src/5017000/orig/regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  *      Copyright (c) 1986 by University of Toronto.
  42  *      Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  *      Permission is granted to anyone to use this software for any
  45  *      purpose on any computer system, and to redistribute it freely,
  46  *      subject to the following restrictions:
  47  *
  48  *      1. The author is not responsible for the consequences of use of
  49  *              this software, no matter how awful, even if they arise
  50  *              from defects in it.
  51  *
  52  *      2. The origin of this software must not be misrepresented, either
  53  *              by explicit claim or by omission.
  54  *
  55  *      3. Altered versions must be plainly marked as such, and must not
  56  *              be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #include "perl.h"
  76
  77 #ifndef PERL_IN_XSUB_RE
  78 #  include "INTERN.h"
  79 #endif
  80
  81 #define REG_COMP_C
  82 #ifdef PERL_IN_XSUB_RE
  83 #  include "re_comp.h"
  84 #else
  85 #  include "regcomp.h"
  86 #endif
  87
  88 #include "dquote_static.c"
  89 #ifndef PERL_IN_XSUB_RE
  90 #  include "charclass_invlists.h"
  91 #endif
  92
  93 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  94
  95 #ifdef op
  96 #undef op
  97 #endif /* op */
  98
  99 #ifdef MSDOS
 100 #  if defined(BUGGY_MSC6)
 101  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
 102 #    pragma optimize("a",off)
 103  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
 104 #    pragma optimize("w",on )
 105 #  endif /* BUGGY_MSC6 */
 106 #endif /* MSDOS */
 107
 108 #ifndef STATIC
 109 #define STATIC  static
 110 #endif
 111
 112 typedef struct RExC_state_t {
 113     U32         flags;                  /* are we folding, multilining? */
 114     char        *precomp;               /* uncompiled string. */
 115     REGEXP      *rx_sv;                 /* The SV that is the regexp. */
 116     regexp      *rx;                    /* perl core regexp structure */
 117     regexp_internal     *rxi;           /* internal data for regexp object pprivate field */
 118     char        *start;                 /* Start of input for compile */
 119     char        *end;                   /* End of input for compile */
 120     char        *parse;                 /* Input-scan pointer. */
 121     I32         whilem_seen;            /* number of WHILEM in this expr */
 122     regnode     *emit_start;            /* Start of emitted-code area */
 123     regnode     *emit_bound;            /* First regnode outside of the allocated space */
 124     regnode     *emit;                  /* Code-emit pointer; &regdummy = don't = compiling */
 125     I32         naughty;                /* How bad is this pattern? */
 126     I32         sawback;                /* Did we see \1, ...? */
 127     U32         seen;
 128     I32         size;                   /* Code size. */
 129     I32         npar;                   /* Capture buffer count, (OPEN). */
 130     I32         cpar;                   /* Capture buffer count, (CLOSE). */
 131     I32         nestroot;               /* root parens we are in - used by accept */
 132     I32         extralen;
 133     I32         seen_zerolen;
 134     I32         seen_evals;
 135     regnode     **open_parens;          /* pointers to open parens */
 136     regnode     **close_parens;         /* pointers to close parens */
 137     regnode     *opend;                 /* END node in program */
 138     I32         utf8;           /* whether the pattern is utf8 or not */
 139     I32         orig_utf8;      /* whether the pattern was originally in utf8 */
 140                                 /* XXX use this for future optimisation of case
 141                                  * where pattern must be upgraded to utf8. */
 142     I32         uni_semantics;  /* If a d charset modifier should use unicode
 143                                    rules, even if the pattern is not in
 144                                    utf8 */
 145     HV          *paren_names;           /* Paren names */
 146
 147     regnode     **recurse;              /* Recurse regops */
 148     I32         recurse_count;          /* Number of recurse regops */
 149     I32         in_lookbehind;
 150     I32         contains_locale;
 151     I32         override_recoding;
 152 #if ADD_TO_REGEXEC
 153     char        *starttry;              /* -Dr: where regtry was called. */
 154 #define RExC_starttry   (pRExC_state->starttry)
 155 #endif
 156 #ifdef DEBUGGING
 157     const char  *lastparse;
 158     I32         lastnum;
 159     AV          *paren_name_list;       /* idx -> name */
 160 #define RExC_lastparse  (pRExC_state->lastparse)
 161 #define RExC_lastnum    (pRExC_state->lastnum)
 162 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 163 #endif
 164 } RExC_state_t;
 165
 166 #define RExC_flags      (pRExC_state->flags)
 167 #define RExC_precomp    (pRExC_state->precomp)
 168 #define RExC_rx_sv      (pRExC_state->rx_sv)
 169 #define RExC_rx         (pRExC_state->rx)
 170 #define RExC_rxi        (pRExC_state->rxi)
 171 #define RExC_start      (pRExC_state->start)
 172 #define RExC_end        (pRExC_state->end)
 173 #define RExC_parse      (pRExC_state->parse)
 174 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
 175 #ifdef RE_TRACK_PATTERN_OFFSETS
 176 #define RExC_offsets    (pRExC_state->rxi->u.offsets) /* I am not like the others */
 177 #endif
 178 #define RExC_emit       (pRExC_state->emit)
 179 #define RExC_emit_start (pRExC_state->emit_start)
 180 #define RExC_emit_bound (pRExC_state->emit_bound)
 181 #define RExC_naughty    (pRExC_state->naughty)
 182 #define RExC_sawback    (pRExC_state->sawback)
 183 #define RExC_seen       (pRExC_state->seen)
 184 #define RExC_size       (pRExC_state->size)
 185 #define RExC_npar       (pRExC_state->npar)
 186 #define RExC_nestroot   (pRExC_state->nestroot)
 187 #define RExC_extralen   (pRExC_state->extralen)
 188 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
 189 #define RExC_seen_evals (pRExC_state->seen_evals)
 190 #define RExC_utf8       (pRExC_state->utf8)
 191 #define RExC_uni_semantics      (pRExC_state->uni_semantics)
 192 #define RExC_orig_utf8  (pRExC_state->orig_utf8)
 193 #define RExC_open_parens        (pRExC_state->open_parens)
 194 #define RExC_close_parens       (pRExC_state->close_parens)
 195 #define RExC_opend      (pRExC_state->opend)
 196 #define RExC_paren_names        (pRExC_state->paren_names)
 197 #define RExC_recurse    (pRExC_state->recurse)
 198 #define RExC_recurse_count      (pRExC_state->recurse_count)
 199 #define RExC_in_lookbehind      (pRExC_state->in_lookbehind)
 200 #define RExC_contains_locale    (pRExC_state->contains_locale)
 201 #define RExC_override_recoding  (pRExC_state->override_recoding)
 202
 203
 204 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
 205 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 206         ((*s) == '{' && regcurly(s)))
 207
 208 #ifdef SPSTART
 209 #undef SPSTART          /* dratted cpp namespace... */
 210 #endif
 211 /*
 212  * Flags to be passed up and down.
 213  */
 214 #define WORST           0       /* Worst case. */
 215 #define HASWIDTH        0x01    /* Known to match non-null strings. */
 216
 217 /* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
 218  * character, and if utf8, must be invariant.  Note that this is not the same
 219  * thing as REGNODE_SIMPLE */
 220 #define SIMPLE          0x02
 221 #define SPSTART         0x04    /* Starts with * or +. */
 222 #define TRYAGAIN        0x08    /* Weeded out a declaration. */
 223 #define POSTPONED       0x10    /* (?1),(?&name), (??{...}) or similar */
 224
 225 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 226
 227 /* whether trie related optimizations are enabled */
 228 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 229 #define TRIE_STUDY_OPT
 230 #define FULL_TRIE_STUDY
 231 #define TRIE_STCLASS
 232 #endif
 233
 234
 235
 236 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 237 #define PBITVAL(paren) (1 << ((paren) & 7))
 238 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 239 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 240 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 241
 242 /* If not already in utf8, do a longjmp back to the beginning */
 243 #define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
 244 #define REQUIRE_UTF8    STMT_START {                                       \
 245                                      if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
 246                         } STMT_END
 247
 248 /* About scan_data_t.
 249
 250   During optimisation we recurse through the regexp program performing
 251   various inplace (keyhole style) optimisations. In addition study_chunk
 252   and scan_commit populate this data structure with information about
 253   what strings MUST appear in the pattern. We look for the longest
 254   string that must appear at a fixed location, and we look for the
 255   longest string that may appear at a floating location. So for instance
 256   in the pattern:
 257
 258     /FOO[xX]A.*B[xX]BAR/
 259
 260   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 261   strings (because they follow a .* construct). study_chunk will identify
 262   both FOO and BAR as being the longest fixed and floating strings respectively.
 263
 264   The strings can be composites, for instance
 265
 266      /(f)(o)(o)/
 267
 268   will result in a composite fixed substring 'foo'.
 269
 270   For each string some basic information is maintained:
 271
 272   - offset or min_offset
 273     This is the position the string must appear at, or not before.
 274     It also implicitly (when combined with minlenp) tells us how many
 275     characters must match before the string we are searching for.
 276     Likewise when combined with minlenp and the length of the string it
 277     tells us how many characters must appear after the string we have
 278     found.
 279
 280   - max_offset
 281     Only used for floating strings. This is the rightmost point that
 282     the string can appear at. If set to I32 max it indicates that the
 283     string can occur infinitely far to the right.
 284
 285   - minlenp
 286     A pointer to the minimum length of the pattern that the string
 287     was found inside. This is important as in the case of positive
 288     lookahead or positive lookbehind we can have multiple patterns
 289     involved. Consider
 290
 291     /(?=FOO).*F/
 292
 293     The minimum length of the pattern overall is 3, the minimum length
 294     of the lookahead part is 3, but the minimum length of the part that
 295     will actually match is 1. So 'FOO's minimum length is 3, but the
 296     minimum length for the F is 1. This is important as the minimum length
 297     is used to determine offsets in front of and behind the string being
 298     looked for.  Since strings can be composites this is the length of the
 299     pattern at the time it was committed with a scan_commit. Note that
 300     the length is calculated by study_chunk, so that the minimum lengths
 301     are not known until the full pattern has been compiled, thus the
 302     pointer to the value.
 303
 304   - lookbehind
 305
 306     In the case of lookbehind the string being searched for can be
 307     offset past the start point of the final matching string.
 308     If this value was just blithely removed from the min_offset it would
 309     invalidate some of the calculations for how many chars must match
 310     before or after (as they are derived from min_offset and minlen and
 311     the length of the string being searched for).
 312     When the final pattern is compiled and the data is moved from the
 313     scan_data_t structure into the regexp structure the information
 314     about lookbehind is factored in, with the information that would
 315     have been lost precalculated in the end_shift field for the
 316     associated string.
 317
 318   The fields pos_min and pos_delta are used to store the minimum offset
 319   and the delta to the maximum offset at the current point in the pattern.
 320
 321 */
 322
 323 typedef struct scan_data_t {
 324     /*I32 len_min;      unused */
 325     /*I32 len_delta;    unused */
 326     I32 pos_min;
 327     I32 pos_delta;
 328     SV *last_found;
 329     I32 last_end;           /* min value, <0 unless valid. */
 330     I32 last_start_min;
 331     I32 last_start_max;
 332     SV **longest;           /* Either &l_fixed, or &l_float. */
 333     SV *longest_fixed;      /* longest fixed string found in pattern */
 334     I32 offset_fixed;       /* offset where it starts */
 335     I32 *minlen_fixed;      /* pointer to the minlen relevant to the string */
 336     I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 337     SV *longest_float;      /* longest floating string found in pattern */
 338     I32 offset_float_min;   /* earliest point in string it can appear */
 339     I32 offset_float_max;   /* latest point in string it can appear */
 340     I32 *minlen_float;      /* pointer to the minlen relevant to the string */
 341     I32 lookbehind_float;   /* is the position of the string modified by LB */
 342     I32 flags;
 343     I32 whilem_c;
 344     I32 *last_closep;
 345     struct regnode_charclass_class *start_class;
 346 } scan_data_t;
 347
 348 /*
 349  * Forward declarations for pregcomp()'s friends.
 350  */
 351
 352 static const scan_data_t zero_scan_data =
 353   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 354
 355 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 356 #define SF_BEFORE_SEOL          0x0001
 357 #define SF_BEFORE_MEOL          0x0002
 358 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 359 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 360
 361 #ifdef NO_UNARY_PLUS
 362 #  define SF_FIX_SHIFT_EOL      (0+2)
 363 #  define SF_FL_SHIFT_EOL               (0+4)
 364 #else
 365 #  define SF_FIX_SHIFT_EOL      (+2)
 366 #  define SF_FL_SHIFT_EOL               (+4)
 367 #endif
 368
 369 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 370 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 371
 372 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 373 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 374 #define SF_IS_INF               0x0040
 375 #define SF_HAS_PAR              0x0080
 376 #define SF_IN_PAR               0x0100
 377 #define SF_HAS_EVAL             0x0200
 378 #define SCF_DO_SUBSTR           0x0400
 379 #define SCF_DO_STCLASS_AND      0x0800
 380 #define SCF_DO_STCLASS_OR       0x1000
 381 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 382 #define SCF_WHILEM_VISITED_POS  0x2000
 383
 384 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 385 #define SCF_SEEN_ACCEPT         0x8000
 386
 387 #define UTF cBOOL(RExC_utf8)
 388
 389 /* The enums for all these are ordered so things work out correctly */
 390 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 391 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
 392 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
 393 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
 394 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
 395 #define MORE_ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
 396 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
 397
 398 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 399
 400 #define OOB_UNICODE             12345678
 401 #define OOB_NAMEDCLASS          -1
 402
 403 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 404 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 405
 406
 407 /* length of regex to show in messages that don't mark a position within */
 408 #define RegexLengthToShowInErrorMessages 127
 409
 410 /*
 411  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 412  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 413  * op/pragma/warn/regcomp.
 414  */
 415 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 416 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 417
 418 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
 419
 420 /*
 421  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 422  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 423  * "...".
 424  */
 425 #define _FAIL(code) STMT_START {                                        \
 426     const char *ellipses = "";                                          \
 427     IV len = RExC_end - RExC_precomp;                                   \
 428                                                                         \
 429     if (!SIZE_ONLY)                                                     \
 430         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);                   \
 431     if (len > RegexLengthToShowInErrorMessages) {                       \
 432         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
 433         len = RegexLengthToShowInErrorMessages - 10;                    \
 434         ellipses = "...";                                               \
 435     }                                                                   \
 436     code;                                                               \
 437 } STMT_END
 438
 439 #define FAIL(msg) _FAIL(                            \
 440     Perl_croak(aTHX_ "%s in regex m/%.*s%s/",       \
 441             msg, (int)len, RExC_precomp, ellipses))
 442
 443 #define FAIL2(msg,arg) _FAIL(                       \
 444     Perl_croak(aTHX_ msg " in regex m/%.*s%s/",     \
 445             arg, (int)len, RExC_precomp, ellipses))
 446
 447 /*
 448  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 449  */
 450 #define Simple_vFAIL(m) STMT_START {                                    \
 451     const IV offset = RExC_parse - RExC_precomp;                        \
 452     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
 453             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
 454 } STMT_END
 455
 456 /*
 457  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 458  */
 459 #define vFAIL(m) STMT_START {                           \
 460     if (!SIZE_ONLY)                                     \
 461         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 462     Simple_vFAIL(m);                                    \
 463 } STMT_END
 464
 465 /*
 466  * Like Simple_vFAIL(), but accepts two arguments.
 467  */
 468 #define Simple_vFAIL2(m,a1) STMT_START {                        \
 469     const IV offset = RExC_parse - RExC_precomp;                        \
 470     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,                   \
 471             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 472 } STMT_END
 473
 474 /*
 475  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 476  */
 477 #define vFAIL2(m,a1) STMT_START {                       \
 478     if (!SIZE_ONLY)                                     \
 479         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 480     Simple_vFAIL2(m, a1);                               \
 481 } STMT_END
 482
 483
 484 /*
 485  * Like Simple_vFAIL(), but accepts three arguments.
 486  */
 487 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
 488     const IV offset = RExC_parse - RExC_precomp;                \
 489     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,               \
 490             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 491 } STMT_END
 492
 493 /*
 494  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 495  */
 496 #define vFAIL3(m,a1,a2) STMT_START {                    \
 497     if (!SIZE_ONLY)                                     \
 498         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 499     Simple_vFAIL3(m, a1, a2);                           \
 500 } STMT_END
 501
 502 /*
 503  * Like Simple_vFAIL(), but accepts four arguments.
 504  */
 505 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
 506     const IV offset = RExC_parse - RExC_precomp;                \
 507     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,           \
 508             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 509 } STMT_END
 510
 511 #define ckWARNreg(loc,m) STMT_START {                                   \
 512     const IV offset = loc - RExC_precomp;                               \
 513     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 514             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 515 } STMT_END
 516
 517 #define ckWARNregdep(loc,m) STMT_START {                                \
 518     const IV offset = loc - RExC_precomp;                               \
 519     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 520             m REPORT_LOCATION,                                          \
 521             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 522 } STMT_END
 523
 524 #define ckWARN2regdep(loc,m, a1) STMT_START {                           \
 525     const IV offset = loc - RExC_precomp;                               \
 526     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 527             m REPORT_LOCATION,                                          \
 528             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
 529 } STMT_END
 530
 531 #define ckWARN2reg(loc, m, a1) STMT_START {                             \
 532     const IV offset = loc - RExC_precomp;                               \
 533     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 534             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
 535 } STMT_END
 536
 537 #define vWARN3(loc, m, a1, a2) STMT_START {                             \
 538     const IV offset = loc - RExC_precomp;                               \
 539     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 540             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 541 } STMT_END
 542
 543 #define ckWARN3reg(loc, m, a1, a2) STMT_START {                         \
 544     const IV offset = loc - RExC_precomp;                               \
 545     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 546             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 547 } STMT_END
 548
 549 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
 550     const IV offset = loc - RExC_precomp;                               \
 551     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 552             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 553 } STMT_END
 554
 555 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START {                     \
 556     const IV offset = loc - RExC_precomp;                               \
 557     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 558             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 559 } STMT_END
 560
 561 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
 562     const IV offset = loc - RExC_precomp;                               \
 563     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 564             a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
 565 } STMT_END
 566
 567
 568 /* Allow for side effects in s */
 569 #define REGC(c,s) STMT_START {                  \
 570     if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
 571 } STMT_END
 572
 573 /* Macros for recording node offsets.   20001227 mjd@plover.com
 574  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 575  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 576  * Element 0 holds the number n.
 577  * Position is 1 indexed.
 578  */
 579 #ifndef RE_TRACK_PATTERN_OFFSETS
 580 #define Set_Node_Offset_To_R(node,byte)
 581 #define Set_Node_Offset(node,byte)
 582 #define Set_Cur_Node_Offset
 583 #define Set_Node_Length_To_R(node,len)
 584 #define Set_Node_Length(node,len)
 585 #define Set_Node_Cur_Length(node)
 586 #define Node_Offset(n)
 587 #define Node_Length(n)
 588 #define Set_Node_Offset_Length(node,offset,len)
 589 #define ProgLen(ri) ri->u.proglen
 590 #define SetProgLen(ri,x) ri->u.proglen = x
 591 #else
 592 #define ProgLen(ri) ri->u.offsets[0]
 593 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 594 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
 595     if (! SIZE_ONLY) {                                                  \
 596         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
 597                     __LINE__, (int)(node), (int)(byte)));               \
 598         if((node) < 0) {                                                \
 599             Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
 600         } else {                                                        \
 601             RExC_offsets[2*(node)-1] = (byte);                          \
 602         }                                                               \
 603     }                                                                   \
 604 } STMT_END
 605
 606 #define Set_Node_Offset(node,byte) \
 607     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 608 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 609
 610 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
 611     if (! SIZE_ONLY) {                                                  \
 612         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
 613                 __LINE__, (int)(node), (int)(len)));                    \
 614         if((node) < 0) {                                                \
 615             Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
 616         } else {                                                        \
 617             RExC_offsets[2*(node)] = (len);                             \
 618         }                                                               \
 619     }                                                                   \
 620 } STMT_END
 621
 622 #define Set_Node_Length(node,len) \
 623     Set_Node_Length_To_R((node)-RExC_emit_start, len)
 624 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
 625 #define Set_Node_Cur_Length(node) \
 626     Set_Node_Length(node, RExC_parse - parse_start)
 627
 628 /* Get offsets and lengths */
 629 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 630 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 631
 632 #define Set_Node_Offset_Length(node,offset,len) STMT_START {    \
 633     Set_Node_Offset_To_R((node)-RExC_emit_start, (offset));     \
 634     Set_Node_Length_To_R((node)-RExC_emit_start, (len));        \
 635 } STMT_END
 636 #endif
 637
 638 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 639 #define EXPERIMENTAL_INPLACESCAN
 640 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
 641
 642 #define DEBUG_STUDYDATA(str,data,depth)                              \
 643 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
 644     PerlIO_printf(Perl_debug_log,                                    \
 645         "%*s" str "Pos:%"IVdf"/%"IVdf                                \
 646         " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s",       \
 647         (int)(depth)*2, "",                                          \
 648         (IV)((data)->pos_min),                                       \
 649         (IV)((data)->pos_delta),                                     \
 650         (UV)((data)->flags),                                         \
 651         (IV)((data)->whilem_c),                                      \
 652         (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
 653         is_inf ? "INF " : ""                                         \
 654     );                                                               \
 655     if ((data)->last_found)                                          \
 656         PerlIO_printf(Perl_debug_log,                                \
 657             "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 658             " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 659             SvPVX_const((data)->last_found),                         \
 660             (IV)((data)->last_end),                                  \
 661             (IV)((data)->last_start_min),                            \
 662             (IV)((data)->last_start_max),                            \
 663             ((data)->longest &&                                      \
 664              (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 665             SvPVX_const((data)->longest_fixed),                      \
 666             (IV)((data)->offset_fixed),                              \
 667             ((data)->longest &&                                      \
 668              (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 669             SvPVX_const((data)->longest_float),                      \
 670             (IV)((data)->offset_float_min),                          \
 671             (IV)((data)->offset_float_max)                           \
 672         );                                                           \
 673     PerlIO_printf(Perl_debug_log,"\n");                              \
 674 });
 675
 676 static void clear_re(pTHX_ void *r);
 677
 678 /* Mark that we cannot extend a found fixed substring at this point.
 679    Update the longest found anchored substring and the longest found
 680    floating substrings if needed. */
 681
 682 STATIC void
 683 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
 684 {
 685     const STRLEN l = CHR_SVLEN(data->last_found);
 686     const STRLEN old_l = CHR_SVLEN(*data->longest);
 687     GET_RE_DEBUG_FLAGS_DECL;
 688
 689     PERL_ARGS_ASSERT_SCAN_COMMIT;
 690
 691     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 692         SvSetMagicSV(*data->longest, data->last_found);
 693         if (*data->longest == data->longest_fixed) {
 694             data->offset_fixed = l ? data->last_start_min : data->pos_min;
 695             if (data->flags & SF_BEFORE_EOL)
 696                 data->flags
 697                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
 698             else
 699                 data->flags &= ~SF_FIX_BEFORE_EOL;
 700             data->minlen_fixed=minlenp;
 701             data->lookbehind_fixed=0;
 702         }
 703         else { /* *data->longest == data->longest_float */
 704             data->offset_float_min = l ? data->last_start_min : data->pos_min;
 705             data->offset_float_max = (l
 706                                       ? data->last_start_max
 707                                       : data->pos_min + data->pos_delta);
 708             if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
 709                 data->offset_float_max = I32_MAX;
 710             if (data->flags & SF_BEFORE_EOL)
 711                 data->flags
 712                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
 713             else
 714                 data->flags &= ~SF_FL_BEFORE_EOL;
 715             data->minlen_float=minlenp;
 716             data->lookbehind_float=0;
 717         }
 718     }
 719     SvCUR_set(data->last_found, 0);
 720     {
 721         SV * const sv = data->last_found;
 722         if (SvUTF8(sv) && SvMAGICAL(sv)) {
 723             MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 724             if (mg)
 725                 mg->mg_len = 0;
 726         }
 727     }
 728     data->last_end = -1;
 729     data->flags &= ~SF_BEFORE_EOL;
 730     DEBUG_STUDYDATA("commit: ",data,0);
 731 }
 732
 733 /* Can match anything (initialization) */
 734 STATIC void
 735 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 736 {
 737     PERL_ARGS_ASSERT_CL_ANYTHING;
 738
 739     ANYOF_BITMAP_SETALL(cl);
 740     cl->flags = ANYOF_CLASS|ANYOF_EOS|ANYOF_UNICODE_ALL
 741                 |ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL;
 742
 743     /* If any portion of the regex is to operate under locale rules,
 744      * initialization includes it.  The reason this isn't done for all regexes
 745      * is that the optimizer was written under the assumption that locale was
 746      * all-or-nothing.  Given the complexity and lack of documentation in the
 747      * optimizer, and that there are inadequate test cases for locale, so many
 748      * parts of it may not work properly, it is safest to avoid locale unless
 749      * necessary. */
 750     if (RExC_contains_locale) {
 751         ANYOF_CLASS_SETALL(cl);     /* /l uses class */
 752         cl->flags |= ANYOF_LOCALE;
 753     }
 754     else {
 755         ANYOF_CLASS_ZERO(cl);       /* Only /l uses class now */
 756     }
 757 }
 758
 759 /* Can match anything (initialization) */
 760 STATIC int
 761 S_cl_is_anything(const struct regnode_charclass_class *cl)
 762 {
 763     int value;
 764
 765     PERL_ARGS_ASSERT_CL_IS_ANYTHING;
 766
 767     for (value = 0; value <= ANYOF_MAX; value += 2)
 768         if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
 769             return 1;
 770     if (!(cl->flags & ANYOF_UNICODE_ALL))
 771         return 0;
 772     if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
 773         return 0;
 774     return 1;
 775 }
 776
 777 /* Can match anything (initialization) */
 778 STATIC void
 779 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 780 {
 781     PERL_ARGS_ASSERT_CL_INIT;
 782
 783     Zero(cl, 1, struct regnode_charclass_class);
 784     cl->type = ANYOF;
 785     cl_anything(pRExC_state, cl);
 786     ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
 787 }
 788
 789 /* These two functions currently do the exact same thing */
 790 #define cl_init_zero            S_cl_init
 791
 792 /* 'AND' a given class with another one.  Can create false positives.  'cl'
 793  * should not be inverted.  'and_with->flags & ANYOF_CLASS' should be 0 if
 794  * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
 795 STATIC void
 796 S_cl_and(struct regnode_charclass_class *cl,
 797         const struct regnode_charclass_class *and_with)
 798 {
 799     PERL_ARGS_ASSERT_CL_AND;
 800
 801     assert(and_with->type == ANYOF);
 802
 803     /* I (khw) am not sure all these restrictions are necessary XXX */
 804     if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
 805         && !(ANYOF_CLASS_TEST_ANY_SET(cl))
 806         && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 807         && !(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
 808         && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) {
 809         int i;
 810
 811         if (and_with->flags & ANYOF_INVERT)
 812             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 813                 cl->bitmap[i] &= ~and_with->bitmap[i];
 814         else
 815             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 816                 cl->bitmap[i] &= and_with->bitmap[i];
 817     } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
 818
 819     if (and_with->flags & ANYOF_INVERT) {
 820
 821         /* Here, the and'ed node is inverted.  Get the AND of the flags that
 822          * aren't affected by the inversion.  Those that are affected are
 823          * handled individually below */
 824         U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
 825         cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
 826         cl->flags |= affected_flags;
 827
 828         /* We currently don't know how to deal with things that aren't in the
 829          * bitmap, but we know that the intersection is no greater than what
 830          * is already in cl, so let there be false positives that get sorted
 831          * out after the synthetic start class succeeds, and the node is
 832          * matched for real. */
 833
 834         /* The inversion of these two flags indicate that the resulting
 835          * intersection doesn't have them */
 836         if (and_with->flags & ANYOF_UNICODE_ALL) {
 837             cl->flags &= ~ANYOF_UNICODE_ALL;
 838         }
 839         if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
 840             cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
 841         }
 842     }
 843     else {   /* and'd node is not inverted */
 844         U8 outside_bitmap_but_not_utf8; /* Temp variable */
 845
 846         if (! ANYOF_NONBITMAP(and_with)) {
 847
 848             /* Here 'and_with' doesn't match anything outside the bitmap
 849              * (except possibly ANYOF_UNICODE_ALL), which means the
 850              * intersection can't either, except for ANYOF_UNICODE_ALL, in
 851              * which case we don't know what the intersection is, but it's no
 852              * greater than what cl already has, so can just leave it alone,
 853              * with possible false positives */
 854             if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
 855                 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
 856                 cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
 857             }
 858         }
 859         else if (! ANYOF_NONBITMAP(cl)) {
 860
 861             /* Here, 'and_with' does match something outside the bitmap, and cl
 862              * doesn't have a list of things to match outside the bitmap.  If
 863              * cl can match all code points above 255, the intersection will
 864              * be those above-255 code points that 'and_with' matches.  If cl
 865              * can't match all Unicode code points, it means that it can't
 866              * match anything outside the bitmap (since the 'if' that got us
 867              * into this block tested for that), so we leave the bitmap empty.
 868              */
 869             if (cl->flags & ANYOF_UNICODE_ALL) {
 870                 ARG_SET(cl, ARG(and_with));
 871
 872                 /* and_with's ARG may match things that don't require UTF8.
 873                  * And now cl's will too, in spite of this being an 'and'.  See
 874                  * the comments below about the kludge */
 875                 cl->flags |= and_with->flags & ANYOF_NONBITMAP_NON_UTF8;
 876             }
 877         }
 878         else {
 879             /* Here, both 'and_with' and cl match something outside the
 880              * bitmap.  Currently we do not do the intersection, so just match
 881              * whatever cl had at the beginning.  */
 882         }
 883
 884
 885         /* Take the intersection of the two sets of flags.  However, the
 886          * ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'.  This is a
 887          * kludge around the fact that this flag is not treated like the others
 888          * which are initialized in cl_anything().  The way the optimizer works
 889          * is that the synthetic start class (SSC) is initialized to match
 890          * anything, and then the first time a real node is encountered, its
 891          * values are AND'd with the SSC's with the result being the values of
 892          * the real node.  However, there are paths through the optimizer where
 893          * the AND never gets called, so those initialized bits are set
 894          * inappropriately, which is not usually a big deal, as they just cause
 895          * false positives in the SSC, which will just mean a probably
 896          * imperceptible slow down in execution.  However this bit has a
 897          * higher false positive consequence in that it can cause utf8.pm,
 898          * utf8_heavy.pl ... to be loaded when not necessary, which is a much
 899          * bigger slowdown and also causes significant extra memory to be used.
 900          * In order to prevent this, the code now takes a different tack.  The
 901          * bit isn't set unless some part of the regular expression needs it,
 902          * but once set it won't get cleared.  This means that these extra
 903          * modules won't get loaded unless there was some path through the
 904          * pattern that would have required them anyway, and  so any false
 905          * positives that occur by not ANDing them out when they could be
 906          * aren't as severe as they would be if we treated this bit like all
 907          * the others */
 908         outside_bitmap_but_not_utf8 = (cl->flags | and_with->flags)
 909                                       & ANYOF_NONBITMAP_NON_UTF8;
 910         cl->flags &= and_with->flags;
 911         cl->flags |= outside_bitmap_but_not_utf8;
 912     }
 913 }
 914
 915 /* 'OR' a given class with another one.  Can create false positives.  'cl'
 916  * should not be inverted.  'or_with->flags & ANYOF_CLASS' should be 0 if
 917  * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
 918 STATIC void
 919 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
 920 {
 921     PERL_ARGS_ASSERT_CL_OR;
 922
 923     if (or_with->flags & ANYOF_INVERT) {
 924
 925         /* Here, the or'd node is to be inverted.  This means we take the
 926          * complement of everything not in the bitmap, but currently we don't
 927          * know what that is, so give up and match anything */
 928         if (ANYOF_NONBITMAP(or_with)) {
 929             cl_anything(pRExC_state, cl);
 930         }
 931         /* We do not use
 932          * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
 933          *   <= (B1 | !B2) | (CL1 | !CL2)
 934          * which is wasteful if CL2 is small, but we ignore CL2:
 935          *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
 936          * XXXX Can we handle case-fold?  Unclear:
 937          *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
 938          *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
 939          */
 940         else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 941              && !(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
 942              && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD) ) {
 943             int i;
 944
 945             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 946                 cl->bitmap[i] |= ~or_with->bitmap[i];
 947         } /* XXXX: logic is complicated otherwise */
 948         else {
 949             cl_anything(pRExC_state, cl);
 950         }
 951
 952         /* And, we can just take the union of the flags that aren't affected
 953          * by the inversion */
 954         cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
 955
 956         /* For the remaining flags:
 957             ANYOF_UNICODE_ALL and inverted means to not match anything above
 958                     255, which means that the union with cl should just be
 959                     what cl has in it, so can ignore this flag
 960             ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
 961                     is 127-255 to match them, but then invert that, so the
 962                     union with cl should just be what cl has in it, so can
 963                     ignore this flag
 964          */
 965     } else {    /* 'or_with' is not inverted */
 966         /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
 967         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 968              && (!(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
 969                  || (cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) ) {
 970             int i;
 971
 972             /* OR char bitmap and class bitmap separately */
 973             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 974                 cl->bitmap[i] |= or_with->bitmap[i];
 975             if (ANYOF_CLASS_TEST_ANY_SET(or_with)) {
 976                 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
 977                     cl->classflags[i] |= or_with->classflags[i];
 978                 cl->flags |= ANYOF_CLASS;
 979             }
 980         }
 981         else { /* XXXX: logic is complicated, leave it along for a moment. */
 982             cl_anything(pRExC_state, cl);
 983         }
 984
 985         if (ANYOF_NONBITMAP(or_with)) {
 986
 987             /* Use the added node's outside-the-bit-map match if there isn't a
 988              * conflict.  If there is a conflict (both nodes match something
 989              * outside the bitmap, but what they match outside is not the same
 990              * pointer, and hence not easily compared until XXX we extend
 991              * inversion lists this far), give up and allow the start class to
 992              * match everything outside the bitmap.  If that stuff is all above
 993              * 255, can just set UNICODE_ALL, otherwise caould be anything. */
 994             if (! ANYOF_NONBITMAP(cl)) {
 995                 ARG_SET(cl, ARG(or_with));
 996             }
 997             else if (ARG(cl) != ARG(or_with)) {
 998
 999                 if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
1000                     cl_anything(pRExC_state, cl);
1001                 }
1002                 else {
1003                     cl->flags |= ANYOF_UNICODE_ALL;
1004                 }
1005             }
1006         }
1007
1008         /* Take the union */
1009         cl->flags |= or_with->flags;
1010     }
1011 }
1012
1013 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1014 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
1015 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1016 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
1017
1018
1019 #ifdef DEBUGGING
1020 /*
1021    dump_trie(trie,widecharmap,revcharmap)
1022    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1023    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1024
1025    These routines dump out a trie in a somewhat readable format.
1026    The _interim_ variants are used for debugging the interim
1027    tables that are used to generate the final compressed
1028    representation which is what dump_trie expects.
1029
1030    Part of the reason for their existence is to provide a form
1031    of documentation as to how the different representations function.
1032
1033 */
1034
1035 /*
1036   Dumps the final compressed table form of the trie to Perl_debug_log.
1037   Used for debugging make_trie().
1038 */
1039
1040 STATIC void
1041 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1042             AV *revcharmap, U32 depth)
1043 {
1044     U32 state;
1045     SV *sv=sv_newmortal();
1046     int colwidth= widecharmap ? 6 : 4;
1047     U16 word;
1048     GET_RE_DEBUG_FLAGS_DECL;
1049
1050     PERL_ARGS_ASSERT_DUMP_TRIE;
1051
1052     PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1053         (int)depth * 2 + 2,"",
1054         "Match","Base","Ofs" );
1055
1056     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1057         SV ** const tmp = av_fetch( revcharmap, state, 0);
1058         if ( tmp ) {
1059             PerlIO_printf( Perl_debug_log, "%*s",
1060                 colwidth,
1061                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1062                             PL_colors[0], PL_colors[1],
1063                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1064                             PERL_PV_ESCAPE_FIRSTCHAR
1065                 )
1066             );
1067         }
1068     }
1069     PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1070         (int)depth * 2 + 2,"");
1071
1072     for( state = 0 ; state < trie->uniquecharcount ; state++ )
1073         PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1074     PerlIO_printf( Perl_debug_log, "\n");
1075
1076     for( state = 1 ; state < trie->statecount ; state++ ) {
1077         const U32 base = trie->states[ state ].trans.base;
1078
1079         PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1080
1081         if ( trie->states[ state ].wordnum ) {
1082             PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1083         } else {
1084             PerlIO_printf( Perl_debug_log, "%6s", "" );
1085         }
1086
1087         PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1088
1089         if ( base ) {
1090             U32 ofs = 0;
1091
1092             while( ( base + ofs  < trie->uniquecharcount ) ||
1093                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
1094                      && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1095                     ofs++;
1096
1097             PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1098
1099             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1100                 if ( ( base + ofs >= trie->uniquecharcount ) &&
1101                      ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1102                      trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1103                 {
1104                    PerlIO_printf( Perl_debug_log, "%*"UVXf,
1105                     colwidth,
1106                     (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1107                 } else {
1108                     PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
1109                 }
1110             }
1111
1112             PerlIO_printf( Perl_debug_log, "]");
1113
1114         }
1115         PerlIO_printf( Perl_debug_log, "\n" );
1116     }
1117     PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1118     for (word=1; word <= trie->wordcount; word++) {
1119         PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1120             (int)word, (int)(trie->wordinfo[word].prev),
1121             (int)(trie->wordinfo[word].len));
1122     }
1123     PerlIO_printf(Perl_debug_log, "\n" );
1124 }
1125 /*
1126   Dumps a fully constructed but uncompressed trie in list form.
1127   List tries normally only are used for construction when the number of
1128   possible chars (trie->uniquecharcount) is very high.
1129   Used for debugging make_trie().
1130 */
1131 STATIC void
1132 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1133                          HV *widecharmap, AV *revcharmap, U32 next_alloc,
1134                          U32 depth)
1135 {
1136     U32 state;
1137     SV *sv=sv_newmortal();
1138     int colwidth= widecharmap ? 6 : 4;
1139     GET_RE_DEBUG_FLAGS_DECL;
1140
1141     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1142
1143     /* print out the table precompression.  */
1144     PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1145         (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1146         "------:-----+-----------------\n" );
1147
1148     for( state=1 ; state < next_alloc ; state ++ ) {
1149         U16 charid;
1150
1151         PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1152             (int)depth * 2 + 2,"", (UV)state  );
1153         if ( ! trie->states[ state ].wordnum ) {
1154             PerlIO_printf( Perl_debug_log, "%5s| ","");
1155         } else {
1156             PerlIO_printf( Perl_debug_log, "W%4x| ",
1157                 trie->states[ state ].wordnum
1158             );
1159         }
1160         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1161             SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1162             if ( tmp ) {
1163                 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1164                     colwidth,
1165                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1166                             PL_colors[0], PL_colors[1],
1167                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1168                             PERL_PV_ESCAPE_FIRSTCHAR
1169                     ) ,
1170                     TRIE_LIST_ITEM(state,charid).forid,
1171                     (UV)TRIE_LIST_ITEM(state,charid).newstate
1172                 );
1173                 if (!(charid % 10))
1174                     PerlIO_printf(Perl_debug_log, "\n%*s| ",
1175                         (int)((depth * 2) + 14), "");
1176             }
1177         }
1178         PerlIO_printf( Perl_debug_log, "\n");
1179     }
1180 }
1181
1182 /*
1183   Dumps a fully constructed but uncompressed trie in table form.
1184   This is the normal DFA style state transition table, with a few
1185   twists to facilitate compression later.
1186   Used for debugging make_trie().
1187 */
1188 STATIC void
1189 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1190                           HV *widecharmap, AV *revcharmap, U32 next_alloc,
1191                           U32 depth)
1192 {
1193     U32 state;
1194     U16 charid;
1195     SV *sv=sv_newmortal();
1196     int colwidth= widecharmap ? 6 : 4;
1197     GET_RE_DEBUG_FLAGS_DECL;
1198
1199     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1200
1201     /*
1202        print out the table precompression so that we can do a visual check
1203        that they are identical.
1204      */
1205
1206     PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1207
1208     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1209         SV ** const tmp = av_fetch( revcharmap, charid, 0);
1210         if ( tmp ) {
1211             PerlIO_printf( Perl_debug_log, "%*s",
1212                 colwidth,
1213                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1214                             PL_colors[0], PL_colors[1],
1215                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1216                             PERL_PV_ESCAPE_FIRSTCHAR
1217                 )
1218             );
1219         }
1220     }
1221
1222     PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1223
1224     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1225         PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1226     }
1227
1228     PerlIO_printf( Perl_debug_log, "\n" );
1229
1230     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1231
1232         PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1233             (int)depth * 2 + 2,"",
1234             (UV)TRIE_NODENUM( state ) );
1235
1236         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1237             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1238             if (v)
1239                 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1240             else
1241                 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1242         }
1243         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1244             PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1245         } else {
1246             PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1247             trie->states[ TRIE_NODENUM( state ) ].wordnum );
1248         }
1249     }
1250 }
1251
1252 #endif
1253
1254
1255 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1256   startbranch: the first branch in the whole branch sequence
1257   first      : start branch of sequence of branch-exact nodes.
1258                May be the same as startbranch
1259   last       : Thing following the last branch.
1260                May be the same as tail.
1261   tail       : item following the branch sequence
1262   count      : words in the sequence
1263   flags      : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1264   depth      : indent depth
1265
1266 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1267
1268 A trie is an N'ary tree where the branches are determined by digital
1269 decomposition of the key. IE, at the root node you look up the 1st character and
1270 follow that branch repeat until you find the end of the branches. Nodes can be
1271 marked as "accepting" meaning they represent a complete word. Eg:
1272
1273   /he|she|his|hers/
1274
1275 would convert into the following structure. Numbers represent states, letters
1276 following numbers represent valid transitions on the letter from that state, if
1277 the number is in square brackets it represents an accepting state, otherwise it
1278 will be in parenthesis.
1279
1280       +-h->+-e->[3]-+-r->(8)-+-s->[9]
1281       |    |
1282       |   (2)
1283       |    |
1284      (1)   +-i->(6)-+-s->[7]
1285       |
1286       +-s->(3)-+-h->(4)-+-e->[5]
1287
1288       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1289
1290 This shows that when matching against the string 'hers' we will begin at state 1
1291 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1292 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1293 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1294 single traverse. We store a mapping from accepting to state to which word was
1295 matched, and then when we have multiple possibilities we try to complete the
1296 rest of the regex in the order in which they occured in the alternation.
1297
1298 The only prior NFA like behaviour that would be changed by the TRIE support is
1299 the silent ignoring of duplicate alternations which are of the form:
1300
1301  / (DUPE|DUPE) X? (?{ ... }) Y /x
1302
1303 Thus EVAL blocks following a trie may be called a different number of times with
1304 and without the optimisation. With the optimisations dupes will be silently
1305 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1306 the following demonstrates:
1307
1308  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1309
1310 which prints out 'word' three times, but
1311
1312  'words'=~/(word|word|word)(?{ print $1 })S/
1313
1314 which doesnt print it out at all. This is due to other optimisations kicking in.
1315
1316 Example of what happens on a structural level:
1317
1318 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1319
1320    1: CURLYM[1] {1,32767}(18)
1321    5:   BRANCH(8)
1322    6:     EXACT <ac>(16)
1323    8:   BRANCH(11)
1324    9:     EXACT <ad>(16)
1325   11:   BRANCH(14)
1326   12:     EXACT <ab>(16)
1327   16:   SUCCEED(0)
1328   17:   NOTHING(18)
1329   18: END(0)
1330
1331 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1332 and should turn into:
1333
1334    1: CURLYM[1] {1,32767}(18)
1335    5:   TRIE(16)
1336         [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1337           <ac>
1338           <ad>
1339           <ab>
1340   16:   SUCCEED(0)
1341   17:   NOTHING(18)
1342   18: END(0)
1343
1344 Cases where tail != last would be like /(?foo|bar)baz/:
1345
1346    1: BRANCH(4)
1347    2:   EXACT <foo>(8)
1348    4: BRANCH(7)
1349    5:   EXACT <bar>(8)
1350    7: TAIL(8)
1351    8: EXACT <baz>(10)
1352   10: END(0)
1353
1354 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1355 and would end up looking like:
1356
1357     1: TRIE(8)
1358       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1359         <foo>
1360         <bar>
1361    7: TAIL(8)
1362    8: EXACT <baz>(10)
1363   10: END(0)
1364
1365     d = uvuni_to_utf8_flags(d, uv, 0);
1366
1367 is the recommended Unicode-aware way of saying
1368
1369     *(d++) = uv;
1370 */
1371
1372 #define TRIE_STORE_REVCHAR(val)                                            \
1373     STMT_START {                                                           \
1374         if (UTF) {                                                         \
1375             SV *zlopp = newSV(7); /* XXX: optimize me */                   \
1376             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
1377             unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, val); \
1378             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
1379             SvPOK_on(zlopp);                                               \
1380             SvUTF8_on(zlopp);                                              \
1381             av_push(revcharmap, zlopp);                                    \
1382         } else {                                                           \
1383             char ooooff = (char)val;                                           \
1384             av_push(revcharmap, newSVpvn(&ooooff, 1));                     \
1385         }                                                                  \
1386         } STMT_END
1387
1388 #define TRIE_READ_CHAR STMT_START {                                                     \
1389     wordlen++;                                                                          \
1390     if ( UTF ) {                                                                        \
1391         /* if it is UTF then it is either already folded, or does not need folding */   \
1392         uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags);             \
1393     }                                                                                   \
1394     else if (folder == PL_fold_latin1) {                                                \
1395         /* if we use this folder we have to obey unicode rules on latin-1 data */       \
1396         if ( foldlen > 0 ) {                                                            \
1397            uvc = utf8n_to_uvuni( (const U8*) scan, UTF8_MAXLEN, &len, uniflags );       \
1398            foldlen -= len;                                                              \
1399            scan += len;                                                                 \
1400            len = 0;                                                                     \
1401         } else {                                                                        \
1402             len = 1;                                                                    \
1403             uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1);                     \
1404             skiplen = UNISKIP(uvc);                                                     \
1405             foldlen -= skiplen;                                                         \
1406             scan = foldbuf + skiplen;                                                   \
1407         }                                                                               \
1408     } else {                                                                            \
1409         /* raw data, will be folded later if needed */                                  \
1410         uvc = (U32)*uc;                                                                 \
1411         len = 1;                                                                        \
1412     }                                                                                   \
1413 } STMT_END
1414
1415
1416
1417 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
1418     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
1419         U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
1420         Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1421     }                                                           \
1422     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
1423     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
1424     TRIE_LIST_CUR( state )++;                                   \
1425 } STMT_END
1426
1427 #define TRIE_LIST_NEW(state) STMT_START {                       \
1428     Newxz( trie->states[ state ].trans.list,               \
1429         4, reg_trie_trans_le );                                 \
1430      TRIE_LIST_CUR( state ) = 1;                                \
1431      TRIE_LIST_LEN( state ) = 4;                                \
1432 } STMT_END
1433
1434 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
1435     U16 dupe= trie->states[ state ].wordnum;                    \
1436     regnode * const noper_next = regnext( noper );              \
1437                                                                 \
1438     DEBUG_r({                                                   \
1439         /* store the word for dumping */                        \
1440         SV* tmp;                                                \
1441         if (OP(noper) != NOTHING)                               \
1442             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);    \
1443         else                                                    \
1444             tmp = newSVpvn_utf8( "", 0, UTF );                  \
1445         av_push( trie_words, tmp );                             \
1446     });                                                         \
1447                                                                 \
1448     curword++;                                                  \
1449     trie->wordinfo[curword].prev   = 0;                         \
1450     trie->wordinfo[curword].len    = wordlen;                   \
1451     trie->wordinfo[curword].accept = state;                     \
1452                                                                 \
1453     if ( noper_next < tail ) {                                  \
1454         if (!trie->jump)                                        \
1455             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1456         trie->jump[curword] = (U16)(noper_next - convert);      \
1457         if (!jumper)                                            \
1458             jumper = noper_next;                                \
1459         if (!nextbranch)                                        \
1460             nextbranch= regnext(cur);                           \
1461     }                                                           \
1462                                                                 \
1463     if ( dupe ) {                                               \
1464         /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
1465         /* chain, so that when the bits of chain are later    */\
1466         /* linked together, the dups appear in the chain      */\
1467         trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1468         trie->wordinfo[dupe].prev = curword;                    \
1469     } else {                                                    \
1470         /* we haven't inserted this word yet.                */ \
1471         trie->states[ state ].wordnum = curword;                \
1472     }                                                           \
1473 } STMT_END
1474
1475
1476 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)          \
1477      ( ( base + charid >=  ucharcount                                   \
1478          && base + charid < ubound                                      \
1479          && state == trie->trans[ base - ucharcount + charid ].check    \
1480          && trie->trans[ base - ucharcount + charid ].next )            \
1481            ? trie->trans[ base - ucharcount + charid ].next             \
1482            : ( state==1 ? special : 0 )                                 \
1483       )
1484
1485 #define MADE_TRIE       1
1486 #define MADE_JUMP_TRIE  2
1487 #define MADE_EXACT_TRIE 4
1488
1489 STATIC I32
1490 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1491 {
1492     dVAR;
1493     /* first pass, loop through and scan words */
1494     reg_trie_data *trie;
1495     HV *widecharmap = NULL;
1496     AV *revcharmap = newAV();
1497     regnode *cur;
1498     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1499     STRLEN len = 0;
1500     UV uvc = 0;
1501     U16 curword = 0;
1502     U32 next_alloc = 0;
1503     regnode *jumper = NULL;
1504     regnode *nextbranch = NULL;
1505     regnode *convert = NULL;
1506     U32 *prev_states; /* temp array mapping each state to previous one */
1507     /* we just use folder as a flag in utf8 */
1508     const U8 * folder = NULL;
1509
1510 #ifdef DEBUGGING
1511     const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1512     AV *trie_words = NULL;
1513     /* along with revcharmap, this only used during construction but both are
1514      * useful during debugging so we store them in the struct when debugging.
1515      */
1516 #else
1517     const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1518     STRLEN trie_charcount=0;
1519 #endif
1520     SV *re_trie_maxbuff;
1521     GET_RE_DEBUG_FLAGS_DECL;
1522
1523     PERL_ARGS_ASSERT_MAKE_TRIE;
1524 #ifndef DEBUGGING
1525     PERL_UNUSED_ARG(depth);
1526 #endif
1527
1528     switch (flags) {
1529         case EXACT: break;
1530         case EXACTFA:
1531         case EXACTFU_SS:
1532         case EXACTFU_TRICKYFOLD:
1533         case EXACTFU: folder = PL_fold_latin1; break;
1534         case EXACTF:  folder = PL_fold; break;
1535         case EXACTFL: folder = PL_fold_locale; break;
1536         default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1537     }
1538
1539     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1540     trie->refcount = 1;
1541     trie->startstate = 1;
1542     trie->wordcount = word_count;
1543     RExC_rxi->data->data[ data_slot ] = (void*)trie;
1544     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1545     if (flags == EXACT)
1546         trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1547     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1548                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
1549
1550     DEBUG_r({
1551         trie_words = newAV();
1552     });
1553
1554     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1555     if (!SvIOK(re_trie_maxbuff)) {
1556         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1557     }
1558     DEBUG_OPTIMISE_r({
1559                 PerlIO_printf( Perl_debug_log,
1560                   "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1561                   (int)depth * 2 + 2, "",
1562                   REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1563                   REG_NODE_NUM(last), REG_NODE_NUM(tail),
1564                   (int)depth);
1565     });
1566
1567    /* Find the node we are going to overwrite */
1568     if ( first == startbranch && OP( last ) != BRANCH ) {
1569         /* whole branch chain */
1570         convert = first;
1571     } else {
1572         /* branch sub-chain */
1573         convert = NEXTOPER( first );
1574     }
1575
1576     /*  -- First loop and Setup --
1577
1578        We first traverse the branches and scan each word to determine if it
1579        contains widechars, and how many unique chars there are, this is
1580        important as we have to build a table with at least as many columns as we
1581        have unique chars.
1582
1583        We use an array of integers to represent the character codes 0..255
1584        (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1585        native representation of the character value as the key and IV's for the
1586        coded index.
1587
1588        *TODO* If we keep track of how many times each character is used we can
1589        remap the columns so that the table compression later on is more
1590        efficient in terms of memory by ensuring the most common value is in the
1591        middle and the least common are on the outside.  IMO this would be better
1592        than a most to least common mapping as theres a decent chance the most
1593        common letter will share a node with the least common, meaning the node
1594        will not be compressible. With a middle is most common approach the worst
1595        case is when we have the least common nodes twice.
1596
1597      */
1598
1599     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1600         regnode * const noper = NEXTOPER( cur );
1601         const U8 *uc = (U8*)STRING( noper );
1602         const U8 * const e  = uc + STR_LEN( noper );
1603         STRLEN foldlen = 0;
1604         U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1605         STRLEN skiplen = 0;
1606         const U8 *scan = (U8*)NULL;
1607         U32 wordlen      = 0;         /* required init */
1608         STRLEN chars = 0;
1609         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1610
1611         if (OP(noper) == NOTHING) {
1612             trie->minlen= 0;
1613             continue;
1614         }
1615         if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
1616             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1617                                           regardless of encoding */
1618             if (OP( noper ) == EXACTFU_SS) {
1619                 /* false positives are ok, so just set this */
1620                 TRIE_BITMAP_SET(trie,0xDF);
1621             }
1622         }
1623         for ( ; uc < e ; uc += len ) {
1624             TRIE_CHARCOUNT(trie)++;
1625             TRIE_READ_CHAR;
1626             chars++;
1627             if ( uvc < 256 ) {
1628                 if ( folder ) {
1629                     U8 folded= folder[ (U8) uvc ];
1630                     if ( !trie->charmap[ folded ] ) {
1631                         trie->charmap[ folded ]=( ++trie->uniquecharcount );
1632                         TRIE_STORE_REVCHAR( folded );
1633                     }
1634                 }
1635                 if ( !trie->charmap[ uvc ] ) {
1636                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1637                     TRIE_STORE_REVCHAR( uvc );
1638                 }
1639                 if ( set_bit ) {
1640                     /* store the codepoint in the bitmap, and its folded
1641                      * equivalent. */
1642                     TRIE_BITMAP_SET(trie, uvc);
1643
1644                     /* store the folded codepoint */
1645                     if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
1646
1647                     if ( !UTF ) {
1648                         /* store first byte of utf8 representation of
1649                            variant codepoints */
1650                         if (! UNI_IS_INVARIANT(uvc)) {
1651                             TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
1652                         }
1653                     }
1654                     set_bit = 0; /* We've done our bit :-) */
1655                 }
1656             } else {
1657                 SV** svpp;
1658                 if ( !widecharmap )
1659                     widecharmap = newHV();
1660
1661                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1662
1663                 if ( !svpp )
1664                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1665
1666                 if ( !SvTRUE( *svpp ) ) {
1667                     sv_setiv( *svpp, ++trie->uniquecharcount );
1668                     TRIE_STORE_REVCHAR(uvc);
1669                 }
1670             }
1671         }
1672         if( cur == first ) {
1673             trie->minlen = chars;
1674             trie->maxlen = chars;
1675         } else if (chars < trie->minlen) {
1676             trie->minlen = chars;
1677         } else if (chars > trie->maxlen) {
1678             trie->maxlen = chars;
1679         }
1680         if (OP( noper ) == EXACTFU_SS) {
1681             /* XXX: workaround - 'ss' could match "\x{DF}" so minlen could be 1 and not 2*/
1682             if (trie->minlen > 1)
1683                 trie->minlen= 1;
1684         }
1685         if (OP( noper ) == EXACTFU_TRICKYFOLD) {
1686             /* XXX: workround - things like "\x{1FBE}\x{0308}\x{0301}" can match "\x{0390}"
1687              *                - We assume that any such sequence might match a 2 byte string */
1688             if (trie->minlen > 2 )
1689                 trie->minlen= 2;
1690         }
1691
1692     } /* end first pass */
1693     DEBUG_TRIE_COMPILE_r(
1694         PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1695                 (int)depth * 2 + 2,"",
1696                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1697                 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1698                 (int)trie->minlen, (int)trie->maxlen )
1699     );
1700
1701     /*
1702         We now know what we are dealing with in terms of unique chars and
1703         string sizes so we can calculate how much memory a naive
1704         representation using a flat table  will take. If it's over a reasonable
1705         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1706         conservative but potentially much slower representation using an array
1707         of lists.
1708
1709         At the end we convert both representations into the same compressed
1710         form that will be used in regexec.c for matching with. The latter
1711         is a form that cannot be used to construct with but has memory
1712         properties similar to the list form and access properties similar
1713         to the table form making it both suitable for fast searches and
1714         small enough that its feasable to store for the duration of a program.
1715
1716         See the comment in the code where the compressed table is produced
1717         inplace from the flat tabe representation for an explanation of how
1718         the compression works.
1719
1720     */
1721
1722
1723     Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1724     prev_states[1] = 0;
1725
1726     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1727         /*
1728             Second Pass -- Array Of Lists Representation
1729
1730             Each state will be represented by a list of charid:state records
1731             (reg_trie_trans_le) the first such element holds the CUR and LEN
1732             points of the allocated array. (See defines above).
1733
1734             We build the initial structure using the lists, and then convert
1735             it into the compressed table form which allows faster lookups
1736             (but cant be modified once converted).
1737         */
1738
1739         STRLEN transcount = 1;
1740
1741         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1742             "%*sCompiling trie using list compiler\n",
1743             (int)depth * 2 + 2, ""));
1744
1745         trie->states = (reg_trie_state *)
1746             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1747                                   sizeof(reg_trie_state) );
1748         TRIE_LIST_NEW(1);
1749         next_alloc = 2;
1750
1751         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1752
1753             regnode * const noper = NEXTOPER( cur );
1754             U8 *uc           = (U8*)STRING( noper );
1755             const U8 * const e = uc + STR_LEN( noper );
1756             U32 state        = 1;         /* required init */
1757             U16 charid       = 0;         /* sanity init */
1758             U8 *scan         = (U8*)NULL; /* sanity init */
1759             STRLEN foldlen   = 0;         /* required init */
1760             U32 wordlen      = 0;         /* required init */
1761             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1762             STRLEN skiplen   = 0;
1763
1764             if (OP(noper) != NOTHING) {
1765                 for ( ; uc < e ; uc += len ) {
1766
1767                     TRIE_READ_CHAR;
1768
1769                     if ( uvc < 256 ) {
1770                         charid = trie->charmap[ uvc ];
1771                     } else {
1772                         SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1773                         if ( !svpp ) {
1774                             charid = 0;
1775                         } else {
1776                             charid=(U16)SvIV( *svpp );
1777                         }
1778                     }
1779                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
1780                     if ( charid ) {
1781
1782                         U16 check;
1783                         U32 newstate = 0;
1784
1785                         charid--;
1786                         if ( !trie->states[ state ].trans.list ) {
1787                             TRIE_LIST_NEW( state );
1788                         }
1789                         for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1790                             if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1791                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1792                                 break;
1793                             }
1794                         }
1795                         if ( ! newstate ) {
1796                             newstate = next_alloc++;
1797                             prev_states[newstate] = state;
1798                             TRIE_LIST_PUSH( state, charid, newstate );
1799                             transcount++;
1800                         }
1801                         state = newstate;
1802                     } else {
1803                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1804                     }
1805                 }
1806             }
1807             TRIE_HANDLE_WORD(state);
1808
1809         } /* end second pass */
1810
1811         /* next alloc is the NEXT state to be allocated */
1812         trie->statecount = next_alloc;
1813         trie->states = (reg_trie_state *)
1814             PerlMemShared_realloc( trie->states,
1815                                    next_alloc
1816                                    * sizeof(reg_trie_state) );
1817
1818         /* and now dump it out before we compress it */
1819         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1820                                                          revcharmap, next_alloc,
1821                                                          depth+1)
1822         );
1823
1824         trie->trans = (reg_trie_trans *)
1825             PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1826         {
1827             U32 state;
1828             U32 tp = 0;
1829             U32 zp = 0;
1830
1831
1832             for( state=1 ; state < next_alloc ; state ++ ) {
1833                 U32 base=0;
1834
1835                 /*
1836                 DEBUG_TRIE_COMPILE_MORE_r(
1837                     PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1838                 );
1839                 */
1840
1841                 if (trie->states[state].trans.list) {
1842                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1843                     U16 maxid=minid;
1844                     U16 idx;
1845
1846                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1847                         const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1848                         if ( forid < minid ) {
1849                             minid=forid;
1850                         } else if ( forid > maxid ) {
1851                             maxid=forid;
1852                         }
1853                     }
1854                     if ( transcount < tp + maxid - minid + 1) {
1855                         transcount *= 2;
1856                         trie->trans = (reg_trie_trans *)
1857                             PerlMemShared_realloc( trie->trans,
1858                                                      transcount
1859                                                      * sizeof(reg_trie_trans) );
1860                         Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1861                     }
1862                     base = trie->uniquecharcount + tp - minid;
1863                     if ( maxid == minid ) {
1864                         U32 set = 0;
1865                         for ( ; zp < tp ; zp++ ) {
1866                             if ( ! trie->trans[ zp ].next ) {
1867                                 base = trie->uniquecharcount + zp - minid;
1868                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1869                                 trie->trans[ zp ].check = state;
1870                                 set = 1;
1871                                 break;
1872                             }
1873                         }
1874                         if ( !set ) {
1875                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1876                             trie->trans[ tp ].check = state;
1877                             tp++;
1878                             zp = tp;
1879                         }
1880                     } else {
1881                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1882                             const U32 tid = base -  trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1883                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1884                             trie->trans[ tid ].check = state;
1885                         }
1886                         tp += ( maxid - minid + 1 );
1887                     }
1888                     Safefree(trie->states[ state ].trans.list);
1889                 }
1890                 /*
1891                 DEBUG_TRIE_COMPILE_MORE_r(
1892                     PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1893                 );
1894                 */
1895                 trie->states[ state ].trans.base=base;
1896             }
1897             trie->lasttrans = tp + 1;
1898         }
1899     } else {
1900         /*
1901            Second Pass -- Flat Table Representation.
1902
1903            we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1904            We know that we will need Charcount+1 trans at most to store the data
1905            (one row per char at worst case) So we preallocate both structures
1906            assuming worst case.
1907
1908            We then construct the trie using only the .next slots of the entry
1909            structs.
1910
1911            We use the .check field of the first entry of the node temporarily to
1912            make compression both faster and easier by keeping track of how many non
1913            zero fields are in the node.
1914
1915            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1916            transition.
1917
1918            There are two terms at use here: state as a TRIE_NODEIDX() which is a
1919            number representing the first entry of the node, and state as a
1920            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1921            TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1922            are 2 entrys per node. eg:
1923
1924              A B       A B
1925           1. 2 4    1. 3 7
1926           2. 0 3    3. 0 5
1927           3. 0 0    5. 0 0
1928           4. 0 0    7. 0 0
1929
1930            The table is internally in the right hand, idx form. However as we also
1931            have to deal with the states array which is indexed by nodenum we have to
1932            use TRIE_NODENUM() to convert.
1933
1934         */
1935         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1936             "%*sCompiling trie using table compiler\n",
1937             (int)depth * 2 + 2, ""));
1938
1939         trie->trans = (reg_trie_trans *)
1940             PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1941                                   * trie->uniquecharcount + 1,
1942                                   sizeof(reg_trie_trans) );
1943         trie->states = (reg_trie_state *)
1944             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1945                                   sizeof(reg_trie_state) );
1946         next_alloc = trie->uniquecharcount + 1;
1947
1948
1949         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1950
1951             regnode * const noper   = NEXTOPER( cur );
1952             const U8 *uc     = (U8*)STRING( noper );
1953             const U8 * const e = uc + STR_LEN( noper );
1954
1955             U32 state        = 1;         /* required init */
1956
1957             U16 charid       = 0;         /* sanity init */
1958             U32 accept_state = 0;         /* sanity init */
1959             U8 *scan         = (U8*)NULL; /* sanity init */
1960
1961             STRLEN foldlen   = 0;         /* required init */
1962             U32 wordlen      = 0;         /* required init */
1963             STRLEN skiplen   = 0;
1964             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1965
1966
1967             if ( OP(noper) != NOTHING ) {
1968                 for ( ; uc < e ; uc += len ) {
1969
1970                     TRIE_READ_CHAR;
1971
1972                     if ( uvc < 256 ) {
1973                         charid = trie->charmap[ uvc ];
1974                     } else {
1975                         SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1976                         charid = svpp ? (U16)SvIV(*svpp) : 0;
1977                     }
1978                     if ( charid ) {
1979                         charid--;
1980                         if ( !trie->trans[ state + charid ].next ) {
1981                             trie->trans[ state + charid ].next = next_alloc;
1982                             trie->trans[ state ].check++;
1983                             prev_states[TRIE_NODENUM(next_alloc)]
1984                                     = TRIE_NODENUM(state);
1985                             next_alloc += trie->uniquecharcount;
1986                         }
1987                         state = trie->trans[ state + charid ].next;
1988                     } else {
1989                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1990                     }
1991                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
1992                 }
1993             }
1994             accept_state = TRIE_NODENUM( state );
1995             TRIE_HANDLE_WORD(accept_state);
1996
1997         } /* end second pass */
1998
1999         /* and now dump it out before we compress it */
2000         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2001                                                           revcharmap,
2002                                                           next_alloc, depth+1));
2003
2004         {
2005         /*
2006            * Inplace compress the table.*
2007
2008            For sparse data sets the table constructed by the trie algorithm will
2009            be mostly 0/FAIL transitions or to put it another way mostly empty.
2010            (Note that leaf nodes will not contain any transitions.)
2011
2012            This algorithm compresses the tables by eliminating most such
2013            transitions, at the cost of a modest bit of extra work during lookup:
2014
2015            - Each states[] entry contains a .base field which indicates the
2016            index in the state[] array wheres its transition data is stored.
2017
2018            - If .base is 0 there are no valid transitions from that node.
2019
2020            - If .base is nonzero then charid is added to it to find an entry in
2021            the trans array.
2022
2023            -If trans[states[state].base+charid].check!=state then the
2024            transition is taken to be a 0/Fail transition. Thus if there are fail
2025            transitions at the front of the node then the .base offset will point
2026            somewhere inside the previous nodes data (or maybe even into a node
2027            even earlier), but the .check field determines if the transition is
2028            valid.
2029
2030            XXX - wrong maybe?
2031            The following process inplace converts the table to the compressed
2032            table: We first do not compress the root node 1,and mark all its
2033            .check pointers as 1 and set its .base pointer as 1 as well. This
2034            allows us to do a DFA construction from the compressed table later,
2035            and ensures that any .base pointers we calculate later are greater
2036            than 0.
2037
2038            - We set 'pos' to indicate the first entry of the second node.
2039
2040            - We then iterate over the columns of the node, finding the first and
2041            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2042            and set the .check pointers accordingly, and advance pos
2043            appropriately and repreat for the next node. Note that when we copy
2044            the next pointers we have to convert them from the original
2045            NODEIDX form to NODENUM form as the former is not valid post
2046            compression.
2047
2048            - If a node has no transitions used we mark its base as 0 and do not
2049            advance the pos pointer.
2050
2051            - If a node only has one transition we use a second pointer into the
2052            structure to fill in allocated fail transitions from other states.
2053            This pointer is independent of the main pointer and scans forward
2054            looking for null transitions that are allocated to a state. When it
2055            finds one it writes the single transition into the "hole".  If the
2056            pointer doesnt find one the single transition is appended as normal.
2057
2058            - Once compressed we can Renew/realloc the structures to release the
2059            excess space.
2060
2061            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2062            specifically Fig 3.47 and the associated pseudocode.
2063
2064            demq
2065         */
2066         const U32 laststate = TRIE_NODENUM( next_alloc );
2067         U32 state, charid;
2068         U32 pos = 0, zp=0;
2069         trie->statecount = laststate;
2070
2071         for ( state = 1 ; state < laststate ; state++ ) {
2072             U8 flag = 0;
2073             const U32 stateidx = TRIE_NODEIDX( state );
2074             const U32 o_used = trie->trans[ stateidx ].check;
2075             U32 used = trie->trans[ stateidx ].check;
2076             trie->trans[ stateidx ].check = 0;
2077
2078             for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2079                 if ( flag || trie->trans[ stateidx + charid ].next ) {
2080                     if ( trie->trans[ stateidx + charid ].next ) {
2081                         if (o_used == 1) {
2082                             for ( ; zp < pos ; zp++ ) {
2083                                 if ( ! trie->trans[ zp ].next ) {
2084                                     break;
2085                                 }
2086                             }
2087                             trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2088                             trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2089                             trie->trans[ zp ].check = state;
2090                             if ( ++zp > pos ) pos = zp;
2091                             break;
2092                         }
2093                         used--;
2094                     }
2095                     if ( !flag ) {
2096                         flag = 1;
2097                         trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2098                     }
2099                     trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2100                     trie->trans[ pos ].check = state;
2101                     pos++;
2102                 }
2103             }
2104         }
2105         trie->lasttrans = pos + 1;
2106         trie->states = (reg_trie_state *)
2107             PerlMemShared_realloc( trie->states, laststate
2108                                    * sizeof(reg_trie_state) );
2109         DEBUG_TRIE_COMPILE_MORE_r(
2110                 PerlIO_printf( Perl_debug_log,
2111                     "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2112                     (int)depth * 2 + 2,"",
2113                     (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
2114                     (IV)next_alloc,
2115                     (IV)pos,
2116                     ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2117             );
2118
2119         } /* end table compress */
2120     }
2121     DEBUG_TRIE_COMPILE_MORE_r(
2122             PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2123                 (int)depth * 2 + 2, "",
2124                 (UV)trie->statecount,
2125                 (UV)trie->lasttrans)
2126     );
2127     /* resize the trans array to remove unused space */
2128     trie->trans = (reg_trie_trans *)
2129         PerlMemShared_realloc( trie->trans, trie->lasttrans
2130                                * sizeof(reg_trie_trans) );
2131
2132     {   /* Modify the program and insert the new TRIE node */
2133         U8 nodetype =(U8)(flags & 0xFF);
2134         char *str=NULL;
2135
2136 #ifdef DEBUGGING
2137         regnode *optimize = NULL;
2138 #ifdef RE_TRACK_PATTERN_OFFSETS
2139
2140         U32 mjd_offset = 0;
2141         U32 mjd_nodelen = 0;
2142 #endif /* RE_TRACK_PATTERN_OFFSETS */
2143 #endif /* DEBUGGING */
2144         /*
2145            This means we convert either the first branch or the first Exact,
2146            depending on whether the thing following (in 'last') is a branch
2147            or not and whther first is the startbranch (ie is it a sub part of
2148            the alternation or is it the whole thing.)
2149            Assuming its a sub part we convert the EXACT otherwise we convert
2150            the whole branch sequence, including the first.
2151          */
2152         /* Find the node we are going to overwrite */
2153         if ( first != startbranch || OP( last ) == BRANCH ) {
2154             /* branch sub-chain */
2155             NEXT_OFF( first ) = (U16)(last - first);
2156 #ifdef RE_TRACK_PATTERN_OFFSETS
2157             DEBUG_r({
2158                 mjd_offset= Node_Offset((convert));
2159                 mjd_nodelen= Node_Length((convert));
2160             });
2161 #endif
2162             /* whole branch chain */
2163         }
2164 #ifdef RE_TRACK_PATTERN_OFFSETS
2165         else {
2166             DEBUG_r({
2167                 const  regnode *nop = NEXTOPER( convert );
2168                 mjd_offset= Node_Offset((nop));
2169                 mjd_nodelen= Node_Length((nop));
2170             });
2171         }
2172         DEBUG_OPTIMISE_r(
2173             PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2174                 (int)depth * 2 + 2, "",
2175                 (UV)mjd_offset, (UV)mjd_nodelen)
2176         );
2177 #endif
2178         /* But first we check to see if there is a common prefix we can
2179            split out as an EXACT and put in front of the TRIE node.  */
2180         trie->startstate= 1;
2181         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
2182             U32 state;
2183             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2184                 U32 ofs = 0;
2185                 I32 idx = -1;
2186                 U32 count = 0;
2187                 const U32 base = trie->states[ state ].trans.base;
2188
2189                 if ( trie->states[state].wordnum )
2190                         count = 1;
2191
2192                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2193                     if ( ( base + ofs >= trie->uniquecharcount ) &&
2194                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2195                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2196                     {
2197                         if ( ++count > 1 ) {
2198                             SV **tmp = av_fetch( revcharmap, ofs, 0);
2199                             const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2200                             if ( state == 1 ) break;
2201                             if ( count == 2 ) {
2202                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2203                                 DEBUG_OPTIMISE_r(
2204                                     PerlIO_printf(Perl_debug_log,
2205                                         "%*sNew Start State=%"UVuf" Class: [",
2206                                         (int)depth * 2 + 2, "",
2207                                         (UV)state));
2208                                 if (idx >= 0) {
2209                                     SV ** const tmp = av_fetch( revcharmap, idx, 0);
2210                                     const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2211
2212                                     TRIE_BITMAP_SET(trie,*ch);
2213                                     if ( folder )
2214                                         TRIE_BITMAP_SET(trie, folder[ *ch ]);
2215                                     DEBUG_OPTIMISE_r(
2216                                         PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2217                                     );
2218                                 }
2219                             }
2220                             TRIE_BITMAP_SET(trie,*ch);
2221                             if ( folder )
2222                                 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2223                             DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2224                         }
2225                         idx = ofs;
2226                     }
2227                 }
2228                 if ( count == 1 ) {
2229                     SV **tmp = av_fetch( revcharmap, idx, 0);
2230                     STRLEN len;
2231                     char *ch = SvPV( *tmp, len );
2232                     DEBUG_OPTIMISE_r({
2233                         SV *sv=sv_newmortal();
2234                         PerlIO_printf( Perl_debug_log,
2235                             "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2236                             (int)depth * 2 + 2, "",
2237                             (UV)state, (UV)idx,
2238                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2239                                 PL_colors[0], PL_colors[1],
2240                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2241                                 PERL_PV_ESCAPE_FIRSTCHAR
2242                             )
2243                         );
2244                     });
2245                     if ( state==1 ) {
2246                         OP( convert ) = nodetype;
2247                         str=STRING(convert);
2248                         STR_LEN(convert)=0;
2249                     }
2250                     STR_LEN(convert) += len;
2251                     while (len--)
2252                         *str++ = *ch++;
2253                 } else {
2254 #ifdef DEBUGGING
2255                     if (state>1)
2256                         DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2257 #endif
2258                     break;
2259                 }
2260             }
2261             trie->prefixlen = (state-1);
2262             if (str) {
2263                 regnode *n = convert+NODE_SZ_STR(convert);
2264                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2265                 trie->startstate = state;
2266                 trie->minlen -= (state - 1);
2267                 trie->maxlen -= (state - 1);
2268 #ifdef DEBUGGING
2269                /* At least the UNICOS C compiler choked on this
2270                 * being argument to DEBUG_r(), so let's just have
2271                 * it right here. */
2272                if (
2273 #ifdef PERL_EXT_RE_BUILD
2274                    1
2275 #else
2276                    DEBUG_r_TEST
2277 #endif
2278                    ) {
2279                    regnode *fix = convert;
2280                    U32 word = trie->wordcount;
2281                    mjd_nodelen++;
2282                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2283                    while( ++fix < n ) {
2284                        Set_Node_Offset_Length(fix, 0, 0);
2285                    }
2286                    while (word--) {
2287                        SV ** const tmp = av_fetch( trie_words, word, 0 );
2288                        if (tmp) {
2289                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
2290                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2291                            else
2292                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2293                        }
2294                    }
2295                }
2296 #endif
2297                 if (trie->maxlen) {
2298                     convert = n;
2299                 } else {
2300                     NEXT_OFF(convert) = (U16)(tail - convert);
2301                     DEBUG_r(optimize= n);
2302                 }
2303             }
2304         }
2305         if (!jumper)
2306             jumper = last;
2307         if ( trie->maxlen ) {
2308             NEXT_OFF( convert ) = (U16)(tail - convert);
2309             ARG_SET( convert, data_slot );
2310             /* Store the offset to the first unabsorbed branch in
2311                jump[0], which is otherwise unused by the jump logic.
2312                We use this when dumping a trie and during optimisation. */
2313             if (trie->jump)
2314                 trie->jump[0] = (U16)(nextbranch - convert);
2315
2316             /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2317              *   and there is a bitmap
2318              *   and the first "jump target" node we found leaves enough room
2319              * then convert the TRIE node into a TRIEC node, with the bitmap
2320              * embedded inline in the opcode - this is hypothetically faster.
2321              */
2322             if ( !trie->states[trie->startstate].wordnum
2323                  && trie->bitmap
2324                  && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2325             {
2326                 OP( convert ) = TRIEC;
2327                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2328                 PerlMemShared_free(trie->bitmap);
2329                 trie->bitmap= NULL;
2330             } else
2331                 OP( convert ) = TRIE;
2332
2333             /* store the type in the flags */
2334             convert->flags = nodetype;
2335             DEBUG_r({
2336             optimize = convert
2337                       + NODE_STEP_REGNODE
2338                       + regarglen[ OP( convert ) ];
2339             });
2340             /* XXX We really should free up the resource in trie now,
2341                    as we won't use them - (which resources?) dmq */
2342         }
2343         /* needed for dumping*/
2344         DEBUG_r(if (optimize) {
2345             regnode *opt = convert;
2346
2347             while ( ++opt < optimize) {
2348                 Set_Node_Offset_Length(opt,0,0);
2349             }
2350             /*
2351                 Try to clean up some of the debris left after the
2352                 optimisation.
2353              */
2354             while( optimize < jumper ) {
2355                 mjd_nodelen += Node_Length((optimize));
2356                 OP( optimize ) = OPTIMIZED;
2357                 Set_Node_Offset_Length(optimize,0,0);
2358                 optimize++;
2359             }
2360             Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2361         });
2362     } /* end node insert */
2363
2364     /*  Finish populating the prev field of the wordinfo array.  Walk back
2365      *  from each accept state until we find another accept state, and if
2366      *  so, point the first word's .prev field at the second word. If the
2367      *  second already has a .prev field set, stop now. This will be the
2368      *  case either if we've already processed that word's accept state,
2369      *  or that state had multiple words, and the overspill words were
2370      *  already linked up earlier.
2371      */
2372     {
2373         U16 word;
2374         U32 state;
2375         U16 prev;
2376
2377         for (word=1; word <= trie->wordcount; word++) {
2378             prev = 0;
2379             if (trie->wordinfo[word].prev)
2380                 continue;
2381             state = trie->wordinfo[word].accept;
2382             while (state) {
2383                 state = prev_states[state];
2384                 if (!state)
2385                     break;
2386                 prev = trie->states[state].wordnum;
2387                 if (prev)
2388                     break;
2389             }
2390             trie->wordinfo[word].prev = prev;
2391         }
2392         Safefree(prev_states);
2393     }
2394
2395
2396     /* and now dump out the compressed format */
2397     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2398
2399     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2400 #ifdef DEBUGGING
2401     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2402     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2403 #else
2404     SvREFCNT_dec(revcharmap);
2405 #endif
2406     return trie->jump
2407            ? MADE_JUMP_TRIE
2408            : trie->startstate>1
2409              ? MADE_EXACT_TRIE
2410              : MADE_TRIE;
2411 }
2412
2413 STATIC void
2414 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode *stclass, U32 depth)
2415 {
2416 /* The Trie is constructed and compressed now so we can build a fail array if it's needed
2417
2418    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2419    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2420    ISBN 0-201-10088-6
2421
2422    We find the fail state for each state in the trie, this state is the longest proper
2423    suffix of the current state's 'word' that is also a proper prefix of another word in our
2424    trie. State 1 represents the word '' and is thus the default fail state. This allows
2425    the DFA not to have to restart after its tried and failed a word at a given point, it
2426    simply continues as though it had been matching the other word in the first place.
2427    Consider
2428       'abcdgu'=~/abcdefg|cdgu/
2429    When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2430    fail, which would bring us to the state representing 'd' in the second word where we would
2431    try 'g' and succeed, proceeding to match 'cdgu'.
2432  */
2433  /* add a fail transition */
2434     const U32 trie_offset = ARG(source);
2435     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2436     U32 *q;
2437     const U32 ucharcount = trie->uniquecharcount;
2438     const U32 numstates = trie->statecount;
2439     const U32 ubound = trie->lasttrans + ucharcount;
2440     U32 q_read = 0;
2441     U32 q_write = 0;
2442     U32 charid;
2443     U32 base = trie->states[ 1 ].trans.base;
2444     U32 *fail;
2445     reg_ac_data *aho;
2446     const U32 data_slot = add_data( pRExC_state, 1, "T" );
2447     GET_RE_DEBUG_FLAGS_DECL;
2448
2449     PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2450 #ifndef DEBUGGING
2451     PERL_UNUSED_ARG(depth);
2452 #endif
2453
2454
2455     ARG_SET( stclass, data_slot );
2456     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2457     RExC_rxi->data->data[ data_slot ] = (void*)aho;
2458     aho->trie=trie_offset;
2459     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2460     Copy( trie->states, aho->states, numstates, reg_trie_state );
2461     Newxz( q, numstates, U32);
2462     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2463     aho->refcount = 1;
2464     fail = aho->fail;
2465     /* initialize fail[0..1] to be 1 so that we always have
2466        a valid final fail state */
2467     fail[ 0 ] = fail[ 1 ] = 1;
2468
2469     for ( charid = 0; charid < ucharcount ; charid++ ) {
2470         const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2471         if ( newstate ) {
2472             q[ q_write ] = newstate;
2473             /* set to point at the root */
2474             fail[ q[ q_write++ ] ]=1;
2475         }
2476     }
2477     while ( q_read < q_write) {
2478         const U32 cur = q[ q_read++ % numstates ];
2479         base = trie->states[ cur ].trans.base;
2480
2481         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2482             const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2483             if (ch_state) {
2484                 U32 fail_state = cur;
2485                 U32 fail_base;
2486                 do {
2487                     fail_state = fail[ fail_state ];
2488                     fail_base = aho->states[ fail_state ].trans.base;
2489                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2490
2491                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2492                 fail[ ch_state ] = fail_state;
2493                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2494                 {
2495                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
2496                 }
2497                 q[ q_write++ % numstates] = ch_state;
2498             }
2499         }
2500     }
2501     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2502        when we fail in state 1, this allows us to use the
2503        charclass scan to find a valid start char. This is based on the principle
2504        that theres a good chance the string being searched contains lots of stuff
2505        that cant be a start char.
2506      */
2507     fail[ 0 ] = fail[ 1 ] = 0;
2508     DEBUG_TRIE_COMPILE_r({
2509         PerlIO_printf(Perl_debug_log,
2510                       "%*sStclass Failtable (%"UVuf" states): 0",
2511                       (int)(depth * 2), "", (UV)numstates
2512         );
2513         for( q_read=1; q_read<numstates; q_read++ ) {
2514             PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2515         }
2516         PerlIO_printf(Perl_debug_log, "\n");
2517     });
2518     Safefree(q);
2519     /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2520 }
2521
2522
2523 /*
2524  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2525  * These need to be revisited when a newer toolchain becomes available.
2526  */
2527 #if defined(__sparc64__) && defined(__GNUC__)
2528 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2529 #       undef  SPARC64_GCC_WORKAROUND
2530 #       define SPARC64_GCC_WORKAROUND 1
2531 #   endif
2532 #endif
2533
2534 #define DEBUG_PEEP(str,scan,depth) \
2535     DEBUG_OPTIMISE_r({if (scan){ \
2536        SV * const mysv=sv_newmortal(); \
2537        regnode *Next = regnext(scan); \
2538        regprop(RExC_rx, mysv, scan); \
2539        PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2540        (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2541        Next ? (REG_NODE_NUM(Next)) : 0 ); \
2542    }});
2543
2544
2545 /* The below joins as many adjacent EXACTish nodes as possible into a single
2546  * one, and looks for problematic sequences of characters whose folds vs.
2547  * non-folds have sufficiently different lengths, that the optimizer would be
2548  * fooled into rejecting legitimate matches of them, and the trie construction
2549  * code can't cope with them.  The joining is only done if:
2550  * 1) there is room in the current conglomerated node to entirely contain the
2551  *    next one.
2552  * 2) they are the exact same node type
2553  *
2554  * The adjacent nodes actually may be separated by NOTHING kind nodes, and
2555  * these get optimized out
2556  *
2557  * If there are problematic code sequences, *min_subtract is set to the delta
2558  * that the minimum size of the node can be less than its actual size.  And,
2559  * the node type of the result is changed to reflect that it contains these
2560  * sequences.
2561  *
2562  * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
2563  * and contains LATIN SMALL LETTER SHARP S
2564  *
2565  * This is as good a place as any to discuss the design of handling these
2566  * problematic sequences.  It's been wrong in Perl for a very long time.  There
2567  * are three code points in Unicode whose folded lengths differ so much from
2568  * the un-folded lengths that it causes problems for the optimizer and trie
2569  * construction.  Why only these are problematic, and not others where lengths
2570  * also differ is something I (khw) do not understand.  New versions of Unicode
2571  * might add more such code points.  Hopefully the logic in fold_grind.t that
2572  * figures out what to test (in part by verifying that each size-combination
2573  * gets tested) will catch any that do come along, so they can be added to the
2574  * special handling below.  The chances of new ones are actually rather small,
2575  * as most, if not all, of the world's scripts that have casefolding have
2576  * already been encoded by Unicode.  Also, a number of Unicode's decisions were
2577  * made to allow compatibility with pre-existing standards, and almost all of
2578  * those have already been dealt with.  These would otherwise be the most
2579  * likely candidates for generating further tricky sequences.  In other words,
2580  * Unicode by itself is unlikely to add new ones unless it is for compatibility
2581  * with pre-existing standards, and there aren't many of those left.
2582  *
2583  * The previous designs for dealing with these involved assigning a special
2584  * node for them.  This approach doesn't work, as evidenced by this example:
2585  *      "\xDFs" =~ /s\xDF/ui    # Used to fail before these patches
2586  * Both these fold to "sss", but if the pattern is parsed to create a node of
2587  * that would match just the \xDF, it won't be able to handle the case where a
2588  * successful match would have to cross the node's boundary.  The new approach
2589  * that hopefully generally solves the problem generates an EXACTFU_SS node
2590  * that is "sss".
2591  *
2592  * There are a number of components to the approach (a lot of work for just
2593  * three code points!):
2594  * 1)   This routine examines each EXACTFish node that could contain the
2595  *      problematic sequences.  It returns in *min_subtract how much to
2596  *      subtract from the the actual length of the string to get a real minimum
2597  *      for one that could match it.  This number is usually 0 except for the
2598  *      problematic sequences.  This delta is used by the caller to adjust the
2599  *      min length of the match, and the delta between min and max, so that the
2600  *      optimizer doesn't reject these possibilities based on size constraints.
2601  * 2)   These sequences are not currently correctly handled by the trie code
2602  *      either, so it changes the joined node type to ops that are not handled
2603  *      by trie's, those new ops being EXACTFU_SS and EXACTFU_TRICKYFOLD.
2604  * 3)   This is sufficient for the two Greek sequences (described below), but
2605  *      the one involving the Sharp s (\xDF) needs more.  The node type
2606  *      EXACTFU_SS is used for an EXACTFU node that contains at least one "ss"
2607  *      sequence in it.  For non-UTF-8 patterns and strings, this is the only
2608  *      case where there is a possible fold length change.  That means that a
2609  *      regular EXACTFU node without UTF-8 involvement doesn't have to concern
2610  *      itself with length changes, and so can be processed faster.  regexec.c
2611  *      takes advantage of this.  Generally, an EXACTFish node that is in UTF-8
2612  *      is pre-folded by regcomp.c.  This saves effort in regex matching.
2613  *      However, probably mostly for historical reasons, the pre-folding isn't
2614  *      done for non-UTF8 patterns (and it can't be for EXACTF and EXACTFL
2615  *      nodes, as what they fold to isn't known until runtime.)  The fold
2616  *      possibilities for the non-UTF8 patterns are quite simple, except for
2617  *      the sharp s.  All the ones that don't involve a UTF-8 target string
2618  *      are members of a fold-pair, and arrays are set up for all of them
2619  *      that quickly find the other member of the pair.  It might actually
2620  *      be faster to pre-fold these, but it isn't currently done, except for
2621  *      the sharp s.  Code elsewhere in this file makes sure that it gets
2622  *      folded to 'ss', even if the pattern isn't UTF-8.  This avoids the
2623  *      issues described in the next item.
2624  * 4)   A problem remains for the sharp s in EXACTF nodes.  Whether it matches
2625  *      'ss' or not is not knowable at compile time.  It will match iff the
2626  *      target string is in UTF-8, unlike the EXACTFU nodes, where it always
2627  *      matches; and the EXACTFL and EXACTFA nodes where it never does.  Thus
2628  *      it can't be folded to "ss" at compile time, unlike EXACTFU does as
2629  *      described in item 3).  An assumption that the optimizer part of
2630  *      regexec.c (probably unwittingly) makes is that a character in the
2631  *      pattern corresponds to at most a single character in the target string.
2632  *      (And I do mean character, and not byte here, unlike other parts of the
2633  *      documentation that have never been updated to account for multibyte
2634  *      Unicode.)  This assumption is wrong only in this case, as all other
2635  *      cases are either 1-1 folds when no UTF-8 is involved; or is true by
2636  *      virtue of having this file pre-fold UTF-8 patterns.   I'm
2637  *      reluctant to try to change this assumption, so instead the code punts.
2638  *      This routine examines EXACTF nodes for the sharp s, and returns a
2639  *      boolean indicating whether or not the node is an EXACTF node that
2640  *      contains a sharp s.  When it is true, the caller sets a flag that later
2641  *      causes the optimizer in this file to not set values for the floating
2642  *      and fixed string lengths, and thus avoids the optimizer code in
2643  *      regexec.c that makes the invalid assumption.  Thus, there is no
2644  *      optimization based on string lengths for EXACTF nodes that contain the
2645  *      sharp s.  This only happens for /id rules (which means the pattern
2646  *      isn't in UTF-8).
2647  */
2648
2649 #define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
2650     if (PL_regkind[OP(scan)] == EXACT) \
2651         join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
2652
2653 STATIC U32
2654 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
2655     /* Merge several consecutive EXACTish nodes into one. */
2656     regnode *n = regnext(scan);
2657     U32 stringok = 1;
2658     regnode *next = scan + NODE_SZ_STR(scan);
2659     U32 merged = 0;
2660     U32 stopnow = 0;
2661 #ifdef DEBUGGING
2662     regnode *stop = scan;
2663     GET_RE_DEBUG_FLAGS_DECL;
2664 #else
2665     PERL_UNUSED_ARG(depth);
2666 #endif
2667
2668     PERL_ARGS_ASSERT_JOIN_EXACT;
2669 #ifndef EXPERIMENTAL_INPLACESCAN
2670     PERL_UNUSED_ARG(flags);
2671     PERL_UNUSED_ARG(val);
2672 #endif
2673     DEBUG_PEEP("join",scan,depth);
2674
2675     /* Look through the subsequent nodes in the chain.  Skip NOTHING, merge
2676      * EXACT ones that are mergeable to the current one. */
2677     while (n
2678            && (PL_regkind[OP(n)] == NOTHING
2679                || (stringok && OP(n) == OP(scan)))
2680            && NEXT_OFF(n)
2681            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
2682     {
2683
2684         if (OP(n) == TAIL || n > next)
2685             stringok = 0;
2686         if (PL_regkind[OP(n)] == NOTHING) {
2687             DEBUG_PEEP("skip:",n,depth);
2688             NEXT_OFF(scan) += NEXT_OFF(n);
2689             next = n + NODE_STEP_REGNODE;
2690 #ifdef DEBUGGING
2691             if (stringok)
2692                 stop = n;
2693 #endif
2694             n = regnext(n);
2695         }
2696         else if (stringok) {
2697             const unsigned int oldl = STR_LEN(scan);
2698             regnode * const nnext = regnext(n);
2699
2700             if (oldl + STR_LEN(n) > U8_MAX)
2701                 break;
2702
2703             DEBUG_PEEP("merg",n,depth);
2704             merged++;
2705
2706             NEXT_OFF(scan) += NEXT_OFF(n);
2707             STR_LEN(scan) += STR_LEN(n);
2708             next = n + NODE_SZ_STR(n);
2709             /* Now we can overwrite *n : */
2710             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2711 #ifdef DEBUGGING
2712             stop = next - 1;
2713 #endif
2714             n = nnext;
2715             if (stopnow) break;
2716         }
2717
2718 #ifdef EXPERIMENTAL_INPLACESCAN
2719         if (flags && !NEXT_OFF(n)) {
2720             DEBUG_PEEP("atch", val, depth);
2721             if (reg_off_by_arg[OP(n)]) {
2722                 ARG_SET(n, val - n);
2723             }
2724             else {
2725                 NEXT_OFF(n) = val - n;
2726             }
2727             stopnow = 1;
2728         }
2729 #endif
2730     }
2731
2732     *min_subtract = 0;
2733     *has_exactf_sharp_s = FALSE;
2734
2735     /* Here, all the adjacent mergeable EXACTish nodes have been merged.  We
2736      * can now analyze for sequences of problematic code points.  (Prior to
2737      * this final joining, sequences could have been split over boundaries, and
2738      * hence missed).  The sequences only happen in folding, hence for any
2739      * non-EXACT EXACTish node */
2740     if (OP(scan) != EXACT) {
2741         U8 *s;
2742         U8 * s0 = (U8*) STRING(scan);
2743         U8 * const s_end = s0 + STR_LEN(scan);
2744
2745         /* The below is perhaps overboard, but this allows us to save a test
2746          * each time through the loop at the expense of a mask.  This is
2747          * because on both EBCDIC and ASCII machines, 'S' and 's' differ by a
2748          * single bit.  On ASCII they are 32 apart; on EBCDIC, they are 64.
2749          * This uses an exclusive 'or' to find that bit and then inverts it to
2750          * form a mask, with just a single 0, in the bit position where 'S' and
2751          * 's' differ. */
2752         const U8 S_or_s_mask = (U8) ~ ('S' ^ 's');
2753         const U8 s_masked = 's' & S_or_s_mask;
2754
2755         /* One pass is made over the node's string looking for all the
2756          * possibilities.  to avoid some tests in the loop, there are two main
2757          * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
2758          * non-UTF-8 */
2759         if (UTF) {
2760
2761             /* There are two problematic Greek code points in Unicode
2762              * casefolding
2763              *
2764              * U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2765              * U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2766              *
2767              * which casefold to
2768              *
2769              * Unicode                      UTF-8
2770              *
2771              * U+03B9 U+0308 U+0301         0xCE 0xB9 0xCC 0x88 0xCC 0x81
2772              * U+03C5 U+0308 U+0301         0xCF 0x85 0xCC 0x88 0xCC 0x81
2773              *
2774              * This means that in case-insensitive matching (or "loose
2775              * matching", as Unicode calls it), an EXACTF of length six (the
2776              * UTF-8 encoded byte length of the above casefolded versions) can
2777              * match a target string of length two (the byte length of UTF-8
2778              * encoded U+0390 or U+03B0).  This would rather mess up the
2779              * minimum length computation.  (there are other code points that
2780              * also fold to these two sequences, but the delta is smaller)
2781              *
2782              * If these sequences are found, the minimum length is decreased by
2783              * four (six minus two).
2784              *
2785              * Similarly, 'ss' may match the single char and byte LATIN SMALL
2786              * LETTER SHARP S.  We decrease the min length by 1 for each
2787              * occurrence of 'ss' found */
2788
2789 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2790 #           define U390_first_byte 0xb4
2791             const U8 U390_tail[] = "\x68\xaf\x49\xaf\x42";
2792 #           define U3B0_first_byte 0xb5
2793             const U8 U3B0_tail[] = "\x46\xaf\x49\xaf\x42";
2794 #else
2795 #           define U390_first_byte 0xce
2796             const U8 U390_tail[] = "\xb9\xcc\x88\xcc\x81";
2797 #           define U3B0_first_byte 0xcf
2798             const U8 U3B0_tail[] = "\x85\xcc\x88\xcc\x81";
2799 #endif
2800             const U8 len = sizeof(U390_tail); /* (-1 for NUL; +1 for 1st byte;
2801                                                  yields a net of 0 */
2802             /* Examine the string for one of the problematic sequences */
2803             for (s = s0;
2804                  s < s_end - 1; /* Can stop 1 before the end, as minimum length
2805                                  * sequence we are looking for is 2 */
2806                  s += UTF8SKIP(s))
2807             {
2808
2809                 /* Look for the first byte in each problematic sequence */
2810                 switch (*s) {
2811                     /* We don't have to worry about other things that fold to
2812                      * 's' (such as the long s, U+017F), as all above-latin1
2813                      * code points have been pre-folded */
2814                     case 's':
2815                     case 'S':
2816
2817                         /* Current character is an 's' or 'S'.  If next one is
2818                          * as well, we have the dreaded sequence */
2819                         if (((*(s+1) & S_or_s_mask) == s_masked)
2820                             /* These two node types don't have special handling
2821                              * for 'ss' */
2822                             && OP(scan) != EXACTFL && OP(scan) != EXACTFA)
2823                         {
2824                             *min_subtract += 1;
2825                             OP(scan) = EXACTFU_SS;
2826                             s++;    /* No need to look at this character again */
2827                         }
2828                         break;
2829
2830                     case U390_first_byte:
2831                         if (s_end - s >= len
2832
2833                             /* The 1's are because are skipping comparing the
2834                              * first byte */
2835                             && memEQ(s + 1, U390_tail, len - 1))
2836                         {
2837                             goto greek_sequence;
2838                         }
2839                         break;
2840
2841                     case U3B0_first_byte:
2842                         if (! (s_end - s >= len
2843                                && memEQ(s + 1, U3B0_tail, len - 1)))
2844                         {
2845                             break;
2846                         }
2847                       greek_sequence:
2848                         *min_subtract += 4;
2849
2850                         /* This can't currently be handled by trie's, so change
2851                          * the node type to indicate this.  If EXACTFA and
2852                          * EXACTFL were ever to be handled by trie's, this
2853                          * would have to be changed.  If this node has already
2854                          * been changed to EXACTFU_SS in this loop, leave it as
2855                          * is.  (I (khw) think it doesn't matter in regexec.c
2856                          * for UTF patterns, but no need to change it */
2857                         if (OP(scan) == EXACTFU) {
2858                             OP(scan) = EXACTFU_TRICKYFOLD;
2859                         }
2860                         s += 6; /* We already know what this sequence is.  Skip
2861                                    the rest of it */
2862                         break;
2863                 }
2864             }
2865         }
2866         else if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
2867
2868             /* Here, the pattern is not UTF-8.  We need to look only for the
2869              * 'ss' sequence, and in the EXACTF case, the sharp s, which can be
2870              * in the final position.  Otherwise we can stop looking 1 byte
2871              * earlier because have to find both the first and second 's' */
2872             const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
2873
2874             for (s = s0; s < upper; s++) {
2875                 switch (*s) {
2876                     case 'S':
2877                     case 's':
2878                         if (s_end - s > 1
2879                             && ((*(s+1) & S_or_s_mask) == s_masked))
2880                         {
2881                             *min_subtract += 1;
2882
2883                             /* EXACTF nodes need to know that the minimum
2884                              * length changed so that a sharp s in the string
2885                              * can match this ss in the pattern, but they
2886                              * remain EXACTF nodes, as they are not trie'able,
2887                              * so don't have to invent a new node type to
2888                              * exclude them from the trie code */
2889                             if (OP(scan) != EXACTF) {
2890                                 OP(scan) = EXACTFU_SS;
2891                             }
2892                             s++;
2893                         }
2894                         break;
2895                     case LATIN_SMALL_LETTER_SHARP_S:
2896                         if (OP(scan) == EXACTF) {
2897                             *has_exactf_sharp_s = TRUE;
2898                         }
2899                         break;
2900                 }
2901             }
2902         }
2903     }
2904
2905 #ifdef DEBUGGING
2906     /* Allow dumping but overwriting the collection of skipped
2907      * ops and/or strings with fake optimized ops */
2908     n = scan + NODE_SZ_STR(scan);
2909     while (n <= stop) {
2910         OP(n) = OPTIMIZED;
2911         FLAGS(n) = 0;
2912         NEXT_OFF(n) = 0;
2913         n++;
2914     }
2915 #endif
2916     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2917     return stopnow;
2918 }
2919
2920 /* REx optimizer.  Converts nodes into quicker variants "in place".
2921    Finds fixed substrings.  */
2922
2923 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2924    to the position after last scanned or to NULL. */
2925
2926 #define INIT_AND_WITHP \
2927     assert(!and_withp); \
2928     Newx(and_withp,1,struct regnode_charclass_class); \
2929     SAVEFREEPV(and_withp)
2930
2931 /* this is a chain of data about sub patterns we are processing that
2932    need to be handled separately/specially in study_chunk. Its so
2933    we can simulate recursion without losing state.  */
2934 struct scan_frame;
2935 typedef struct scan_frame {
2936     regnode *last;  /* last node to process in this frame */
2937     regnode *next;  /* next node to process when last is reached */
2938     struct scan_frame *prev; /*previous frame*/
2939     I32 stop; /* what stopparen do we use */
2940 } scan_frame;
2941
2942
2943 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2944
2945 #define CASE_SYNST_FNC(nAmE)                                       \
2946 case nAmE:                                                         \
2947     if (flags & SCF_DO_STCLASS_AND) {                              \
2948             for (value = 0; value < 256; value++)                  \
2949                 if (!is_ ## nAmE ## _cp(value))                       \
2950                     ANYOF_BITMAP_CLEAR(data->start_class, value);  \
2951     }                                                              \
2952     else {                                                         \
2953             for (value = 0; value < 256; value++)                  \
2954                 if (is_ ## nAmE ## _cp(value))                        \
2955                     ANYOF_BITMAP_SET(data->start_class, value);    \
2956     }                                                              \
2957     break;                                                         \
2958 case N ## nAmE:                                                    \
2959     if (flags & SCF_DO_STCLASS_AND) {                              \
2960             for (value = 0; value < 256; value++)                   \
2961                 if (is_ ## nAmE ## _cp(value))                         \
2962                     ANYOF_BITMAP_CLEAR(data->start_class, value);   \
2963     }                                                               \
2964     else {                                                          \
2965             for (value = 0; value < 256; value++)                   \
2966                 if (!is_ ## nAmE ## _cp(value))                        \
2967                     ANYOF_BITMAP_SET(data->start_class, value);     \
2968     }                                                               \
2969     break
2970
2971
2972
2973 STATIC I32
2974 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2975                         I32 *minlenp, I32 *deltap,
2976                         regnode *last,
2977                         scan_data_t *data,
2978                         I32 stopparen,
2979                         U8* recursed,
2980                         struct regnode_charclass_class *and_withp,
2981                         U32 flags, U32 depth)
2982                         /* scanp: Start here (read-write). */
2983                         /* deltap: Write maxlen-minlen here. */
2984                         /* last: Stop before this one. */
2985                         /* data: string data about the pattern */
2986                         /* stopparen: treat close N as END */
2987                         /* recursed: which subroutines have we recursed into */
2988                         /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2989 {
2990     dVAR;
2991     I32 min = 0, pars = 0, code;
2992     regnode *scan = *scanp, *next;
2993     I32 delta = 0;
2994     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2995     int is_inf_internal = 0;            /* The studied chunk is infinite */
2996     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2997     scan_data_t data_fake;
2998     SV *re_trie_maxbuff = NULL;
2999     regnode *first_non_open = scan;
3000     I32 stopmin = I32_MAX;
3001     scan_frame *frame = NULL;
3002     GET_RE_DEBUG_FLAGS_DECL;
3003
3004     PERL_ARGS_ASSERT_STUDY_CHUNK;
3005
3006 #ifdef DEBUGGING
3007     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3008 #endif
3009
3010     if ( depth == 0 ) {
3011         while (first_non_open && OP(first_non_open) == OPEN)
3012             first_non_open=regnext(first_non_open);
3013     }
3014
3015
3016   fake_study_recurse:
3017     while ( scan && OP(scan) != END && scan < last ){
3018         UV min_subtract = 0;    /* How much to subtract from the minimum node
3019                                    length to get a real minimum (because the
3020                                    folded version may be shorter) */
3021         bool has_exactf_sharp_s = FALSE;
3022         /* Peephole optimizer: */
3023         DEBUG_STUDYDATA("Peep:", data,depth);
3024         DEBUG_PEEP("Peep",scan,depth);
3025
3026         /* Its not clear to khw or hv why this is done here, and not in the
3027          * clauses that deal with EXACT nodes.  khw's guess is that it's
3028          * because of a previous design */
3029         JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
3030
3031         /* Follow the next-chain of the current node and optimize
3032            away all the NOTHINGs from it.  */
3033         if (OP(scan) != CURLYX) {
3034             const int max = (reg_off_by_arg[OP(scan)]
3035                        ? I32_MAX
3036                        /* I32 may be smaller than U16 on CRAYs! */
3037                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3038             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3039             int noff;
3040             regnode *n = scan;
3041
3042             /* Skip NOTHING and LONGJMP. */
3043             while ((n = regnext(n))
3044                    && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3045                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3046                    && off + noff < max)
3047                 off += noff;
3048             if (reg_off_by_arg[OP(scan)])
3049                 ARG(scan) = off;
3050             else
3051                 NEXT_OFF(scan) = off;
3052         }
3053
3054
3055
3056         /* The principal pseudo-switch.  Cannot be a switch, since we
3057            look into several different things.  */
3058         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3059                    || OP(scan) == IFTHEN) {
3060             next = regnext(scan);
3061             code = OP(scan);
3062             /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
3063
3064             if (OP(next) == code || code == IFTHEN) {
3065                 /* NOTE - There is similar code to this block below for handling
3066                    TRIE nodes on a re-study.  If you change stuff here check there
3067                    too. */
3068                 I32 max1 = 0, min1 = I32_MAX, num = 0;
3069                 struct regnode_charclass_class accum;
3070                 regnode * const startbranch=scan;
3071
3072                 if (flags & SCF_DO_SUBSTR)
3073                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
3074                 if (flags & SCF_DO_STCLASS)
3075                     cl_init_zero(pRExC_state, &accum);
3076
3077                 while (OP(scan) == code) {
3078                     I32 deltanext, minnext, f = 0, fake;
3079                     struct regnode_charclass_class this_class;
3080
3081                     num++;
3082                     data_fake.flags = 0;
3083                     if (data) {
3084                         data_fake.whilem_c = data->whilem_c;
3085                         data_fake.last_closep = data->last_closep;
3086                     }
3087                     else
3088                         data_fake.last_closep = &fake;
3089
3090                     data_fake.pos_delta = delta;
3091                     next = regnext(scan);
3092                     scan = NEXTOPER(scan);
3093                     if (code != BRANCH)
3094                         scan = NEXTOPER(scan);
3095                     if (flags & SCF_DO_STCLASS) {
3096                         cl_init(pRExC_state, &this_class);
3097                         data_fake.start_class = &this_class;
3098                         f = SCF_DO_STCLASS_AND;
3099                     }
3100                     if (flags & SCF_WHILEM_VISITED_POS)
3101                         f |= SCF_WHILEM_VISITED_POS;
3102
3103                     /* we suppose the run is continuous, last=next...*/
3104                     minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3105                                           next, &data_fake,
3106                                           stopparen, recursed, NULL, f,depth+1);
3107                     if (min1 > minnext)
3108                         min1 = minnext;
3109                     if (max1 < minnext + deltanext)
3110                         max1 = minnext + deltanext;
3111                     if (deltanext == I32_MAX)
3112                         is_inf = is_inf_internal = 1;
3113                     scan = next;
3114                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3115                         pars++;
3116                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
3117                         if ( stopmin > minnext)
3118                             stopmin = min + min1;
3119                         flags &= ~SCF_DO_SUBSTR;
3120                         if (data)
3121                             data->flags |= SCF_SEEN_ACCEPT;
3122                     }
3123                     if (data) {
3124                         if (data_fake.flags & SF_HAS_EVAL)
3125                             data->flags |= SF_HAS_EVAL;
3126                         data->whilem_c = data_fake.whilem_c;
3127                     }
3128                     if (flags & SCF_DO_STCLASS)
3129                         cl_or(pRExC_state, &accum, &this_class);
3130                 }
3131                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3132                     min1 = 0;
3133                 if (flags & SCF_DO_SUBSTR) {
3134                     data->pos_min += min1;
3135                     data->pos_delta += max1 - min1;
3136                     if (max1 != min1 || is_inf)
3137                         data->longest = &(data->longest_float);
3138                 }
3139                 min += min1;
3140                 delta += max1 - min1;
3141                 if (flags & SCF_DO_STCLASS_OR) {
3142                     cl_or(pRExC_state, data->start_class, &accum);
3143                     if (min1) {
3144                         cl_and(data->start_class, and_withp);
3145                         flags &= ~SCF_DO_STCLASS;
3146                     }
3147                 }
3148                 else if (flags & SCF_DO_STCLASS_AND) {
3149                     if (min1) {
3150                         cl_and(data->start_class, &accum);
3151                         flags &= ~SCF_DO_STCLASS;
3152                     }
3153                     else {
3154                         /* Switch to OR mode: cache the old value of
3155                          * data->start_class */
3156                         INIT_AND_WITHP;
3157                         StructCopy(data->start_class, and_withp,
3158                                    struct regnode_charclass_class);
3159                         flags &= ~SCF_DO_STCLASS_AND;
3160                         StructCopy(&accum, data->start_class,
3161                                    struct regnode_charclass_class);
3162                         flags |= SCF_DO_STCLASS_OR;
3163                         data->start_class->flags |= ANYOF_EOS;
3164                     }
3165                 }
3166
3167                 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
3168                 /* demq.
3169
3170                    Assuming this was/is a branch we are dealing with: 'scan' now
3171                    points at the item that follows the branch sequence, whatever
3172                    it is. We now start at the beginning of the sequence and look
3173                    for subsequences of
3174
3175                    BRANCH->EXACT=>x1
3176                    BRANCH->EXACT=>x2
3177                    tail
3178
3179                    which would be constructed from a pattern like /A|LIST|OF|WORDS/
3180
3181                    If we can find such a subsequence we need to turn the first
3182                    element into a trie and then add the subsequent branch exact
3183                    strings to the trie.
3184
3185                    We have two cases
3186
3187                      1. patterns where the whole set of branches can be converted.
3188
3189                      2. patterns where only a subset can be converted.
3190
3191                    In case 1 we can replace the whole set with a single regop
3192                    for the trie. In case 2 we need to keep the start and end
3193                    branches so
3194
3195                      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3196                      becomes BRANCH TRIE; BRANCH X;
3197
3198                   There is an additional case, that being where there is a
3199                   common prefix, which gets split out into an EXACT like node
3200                   preceding the TRIE node.
3201
3202                   If x(1..n)==tail then we can do a simple trie, if not we make
3203                   a "jump" trie, such that when we match the appropriate word
3204                   we "jump" to the appropriate tail node. Essentially we turn
3205                   a nested if into a case structure of sorts.
3206
3207                 */
3208
3209                     int made=0;
3210                     if (!re_trie_maxbuff) {
3211                         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3212                         if (!SvIOK(re_trie_maxbuff))
3213                             sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3214                     }
3215                     if ( SvIV(re_trie_maxbuff)>=0  ) {
3216                         regnode *cur;
3217                         regnode *first = (regnode *)NULL;
3218                         regnode *last = (regnode *)NULL;
3219                         regnode *tail = scan;
3220                         U8 trietype = 0;
3221                         U32 count=0;
3222
3223 #ifdef DEBUGGING
3224                         SV * const mysv = sv_newmortal();       /* for dumping */
3225 #endif
3226                         /* var tail is used because there may be a TAIL
3227                            regop in the way. Ie, the exacts will point to the
3228                            thing following the TAIL, but the last branch will
3229                            point at the TAIL. So we advance tail. If we
3230                            have nested (?:) we may have to move through several
3231                            tails.
3232                          */
3233
3234                         while ( OP( tail ) == TAIL ) {
3235                             /* this is the TAIL generated by (?:) */
3236                             tail = regnext( tail );
3237                         }
3238
3239
3240                         DEBUG_OPTIMISE_r({
3241                             regprop(RExC_rx, mysv, tail );
3242                             PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3243                                 (int)depth * 2 + 2, "",
3244                                 "Looking for TRIE'able sequences. Tail node is: ",
3245                                 SvPV_nolen_const( mysv )
3246                             );
3247                         });
3248
3249                         /*
3250
3251                             Step through the branches
3252                                 cur represents each branch,
3253                                 noper is the first thing to be matched as part of that branch
3254                                 noper_next is the regnext() of that node.
3255
3256                             We normally handle a case like this /FOO[xyz]|BAR[pqr]/
3257                             via a "jump trie" but we also support building with NOJUMPTRIE,
3258                             which restricts the trie logic to structures like /FOO|BAR/.
3259
3260                             If noper is a trieable nodetype then the branch is a possible optimization
3261                             target. If we are building under NOJUMPTRIE then we require that noper_next
3262                             is the same as scan (our current position in the regex program).
3263
3264                             Once we have two or more consecutive such branches we can create a
3265                             trie of the EXACT's contents and stitch it in place into the program.
3266
3267                             If the sequence represents all of the branches in the alternation we
3268                             replace the entire thing with a single TRIE node.
3269
3270                             Otherwise when it is a subsequence we need to stitch it in place and
3271                             replace only the relevant branches. This means the first branch has
3272                             to remain as it is used by the alternation logic, and its next pointer,
3273                             and needs to be repointed at the item on the branch chain following
3274                             the last branch we have optimized away.
3275
3276                             This could be either a BRANCH, in which case the subsequence is internal,
3277                             or it could be the item following the branch sequence in which case the
3278                             subsequence is at the end (which does not necessarily mean the first node
3279                             is the start of the alternation).
3280
3281                             TRIE_TYPE(X) is a define which maps the optype to a trietype.
3282
3283                                 optype          |  trietype
3284                                 ----------------+-----------
3285                                 NOTHING         | NOTHING
3286                                 EXACT           | EXACT
3287                                 EXACTFU         | EXACTFU
3288                                 EXACTFU_SS      | EXACTFU
3289                                 EXACTFU_TRICKYFOLD | EXACTFU
3290                                 EXACTFA         | 0
3291
3292
3293                         */
3294 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING :   \
3295                        ( EXACT == (X) )   ? EXACT :        \
3296                        ( EXACTFU == (X) || EXACTFU_SS == (X) || EXACTFU_TRICKYFOLD == (X) ) ? EXACTFU :        \
3297                        0 )
3298
3299                         /* dont use tail as the end marker for this traverse */
3300                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3301                             regnode * const noper = NEXTOPER( cur );
3302                             U8 noper_type = OP( noper );
3303                             U8 noper_trietype = TRIE_TYPE( noper_type );
3304 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3305                             regnode * const noper_next = regnext( noper );
3306 #endif
3307
3308                             DEBUG_OPTIMISE_r({
3309                                 regprop(RExC_rx, mysv, cur);
3310                                 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3311                                    (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3312
3313                                 regprop(RExC_rx, mysv, noper);
3314                                 PerlIO_printf( Perl_debug_log, " -> %s",
3315                                     SvPV_nolen_const(mysv));
3316
3317                                 if ( noper_next ) {
3318                                   regprop(RExC_rx, mysv, noper_next );
3319                                   PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3320                                     SvPV_nolen_const(mysv));
3321                                 }
3322                                 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
3323                                    REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
3324                             });
3325
3326                             /* Is noper a trieable nodetype that can be merged with the
3327                              * current trie (if there is one)? */
3328                             if ( noper_trietype
3329                                   &&
3330                                   (
3331                                         /* XXX: Currently we cannot allow a NOTHING node to be the first element
3332                                          * of a TRIEABLE sequence, Otherwise we will overwrite the regop following
3333                                          * the NOTHING with the TRIE regop later on. This is because a NOTHING node
3334                                          * is only one regnode wide, and a TRIE is two regnodes. An example of a
3335                                          * problematic pattern is: "x" =~ /\A(?>(?:(?:)A|B|C?x))\z/
3336                                          * At a later point of time we can somewhat workaround this by handling
3337                                          * NOTHING -> EXACT sequences as generated by /(?:)A|(?:)B/ type patterns,
3338                                          * as we can effectively ignore the NOTHING regop in that case.
3339                                          * This clause, which allows NOTHING to start a sequence is left commented
3340                                          * out as a reference.
3341                                          * - Yves
3342
3343                                            ( noper_trietype == NOTHING)
3344                                            || ( trietype == NOTHING )
3345                                         */
3346                                         ( noper_trietype == NOTHING && trietype )
3347                                         || ( trietype == noper_trietype )
3348                                   )
3349 #ifdef NOJUMPTRIE
3350                                   && noper_next == tail
3351 #endif
3352                                   && count < U16_MAX)
3353                             {
3354                                 /* Handle mergable triable node
3355                                  * Either we are the first node in a new trieable sequence,
3356                                  * in which case we do some bookkeeping, otherwise we update
3357                                  * the end pointer. */
3358                                 count++;
3359                                 if ( !first ) {
3360                                     first = cur;
3361                                     trietype = noper_trietype;
3362                                 } else {
3363                                     if ( trietype == NOTHING )
3364                                         trietype = noper_trietype;
3365                                     last = cur;
3366                                 }
3367                             } /* end handle mergable triable node */
3368                             else {
3369                                 /* handle unmergable node -
3370                                  * noper may either be a triable node which can not be tried
3371                                  * together with the current trie, or a non triable node */
3372                                 if ( last ) {
3373                                     /* If last is set and trietype is not NOTHING then we have found
3374                                      * at least two triable branch sequences in a row of a similar
3375                                      * trietype so we can turn them into a trie. If/when we
3376                                      * allow NOTHING to start a trie sequence this condition will be
3377                                      * required, and it isn't expensive so we leave it in for now. */
3378                                     if ( trietype != NOTHING )
3379                                         make_trie( pRExC_state,
3380                                                 startbranch, first, cur, tail, count,
3381                                                 trietype, depth+1 );
3382                                     last = NULL; /* note: we clear/update first, trietype etc below, so we dont do it here */
3383                                 }
3384                                 if ( noper_trietype
3385 #ifdef NOJUMPTRIE
3386                                      && noper_next == tail
3387 #endif
3388                                 ){
3389                                     /* noper is triable, so we can start a new trie sequence */
3390                                     count = 1;
3391                                     first = cur;
3392                                     trietype = noper_trietype;
3393                                 } else if (first) {
3394                                     /* if we already saw a first but the current node is not triable then we have
3395                                      * to reset the first information. */
3396                                     count = 0;
3397                                     first = NULL;
3398                                     trietype = 0;
3399                                 }
3400                             } /* end handle unmergable node */
3401                         } /* loop over branches */
3402                         DEBUG_OPTIMISE_r({
3403                             regprop(RExC_rx, mysv, cur);
3404                             PerlIO_printf( Perl_debug_log,
3405                               "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3406                               "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3407
3408                         });
3409                         if ( last && trietype != NOTHING ) {
3410                             /* the last branch of the sequence was part of a trie,
3411                              * so we have to construct it here outside of the loop
3412                              */
3413                             made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 );
3414 #ifdef TRIE_STUDY_OPT
3415                             if ( ((made == MADE_EXACT_TRIE &&
3416                                  startbranch == first)
3417                                  || ( first_non_open == first )) &&
3418                                  depth==0 ) {
3419                                 flags |= SCF_TRIE_RESTUDY;
3420                                 if ( startbranch == first
3421                                      && scan == tail )
3422                                 {
3423                                     RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3424                                 }
3425                             }
3426 #endif
3427                         } /* end if ( last) */
3428                     } /* TRIE_MAXBUF is non zero */
3429
3430                 } /* do trie */
3431
3432             }
3433             else if ( code == BRANCHJ ) {  /* single branch is optimized. */
3434                 scan = NEXTOPER(NEXTOPER(scan));
3435             } else                      /* single branch is optimized. */
3436                 scan = NEXTOPER(scan);
3437             continue;
3438         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3439             scan_frame *newframe = NULL;
3440             I32 paren;
3441             regnode *start;
3442             regnode *end;
3443
3444             if (OP(scan) != SUSPEND) {
3445             /* set the pointer */
3446                 if (OP(scan) == GOSUB) {
3447                     paren = ARG(scan);
3448                     RExC_recurse[ARG2L(scan)] = scan;
3449                     start = RExC_open_parens[paren-1];
3450                     end   = RExC_close_parens[paren-1];
3451                 } else {
3452                     paren = 0;
3453                     start = RExC_rxi->program + 1;
3454                     end   = RExC_opend;
3455                 }
3456                 if (!recursed) {
3457                     Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3458                     SAVEFREEPV(recursed);
3459                 }
3460                 if (!PAREN_TEST(recursed,paren+1)) {
3461                     PAREN_SET(recursed,paren+1);
3462                     Newx(newframe,1,scan_frame);
3463                 } else {
3464                     if (flags & SCF_DO_SUBSTR) {
3465                         SCAN_COMMIT(pRExC_state,data,minlenp);
3466                         data->longest = &(data->longest_float);
3467                     }
3468                     is_inf = is_inf_internal = 1;
3469                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3470                         cl_anything(pRExC_state, data->start_class);
3471                     flags &= ~SCF_DO_STCLASS;
3472                 }
3473             } else {
3474                 Newx(newframe,1,scan_frame);
3475                 paren = stopparen;
3476                 start = scan+2;
3477                 end = regnext(scan);
3478             }
3479             if (newframe) {
3480                 assert(start);
3481                 assert(end);
3482                 SAVEFREEPV(newframe);
3483                 newframe->next = regnext(scan);
3484                 newframe->last = last;
3485                 newframe->stop = stopparen;
3486                 newframe->prev = frame;
3487
3488                 frame = newframe;
3489                 scan =  start;
3490                 stopparen = paren;
3491                 last = end;
3492
3493                 continue;
3494             }
3495         }
3496         else if (OP(scan) == EXACT) {
3497             I32 l = STR_LEN(scan);
3498             UV uc;
3499             if (UTF) {
3500                 const U8 * const s = (U8*)STRING(scan);
3501                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3502                 l = utf8_length(s, s + l);
3503             } else {
3504                 uc = *((U8*)STRING(scan));
3505             }
3506             min += l;
3507             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3508                 /* The code below prefers earlier match for fixed
3509                    offset, later match for variable offset.  */
3510                 if (data->last_end == -1) { /* Update the start info. */
3511                     data->last_start_min = data->pos_min;
3512                     data->last_start_max = is_inf
3513                         ? I32_MAX : data->pos_min + data->pos_delta;
3514                 }
3515                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3516                 if (UTF)
3517                     SvUTF8_on(data->last_found);
3518                 {
3519                     SV * const sv = data->last_found;
3520                     MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3521                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
3522                     if (mg && mg->mg_len >= 0)
3523                         mg->mg_len += utf8_length((U8*)STRING(scan),
3524                                                   (U8*)STRING(scan)+STR_LEN(scan));
3525                 }
3526                 data->last_end = data->pos_min + l;
3527                 data->pos_min += l; /* As in the first entry. */
3528                 data->flags &= ~SF_BEFORE_EOL;
3529             }
3530             if (flags & SCF_DO_STCLASS_AND) {
3531                 /* Check whether it is compatible with what we know already! */
3532                 int compat = 1;
3533
3534
3535                 /* If compatible, we or it in below.  It is compatible if is
3536                  * in the bitmp and either 1) its bit or its fold is set, or 2)
3537                  * it's for a locale.  Even if there isn't unicode semantics
3538                  * here, at runtime there may be because of matching against a
3539                  * utf8 string, so accept a possible false positive for
3540                  * latin1-range folds */
3541                 if (uc >= 0x100 ||
3542                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3543                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
3544                     && (!(data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD)
3545                         || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3546                     )
3547                 {
3548                     compat = 0;
3549                 }
3550                 ANYOF_CLASS_ZERO(data->start_class);
3551                 ANYOF_BITMAP_ZERO(data->start_class);
3552                 if (compat)
3553                     ANYOF_BITMAP_SET(data->start_class, uc);
3554                 else if (uc >= 0x100) {
3555                     int i;
3556
3557                     /* Some Unicode code points fold to the Latin1 range; as
3558                      * XXX temporary code, instead of figuring out if this is
3559                      * one, just assume it is and set all the start class bits
3560                      * that could be some such above 255 code point's fold
3561                      * which will generate fals positives.  As the code
3562                      * elsewhere that does compute the fold settles down, it
3563                      * can be extracted out and re-used here */
3564                     for (i = 0; i < 256; i++){
3565                         if (HAS_NONLATIN1_FOLD_CLOSURE(i)) {
3566                             ANYOF_BITMAP_SET(data->start_class, i);
3567                         }
3568                     }
3569                 }
3570                 data->start_class->flags &= ~ANYOF_EOS;
3571                 if (uc < 0x100)
3572                   data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3573             }
3574             else if (flags & SCF_DO_STCLASS_OR) {
3575                 /* false positive possible if the class is case-folded */
3576                 if (uc < 0x100)
3577                     ANYOF_BITMAP_SET(data->start_class, uc);
3578                 else
3579                     data->start_class->flags |= ANYOF_UNICODE_ALL;
3580                 data->start_class->flags &= ~ANYOF_EOS;
3581                 cl_and(data->start_class, and_withp);
3582             }
3583             flags &= ~SCF_DO_STCLASS;
3584         }
3585         else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3586             I32 l = STR_LEN(scan);
3587             UV uc = *((U8*)STRING(scan));
3588
3589             /* Search for fixed substrings supports EXACT only. */
3590             if (flags & SCF_DO_SUBSTR) {
3591                 assert(data);
3592                 SCAN_COMMIT(pRExC_state, data, minlenp);
3593             }
3594             if (UTF) {
3595                 const U8 * const s = (U8 *)STRING(scan);
3596                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3597                 l = utf8_length(s, s + l);
3598             }
3599             else if (has_exactf_sharp_s) {
3600                 RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
3601             }
3602             min += l - min_subtract;
3603             if (min < 0) {
3604                 min = 0;
3605             }
3606             delta += min_subtract;
3607             if (flags & SCF_DO_SUBSTR) {
3608                 data->pos_min += l - min_subtract;
3609                 if (data->pos_min < 0) {
3610                     data->pos_min = 0;
3611                 }
3612                 data->pos_delta += min_subtract;
3613                 if (min_subtract) {
3614                     data->longest = &(data->longest_float);
3615                 }
3616             }
3617             if (flags & SCF_DO_STCLASS_AND) {
3618                 /* Check whether it is compatible with what we know already! */
3619                 int compat = 1;
3620                 if (uc >= 0x100 ||
3621                  (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3622                   && !ANYOF_BITMAP_TEST(data->start_class, uc)
3623                   && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3624                 {
3625                     compat = 0;
3626                 }
3627                 ANYOF_CLASS_ZERO(data->start_class);
3628                 ANYOF_BITMAP_ZERO(data->start_class);
3629                 if (compat) {
3630                     ANYOF_BITMAP_SET(data->start_class, uc);
3631                     data->start_class->flags &= ~ANYOF_EOS;
3632                     data->start_class->flags |= ANYOF_LOC_NONBITMAP_FOLD;
3633                     if (OP(scan) == EXACTFL) {
3634                         /* XXX This set is probably no longer necessary, and
3635                          * probably wrong as LOCALE now is on in the initial
3636                          * state */
3637                         data->start_class->flags |= ANYOF_LOCALE;
3638                     }
3639                     else {
3640
3641                         /* Also set the other member of the fold pair.  In case
3642                          * that unicode semantics is called for at runtime, use
3643                          * the full latin1 fold.  (Can't do this for locale,
3644                          * because not known until runtime) */
3645                         ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3646
3647                         /* All other (EXACTFL handled above) folds except under
3648                          * /iaa that include s, S, and sharp_s also may include
3649                          * the others */
3650                         if (OP(scan) != EXACTFA) {
3651                             if (uc == 's' || uc == 'S') {
3652                                 ANYOF_BITMAP_SET(data->start_class,
3653                                                  LATIN_SMALL_LETTER_SHARP_S);
3654                             }
3655                             else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3656                                 ANYOF_BITMAP_SET(data->start_class, 's');
3657                                 ANYOF_BITMAP_SET(data->start_class, 'S');
3658                             }
3659                         }
3660                     }
3661                 }
3662                 else if (uc >= 0x100) {
3663                     int i;
3664                     for (i = 0; i < 256; i++){
3665                         if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3666                             ANYOF_BITMAP_SET(data->start_class, i);
3667                         }
3668                     }
3669                 }
3670             }
3671             else if (flags & SCF_DO_STCLASS_OR) {
3672                 if (data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) {
3673                     /* false positive possible if the class is case-folded.
3674                        Assume that the locale settings are the same... */
3675                     if (uc < 0x100) {
3676                         ANYOF_BITMAP_SET(data->start_class, uc);
3677                         if (OP(scan) != EXACTFL) {
3678
3679                             /* And set the other member of the fold pair, but
3680                              * can't do that in locale because not known until
3681                              * run-time */
3682                             ANYOF_BITMAP_SET(data->start_class,
3683                                              PL_fold_latin1[uc]);
3684
3685                             /* All folds except under /iaa that include s, S,
3686                              * and sharp_s also may include the others */
3687                             if (OP(scan) != EXACTFA) {
3688                                 if (uc == 's' || uc == 'S') {
3689                                     ANYOF_BITMAP_SET(data->start_class,
3690                                                    LATIN_SMALL_LETTER_SHARP_S);
3691                                 }
3692                                 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3693                                     ANYOF_BITMAP_SET(data->start_class, 's');
3694                                     ANYOF_BITMAP_SET(data->start_class, 'S');
3695                                 }
3696                             }
3697                         }
3698                     }
3699                     data->start_class->flags &= ~ANYOF_EOS;
3700                 }
3701                 cl_and(data->start_class, and_withp);
3702             }
3703             flags &= ~SCF_DO_STCLASS;
3704         }
3705         else if (REGNODE_VARIES(OP(scan))) {
3706             I32 mincount, maxcount, minnext, deltanext, fl = 0;
3707             I32 f = flags, pos_before = 0;
3708             regnode * const oscan = scan;
3709             struct regnode_charclass_class this_class;
3710             struct regnode_charclass_class *oclass = NULL;
3711             I32 next_is_eval = 0;
3712
3713             switch (PL_regkind[OP(scan)]) {
3714             case WHILEM:                /* End of (?:...)* . */
3715                 scan = NEXTOPER(scan);
3716                 goto finish;
3717             case PLUS:
3718                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3719                     next = NEXTOPER(scan);
3720                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3721                         mincount = 1;
3722                         maxcount = REG_INFTY;
3723                         next = regnext(scan);
3724                         scan = NEXTOPER(scan);
3725                         goto do_curly;
3726                     }
3727                 }
3728                 if (flags & SCF_DO_SUBSTR)
3729                     data->pos_min++;
3730                 min++;
3731                 /* Fall through. */
3732             case STAR:
3733                 if (flags & SCF_DO_STCLASS) {
3734                     mincount = 0;
3735                     maxcount = REG_INFTY;
3736                     next = regnext(scan);
3737                     scan = NEXTOPER(scan);
3738                     goto do_curly;
3739                 }
3740                 is_inf = is_inf_internal = 1;
3741                 scan = regnext(scan);
3742                 if (flags & SCF_DO_SUBSTR) {
3743                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3744                     data->longest = &(data->longest_float);
3745                 }
3746                 goto optimize_curly_tail;
3747             case CURLY:
3748                 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3749                     && (scan->flags == stopparen))
3750                 {
3751                     mincount = 1;
3752                     maxcount = 1;
3753                 } else {
3754                     mincount = ARG1(scan);
3755                     maxcount = ARG2(scan);
3756                 }
3757                 next = regnext(scan);
3758                 if (OP(scan) == CURLYX) {
3759                     I32 lp = (data ? *(data->last_closep) : 0);
3760                     scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3761                 }
3762                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3763                 next_is_eval = (OP(scan) == EVAL);
3764               do_curly:
3765                 if (flags & SCF_DO_SUBSTR) {
3766                     if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3767                     pos_before = data->pos_min;
3768                 }
3769                 if (data) {
3770                     fl = data->flags;
3771                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3772                     if (is_inf)
3773                         data->flags |= SF_IS_INF;
3774                 }
3775                 if (flags & SCF_DO_STCLASS) {
3776                     cl_init(pRExC_state, &this_class);
3777                     oclass = data->start_class;
3778                     data->start_class = &this_class;
3779                     f |= SCF_DO_STCLASS_AND;
3780                     f &= ~SCF_DO_STCLASS_OR;
3781                 }
3782                 /* Exclude from super-linear cache processing any {n,m}
3783                    regops for which the combination of input pos and regex
3784                    pos is not enough information to determine if a match
3785                    will be possible.
3786
3787                    For example, in the regex /foo(bar\s*){4,8}baz/ with the
3788                    regex pos at the \s*, the prospects for a match depend not
3789                    only on the input position but also on how many (bar\s*)
3790                    repeats into the {4,8} we are. */
3791                if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3792                     f &= ~SCF_WHILEM_VISITED_POS;
3793
3794                 /* This will finish on WHILEM, setting scan, or on NULL: */
3795                 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3796                                       last, data, stopparen, recursed, NULL,
3797                                       (mincount == 0
3798                                         ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3799
3800                 if (flags & SCF_DO_STCLASS)
3801                     data->start_class = oclass;
3802                 if (mincount == 0 || minnext == 0) {
3803                     if (flags & SCF_DO_STCLASS_OR) {
3804                         cl_or(pRExC_state, data->start_class, &this_class);
3805                     }
3806                     else if (flags & SCF_DO_STCLASS_AND) {
3807                         /* Switch to OR mode: cache the old value of
3808                          * data->start_class */
3809                         INIT_AND_WITHP;
3810                         StructCopy(data->start_class, and_withp,
3811                                    struct regnode_charclass_class);
3812                         flags &= ~SCF_DO_STCLASS_AND;
3813                         StructCopy(&this_class, data->start_class,
3814                                    struct regnode_charclass_class);
3815                         flags |= SCF_DO_STCLASS_OR;
3816                         data->start_class->flags |= ANYOF_EOS;
3817                     }
3818                 } else {                /* Non-zero len */
3819                     if (flags & SCF_DO_STCLASS_OR) {
3820                         cl_or(pRExC_state, data->start_class, &this_class);
3821                         cl_and(data->start_class, and_withp);
3822                     }
3823                     else if (flags & SCF_DO_STCLASS_AND)
3824                         cl_and(data->start_class, &this_class);
3825                     flags &= ~SCF_DO_STCLASS;
3826                 }
3827                 if (!scan)              /* It was not CURLYX, but CURLY. */
3828                     scan = next;
3829                 if ( /* ? quantifier ok, except for (?{ ... }) */
3830                     (next_is_eval || !(mincount == 0 && maxcount == 1))
3831                     && (minnext == 0) && (deltanext == 0)
3832                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3833                     && maxcount <= REG_INFTY/3) /* Complement check for big count */
3834                 {
3835                     ckWARNreg(RExC_parse,
3836                               "Quantifier unexpected on zero-length expression");
3837                 }
3838
3839                 min += minnext * mincount;
3840                 is_inf_internal |= ((maxcount == REG_INFTY
3841                                      && (minnext + deltanext) > 0)
3842                                     || deltanext == I32_MAX);
3843                 is_inf |= is_inf_internal;
3844                 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3845
3846                 /* Try powerful optimization CURLYX => CURLYN. */
3847                 if (  OP(oscan) == CURLYX && data
3848                       && data->flags & SF_IN_PAR
3849                       && !(data->flags & SF_HAS_EVAL)
3850                       && !deltanext && minnext == 1 ) {
3851                     /* Try to optimize to CURLYN.  */
3852                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3853                     regnode * const nxt1 = nxt;
3854 #ifdef DEBUGGING
3855                     regnode *nxt2;
3856 #endif
3857
3858                     /* Skip open. */
3859                     nxt = regnext(nxt);
3860                     if (!REGNODE_SIMPLE(OP(nxt))
3861                         && !(PL_regkind[OP(nxt)] == EXACT
3862                              && STR_LEN(nxt) == 1))
3863                         goto nogo;
3864 #ifdef DEBUGGING
3865                     nxt2 = nxt;
3866 #endif
3867                     nxt = regnext(nxt);
3868                     if (OP(nxt) != CLOSE)
3869                         goto nogo;
3870                     if (RExC_open_parens) {
3871                         RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3872                         RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3873                     }
3874                     /* Now we know that nxt2 is the only contents: */
3875                     oscan->flags = (U8)ARG(nxt);
3876                     OP(oscan) = CURLYN;
3877                     OP(nxt1) = NOTHING; /* was OPEN. */
3878
3879 #ifdef DEBUGGING
3880                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3881                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3882                     NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3883                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
3884                     OP(nxt + 1) = OPTIMIZED; /* was count. */
3885                     NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3886 #endif
3887                 }
3888               nogo:
3889
3890                 /* Try optimization CURLYX => CURLYM. */
3891                 if (  OP(oscan) == CURLYX && data
3892                       && !(data->flags & SF_HAS_PAR)
3893                       && !(data->flags & SF_HAS_EVAL)
3894                       && !deltanext     /* atom is fixed width */
3895                       && minnext != 0   /* CURLYM can't handle zero width */
3896                 ) {
3897                     /* XXXX How to optimize if data == 0? */
3898                     /* Optimize to a simpler form.  */
3899                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3900                     regnode *nxt2;
3901
3902                     OP(oscan) = CURLYM;
3903                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3904                             && (OP(nxt2) != WHILEM))
3905                         nxt = nxt2;
3906                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
3907                     /* Need to optimize away parenths. */
3908                     if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3909                         /* Set the parenth number.  */
3910                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3911
3912                         oscan->flags = (U8)ARG(nxt);
3913                         if (RExC_open_parens) {
3914                             RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3915                             RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3916                         }
3917                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
3918                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
3919
3920 #ifdef DEBUGGING
3921                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3922                         OP(nxt + 1) = OPTIMIZED; /* was count. */
3923                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3924                         NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
3925 #endif
3926 #if 0
3927                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
3928                             regnode *nnxt = regnext(nxt1);
3929                             if (nnxt == nxt) {
3930                                 if (reg_off_by_arg[OP(nxt1)])
3931                                     ARG_SET(nxt1, nxt2 - nxt1);
3932                                 else if (nxt2 - nxt1 < U16_MAX)
3933                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
3934                                 else
3935                                     OP(nxt) = NOTHING;  /* Cannot beautify */
3936                             }
3937                             nxt1 = nnxt;
3938                         }
3939 #endif
3940                         /* Optimize again: */
3941                         study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3942                                     NULL, stopparen, recursed, NULL, 0,depth+1);
3943                     }
3944                     else
3945                         oscan->flags = 0;
3946                 }
3947                 else if ((OP(oscan) == CURLYX)
3948                          && (flags & SCF_WHILEM_VISITED_POS)
3949                          /* See the comment on a similar expression above.
3950                             However, this time it's not a subexpression
3951                             we care about, but the expression itself. */
3952                          && (maxcount == REG_INFTY)
3953                          && data && ++data->whilem_c < 16) {
3954                     /* This stays as CURLYX, we can put the count/of pair. */
3955                     /* Find WHILEM (as in regexec.c) */
3956                     regnode *nxt = oscan + NEXT_OFF(oscan);
3957
3958                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3959                         nxt += ARG(nxt);
3960                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
3961                         | (RExC_whilem_seen << 4)); /* On WHILEM */
3962                 }
3963                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3964                     pars++;
3965                 if (flags & SCF_DO_SUBSTR) {
3966                     SV *last_str = NULL;
3967                     int counted = mincount != 0;
3968
3969                     if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3970 #if defined(SPARC64_GCC_WORKAROUND)
3971                         I32 b = 0;
3972                         STRLEN l = 0;
3973                         const char *s = NULL;
3974                         I32 old = 0;
3975
3976                         if (pos_before >= data->last_start_min)
3977                             b = pos_before;
3978                         else
3979                             b = data->last_start_min;
3980
3981                         l = 0;
3982                         s = SvPV_const(data->last_found, l);
3983                         old = b - data->last_start_min;
3984
3985 #else
3986                         I32 b = pos_before >= data->last_start_min
3987                             ? pos_before : data->last_start_min;
3988                         STRLEN l;
3989                         const char * const s = SvPV_const(data->last_found, l);
3990                         I32 old = b - data->last_start_min;
3991 #endif
3992
3993                         if (UTF)
3994                             old = utf8_hop((U8*)s, old) - (U8*)s;
3995                         l -= old;
3996                         /* Get the added string: */
3997                         last_str = newSVpvn_utf8(s  + old, l, UTF);
3998                         if (deltanext == 0 && pos_before == b) {
3999                             /* What was added is a constant string */
4000                             if (mincount > 1) {
4001                                 SvGROW(last_str, (mincount * l) + 1);
4002                                 repeatcpy(SvPVX(last_str) + l,
4003                                           SvPVX_const(last_str), l, mincount - 1);
4004                                 SvCUR_set(last_str, SvCUR(last_str) * mincount);
4005                                 /* Add additional parts. */
4006                                 SvCUR_set(data->last_found,
4007                                           SvCUR(data->last_found) - l);
4008                                 sv_catsv(data->last_found, last_str);
4009                                 {
4010                                     SV * sv = data->last_found;
4011                                     MAGIC *mg =
4012                                         SvUTF8(sv) && SvMAGICAL(sv) ?
4013                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
4014                                     if (mg && mg->mg_len >= 0)
4015                                         mg->mg_len += CHR_SVLEN(last_str) - l;
4016                                 }
4017                                 data->last_end += l * (mincount - 1);
4018                             }
4019                         } else {
4020                             /* start offset must point into the last copy */
4021                             data->last_start_min += minnext * (mincount - 1);
4022                             data->last_start_max += is_inf ? I32_MAX
4023                                 : (maxcount - 1) * (minnext + data->pos_delta);
4024                         }
4025                     }
4026                     /* It is counted once already... */
4027                     data->pos_min += minnext * (mincount - counted);
4028                     data->pos_delta += - counted * deltanext +
4029                         (minnext + deltanext) * maxcount - minnext * mincount;
4030                     if (mincount != maxcount) {
4031                          /* Cannot extend fixed substrings found inside
4032                             the group.  */
4033                         SCAN_COMMIT(pRExC_state,data,minlenp);
4034                         if (mincount && last_str) {
4035                             SV * const sv = data->last_found;
4036                             MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4037                                 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4038
4039                             if (mg)
4040                                 mg->mg_len = -1;
4041                             sv_setsv(sv, last_str);
4042                             data->last_end = data->pos_min;
4043                             data->last_start_min =
4044                                 data->pos_min - CHR_SVLEN(last_str);
4045                             data->last_start_max = is_inf
4046                                 ? I32_MAX
4047                                 : data->pos_min + data->pos_delta
4048                                 - CHR_SVLEN(last_str);
4049                         }
4050                         data->longest = &(data->longest_float);
4051                     }
4052                     SvREFCNT_dec(last_str);
4053                 }
4054                 if (data && (fl & SF_HAS_EVAL))
4055                     data->flags |= SF_HAS_EVAL;
4056               optimize_curly_tail:
4057                 if (OP(oscan) != CURLYX) {
4058                     while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4059                            && NEXT_OFF(next))
4060                         NEXT_OFF(oscan) += NEXT_OFF(next);
4061                 }
4062                 continue;
4063             default:                    /* REF, ANYOFV, and CLUMP only? */
4064                 if (flags & SCF_DO_SUBSTR) {
4065                     SCAN_COMMIT(pRExC_state,data,minlenp);      /* Cannot expect anything... */
4066                     data->longest = &(data->longest_float);
4067                 }
4068                 is_inf = is_inf_internal = 1;
4069                 if (flags & SCF_DO_STCLASS_OR)
4070                     cl_anything(pRExC_state, data->start_class);
4071                 flags &= ~SCF_DO_STCLASS;
4072                 break;
4073             }
4074         }
4075         else if (OP(scan) == LNBREAK) {
4076             if (flags & SCF_DO_STCLASS) {
4077                 int value = 0;
4078                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
4079                 if (flags & SCF_DO_STCLASS_AND) {
4080                     for (value = 0; value < 256; value++)
4081                         if (!is_VERTWS_cp(value))
4082                             ANYOF_BITMAP_CLEAR(data->start_class, value);
4083                 }
4084                 else {
4085                     for (value = 0; value < 256; value++)
4086                         if (is_VERTWS_cp(value))
4087                             ANYOF_BITMAP_SET(data->start_class, value);
4088                 }
4089                 if (flags & SCF_DO_STCLASS_OR)
4090                     cl_and(data->start_class, and_withp);
4091                 flags &= ~SCF_DO_STCLASS;
4092             }
4093             min += 1;
4094             delta += 1;
4095             if (flags & SCF_DO_SUBSTR) {
4096                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
4097                 data->pos_min += 1;
4098                 data->pos_delta += 1;
4099                 data->longest = &(data->longest_float);
4100             }
4101         }
4102         else if (REGNODE_SIMPLE(OP(scan))) {
4103             int value = 0;
4104
4105             if (flags & SCF_DO_SUBSTR) {
4106                 SCAN_COMMIT(pRExC_state,data,minlenp);
4107                 data->pos_min++;
4108             }
4109             min++;
4110             if (flags & SCF_DO_STCLASS) {
4111                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
4112
4113                 /* Some of the logic below assumes that switching
4114                    locale on will only add false positives. */
4115                 switch (PL_regkind[OP(scan)]) {
4116                 case SANY:
4117                 default:
4118                   do_default:
4119                     /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
4120                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4121                         cl_anything(pRExC_state, data->start_class);
4122                     break;
4123                 case REG_ANY:
4124                     if (OP(scan) == SANY)
4125                         goto do_default;
4126                     if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
4127                         value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
4128                                  || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
4129                         cl_anything(pRExC_state, data->start_class);
4130                     }
4131                     if (flags & SCF_DO_STCLASS_AND || !value)
4132                         ANYOF_BITMAP_CLEAR(data->start_class,'\n');
4133                     break;
4134                 case ANYOF:
4135                     if (flags & SCF_DO_STCLASS_AND)
4136                         cl_and(data->start_class,
4137                                (struct regnode_charclass_class*)scan);
4138                     else
4139                         cl_or(pRExC_state, data->start_class,
4140                               (struct regnode_charclass_class*)scan);
4141                     break;
4142                 case ALNUM:
4143                     if (flags & SCF_DO_STCLASS_AND) {
4144                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4145                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
4146                             if (OP(scan) == ALNUMU) {
4147                                 for (value = 0; value < 256; value++) {
4148                                     if (!isWORDCHAR_L1(value)) {
4149                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4150                                     }
4151                                 }
4152                             } else {
4153                                 for (value = 0; value < 256; value++) {
4154                                     if (!isALNUM(value)) {
4155                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4156                                     }
4157                                 }
4158                             }
4159                         }
4160                     }
4161                     else {
4162                         if (data->start_class->flags & ANYOF_LOCALE)
4163                             ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
4164
4165                         /* Even if under locale, set the bits for non-locale
4166                          * in case it isn't a true locale-node.  This will
4167                          * create false positives if it truly is locale */
4168                         if (OP(scan) == ALNUMU) {
4169                             for (value = 0; value < 256; value++) {
4170                                 if (isWORDCHAR_L1(value)) {
4171                                     ANYOF_BITMAP_SET(data->start_class, value);
4172                                 }
4173                             }
4174                         } else {
4175                             for (value = 0; value < 256; value++) {
4176                                 if (isALNUM(value)) {
4177                                     ANYOF_BITMAP_SET(data->start_class, value);
4178                                 }
4179                             }
4180                         }
4181                     }
4182                     break;
4183                 case NALNUM:
4184                     if (flags & SCF_DO_STCLASS_AND) {
4185                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4186                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
4187                             if (OP(scan) == NALNUMU) {
4188                                 for (value = 0; value < 256; value++) {
4189                                     if (isWORDCHAR_L1(value)) {
4190                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4191                                     }
4192                                 }
4193                             } else {
4194                                 for (value = 0; value < 256; value++) {
4195                                     if (isALNUM(value)) {
4196                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4197                                     }
4198                                 }
4199                             }
4200                         }
4201                     }
4202                     else {
4203                         if (data->start_class->flags & ANYOF_LOCALE)
4204                             ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
4205
4206                         /* Even if under locale, set the bits for non-locale in
4207                          * case it isn't a true locale-node.  This will create
4208                          * false positives if it truly is locale */
4209                         if (OP(scan) == NALNUMU) {
4210                             for (value = 0; value < 256; value++) {
4211                                 if (! isWORDCHAR_L1(value)) {
4212                                     ANYOF_BITMAP_SET(data->start_class, value);
4213                                 }
4214                             }
4215                         } else {
4216                             for (value = 0; value < 256; value++) {
4217                                 if (! isALNUM(value)) {
4218                                     ANYOF_BITMAP_SET(data->start_class, value);
4219                                 }
4220                             }
4221                         }
4222                     }
4223                     break;
4224                 case SPACE:
4225                     if (flags & SCF_DO_STCLASS_AND) {
4226                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4227                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
4228                             if (OP(scan) == SPACEU) {
4229                                 for (value = 0; value < 256; value++) {
4230                                     if (!isSPACE_L1(value)) {
4231                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4232                                     }
4233                                 }
4234                             } else {
4235                                 for (value = 0; value < 256; value++) {
4236                                     if (!isSPACE(value)) {
4237                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4238                                     }
4239                                 }
4240                             }
4241                         }
4242                     }
4243                     else {
4244                         if (data->start_class->flags & ANYOF_LOCALE) {
4245                             ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
4246                         }
4247                         if (OP(scan) == SPACEU) {
4248                             for (value = 0; value < 256; value++) {
4249                                 if (isSPACE_L1(value)) {
4250                                     ANYOF_BITMAP_SET(data->start_class, value);
4251                                 }
4252                             }
4253                         } else {
4254                             for (value = 0; value < 256; value++) {
4255                                 if (isSPACE(value)) {
4256                                     ANYOF_BITMAP_SET(data->start_class, value);
4257                                 }
4258                             }
4259                         }
4260                     }
4261                     break;
4262                 case NSPACE:
4263                     if (flags & SCF_DO_STCLASS_AND) {
4264                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4265                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
4266                             if (OP(scan) == NSPACEU) {
4267                                 for (value = 0; value < 256; value++) {
4268                                     if (isSPACE_L1(value)) {
4269                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4270                                     }
4271                                 }
4272                             } else {
4273                                 for (value = 0; value < 256; value++) {
4274                                     if (isSPACE(value)) {
4275                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4276                                     }
4277                                 }
4278                             }
4279                         }
4280                     }
4281                     else {
4282                         if (data->start_class->flags & ANYOF_LOCALE)
4283                             ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
4284                         if (OP(scan) == NSPACEU) {
4285                             for (value = 0; value < 256; value++) {
4286                                 if (!isSPACE_L1(value)) {
4287                                     ANYOF_BITMAP_SET(data->start_class, value);
4288                                 }
4289                             }
4290                         }
4291                         else {
4292                             for (value = 0; value < 256; value++) {
4293                                 if (!isSPACE(value)) {
4294                                     ANYOF_BITMAP_SET(data->start_class, value);
4295                                 }
4296                             }
4297                         }
4298                     }
4299                     break;
4300                 case DIGIT:
4301                     if (flags & SCF_DO_STCLASS_AND) {
4302                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4303                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
4304                             for (value = 0; value < 256; value++)
4305                                 if (!isDIGIT(value))
4306                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
4307                         }
4308                     }
4309                     else {
4310                         if (data->start_class->flags & ANYOF_LOCALE)
4311                             ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
4312                         for (value = 0; value < 256; value++)
4313                             if (isDIGIT(value))
4314                                 ANYOF_BITMAP_SET(data->start_class, value);
4315                     }
4316                     break;
4317                 case NDIGIT:
4318                     if (flags & SCF_DO_STCLASS_AND) {
4319                         if (!(data->start_class->flags & ANYOF_LOCALE))
4320                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
4321                         for (value = 0; value < 256; value++)
4322                             if (isDIGIT(value))
4323                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
4324                     }
4325                     else {
4326                         if (data->start_class->flags & ANYOF_LOCALE)
4327                             ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
4328                         for (value = 0; value < 256; value++)
4329                             if (!isDIGIT(value))
4330                                 ANYOF_BITMAP_SET(data->start_class, value);
4331                     }
4332                     break;
4333                 CASE_SYNST_FNC(VERTWS);
4334                 CASE_SYNST_FNC(HORIZWS);
4335
4336                 }
4337                 if (flags & SCF_DO_STCLASS_OR)
4338                     cl_and(data->start_class, and_withp);
4339                 flags &= ~SCF_DO_STCLASS;
4340             }
4341         }
4342         else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
4343             data->flags |= (OP(scan) == MEOL
4344                             ? SF_BEFORE_MEOL
4345                             : SF_BEFORE_SEOL);
4346         }
4347         else if (  PL_regkind[OP(scan)] == BRANCHJ
4348                  /* Lookbehind, or need to calculate parens/evals/stclass: */
4349                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
4350                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4351             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4352                 || OP(scan) == UNLESSM )
4353             {
4354                 /* Negative Lookahead/lookbehind
4355                    In this case we can't do fixed string optimisation.
4356                 */
4357
4358                 I32 deltanext, minnext, fake = 0;
4359                 regnode *nscan;
4360                 struct regnode_charclass_class intrnl;
4361                 int f = 0;
4362
4363                 data_fake.flags = 0;
4364                 if (data) {
4365                     data_fake.whilem_c = data->whilem_c;
4366                     data_fake.last_closep = data->last_closep;
4367                 }
4368                 else
4369                     data_fake.last_closep = &fake;
4370                 data_fake.pos_delta = delta;
4371                 if ( flags & SCF_DO_STCLASS && !scan->flags
4372                      && OP(scan) == IFMATCH ) { /* Lookahead */
4373                     cl_init(pRExC_state, &intrnl);
4374                     data_fake.start_class = &intrnl;
4375                     f |= SCF_DO_STCLASS_AND;
4376                 }
4377                 if (flags & SCF_WHILEM_VISITED_POS)
4378                     f |= SCF_WHILEM_VISITED_POS;
4379                 next = regnext(scan);
4380                 nscan = NEXTOPER(NEXTOPER(scan));
4381                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
4382                     last, &data_fake, stopparen, recursed, NULL, f, depth+1);
4383                 if (scan->flags) {
4384                     if (deltanext) {
4385                         FAIL("Variable length lookbehind not implemented");
4386                     }
4387                     else if (minnext > (I32)U8_MAX) {
4388                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4389                     }
4390                     scan->flags = (U8)minnext;
4391                 }
4392                 if (data) {
4393                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4394                         pars++;
4395                     if (data_fake.flags & SF_HAS_EVAL)
4396                         data->flags |= SF_HAS_EVAL;
4397                     data->whilem_c = data_fake.whilem_c;
4398                 }
4399                 if (f & SCF_DO_STCLASS_AND) {
4400                     if (flags & SCF_DO_STCLASS_OR) {
4401                         /* OR before, AND after: ideally we would recurse with
4402                          * data_fake to get the AND applied by study of the
4403                          * remainder of the pattern, and then derecurse;
4404                          * *** HACK *** for now just treat as "no information".
4405                          * See [perl #56690].
4406                          */
4407                         cl_init(pRExC_state, data->start_class);
4408                     }  else {
4409                         /* AND before and after: combine and continue */
4410                         const int was = (data->start_class->flags & ANYOF_EOS);
4411
4412                         cl_and(data->start_class, &intrnl);
4413                         if (was)
4414                             data->start_class->flags |= ANYOF_EOS;
4415                     }
4416                 }
4417             }
4418 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4419             else {
4420                 /* Positive Lookahead/lookbehind
4421                    In this case we can do fixed string optimisation,
4422                    but we must be careful about it. Note in the case of
4423                    lookbehind the positions will be offset by the minimum
4424                    length of the pattern, something we won't know about
4425                    until after the recurse.
4426                 */
4427                 I32 deltanext, fake = 0;
4428                 regnode *nscan;
4429                 struct regnode_charclass_class intrnl;
4430                 int f = 0;
4431                 /* We use SAVEFREEPV so that when the full compile
4432                     is finished perl will clean up the allocated
4433                     minlens when it's all done. This way we don't
4434                     have to worry about freeing them when we know
4435                     they wont be used, which would be a pain.
4436                  */
4437                 I32 *minnextp;
4438                 Newx( minnextp, 1, I32 );
4439                 SAVEFREEPV(minnextp);
4440
4441                 if (data) {
4442                     StructCopy(data, &data_fake, scan_data_t);
4443                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
4444                         f |= SCF_DO_SUBSTR;
4445                         if (scan->flags)
4446                             SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
4447                         data_fake.last_found=newSVsv(data->last_found);
4448                     }
4449                 }
4450                 else
4451                     data_fake.last_closep = &fake;
4452                 data_fake.flags = 0;
4453                 data_fake.pos_delta = delta;
4454                 if (is_inf)
4455                     data_fake.flags |= SF_IS_INF;
4456                 if ( flags & SCF_DO_STCLASS && !scan->flags
4457                      && OP(scan) == IFMATCH ) { /* Lookahead */
4458                     cl_init(pRExC_state, &intrnl);
4459                     data_fake.start_class = &intrnl;
4460                     f |= SCF_DO_STCLASS_AND;
4461                 }
4462                 if (flags & SCF_WHILEM_VISITED_POS)
4463                     f |= SCF_WHILEM_VISITED_POS;
4464                 next = regnext(scan);
4465                 nscan = NEXTOPER(NEXTOPER(scan));
4466
4467                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
4468                     last, &data_fake, stopparen, recursed, NULL, f,depth+1);
4469                 if (scan->flags) {
4470                     if (deltanext) {
4471                         FAIL("Variable length lookbehind not implemented");
4472                     }
4473                     else if (*minnextp > (I32)U8_MAX) {
4474                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4475                     }
4476                     scan->flags = (U8)*minnextp;
4477                 }
4478
4479                 *minnextp += min;
4480
4481                 if (f & SCF_DO_STCLASS_AND) {
4482                     const int was = (data->start_class->flags & ANYOF_EOS);
4483
4484                     cl_and(data->start_class, &intrnl);
4485                     if (was)
4486                         data->start_class->flags |= ANYOF_EOS;
4487                 }
4488                 if (data) {
4489                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4490                         pars++;
4491                     if (data_fake.flags & SF_HAS_EVAL)
4492                         data->flags |= SF_HAS_EVAL;
4493                     data->whilem_c = data_fake.whilem_c;
4494                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4495                         if (RExC_rx->minlen<*minnextp)
4496                             RExC_rx->minlen=*minnextp;
4497                         SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4498                         SvREFCNT_dec(data_fake.last_found);
4499
4500                         if ( data_fake.minlen_fixed != minlenp )
4501                         {
4502                             data->offset_fixed= data_fake.offset_fixed;
4503                             data->minlen_fixed= data_fake.minlen_fixed;
4504                             data->lookbehind_fixed+= scan->flags;
4505                         }
4506                         if ( data_fake.minlen_float != minlenp )
4507                         {
4508                             data->minlen_float= data_fake.minlen_float;
4509                             data->offset_float_min=data_fake.offset_float_min;
4510                             data->offset_float_max=data_fake.offset_float_max;
4511                             data->lookbehind_float+= scan->flags;
4512                         }
4513                     }
4514                 }
4515
4516
4517             }
4518 #endif
4519         }
4520         else if (OP(scan) == OPEN) {
4521             if (stopparen != (I32)ARG(scan))
4522                 pars++;
4523         }
4524         else if (OP(scan) == CLOSE) {
4525             if (stopparen == (I32)ARG(scan)) {
4526                 break;
4527             }
4528             if ((I32)ARG(scan) == is_par) {
4529                 next = regnext(scan);
4530
4531                 if ( next && (OP(next) != WHILEM) && next < last)
4532                     is_par = 0;         /* Disable optimization */
4533             }
4534             if (data)
4535                 *(data->last_closep) = ARG(scan);
4536         }
4537         else if (OP(scan) == EVAL) {
4538                 if (data)
4539                     data->flags |= SF_HAS_EVAL;
4540         }
4541         else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4542             if (flags & SCF_DO_SUBSTR) {
4543                 SCAN_COMMIT(pRExC_state,data,minlenp);
4544                 flags &= ~SCF_DO_SUBSTR;
4545             }
4546             if (data && OP(scan)==ACCEPT) {
4547                 data->flags |= SCF_SEEN_ACCEPT;
4548                 if (stopmin > min)
4549                     stopmin = min;
4550             }
4551         }
4552         else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4553         {
4554                 if (flags & SCF_DO_SUBSTR) {
4555                     SCAN_COMMIT(pRExC_state,data,minlenp);
4556                     data->longest = &(data->longest_float);
4557                 }
4558                 is_inf = is_inf_internal = 1;
4559                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4560                     cl_anything(pRExC_state, data->start_class);
4561                 flags &= ~SCF_DO_STCLASS;
4562         }
4563         else if (OP(scan) == GPOS) {
4564             if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4565                 !(delta || is_inf || (data && data->pos_delta)))
4566             {
4567                 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4568                     RExC_rx->extflags |= RXf_ANCH_GPOS;
4569                 if (RExC_rx->gofs < (U32)min)
4570                     RExC_rx->gofs = min;
4571             } else {
4572                 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4573                 RExC_rx->gofs = 0;
4574             }
4575         }
4576 #ifdef TRIE_STUDY_OPT
4577 #ifdef FULL_TRIE_STUDY
4578         else if (PL_regkind[OP(scan)] == TRIE) {
4579             /* NOTE - There is similar code to this block above for handling
4580                BRANCH nodes on the initial study.  If you change stuff here
4581                check there too. */
4582             regnode *trie_node= scan;
4583             regnode *tail= regnext(scan);
4584             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4585             I32 max1 = 0, min1 = I32_MAX;
4586             struct regnode_charclass_class accum;
4587
4588             if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4589                 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4590             if (flags & SCF_DO_STCLASS)
4591                 cl_init_zero(pRExC_state, &accum);
4592
4593             if (!trie->jump) {
4594                 min1= trie->minlen;
4595                 max1= trie->maxlen;
4596             } else {
4597                 const regnode *nextbranch= NULL;
4598                 U32 word;
4599
4600                 for ( word=1 ; word <= trie->wordcount ; word++)
4601                 {
4602                     I32 deltanext=0, minnext=0, f = 0, fake;
4603                     struct regnode_charclass_class this_class;
4604
4605                     data_fake.flags = 0;
4606                     if (data) {
4607                         data_fake.whilem_c = data->whilem_c;
4608                         data_fake.last_closep = data->last_closep;
4609                     }
4610                     else
4611                         data_fake.last_closep = &fake;
4612                     data_fake.pos_delta = delta;
4613                     if (flags & SCF_DO_STCLASS) {
4614                         cl_init(pRExC_state, &this_class);
4615                         data_fake.start_class = &this_class;
4616                         f = SCF_DO_STCLASS_AND;
4617                     }
4618                     if (flags & SCF_WHILEM_VISITED_POS)
4619                         f |= SCF_WHILEM_VISITED_POS;
4620
4621                     if (trie->jump[word]) {
4622                         if (!nextbranch)
4623                             nextbranch = trie_node + trie->jump[0];
4624                         scan= trie_node + trie->jump[word];
4625                         /* We go from the jump point to the branch that follows
4626                            it. Note this means we need the vestigal unused branches
4627                            even though they arent otherwise used.
4628                          */
4629                         minnext = study_chunk(pRExC_state, &scan, minlenp,
4630                             &deltanext, (regnode *)nextbranch, &data_fake,
4631                             stopparen, recursed, NULL, f,depth+1);
4632                     }
4633                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4634                         nextbranch= regnext((regnode*)nextbranch);
4635
4636                     if (min1 > (I32)(minnext + trie->minlen))
4637                         min1 = minnext + trie->minlen;
4638                     if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4639                         max1 = minnext + deltanext + trie->maxlen;
4640                     if (deltanext == I32_MAX)
4641                         is_inf = is_inf_internal = 1;
4642
4643                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4644                         pars++;
4645                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
4646                         if ( stopmin > min + min1)
4647                             stopmin = min + min1;
4648                         flags &= ~SCF_DO_SUBSTR;
4649                         if (data)
4650                             data->flags |= SCF_SEEN_ACCEPT;
4651                     }
4652                     if (data) {
4653                         if (data_fake.flags & SF_HAS_EVAL)
4654                             data->flags |= SF_HAS_EVAL;
4655                         data->whilem_c = data_fake.whilem_c;
4656                     }
4657                     if (flags & SCF_DO_STCLASS)
4658                         cl_or(pRExC_state, &accum, &this_class);
4659                 }
4660             }
4661             if (flags & SCF_DO_SUBSTR) {
4662                 data->pos_min += min1;
4663                 data->pos_delta += max1 - min1;
4664                 if (max1 != min1 || is_inf)
4665                     data->longest = &(data->longest_float);
4666             }
4667             min += min1;
4668             delta += max1 - min1;
4669             if (flags & SCF_DO_STCLASS_OR) {
4670                 cl_or(pRExC_state, data->start_class, &accum);
4671                 if (min1) {
4672                     cl_and(data->start_class, and_withp);
4673                     flags &= ~SCF_DO_STCLASS;
4674                 }
4675             }
4676             else if (flags & SCF_DO_STCLASS_AND) {
4677                 if (min1) {
4678                     cl_and(data->start_class, &accum);
4679                     flags &= ~SCF_DO_STCLASS;
4680                 }
4681                 else {
4682                     /* Switch to OR mode: cache the old value of
4683                      * data->start_class */
4684                     INIT_AND_WITHP;
4685                     StructCopy(data->start_class, and_withp,
4686                                struct regnode_charclass_class);
4687                     flags &= ~SCF_DO_STCLASS_AND;
4688                     StructCopy(&accum, data->start_class,
4689                                struct regnode_charclass_class);
4690                     flags |= SCF_DO_STCLASS_OR;
4691                     data->start_class->flags |= ANYOF_EOS;
4692                 }
4693             }
4694             scan= tail;
4695             continue;
4696         }
4697 #else
4698         else if (PL_regkind[OP(scan)] == TRIE) {
4699             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4700             U8*bang=NULL;
4701
4702             min += trie->minlen;
4703             delta += (trie->maxlen - trie->minlen);
4704             flags &= ~SCF_DO_STCLASS; /* xxx */
4705             if (flags & SCF_DO_SUBSTR) {
4706                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
4707                 data->pos_min += trie->minlen;
4708                 data->pos_delta += (trie->maxlen - trie->minlen);
4709                 if (trie->maxlen != trie->minlen)
4710                     data->longest = &(data->longest_float);
4711             }
4712             if (trie->jump) /* no more substrings -- for now /grr*/
4713                 flags &= ~SCF_DO_SUBSTR;
4714         }
4715 #endif /* old or new */
4716 #endif /* TRIE_STUDY_OPT */
4717
4718         /* Else: zero-length, ignore. */
4719         scan = regnext(scan);
4720     }
4721     if (frame) {
4722         last = frame->last;
4723         scan = frame->next;
4724         stopparen = frame->stop;
4725         frame = frame->prev;
4726         goto fake_study_recurse;
4727     }
4728
4729   finish:
4730     assert(!frame);
4731     DEBUG_STUDYDATA("pre-fin:",data,depth);
4732
4733     *scanp = scan;
4734     *deltap = is_inf_internal ? I32_MAX : delta;
4735     if (flags & SCF_DO_SUBSTR && is_inf)
4736         data->pos_delta = I32_MAX - data->pos_min;
4737     if (is_par > (I32)U8_MAX)
4738         is_par = 0;
4739     if (is_par && pars==1 && data) {
4740         data->flags |= SF_IN_PAR;
4741         data->flags &= ~SF_HAS_PAR;
4742     }
4743     else if (pars && data) {
4744         data->flags |= SF_HAS_PAR;
4745         data->flags &= ~SF_IN_PAR;
4746     }
4747     if (flags & SCF_DO_STCLASS_OR)
4748         cl_and(data->start_class, and_withp);
4749     if (flags & SCF_TRIE_RESTUDY)
4750         data->flags |=  SCF_TRIE_RESTUDY;
4751
4752     DEBUG_STUDYDATA("post-fin:",data,depth);
4753
4754     return min < stopmin ? min : stopmin;
4755 }
4756
4757 STATIC U32
4758 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4759 {
4760     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4761
4762     PERL_ARGS_ASSERT_ADD_DATA;
4763
4764     Renewc(RExC_rxi->data,
4765            sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4766            char, struct reg_data);
4767     if(count)
4768         Renew(RExC_rxi->data->what, count + n, U8);
4769     else
4770         Newx(RExC_rxi->data->what, n, U8);
4771     RExC_rxi->data->count = count + n;
4772     Copy(s, RExC_rxi->data->what + count, n, U8);
4773     return count;
4774 }
4775
4776 /*XXX: todo make this not included in a non debugging perl */
4777 #ifndef PERL_IN_XSUB_RE
4778 void
4779 Perl_reginitcolors(pTHX)
4780 {
4781     dVAR;
4782     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4783     if (s) {
4784         char *t = savepv(s);
4785         int i = 0;
4786         PL_colors[0] = t;
4787         while (++i < 6) {
4788             t = strchr(t, '\t');
4789             if (t) {
4790                 *t = '\0';
4791                 PL_colors[i] = ++t;
4792             }
4793             else
4794                 PL_colors[i] = t = (char *)"";
4795         }
4796     } else {
4797         int i = 0;
4798         while (i < 6)
4799             PL_colors[i++] = (char *)"";
4800     }
4801     PL_colorset = 1;
4802 }
4803 #endif
4804
4805
4806 #ifdef TRIE_STUDY_OPT
4807 #define CHECK_RESTUDY_GOTO                                  \
4808         if (                                                \
4809               (data.flags & SCF_TRIE_RESTUDY)               \
4810               && ! restudied++                              \
4811         )     goto reStudy
4812 #else
4813 #define CHECK_RESTUDY_GOTO
4814 #endif
4815
4816 /*
4817  - pregcomp - compile a regular expression into internal code
4818  *
4819  * We can't allocate space until we know how big the compiled form will be,
4820  * but we can't compile it (and thus know how big it is) until we've got a
4821  * place to put the code.  So we cheat:  we compile it twice, once with code
4822  * generation turned off and size counting turned on, and once "for real".
4823  * This also means that we don't allocate space until we are sure that the
4824  * thing really will compile successfully, and we never have to move the
4825  * code and thus invalidate pointers into it.  (Note that it has to be in
4826  * one piece because free() must be able to free it all.) [NB: not true in perl]
4827  *
4828  * Beware that the optimization-preparation code in here knows about some
4829  * of the structure of the compiled regexp.  [I'll say.]
4830  */
4831
4832
4833
4834 #ifndef PERL_IN_XSUB_RE
4835 #define RE_ENGINE_PTR &PL_core_reg_engine
4836 #else
4837 extern const struct regexp_engine my_reg_engine;
4838 #define RE_ENGINE_PTR &my_reg_engine
4839 #endif
4840
4841 #ifndef PERL_IN_XSUB_RE
4842 REGEXP *
4843 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4844 {
4845     dVAR;
4846     HV * const table = GvHV(PL_hintgv);
4847
4848     PERL_ARGS_ASSERT_PREGCOMP;
4849
4850     /* Dispatch a request to compile a regexp to correct
4851        regexp engine. */
4852     if (table) {
4853         SV **ptr= hv_fetchs(table, "regcomp", FALSE);
4854         GET_RE_DEBUG_FLAGS_DECL;
4855         if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
4856             const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4857             DEBUG_COMPILE_r({
4858                 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4859                     SvIV(*ptr));
4860             });
4861             return CALLREGCOMP_ENG(eng, pattern, flags);
4862         }
4863     }
4864     return Perl_re_compile(aTHX_ pattern, flags);
4865 }
4866 #endif
4867
4868 REGEXP *
4869 Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
4870 {
4871     dVAR;
4872     REGEXP *rx;
4873     struct regexp *r;
4874     register regexp_internal *ri;
4875     STRLEN plen;
4876     char* VOL exp;
4877     char* xend;
4878     regnode *scan;
4879     I32 flags;
4880     I32 minlen = 0;
4881     U32 pm_flags;
4882
4883     /* these are all flags - maybe they should be turned
4884      * into a single int with different bit masks */
4885     I32 sawlookahead = 0;
4886     I32 sawplus = 0;
4887     I32 sawopen = 0;
4888     bool used_setjump = FALSE;
4889     regex_charset initial_charset = get_regex_charset(orig_pm_flags);
4890
4891     U8 jump_ret = 0;
4892     dJMPENV;
4893     scan_data_t data;
4894     RExC_state_t RExC_state;
4895     RExC_state_t * const pRExC_state = &RExC_state;
4896 #ifdef TRIE_STUDY_OPT
4897     int restudied;
4898     RExC_state_t copyRExC_state;
4899 #endif
4900     GET_RE_DEBUG_FLAGS_DECL;
4901
4902     PERL_ARGS_ASSERT_RE_COMPILE;
4903
4904     DEBUG_r(if (!PL_colorset) reginitcolors());
4905
4906 #ifndef PERL_IN_XSUB_RE
4907     /* Initialize these here instead of as-needed, as is quick and avoids
4908      * having to test them each time otherwise */
4909     if (! PL_AboveLatin1) {
4910         PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
4911         PL_ASCII = _new_invlist_C_array(ASCII_invlist);
4912         PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
4913
4914         PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist);
4915         PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist);
4916
4917         PL_L1PosixAlpha = _new_invlist_C_array(L1PosixAlpha_invlist);
4918         PL_PosixAlpha = _new_invlist_C_array(PosixAlpha_invlist);
4919
4920         PL_PosixBlank = _new_invlist_C_array(PosixBlank_invlist);
4921         PL_XPosixBlank = _new_invlist_C_array(XPosixBlank_invlist);
4922
4923         PL_L1Cased = _new_invlist_C_array(L1Cased_invlist);
4924
4925         PL_PosixCntrl = _new_invlist_C_array(PosixCntrl_invlist);
4926         PL_XPosixCntrl = _new_invlist_C_array(XPosixCntrl_invlist);
4927
4928         PL_PosixDigit = _new_invlist_C_array(PosixDigit_invlist);
4929
4930         PL_L1PosixGraph = _new_invlist_C_array(L1PosixGraph_invlist);
4931         PL_PosixGraph = _new_invlist_C_array(PosixGraph_invlist);
4932
4933         PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist);
4934         PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist);
4935
4936         PL_L1PosixLower = _new_invlist_C_array(L1PosixLower_invlist);
4937         PL_PosixLower = _new_invlist_C_array(PosixLower_invlist);
4938
4939         PL_L1PosixPrint = _new_invlist_C_array(L1PosixPrint_invlist);
4940         PL_PosixPrint = _new_invlist_C_array(PosixPrint_invlist);
4941
4942         PL_L1PosixPunct = _new_invlist_C_array(L1PosixPunct_invlist);
4943         PL_PosixPunct = _new_invlist_C_array(PosixPunct_invlist);
4944
4945         PL_PerlSpace = _new_invlist_C_array(PerlSpace_invlist);
4946         PL_XPerlSpace = _new_invlist_C_array(XPerlSpace_invlist);
4947
4948         PL_PosixSpace = _new_invlist_C_array(PosixSpace_invlist);
4949         PL_XPosixSpace = _new_invlist_C_array(XPosixSpace_invlist);
4950
4951         PL_L1PosixUpper = _new_invlist_C_array(L1PosixUpper_invlist);
4952         PL_PosixUpper = _new_invlist_C_array(PosixUpper_invlist);
4953
4954         PL_VertSpace = _new_invlist_C_array(VertSpace_invlist);
4955
4956         PL_PosixWord = _new_invlist_C_array(PosixWord_invlist);
4957         PL_L1PosixWord = _new_invlist_C_array(L1PosixWord_invlist);
4958
4959         PL_PosixXDigit = _new_invlist_C_array(PosixXDigit_invlist);
4960         PL_XPosixXDigit = _new_invlist_C_array(XPosixXDigit_invlist);
4961     }
4962 #endif
4963
4964     exp = SvPV(pattern, plen);
4965
4966     if (plen == 0) { /* ignore the utf8ness if the pattern is 0 length */
4967         RExC_utf8 = RExC_orig_utf8 = 0;
4968     }
4969     else {
4970         RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
4971     }
4972     RExC_uni_semantics = 0;
4973     RExC_contains_locale = 0;
4974
4975     /****************** LONG JUMP TARGET HERE***********************/
4976     /* Longjmp back to here if have to switch in midstream to utf8 */
4977     if (! RExC_orig_utf8) {
4978         JMPENV_PUSH(jump_ret);
4979         used_setjump = TRUE;
4980     }
4981
4982     if (jump_ret == 0) {    /* First time through */
4983         xend = exp + plen;
4984
4985         DEBUG_COMPILE_r({
4986             SV *dsv= sv_newmortal();
4987             RE_PV_QUOTED_DECL(s, RExC_utf8,
4988                 dsv, exp, plen, 60);
4989             PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4990                            PL_colors[4],PL_colors[5],s);
4991         });
4992     }
4993     else {  /* longjumped back */
4994         STRLEN len = plen;
4995
4996         /* If the cause for the longjmp was other than changing to utf8, pop
4997          * our own setjmp, and longjmp to the correct handler */
4998         if (jump_ret != UTF8_LONGJMP) {
4999             JMPENV_POP;
5000             JMPENV_JUMP(jump_ret);
5001         }
5002
5003         GET_RE_DEBUG_FLAGS;
5004
5005         /* It's possible to write a regexp in ascii that represents Unicode
5006         codepoints outside of the byte range, such as via \x{100}. If we
5007         detect such a sequence we have to convert the entire pattern to utf8
5008         and then recompile, as our sizing calculation will have been based
5009         on 1 byte == 1 character, but we will need to use utf8 to encode
5010         at least some part of the pattern, and therefore must convert the whole
5011         thing.
5012         -- dmq */
5013         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5014             "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5015         exp = (char*)Perl_bytes_to_utf8(aTHX_
5016                                         (U8*)SvPV_nomg(pattern, plen),
5017                                         &len);
5018         xend = exp + len;
5019         RExC_orig_utf8 = RExC_utf8 = 1;
5020         SAVEFREEPV(exp);
5021     }
5022
5023 #ifdef TRIE_STUDY_OPT
5024     restudied = 0;
5025 #endif
5026
5027     pm_flags = orig_pm_flags;
5028
5029     if (initial_charset == REGEX_LOCALE_CHARSET) {
5030         RExC_contains_locale = 1;
5031     }
5032     else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
5033
5034         /* Set to use unicode semantics if the pattern is in utf8 and has the
5035          * 'depends' charset specified, as it means unicode when utf8  */
5036         set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
5037     }
5038
5039     RExC_precomp = exp;
5040     RExC_flags = pm_flags;
5041     RExC_sawback = 0;
5042
5043     RExC_seen = 0;
5044     RExC_in_lookbehind = 0;
5045     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
5046     RExC_seen_evals = 0;
5047     RExC_extralen = 0;
5048     RExC_override_recoding = 0;
5049
5050     /* First pass: determine size, legality. */
5051     RExC_parse = exp;
5052     RExC_start = exp;
5053     RExC_end = xend;
5054     RExC_naughty = 0;
5055     RExC_npar = 1;
5056     RExC_nestroot = 0;
5057     RExC_size = 0L;
5058     RExC_emit = &PL_regdummy;
5059     RExC_whilem_seen = 0;
5060     RExC_open_parens = NULL;
5061     RExC_close_parens = NULL;
5062     RExC_opend = NULL;
5063     RExC_paren_names = NULL;
5064 #ifdef DEBUGGING
5065     RExC_paren_name_list = NULL;
5066 #endif
5067     RExC_recurse = NULL;
5068     RExC_recurse_count = 0;
5069
5070 #if 0 /* REGC() is (currently) a NOP at the first pass.
5071        * Clever compilers notice this and complain. --jhi */
5072     REGC((U8)REG_MAGIC, (char*)RExC_emit);
5073 #endif
5074     DEBUG_PARSE_r(
5075         PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
5076         RExC_lastnum=0;
5077         RExC_lastparse=NULL;
5078     );
5079     if (reg(pRExC_state, 0, &flags,1) == NULL) {
5080         RExC_precomp = NULL;
5081         return(NULL);
5082     }
5083
5084     /* Here, finished first pass.  Get rid of any added setjmp */
5085     if (used_setjump) {
5086         JMPENV_POP;
5087     }
5088
5089     DEBUG_PARSE_r({
5090         PerlIO_printf(Perl_debug_log,
5091             "Required size %"IVdf" nodes\n"
5092             "Starting second pass (creation)\n",
5093             (IV)RExC_size);
5094         RExC_lastnum=0;
5095         RExC_lastparse=NULL;
5096     });
5097
5098     /* The first pass could have found things that force Unicode semantics */
5099     if ((RExC_utf8 || RExC_uni_semantics)
5100          && get_regex_charset(pm_flags) == REGEX_DEPENDS_CHARSET)
5101     {
5102         set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
5103     }
5104
5105     /* Small enough for pointer-storage convention?
5106        If extralen==0, this means that we will not need long jumps. */
5107     if (RExC_size >= 0x10000L && RExC_extralen)
5108         RExC_size += RExC_extralen;
5109     else
5110         RExC_extralen = 0;
5111     if (RExC_whilem_seen > 15)
5112         RExC_whilem_seen = 15;
5113
5114     /* Allocate space and zero-initialize. Note, the two step process
5115        of zeroing when in debug mode, thus anything assigned has to
5116        happen after that */
5117     rx = (REGEXP*) newSV_type(SVt_REGEXP);
5118     r = (struct regexp*)SvANY(rx);
5119     Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
5120          char, regexp_internal);
5121     if ( r == NULL || ri == NULL )
5122         FAIL("Regexp out of space");
5123 #ifdef DEBUGGING
5124     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
5125     Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
5126 #else
5127     /* bulk initialize base fields with 0. */
5128     Zero(ri, sizeof(regexp_internal), char);
5129 #endif
5130
5131     /* non-zero initialization begins here */
5132     RXi_SET( r, ri );
5133     r->engine= RE_ENGINE_PTR;
5134     r->extflags = pm_flags;
5135     {
5136         bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
5137         bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
5138
5139         /* The caret is output if there are any defaults: if not all the STD
5140          * flags are set, or if no character set specifier is needed */
5141         bool has_default =
5142                     (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
5143                     || ! has_charset);
5144         bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
5145         U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
5146                             >> RXf_PMf_STD_PMMOD_SHIFT);
5147         const char *fptr = STD_PAT_MODS;        /*"msix"*/
5148         char *p;
5149         /* Allocate for the worst case, which is all the std flags are turned
5150          * on.  If more precision is desired, we could do a population count of
5151          * the flags set.  This could be done with a small lookup table, or by
5152          * shifting, masking and adding, or even, when available, assembly
5153          * language for a machine-language population count.
5154          * We never output a minus, as all those are defaults, so are
5155          * covered by the caret */
5156         const STRLEN wraplen = plen + has_p + has_runon
5157             + has_default       /* If needs a caret */
5158
5159                 /* If needs a character set specifier */
5160             + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
5161             + (sizeof(STD_PAT_MODS) - 1)
5162             + (sizeof("(?:)") - 1);
5163
5164         p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
5165         SvPOK_on(rx);
5166         SvFLAGS(rx) |= SvUTF8(pattern);
5167         *p++='('; *p++='?';
5168
5169         /* If a default, cover it using the caret */
5170         if (has_default) {
5171             *p++= DEFAULT_PAT_MOD;
5172         }
5173         if (has_charset) {
5174             STRLEN len;
5175             const char* const name = get_regex_charset_name(r->extflags, &len);
5176             Copy(name, p, len, char);
5177             p += len;
5178         }
5179         if (has_p)
5180             *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
5181         {
5182             char ch;
5183             while((ch = *fptr++)) {
5184                 if(reganch & 1)
5185                     *p++ = ch;
5186                 reganch >>= 1;
5187             }
5188         }
5189
5190         *p++ = ':';
5191         Copy(RExC_precomp, p, plen, char);
5192         assert ((RX_WRAPPED(rx) - p) < 16);
5193         r->pre_prefix = p - RX_WRAPPED(rx);
5194         p += plen;
5195         if (has_runon)
5196             *p++ = '\n';
5197         *p++ = ')';
5198         *p = 0;
5199         SvCUR_set(rx, p - SvPVX_const(rx));
5200     }
5201
5202     r->intflags = 0;
5203     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
5204
5205     if (RExC_seen & REG_SEEN_RECURSE) {
5206         Newxz(RExC_open_parens, RExC_npar,regnode *);
5207         SAVEFREEPV(RExC_open_parens);
5208         Newxz(RExC_close_parens,RExC_npar,regnode *);
5209         SAVEFREEPV(RExC_close_parens);
5210     }
5211
5212     /* Useful during FAIL. */
5213 #ifdef RE_TRACK_PATTERN_OFFSETS
5214     Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
5215     DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
5216                           "%s %"UVuf" bytes for offset annotations.\n",
5217                           ri->u.offsets ? "Got" : "Couldn't get",
5218                           (UV)((2*RExC_size+1) * sizeof(U32))));
5219 #endif
5220     SetProgLen(ri,RExC_size);
5221     RExC_rx_sv = rx;
5222     RExC_rx = r;
5223     RExC_rxi = ri;
5224
5225     /* Second pass: emit code. */
5226     RExC_flags = pm_flags;      /* don't let top level (?i) bleed */
5227     RExC_parse = exp;
5228     RExC_end = xend;
5229     RExC_naughty = 0;
5230     RExC_npar = 1;
5231     RExC_emit_start = ri->program;
5232     RExC_emit = ri->program;
5233     RExC_emit_bound = ri->program + RExC_size + 1;
5234
5235     /* Store the count of eval-groups for security checks: */
5236     RExC_rx->seen_evals = RExC_seen_evals;
5237     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
5238     if (reg(pRExC_state, 0, &flags,1) == NULL) {
5239         ReREFCNT_dec(rx);
5240         return(NULL);
5241     }
5242     /* XXXX To minimize changes to RE engine we always allocate
5243        3-units-long substrs field. */
5244     Newx(r->substrs, 1, struct reg_substr_data);
5245     if (RExC_recurse_count) {
5246         Newxz(RExC_recurse,RExC_recurse_count,regnode *);
5247         SAVEFREEPV(RExC_recurse);
5248     }
5249
5250 reStudy:
5251     r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
5252     Zero(r->substrs, 1, struct reg_substr_data);
5253
5254 #ifdef TRIE_STUDY_OPT
5255     if (!restudied) {
5256         StructCopy(&zero_scan_data, &data, scan_data_t);
5257         copyRExC_state = RExC_state;
5258     } else {
5259         U32 seen=RExC_seen;
5260         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
5261
5262         RExC_state = copyRExC_state;
5263         if (seen & REG_TOP_LEVEL_BRANCHES)
5264             RExC_seen |= REG_TOP_LEVEL_BRANCHES;
5265         else
5266             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
5267         if (data.last_found) {
5268             SvREFCNT_dec(data.longest_fixed);
5269             SvREFCNT_dec(data.longest_float);
5270             SvREFCNT_dec(data.last_found);
5271         }
5272         StructCopy(&zero_scan_data, &data, scan_data_t);
5273     }
5274 #else
5275     StructCopy(&zero_scan_data, &data, scan_data_t);
5276 #endif
5277
5278     /* Dig out information for optimizations. */
5279     r->extflags = RExC_flags; /* was pm_op */
5280     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
5281
5282     if (UTF)
5283         SvUTF8_on(rx);  /* Unicode in it? */
5284     ri->regstclass = NULL;
5285     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
5286         r->intflags |= PREGf_NAUGHTY;
5287     scan = ri->program + 1;             /* First BRANCH. */
5288
5289     /* testing for BRANCH here tells us whether there is "must appear"
5290        data in the pattern. If there is then we can use it for optimisations */
5291     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /*  Only one top-level choice. */
5292         I32 fake;
5293         STRLEN longest_float_length, longest_fixed_length;
5294         struct regnode_charclass_class ch_class; /* pointed to by data */
5295         int stclass_flag;
5296         I32 last_close = 0; /* pointed to by data */
5297         regnode *first= scan;
5298         regnode *first_next= regnext(first);
5299         /*
5300          * Skip introductions and multiplicators >= 1
5301          * so that we can extract the 'meat' of the pattern that must
5302          * match in the large if() sequence following.
5303          * NOTE that EXACT is NOT covered here, as it is normally
5304          * picked up by the optimiser separately.
5305          *
5306          * This is unfortunate as the optimiser isnt handling lookahead
5307          * properly currently.
5308          *
5309          */
5310         while ((OP(first) == OPEN && (sawopen = 1)) ||
5311                /* An OR of *one* alternative - should not happen now. */
5312             (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
5313             /* for now we can't handle lookbehind IFMATCH*/
5314             (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
5315             (OP(first) == PLUS) ||
5316             (OP(first) == MINMOD) ||
5317                /* An {n,m} with n>0 */
5318             (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
5319             (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
5320         {
5321                 /*
5322                  * the only op that could be a regnode is PLUS, all the rest
5323                  * will be regnode_1 or regnode_2.
5324                  *
5325                  */
5326                 if (OP(first) == PLUS)
5327                     sawplus = 1;
5328                 else
5329                     first += regarglen[OP(first)];
5330
5331                 first = NEXTOPER(first);
5332                 first_next= regnext(first);
5333         }
5334
5335         /* Starting-point info. */
5336       again:
5337         DEBUG_PEEP("first:",first,0);
5338         /* Ignore EXACT as we deal with it later. */
5339         if (PL_regkind[OP(first)] == EXACT) {
5340             if (OP(first) == EXACT)
5341                 NOOP;   /* Empty, get anchored substr later. */
5342             else
5343                 ri->regstclass = first;
5344         }
5345 #ifdef TRIE_STCLASS
5346         else if (PL_regkind[OP(first)] == TRIE &&
5347                 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
5348         {
5349             regnode *trie_op;
5350             /* this can happen only on restudy */
5351             if ( OP(first) == TRIE ) {
5352                 struct regnode_1 *trieop = (struct regnode_1 *)
5353                     PerlMemShared_calloc(1, sizeof(struct regnode_1));
5354                 StructCopy(first,trieop,struct regnode_1);
5355                 trie_op=(regnode *)trieop;
5356             } else {
5357                 struct regnode_charclass *trieop = (struct regnode_charclass *)
5358                     PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
5359                 StructCopy(first,trieop,struct regnode_charclass);
5360                 trie_op=(regnode *)trieop;
5361             }
5362             OP(trie_op)+=2;
5363             make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
5364             ri->regstclass = trie_op;
5365         }
5366 #endif
5367         else if (REGNODE_SIMPLE(OP(first)))
5368             ri->regstclass = first;
5369         else if (PL_regkind[OP(first)] == BOUND ||
5370                  PL_regkind[OP(first)] == NBOUND)
5371             ri->regstclass = first;
5372         else if (PL_regkind[OP(first)] == BOL) {
5373             r->extflags |= (OP(first) == MBOL
5374                            ? RXf_ANCH_MBOL
5375                            : (OP(first) == SBOL
5376                               ? RXf_ANCH_SBOL
5377                               : RXf_ANCH_BOL));
5378             first = NEXTOPER(first);
5379             goto again;
5380         }
5381         else if (OP(first) == GPOS) {
5382             r->extflags |= RXf_ANCH_GPOS;
5383             first = NEXTOPER(first);
5384             goto again;
5385         }
5386         else if ((!sawopen || !RExC_sawback) &&
5387             (OP(first) == STAR &&
5388             PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
5389             !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
5390         {
5391             /* turn .* into ^.* with an implied $*=1 */
5392             const int type =
5393                 (OP(NEXTOPER(first)) == REG_ANY)
5394                     ? RXf_ANCH_MBOL
5395                     : RXf_ANCH_SBOL;
5396             r->extflags |= type;
5397             r->intflags |= PREGf_IMPLICIT;
5398             first = NEXTOPER(first);
5399             goto again;
5400         }
5401         if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
5402             && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
5403             /* x+ must match at the 1st pos of run of x's */
5404             r->intflags |= PREGf_SKIP;
5405
5406         /* Scan is after the zeroth branch, first is atomic matcher. */
5407 #ifdef TRIE_STUDY_OPT
5408         DEBUG_PARSE_r(
5409             if (!restudied)
5410                 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
5411                               (IV)(first - scan + 1))
5412         );
5413 #else
5414         DEBUG_PARSE_r(
5415             PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
5416                 (IV)(first - scan + 1))
5417         );
5418 #endif
5419
5420
5421         /*
5422         * If there's something expensive in the r.e., find the
5423         * longest literal string that must appear and make it the
5424         * regmust.  Resolve ties in favor of later strings, since
5425         * the regstart check works with the beginning of the r.e.
5426         * and avoiding duplication strengthens checking.  Not a
5427         * strong reason, but sufficient in the absence of others.
5428         * [Now we resolve ties in favor of the earlier string if
5429         * it happens that c_offset_min has been invalidated, since the
5430         * earlier string may buy us something the later one won't.]
5431         */
5432
5433         data.longest_fixed = newSVpvs("");
5434         data.longest_float = newSVpvs("");
5435         data.last_found = newSVpvs("");
5436         data.longest = &(data.longest_fixed);
5437         first = scan;
5438         if (!ri->regstclass) {
5439             cl_init(pRExC_state, &ch_class);
5440             data.start_class = &ch_class;
5441             stclass_flag = SCF_DO_STCLASS_AND;
5442         } else                          /* XXXX Check for BOUND? */
5443             stclass_flag = 0;
5444         data.last_closep = &last_close;
5445
5446         minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
5447             &data, -1, NULL, NULL,
5448             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
5449
5450
5451         CHECK_RESTUDY_GOTO;
5452
5453
5454         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
5455              && data.last_start_min == 0 && data.last_end > 0
5456              && !RExC_seen_zerolen
5457              && !(RExC_seen & REG_SEEN_VERBARG)
5458              && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
5459             r->extflags |= RXf_CHECK_ALL;
5460         scan_commit(pRExC_state, &data,&minlen,0);
5461         SvREFCNT_dec(data.last_found);
5462
5463         /* Note that code very similar to this but for anchored string
5464            follows immediately below, changes may need to be made to both.
5465            Be careful.
5466          */
5467         longest_float_length = CHR_SVLEN(data.longest_float);
5468         if (longest_float_length
5469             || (data.flags & SF_FL_BEFORE_EOL
5470                 && (!(data.flags & SF_FL_BEFORE_MEOL)
5471                     || (RExC_flags & RXf_PMf_MULTILINE))))
5472         {
5473             I32 t,ml;
5474
5475             /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
5476             if ((RExC_seen & REG_SEEN_EXACTF_SHARP_S)
5477                 || (SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
5478                     && data.offset_fixed == data.offset_float_min
5479                     && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
5480                     goto remove_float;          /* As in (a)+. */
5481
5482             /* copy the information about the longest float from the reg_scan_data
5483                over to the program. */
5484             if (SvUTF8(data.longest_float)) {
5485                 r->float_utf8 = data.longest_float;
5486                 r->float_substr = NULL;
5487             } else {
5488                 r->float_substr = data.longest_float;
5489                 r->float_utf8 = NULL;
5490             }
5491             /* float_end_shift is how many chars that must be matched that
5492                follow this item. We calculate it ahead of time as once the
5493                lookbehind offset is added in we lose the ability to correctly
5494                calculate it.*/
5495             ml = data.minlen_float ? *(data.minlen_float)
5496                                    : (I32)longest_float_length;
5497             r->float_end_shift = ml - data.offset_float_min
5498                 - longest_float_length + (SvTAIL(data.longest_float) != 0)
5499                 + data.lookbehind_float;
5500             r->float_min_offset = data.offset_float_min - data.lookbehind_float;
5501             r->float_max_offset = data.offset_float_max;
5502             if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
5503                 r->float_max_offset -= data.lookbehind_float;
5504
5505             t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
5506                        && (!(data.flags & SF_FL_BEFORE_MEOL)
5507                            || (RExC_flags & RXf_PMf_MULTILINE)));
5508             fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
5509         }
5510         else {
5511           remove_float:
5512             r->float_substr = r->float_utf8 = NULL;
5513             SvREFCNT_dec(data.longest_float);
5514             longest_float_length = 0;
5515         }
5516
5517         /* Note that code very similar to this but for floating string
5518            is immediately above, changes may need to be made to both.
5519            Be careful.
5520          */
5521         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
5522
5523         /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
5524         if (! (RExC_seen & REG_SEEN_EXACTF_SHARP_S)
5525             && (longest_fixed_length
5526                 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
5527                     && (!(data.flags & SF_FIX_BEFORE_MEOL)
5528                         || (RExC_flags & RXf_PMf_MULTILINE)))) )
5529         {
5530             I32 t,ml;
5531
5532             /* copy the information about the longest fixed
5533                from the reg_scan_data over to the program. */
5534             if (SvUTF8(data.longest_fixed)) {
5535                 r->anchored_utf8 = data.longest_fixed;
5536                 r->anchored_substr = NULL;
5537             } else {
5538                 r->anchored_substr = data.longest_fixed;
5539                 r->anchored_utf8 = NULL;
5540             }
5541             /* fixed_end_shift is how many chars that must be matched that
5542                follow this item. We calculate it ahead of time as once the
5543                lookbehind offset is added in we lose the ability to correctly
5544                calculate it.*/
5545             ml = data.minlen_fixed ? *(data.minlen_fixed)
5546                                    : (I32)longest_fixed_length;
5547             r->anchored_end_shift = ml - data.offset_fixed
5548                 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
5549                 + data.lookbehind_fixed;
5550             r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
5551
5552             t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
5553                  && (!(data.flags & SF_FIX_BEFORE_MEOL)
5554                      || (RExC_flags & RXf_PMf_MULTILINE)));
5555             fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
5556         }
5557         else {
5558             r->anchored_substr = r->anchored_utf8 = NULL;
5559             SvREFCNT_dec(data.longest_fixed);
5560             longest_fixed_length = 0;
5561         }
5562         if (ri->regstclass
5563             && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
5564             ri->regstclass = NULL;
5565
5566         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
5567             && stclass_flag
5568             && !(data.start_class->flags & ANYOF_EOS)
5569             && !cl_is_anything(data.start_class))
5570         {
5571             const U32 n = add_data(pRExC_state, 1, "f");
5572             data.start_class->flags |= ANYOF_IS_SYNTHETIC;
5573
5574             Newx(RExC_rxi->data->data[n], 1,
5575                 struct regnode_charclass_class);
5576             StructCopy(data.start_class,
5577                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
5578                        struct regnode_charclass_class);
5579             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
5580             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
5581             DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
5582                       regprop(r, sv, (regnode*)data.start_class);
5583                       PerlIO_printf(Perl_debug_log,
5584                                     "synthetic stclass \"%s\".\n",
5585                                     SvPVX_const(sv));});
5586         }
5587
5588         /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
5589         if (longest_fixed_length > longest_float_length) {
5590             r->check_end_shift = r->anchored_end_shift;
5591             r->check_substr = r->anchored_substr;
5592             r->check_utf8 = r->anchored_utf8;
5593             r->check_offset_min = r->check_offset_max = r->anchored_offset;
5594             if (r->extflags & RXf_ANCH_SINGLE)
5595                 r->extflags |= RXf_NOSCAN;
5596         }
5597         else {
5598             r->check_end_shift = r->float_end_shift;
5599             r->check_substr = r->float_substr;
5600             r->check_utf8 = r->float_utf8;
5601             r->check_offset_min = r->float_min_offset;
5602             r->check_offset_max = r->float_max_offset;
5603         }
5604         /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
5605            This should be changed ASAP!  */
5606         if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
5607             r->extflags |= RXf_USE_INTUIT;
5608             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
5609                 r->extflags |= RXf_INTUIT_TAIL;
5610         }
5611         /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
5612         if ( (STRLEN)minlen < longest_float_length )
5613             minlen= longest_float_length;
5614         if ( (STRLEN)minlen < longest_fixed_length )
5615             minlen= longest_fixed_length;
5616         */
5617     }
5618     else {
5619         /* Several toplevels. Best we can is to set minlen. */
5620         I32 fake;
5621         struct regnode_charclass_class ch_class;
5622         I32 last_close = 0;
5623
5624         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
5625
5626         scan = ri->program + 1;
5627         cl_init(pRExC_state, &ch_class);
5628         data.start_class = &ch_class;
5629         data.last_closep = &last_close;
5630
5631
5632         minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
5633             &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
5634
5635         CHECK_RESTUDY_GOTO;
5636
5637         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
5638                 = r->float_substr = r->float_utf8 = NULL;
5639
5640         if (!(data.start_class->flags & ANYOF_EOS)
5641             && !cl_is_anything(data.start_class))
5642         {
5643             const U32 n = add_data(pRExC_state, 1, "f");
5644             data.start_class->flags |= ANYOF_IS_SYNTHETIC;
5645
5646             Newx(RExC_rxi->data->data[n], 1,
5647                 struct regnode_charclass_class);
5648             StructCopy(data.start_class,
5649                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
5650                        struct regnode_charclass_class);
5651             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
5652             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
5653             DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
5654                       regprop(r, sv, (regnode*)data.start_class);
5655                       PerlIO_printf(Perl_debug_log,
5656                                     "synthetic stclass \"%s\".\n",
5657                                     SvPVX_const(sv));});
5658         }
5659     }
5660
5661     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
5662        the "real" pattern. */
5663     DEBUG_OPTIMISE_r({
5664         PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
5665                       (IV)minlen, (IV)r->minlen);
5666     });
5667     r->minlenret = minlen;
5668     if (r->minlen < minlen)
5669         r->minlen = minlen;
5670
5671     if (RExC_seen & REG_SEEN_GPOS)
5672         r->extflags |= RXf_GPOS_SEEN;
5673     if (RExC_seen & REG_SEEN_LOOKBEHIND)
5674         r->extflags |= RXf_LOOKBEHIND_SEEN;
5675     if (RExC_seen & REG_SEEN_EVAL)
5676         r->extflags |= RXf_EVAL_SEEN;
5677     if (RExC_seen & REG_SEEN_CANY)
5678         r->extflags |= RXf_CANY_SEEN;
5679     if (RExC_seen & REG_SEEN_VERBARG)
5680         r->intflags |= PREGf_VERBARG_SEEN;
5681     if (RExC_seen & REG_SEEN_CUTGROUP)
5682         r->intflags |= PREGf_CUTGROUP_SEEN;
5683     if (RExC_paren_names)
5684         RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
5685     else
5686         RXp_PAREN_NAMES(r) = NULL;
5687
5688 #ifdef STUPID_PATTERN_CHECKS
5689     if (RX_PRELEN(rx) == 0)
5690         r->extflags |= RXf_NULL;
5691     if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
5692         /* XXX: this should happen BEFORE we compile */
5693         r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5694     else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
5695         r->extflags |= RXf_WHITE;
5696     else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
5697         r->extflags |= RXf_START_ONLY;
5698 #else
5699     if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
5700             /* XXX: this should happen BEFORE we compile */
5701             r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5702     else {
5703         regnode *first = ri->program + 1;
5704         U8 fop = OP(first);
5705
5706         if (PL_regkind[fop] == NOTHING && OP(NEXTOPER(first)) == END)
5707             r->extflags |= RXf_NULL;
5708         else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END)
5709             r->extflags |= RXf_START_ONLY;
5710         else if (fop == PLUS && OP(NEXTOPER(first)) == SPACE
5711                              && OP(regnext(first)) == END)
5712             r->extflags |= RXf_WHITE;
5713     }
5714 #endif
5715 #ifdef DEBUGGING
5716     if (RExC_paren_names) {
5717         ri->name_list_idx = add_data( pRExC_state, 1, "a" );
5718         ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
5719     } else
5720 #endif
5721         ri->name_list_idx = 0;
5722
5723     if (RExC_recurse_count) {
5724         for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
5725             const regnode *scan = RExC_recurse[RExC_recurse_count-1];
5726             ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
5727         }
5728     }
5729     Newxz(r->offs, RExC_npar, regexp_paren_pair);
5730     /* assume we don't need to swap parens around before we match */
5731
5732     DEBUG_DUMP_r({
5733         PerlIO_printf(Perl_debug_log,"Final program:\n");
5734         regdump(r);
5735     });
5736 #ifdef RE_TRACK_PATTERN_OFFSETS
5737     DEBUG_OFFSETS_r(if (ri->u.offsets) {
5738         const U32 len = ri->u.offsets[0];
5739         U32 i;
5740         GET_RE_DEBUG_FLAGS_DECL;
5741         PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
5742         for (i = 1; i <= len; i++) {
5743             if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
5744                 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
5745                 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
5746             }
5747         PerlIO_printf(Perl_debug_log, "\n");
5748     });
5749 #endif
5750     return rx;
5751 }
5752
5753 #undef RE_ENGINE_PTR
5754
5755
5756 SV*
5757 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
5758                     const U32 flags)
5759 {
5760     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
5761
5762     PERL_UNUSED_ARG(value);
5763
5764     if (flags & RXapif_FETCH) {
5765         return reg_named_buff_fetch(rx, key, flags);
5766     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
5767         Perl_croak_no_modify(aTHX);
5768         return NULL;
5769     } else if (flags & RXapif_EXISTS) {
5770         return reg_named_buff_exists(rx, key, flags)
5771             ? &PL_sv_yes
5772             : &PL_sv_no;
5773     } else if (flags & RXapif_REGNAMES) {
5774         return reg_named_buff_all(rx, flags);
5775     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
5776         return reg_named_buff_scalar(rx, flags);
5777     } else {
5778         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
5779         return NULL;
5780     }
5781 }
5782
5783 SV*
5784 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
5785                          const U32 flags)
5786 {
5787     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
5788     PERL_UNUSED_ARG(lastkey);
5789
5790     if (flags & RXapif_FIRSTKEY)
5791         return reg_named_buff_firstkey(rx, flags);
5792     else if (flags & RXapif_NEXTKEY)
5793         return reg_named_buff_nextkey(rx, flags);
5794     else {
5795         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
5796         return NULL;
5797     }
5798 }
5799
5800 SV*
5801 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
5802                           const U32 flags)
5803 {
5804     AV *retarray = NULL;
5805     SV *ret;
5806     struct regexp *const rx = (struct regexp *)SvANY(r);
5807
5808     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
5809
5810     if (flags & RXapif_ALL)
5811         retarray=newAV();
5812
5813     if (rx && RXp_PAREN_NAMES(rx)) {
5814         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
5815         if (he_str) {
5816             IV i;
5817             SV* sv_dat=HeVAL(he_str);
5818             I32 *nums=(I32*)SvPVX(sv_dat);
5819             for ( i=0; i<SvIVX(sv_dat); i++ ) {
5820                 if ((I32)(rx->nparens) >= nums[i]
5821                     && rx->offs[nums[i]].start != -1
5822                     && rx->offs[nums[i]].end != -1)
5823                 {
5824                     ret = newSVpvs("");
5825                     CALLREG_NUMBUF_FETCH(r,nums[i],ret);
5826                     if (!retarray)
5827                         return ret;
5828                 } else {
5829                     if (retarray)
5830                         ret = newSVsv(&PL_sv_undef);
5831                 }
5832                 if (retarray)
5833                     av_push(retarray, ret);
5834             }
5835             if (retarray)
5836                 return newRV_noinc(MUTABLE_SV(retarray));
5837         }
5838     }
5839     return NULL;
5840 }
5841
5842 bool
5843 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
5844                            const U32 flags)
5845 {
5846     struct regexp *const rx = (struct regexp *)SvANY(r);
5847
5848     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
5849
5850     if (rx && RXp_PAREN_NAMES(rx)) {
5851         if (flags & RXapif_ALL) {
5852             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
5853         } else {
5854             SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
5855             if (sv) {
5856                 SvREFCNT_dec(sv);
5857                 return TRUE;
5858             } else {
5859                 return FALSE;
5860             }
5861         }
5862     } else {
5863         return FALSE;
5864     }
5865 }
5866
5867 SV*
5868 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
5869 {
5870     struct regexp *const rx = (struct regexp *)SvANY(r);
5871
5872     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
5873
5874     if ( rx && RXp_PAREN_NAMES(rx) ) {
5875         (void)hv_iterinit(RXp_PAREN_NAMES(rx));
5876
5877         return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
5878     } else {
5879         return FALSE;
5880     }
5881 }
5882
5883 SV*
5884 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
5885 {
5886     struct regexp *const rx = (struct regexp *)SvANY(r);
5887     GET_RE_DEBUG_FLAGS_DECL;
5888
5889     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
5890
5891     if (rx && RXp_PAREN_NAMES(rx)) {
5892         HV *hv = RXp_PAREN_NAMES(rx);
5893         HE *temphe;
5894         while ( (temphe = hv_iternext_flags(hv,0)) ) {
5895             IV i;
5896             IV parno = 0;
5897             SV* sv_dat = HeVAL(temphe);
5898             I32 *nums = (I32*)SvPVX(sv_dat);
5899             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5900                 if ((I32)(rx->lastparen) >= nums[i] &&
5901                     rx->offs[nums[i]].start != -1 &&
5902                     rx->offs[nums[i]].end != -1)
5903                 {
5904                     parno = nums[i];
5905                     break;
5906                 }
5907             }
5908             if (parno || flags & RXapif_ALL) {
5909                 return newSVhek(HeKEY_hek(temphe));
5910             }
5911         }
5912     }
5913     return NULL;
5914 }
5915
5916 SV*
5917 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
5918 {
5919     SV *ret;
5920     AV *av;
5921     I32 length;
5922     struct regexp *const rx = (struct regexp *)SvANY(r);
5923
5924     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
5925
5926     if (rx && RXp_PAREN_NAMES(rx)) {
5927         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5928             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
5929         } else if (flags & RXapif_ONE) {
5930             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
5931             av = MUTABLE_AV(SvRV(ret));
5932             length = av_len(av);
5933             SvREFCNT_dec(ret);
5934             return newSViv(length + 1);
5935         } else {
5936             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5937             return NULL;
5938         }
5939     }
5940     return &PL_sv_undef;
5941 }
5942
5943 SV*
5944 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
5945 {
5946     struct regexp *const rx = (struct regexp *)SvANY(r);
5947     AV *av = newAV();
5948
5949     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
5950
5951     if (rx && RXp_PAREN_NAMES(rx)) {
5952         HV *hv= RXp_PAREN_NAMES(rx);
5953         HE *temphe;
5954         (void)hv_iterinit(hv);
5955         while ( (temphe = hv_iternext_flags(hv,0)) ) {
5956             IV i;
5957             IV parno = 0;
5958             SV* sv_dat = HeVAL(temphe);
5959             I32 *nums = (I32*)SvPVX(sv_dat);
5960             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5961                 if ((I32)(rx->lastparen) >= nums[i] &&
5962                     rx->offs[nums[i]].start != -1 &&
5963                     rx->offs[nums[i]].end != -1)
5964                 {
5965                     parno = nums[i];
5966                     break;
5967                 }
5968             }
5969             if (parno || flags & RXapif_ALL) {
5970                 av_push(av, newSVhek(HeKEY_hek(temphe)));
5971             }
5972         }
5973     }
5974
5975     return newRV_noinc(MUTABLE_SV(av));
5976 }
5977
5978 void
5979 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
5980                              SV * const sv)
5981 {
5982     struct regexp *const rx = (struct regexp *)SvANY(r);
5983     char *s = NULL;
5984     I32 i = 0;
5985     I32 s1, t1;
5986
5987     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
5988
5989     if (!rx->subbeg) {
5990         sv_setsv(sv,&PL_sv_undef);
5991         return;
5992     }
5993     else
5994     if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
5995         /* $` */
5996         i = rx->offs[0].start;
5997         s = rx->subbeg;
5998     }
5999     else
6000     if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
6001         /* $' */
6002         s = rx->subbeg + rx->offs[0].end;
6003         i = rx->sublen - rx->offs[0].end;
6004     }
6005     else
6006     if ( 0 <= paren && paren <= (I32)rx->nparens &&
6007         (s1 = rx->offs[paren].start) != -1 &&
6008         (t1 = rx->offs[paren].end) != -1)
6009     {
6010         /* $& $1 ... */
6011         i = t1 - s1;
6012         s = rx->subbeg + s1;
6013     } else {
6014         sv_setsv(sv,&PL_sv_undef);
6015         return;
6016     }
6017     assert(rx->sublen >= (s - rx->subbeg) + i );
6018     if (i >= 0) {
6019         const int oldtainted = PL_tainted;
6020         TAINT_NOT;
6021         sv_setpvn(sv, s, i);
6022         PL_tainted = oldtainted;
6023         if ( (rx->extflags & RXf_CANY_SEEN)
6024             ? (RXp_MATCH_UTF8(rx)
6025                         && (!i || is_utf8_string((U8*)s, i)))
6026             : (RXp_MATCH_UTF8(rx)) )
6027         {
6028             SvUTF8_on(sv);
6029         }
6030         else
6031             SvUTF8_off(sv);
6032         if (PL_tainting) {
6033             if (RXp_MATCH_TAINTED(rx)) {
6034                 if (SvTYPE(sv) >= SVt_PVMG) {
6035                     MAGIC* const mg = SvMAGIC(sv);
6036                     MAGIC* mgt;
6037                     PL_tainted = 1;
6038                     SvMAGIC_set(sv, mg->mg_moremagic);
6039                     SvTAINT(sv);
6040                     if ((mgt = SvMAGIC(sv))) {
6041                         mg->mg_moremagic = mgt;
6042                         SvMAGIC_set(sv, mg);
6043                     }
6044                 } else {
6045                     PL_tainted = 1;
6046                     SvTAINT(sv);
6047                 }
6048             } else
6049                 SvTAINTED_off(sv);
6050         }
6051     } else {
6052         sv_setsv(sv,&PL_sv_undef);
6053         return;
6054     }
6055 }
6056
6057 void
6058 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
6059                                                          SV const * const value)
6060 {
6061     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
6062
6063     PERL_UNUSED_ARG(rx);
6064     PERL_UNUSED_ARG(paren);
6065     PERL_UNUSED_ARG(value);
6066
6067     if (!PL_localizing)
6068         Perl_croak_no_modify(aTHX);
6069 }
6070
6071 I32
6072 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
6073                               const I32 paren)
6074 {
6075     struct regexp *const rx = (struct regexp *)SvANY(r);
6076     I32 i;
6077     I32 s1, t1;
6078
6079     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
6080
6081     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
6082         switch (paren) {
6083       /* $` / ${^PREMATCH} */
6084       case RX_BUFF_IDX_PREMATCH:
6085         if (rx->offs[0].start != -1) {
6086                         i = rx->offs[0].start;
6087                         if (i > 0) {
6088                                 s1 = 0;
6089                                 t1 = i;
6090                                 goto getlen;
6091                         }
6092             }
6093         return 0;
6094       /* $' / ${^POSTMATCH} */
6095       case RX_BUFF_IDX_POSTMATCH:
6096             if (rx->offs[0].end != -1) {
6097                         i = rx->sublen - rx->offs[0].end;
6098                         if (i > 0) {
6099                                 s1 = rx->offs[0].end;
6100                                 t1 = rx->sublen;
6101                                 goto getlen;
6102                         }
6103             }
6104         return 0;
6105       /* $& / ${^MATCH}, $1, $2, ... */
6106       default:
6107             if (paren <= (I32)rx->nparens &&
6108             (s1 = rx->offs[paren].start) != -1 &&
6109             (t1 = rx->offs[paren].end) != -1)
6110             {
6111             i = t1 - s1;
6112             goto getlen;
6113         } else {
6114             if (ckWARN(WARN_UNINITIALIZED))
6115                 report_uninit((const SV *)sv);
6116             return 0;
6117         }
6118     }
6119   getlen:
6120     if (i > 0 && RXp_MATCH_UTF8(rx)) {
6121         const char * const s = rx->subbeg + s1;
6122         const U8 *ep;
6123         STRLEN el;
6124
6125         i = t1 - s1;
6126         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
6127                         i = el;
6128     }
6129     return i;
6130 }
6131
6132 SV*
6133 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
6134 {
6135     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
6136         PERL_UNUSED_ARG(rx);
6137         if (0)
6138             return NULL;
6139         else
6140             return newSVpvs("Regexp");
6141 }
6142
6143 /* Scans the name of a named buffer from the pattern.
6144  * If flags is REG_RSN_RETURN_NULL returns null.
6145  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
6146  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
6147  * to the parsed name as looked up in the RExC_paren_names hash.
6148  * If there is an error throws a vFAIL().. type exception.
6149  */
6150
6151 #define REG_RSN_RETURN_NULL    0
6152 #define REG_RSN_RETURN_NAME    1
6153 #define REG_RSN_RETURN_DATA    2
6154
6155 STATIC SV*
6156 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
6157 {
6158     char *name_start = RExC_parse;
6159
6160     PERL_ARGS_ASSERT_REG_SCAN_NAME;
6161
6162     if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
6163          /* skip IDFIRST by using do...while */
6164         if (UTF)
6165             do {
6166                 RExC_parse += UTF8SKIP(RExC_parse);
6167             } while (isALNUM_utf8((U8*)RExC_parse));
6168         else
6169             do {
6170                 RExC_parse++;
6171             } while (isALNUM(*RExC_parse));
6172     }
6173
6174     if ( flags ) {
6175         SV* sv_name
6176             = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
6177                              SVs_TEMP | (UTF ? SVf_UTF8 : 0));
6178         if ( flags == REG_RSN_RETURN_NAME)
6179             return sv_name;
6180         else if (flags==REG_RSN_RETURN_DATA) {
6181             HE *he_str = NULL;
6182             SV *sv_dat = NULL;
6183             if ( ! sv_name )      /* should not happen*/
6184                 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
6185             if (RExC_paren_names)
6186                 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
6187             if ( he_str )
6188                 sv_dat = HeVAL(he_str);
6189             if ( ! sv_dat )
6190                 vFAIL("Reference to nonexistent named group");
6191             return sv_dat;
6192         }
6193         else {
6194             Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
6195                        (unsigned long) flags);
6196         }
6197         /* NOT REACHED */
6198     }
6199     return NULL;
6200 }
6201
6202 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
6203     int rem=(int)(RExC_end - RExC_parse);                       \
6204     int cut;                                                    \
6205     int num;                                                    \
6206     int iscut=0;                                                \
6207     if (rem>10) {                                               \
6208         rem=10;                                                 \
6209         iscut=1;                                                \
6210     }                                                           \
6211     cut=10-rem;                                                 \
6212     if (RExC_lastparse!=RExC_parse)                             \
6213         PerlIO_printf(Perl_debug_log," >%.*s%-*s",              \
6214             rem, RExC_parse,                                    \
6215             cut + 4,                                            \
6216             iscut ? "..." : "<"                                 \
6217         );                                                      \
6218     else                                                        \
6219         PerlIO_printf(Perl_debug_log,"%16s","");                \
6220                                                                 \
6221     if (SIZE_ONLY)                                              \
6222        num = RExC_size + 1;                                     \
6223     else                                                        \
6224        num=REG_NODE_NUM(RExC_emit);                             \
6225     if (RExC_lastnum!=num)                                      \
6226        PerlIO_printf(Perl_debug_log,"|%4d",num);                \
6227     else                                                        \
6228        PerlIO_printf(Perl_debug_log,"|%4s","");                 \
6229     PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
6230         (int)((depth*2)), "",                                   \
6231         (funcname)                                              \
6232     );                                                          \
6233     RExC_lastnum=num;                                           \
6234     RExC_lastparse=RExC_parse;                                  \
6235 })
6236
6237
6238
6239 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
6240     DEBUG_PARSE_MSG((funcname));                            \
6241     PerlIO_printf(Perl_debug_log,"%4s","\n");               \
6242 })
6243 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
6244     DEBUG_PARSE_MSG((funcname));                            \
6245     PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
6246 })
6247
6248 /* This section of code defines the inversion list object and its methods.  The
6249  * interfaces are highly subject to change, so as much as possible is static to
6250  * this file.  An inversion list is here implemented as a malloc'd C UV array
6251  * with some added info that is placed as UVs at the beginning in a header
6252  * portion.  An inversion list for Unicode is an array of code points, sorted
6253  * by ordinal number.  The zeroth element is the first code point in the list.
6254  * The 1th element is the first element beyond that not in the list.  In other
6255  * words, the first range is
6256  *  invlist[0]..(invlist[1]-1)
6257  * The other ranges follow.  Thus every element whose index is divisible by two
6258  * marks the beginning of a range that is in the list, and every element not
6259  * divisible by two marks the beginning of a range not in the list.  A single
6260  * element inversion list that contains the single code point N generally
6261  * consists of two elements
6262  *  invlist[0] == N
6263  *  invlist[1] == N+1
6264  * (The exception is when N is the highest representable value on the
6265  * machine, in which case the list containing just it would be a single
6266  * element, itself.  By extension, if the last range in the list extends to
6267  * infinity, then the first element of that range will be in the inversion list
6268  * at a position that is divisible by two, and is the final element in the
6269  * list.)
6270  * Taking the complement (inverting) an inversion list is quite simple, if the
6271  * first element is 0, remove it; otherwise add a 0 element at the beginning.
6272  * This implementation reserves an element at the beginning of each inversion list
6273  * to contain 0 when the list contains 0, and contains 1 otherwise.  The actual
6274  * beginning of the list is either that element if 0, or the next one if 1.
6275  *
6276  * More about inversion lists can be found in "Unicode Demystified"
6277  * Chapter 13 by Richard Gillam, published by Addison-Wesley.
6278  * More will be coming when functionality is added later.
6279  *
6280  * The inversion list data structure is currently implemented as an SV pointing
6281  * to an array of UVs that the SV thinks are bytes.  This allows us to have an
6282  * array of UV whose memory management is automatically handled by the existing
6283  * facilities for SV's.
6284  *
6285  * Some of the methods should always be private to the implementation, and some
6286  * should eventually be made public */
6287
6288 #define INVLIST_LEN_OFFSET 0    /* Number of elements in the inversion list */
6289 #define INVLIST_ITER_OFFSET 1   /* Current iteration position */
6290
6291 /* This is a combination of a version and data structure type, so that one
6292  * being passed in can be validated to be an inversion list of the correct
6293  * vintage.  When the structure of the header is changed, a new random number
6294  * in the range 2**31-1 should be generated and the new() method changed to
6295  * insert that at this location.  Then, if an auxiliary program doesn't change
6296  * correspondingly, it will be discovered immediately */
6297 #define INVLIST_VERSION_ID_OFFSET 2
6298 #define INVLIST_VERSION_ID 1064334010
6299
6300 /* For safety, when adding new elements, remember to #undef them at the end of
6301  * the inversion list code section */
6302
6303 #define INVLIST_ZERO_OFFSET 3   /* 0 or 1; must be last element in header */
6304 /* The UV at position ZERO contains either 0 or 1.  If 0, the inversion list
6305  * contains the code point U+00000, and begins here.  If 1, the inversion list
6306  * doesn't contain U+0000, and it begins at the next UV in the array.
6307  * Inverting an inversion list consists of adding or removing the 0 at the
6308  * beginning of it.  By reserving a space for that 0, inversion can be made
6309  * very fast */
6310
6311 #define HEADER_LENGTH (INVLIST_ZERO_OFFSET + 1)
6312
6313 /* Internally things are UVs */
6314 #define TO_INTERNAL_SIZE(x) ((x + HEADER_LENGTH) * sizeof(UV))
6315 #define FROM_INTERNAL_SIZE(x) ((x / sizeof(UV)) - HEADER_LENGTH)
6316
6317 #define INVLIST_INITIAL_LEN 10
6318
6319 PERL_STATIC_INLINE UV*
6320 S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
6321 {
6322     /* Returns a pointer to the first element in the inversion list's array.
6323      * This is called upon initialization of an inversion list.  Where the
6324      * array begins depends on whether the list has the code point U+0000
6325      * in it or not.  The other parameter tells it whether the code that
6326      * follows this call is about to put a 0 in the inversion list or not.
6327      * The first element is either the element with 0, if 0, or the next one,
6328      * if 1 */
6329
6330     UV* zero = get_invlist_zero_addr(invlist);
6331
6332     PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
6333
6334     /* Must be empty */
6335     assert(! *get_invlist_len_addr(invlist));
6336
6337     /* 1^1 = 0; 1^0 = 1 */
6338     *zero = 1 ^ will_have_0;
6339     return zero + *zero;
6340 }
6341
6342 PERL_STATIC_INLINE UV*
6343 S_invlist_array(pTHX_ SV* const invlist)
6344 {
6345     /* Returns the pointer to the inversion list's array.  Every time the
6346      * length changes, this needs to be called in case malloc or realloc moved
6347      * it */
6348
6349     PERL_ARGS_ASSERT_INVLIST_ARRAY;
6350
6351     /* Must not be empty.  If these fail, you probably didn't check for <len>
6352      * being non-zero before trying to get the array */
6353     assert(*get_invlist_len_addr(invlist));
6354     assert(*get_invlist_zero_addr(invlist) == 0
6355            || *get_invlist_zero_addr(invlist) == 1);
6356
6357     /* The array begins either at the element reserved for zero if the
6358      * list contains 0 (that element will be set to 0), or otherwise the next
6359      * element (in which case the reserved element will be set to 1). */
6360     return (UV *) (get_invlist_zero_addr(invlist)
6361                    + *get_invlist_zero_addr(invlist));
6362 }
6363
6364 PERL_STATIC_INLINE UV*
6365 S_get_invlist_len_addr(pTHX_ SV* invlist)
6366 {
6367     /* Return the address of the UV that contains the current number
6368      * of used elements in the inversion list */
6369
6370     PERL_ARGS_ASSERT_GET_INVLIST_LEN_ADDR;
6371
6372     return (UV *) (SvPVX(invlist) + (INVLIST_LEN_OFFSET * sizeof (UV)));
6373 }
6374
6375 PERL_STATIC_INLINE UV
6376 S_invlist_len(pTHX_ SV* const invlist)
6377 {
6378     /* Returns the current number of elements stored in the inversion list's
6379      * array */
6380
6381     PERL_ARGS_ASSERT_INVLIST_LEN;
6382
6383     return *get_invlist_len_addr(invlist);
6384 }
6385
6386 PERL_STATIC_INLINE void
6387 S_invlist_set_len(pTHX_ SV* const invlist, const UV len)
6388 {
6389     /* Sets the current number of elements stored in the inversion list */
6390
6391     PERL_ARGS_ASSERT_INVLIST_SET_LEN;
6392
6393     *get_invlist_len_addr(invlist) = len;
6394
6395     assert(len <= SvLEN(invlist));
6396
6397     SvCUR_set(invlist, TO_INTERNAL_SIZE(len));
6398     /* If the list contains U+0000, that element is part of the header,
6399      * and should not be counted as part of the array.  It will contain
6400      * 0 in that case, and 1 otherwise.  So we could flop 0=>1, 1=>0 and
6401      * subtract:
6402      *  SvCUR_set(invlist,
6403      *            TO_INTERNAL_SIZE(len
6404      *                             - (*get_invlist_zero_addr(inv_list) ^ 1)));
6405      * But, this is only valid if len is not 0.  The consequences of not doing
6406      * this is that the memory allocation code may think that 1 more UV is
6407      * being used than actually is, and so might do an unnecessary grow.  That
6408      * seems worth not bothering to make this the precise amount.
6409      *
6410      * Note that when inverting, SvCUR shouldn't change */
6411 }
6412
6413 PERL_STATIC_INLINE UV
6414 S_invlist_max(pTHX_ SV* const invlist)
6415 {
6416     /* Returns the maximum number of elements storable in the inversion list's
6417      * array, without having to realloc() */
6418
6419     PERL_ARGS_ASSERT_INVLIST_MAX;
6420
6421     return FROM_INTERNAL_SIZE(SvLEN(invlist));
6422 }
6423
6424 PERL_STATIC_INLINE UV*
6425 S_get_invlist_zero_addr(pTHX_ SV* invlist)
6426 {
6427     /* Return the address of the UV that is reserved to hold 0 if the inversion
6428      * list contains 0.  This has to be the last element of the heading, as the
6429      * list proper starts with either it if 0, or the next element if not.
6430      * (But we force it to contain either 0 or 1) */
6431
6432     PERL_ARGS_ASSERT_GET_INVLIST_ZERO_ADDR;
6433
6434     return (UV *) (SvPVX(invlist) + (INVLIST_ZERO_OFFSET * sizeof (UV)));
6435 }
6436
6437 #ifndef PERL_IN_XSUB_RE
6438 SV*
6439 Perl__new_invlist(pTHX_ IV initial_size)
6440 {
6441
6442     /* Return a pointer to a newly constructed inversion list, with enough
6443      * space to store 'initial_size' elements.  If that number is negative, a
6444      * system default is used instead */
6445
6446     SV* new_list;
6447
6448     if (initial_size < 0) {
6449         initial_size = INVLIST_INITIAL_LEN;
6450     }
6451
6452     /* Allocate the initial space */
6453     new_list = newSV(TO_INTERNAL_SIZE(initial_size));
6454     invlist_set_len(new_list, 0);
6455
6456     /* Force iterinit() to be used to get iteration to work */
6457     *get_invlist_iter_addr(new_list) = UV_MAX;
6458
6459     /* This should force a segfault if a method doesn't initialize this
6460      * properly */
6461     *get_invlist_zero_addr(new_list) = UV_MAX;
6462
6463     *get_invlist_version_id_addr(new_list) = INVLIST_VERSION_ID;
6464 #if HEADER_LENGTH != 4
6465 #   error Need to regenerate VERSION_ID by running perl -E 'say int(rand 2**31-1)', and then changing the #if to the new length
6466 #endif
6467
6468     return new_list;
6469 }
6470 #endif
6471
6472 STATIC SV*
6473 S__new_invlist_C_array(pTHX_ UV* list)
6474 {
6475     /* Return a pointer to a newly constructed inversion list, initialized to
6476      * point to <list>, which has to be in the exact correct inversion list
6477      * form, including internal fields.  Thus this is a dangerous routine that
6478      * should not be used in the wrong hands */
6479
6480     SV* invlist = newSV_type(SVt_PV);
6481
6482     PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
6483
6484     SvPV_set(invlist, (char *) list);
6485     SvLEN_set(invlist, 0);  /* Means we own the contents, and the system
6486                                shouldn't touch it */
6487     SvCUR_set(invlist, TO_INTERNAL_SIZE(invlist_len(invlist)));
6488
6489     if (*get_invlist_version_id_addr(invlist) != INVLIST_VERSION_ID) {
6490         Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
6491     }
6492
6493     return invlist;
6494 }
6495
6496 STATIC void
6497 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
6498 {
6499     /* Grow the maximum size of an inversion list */
6500
6501     PERL_ARGS_ASSERT_INVLIST_EXTEND;
6502
6503     SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max));
6504 }
6505
6506 PERL_STATIC_INLINE void
6507 S_invlist_trim(pTHX_ SV* const invlist)
6508 {
6509     PERL_ARGS_ASSERT_INVLIST_TRIM;
6510
6511     /* Change the length of the inversion list to how many entries it currently
6512      * has */
6513
6514     SvPV_shrink_to_cur((SV *) invlist);
6515 }
6516
6517 /* An element is in an inversion list iff its index is even numbered: 0, 2, 4,
6518  * etc */
6519 #define ELEMENT_RANGE_MATCHES_INVLIST(i) (! ((i) & 1))
6520 #define PREV_RANGE_MATCHES_INVLIST(i) (! ELEMENT_RANGE_MATCHES_INVLIST(i))
6521
6522 #define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
6523
6524 STATIC void
6525 S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
6526 {
6527    /* Subject to change or removal.  Append the range from 'start' to 'end' at
6528     * the end of the inversion list.  The range must be above any existing
6529     * ones. */
6530
6531     UV* array;
6532     UV max = invlist_max(invlist);
6533     UV len = invlist_len(invlist);
6534
6535     PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
6536
6537     if (len == 0) { /* Empty lists must be initialized */
6538         array = _invlist_array_init(invlist, start == 0);
6539     }
6540     else {
6541         /* Here, the existing list is non-empty. The current max entry in the
6542          * list is generally the first value not in the set, except when the
6543          * set extends to the end of permissible values, in which case it is
6544          * the first entry in that final set, and so this call is an attempt to
6545          * append out-of-order */
6546
6547         UV final_element = len - 1;
6548         array = invlist_array(invlist);
6549         if (array[final_element] > start
6550             || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
6551         {
6552             Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
6553                        array[final_element], start,
6554                        ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
6555         }
6556
6557         /* Here, it is a legal append.  If the new range begins with the first
6558          * value not in the set, it is extending the set, so the new first
6559          * value not in the set is one greater than the newly extended range.
6560          * */
6561         if (array[final_element] == start) {
6562             if (end != UV_MAX) {
6563                 array[final_element] = end + 1;
6564             }
6565             else {
6566                 /* But if the end is the maximum representable on the machine,
6567                  * just let the range that this would extend to have no end */
6568                 invlist_set_len(invlist, len - 1);
6569             }
6570             return;
6571         }
6572     }
6573
6574     /* Here the new range doesn't extend any existing set.  Add it */
6575
6576     len += 2;   /* Includes an element each for the start and end of range */
6577
6578     /* If overflows the existing space, extend, which may cause the array to be
6579      * moved */
6580     if (max < len) {
6581         invlist_extend(invlist, len);
6582         invlist_set_len(invlist, len);  /* Have to set len here to avoid assert
6583                                            failure in invlist_array() */
6584         array = invlist_array(invlist);
6585     }
6586     else {
6587         invlist_set_len(invlist, len);
6588     }
6589
6590     /* The next item on the list starts the range, the one after that is
6591      * one past the new range.  */
6592     array[len - 2] = start;
6593     if (end != UV_MAX) {
6594         array[len - 1] = end + 1;
6595     }
6596     else {
6597         /* But if the end is the maximum representable on the machine, just let
6598          * the range have no end */
6599         invlist_set_len(invlist, len - 1);
6600     }
6601 }
6602
6603 #ifndef PERL_IN_XSUB_RE
6604
6605 STATIC IV
6606 S_invlist_search(pTHX_ SV* const invlist, const UV cp)
6607 {
6608     /* Searches the inversion list for the entry that contains the input code
6609      * point <cp>.  If <cp> is not in the list, -1 is returned.  Otherwise, the
6610      * return value is the index into the list's array of the range that
6611      * contains <cp> */
6612
6613     IV low = 0;
6614     IV high = invlist_len(invlist);
6615     const UV * const array = invlist_array(invlist);
6616
6617     PERL_ARGS_ASSERT_INVLIST_SEARCH;
6618
6619     /* If list is empty or the code point is before the first element, return
6620      * failure. */
6621     if (high == 0 || cp < array[0]) {
6622         return -1;
6623     }
6624
6625     /* Binary search.  What we are looking for is <i> such that
6626      *  array[i] <= cp < array[i+1]
6627      * The loop below converges on the i+1. */
6628     while (low < high) {
6629         IV mid = (low + high) / 2;
6630         if (array[mid] <= cp) {
6631             low = mid + 1;
6632
6633             /* We could do this extra test to exit the loop early.
6634             if (cp < array[low]) {
6635                 return mid;
6636             }
6637             */
6638         }
6639         else { /* cp < array[mid] */
6640             high = mid;
6641         }
6642     }
6643
6644     return high - 1;
6645 }
6646
6647 void
6648 Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch)
6649 {
6650     /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
6651      * but is used when the swash has an inversion list.  This makes this much
6652      * faster, as it uses a binary search instead of a linear one.  This is
6653      * intimately tied to that function, and perhaps should be in utf8.c,
6654      * except it is intimately tied to inversion lists as well.  It assumes
6655      * that <swatch> is all 0's on input */
6656
6657     UV current = start;
6658     const IV len = invlist_len(invlist);
6659     IV i;
6660     const UV * array;
6661
6662     PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
6663
6664     if (len == 0) { /* Empty inversion list */
6665         return;
6666     }
6667
6668     array = invlist_array(invlist);
6669
6670     /* Find which element it is */
6671     i = invlist_search(invlist, start);
6672
6673     /* We populate from <start> to <end> */
6674     while (current < end) {
6675         UV upper;
6676
6677         /* The inversion list gives the results for every possible code point
6678          * after the first one in the list.  Only those ranges whose index is
6679          * even are ones that the inversion list matches.  For the odd ones,
6680          * and if the initial code point is not in the list, we have to skip
6681          * forward to the next element */
6682         if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
6683             i++;
6684             if (i >= len) { /* Finished if beyond the end of the array */
6685                 return;
6686             }
6687             current = array[i];
6688             if (current >= end) {   /* Finished if beyond the end of what we
6689                                        are populating */
6690                 return;
6691             }
6692         }
6693         assert(current >= start);
6694
6695         /* The current range ends one below the next one, except don't go past
6696          * <end> */
6697         i++;
6698         upper = (i < len && array[i] < end) ? array[i] : end;
6699
6700         /* Here we are in a range that matches.  Populate a bit in the 3-bit U8
6701          * for each code point in it */
6702         for (; current < upper; current++) {
6703             const STRLEN offset = (STRLEN)(current - start);
6704             swatch[offset >> 3] |= 1 << (offset & 7);
6705         }
6706
6707         /* Quit if at the end of the list */
6708         if (i >= len) {
6709
6710             /* But first, have to deal with the highest possible code point on
6711              * the platform.  The previous code assumes that <end> is one
6712              * beyond where we want to populate, but that is impossible at the
6713              * platform's infinity, so have to handle it specially */
6714             if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
6715             {
6716                 const STRLEN offset = (STRLEN)(end - start);
6717                 swatch[offset >> 3] |= 1 << (offset & 7);
6718             }
6719             return;
6720         }
6721
6722         /* Advance to the next range, which will be for code points not in the
6723          * inversion list */
6724         current = array[i];
6725     }
6726
6727     return;
6728 }
6729
6730
6731 void
6732 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** output)
6733 {
6734     /* Take the union of two inversion lists and point <output> to it.  *output
6735      * should be defined upon input, and if it points to one of the two lists,
6736      * the reference count to that list will be decremented.  The first list,
6737      * <a>, may be NULL, in which case a copy of the second list is returned.
6738      * If <complement_b> is TRUE, the union is taken of the complement
6739      * (inversion) of <b> instead of b itself.
6740      *
6741      * The basis for this comes from "Unicode Demystified" Chapter 13 by
6742      * Richard Gillam, published by Addison-Wesley, and explained at some
6743      * length there.  The preface says to incorporate its examples into your
6744      * code at your own risk.
6745      *
6746      * The algorithm is like a merge sort.
6747      *
6748      * XXX A potential performance improvement is to keep track as we go along
6749      * if only one of the inputs contributes to the result, meaning the other
6750      * is a subset of that one.  In that case, we can skip the final copy and
6751      * return the larger of the input lists, but then outside code might need
6752      * to keep track of whether to free the input list or not */
6753
6754     UV* array_a;    /* a's array */
6755     UV* array_b;
6756     UV len_a;       /* length of a's array */
6757     UV len_b;
6758
6759     SV* u;                      /* the resulting union */
6760     UV* array_u;
6761     UV len_u;
6762
6763     UV i_a = 0;             /* current index into a's array */
6764     UV i_b = 0;
6765     UV i_u = 0;
6766
6767     /* running count, as explained in the algorithm source book; items are
6768      * stopped accumulating and are output when the count changes to/from 0.
6769      * The count is incremented when we start a range that's in the set, and
6770      * decremented when we start a range that's not in the set.  So its range
6771      * is 0 to 2.  Only when the count is zero is something not in the set.
6772      */
6773     UV count = 0;
6774
6775     PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
6776     assert(a != b);
6777
6778     /* If either one is empty, the union is the other one */
6779     if (a == NULL || ((len_a = invlist_len(a)) == 0)) {
6780         if (*output == a) {
6781             if (a != NULL) {
6782                 SvREFCNT_dec(a);
6783             }
6784         }
6785         if (*output != b) {
6786             *output = invlist_clone(b);
6787             if (complement_b) {
6788                 _invlist_invert(*output);
6789             }
6790         } /* else *output already = b; */
6791         return;
6792     }
6793     else if ((len_b = invlist_len(b)) == 0) {
6794         if (*output == b) {
6795             SvREFCNT_dec(b);
6796         }
6797
6798         /* The complement of an empty list is a list that has everything in it,
6799          * so the union with <a> includes everything too */
6800         if (complement_b) {
6801             if (a == *output) {
6802                 SvREFCNT_dec(a);
6803             }
6804             *output = _new_invlist(1);
6805             _append_range_to_invlist(*output, 0, UV_MAX);
6806         }
6807         else if (*output != a) {
6808             *output = invlist_clone(a);
6809         }
6810         /* else *output already = a; */
6811         return;
6812     }
6813
6814     /* Here both lists exist and are non-empty */
6815     array_a = invlist_array(a);
6816     array_b = invlist_array(b);
6817
6818     /* If are to take the union of 'a' with the complement of b, set it
6819      * up so are looking at b's complement. */
6820     if (complement_b) {
6821
6822         /* To complement, we invert: if the first element is 0, remove it.  To
6823          * do this, we just pretend the array starts one later, and clear the
6824          * flag as we don't have to do anything else later */
6825         if (array_b[0] == 0) {
6826             array_b++;
6827             len_b--;
6828             complement_b = FALSE;
6829         }
6830         else {
6831
6832             /* But if the first element is not zero, we unshift a 0 before the
6833              * array.  The data structure reserves a space for that 0 (which
6834              * should be a '1' right now), so physical shifting is unneeded,
6835              * but temporarily change that element to 0.  Before exiting the
6836              * routine, we must restore the element to '1' */
6837             array_b--;
6838             len_b++;
6839             array_b[0] = 0;
6840         }
6841     }
6842
6843     /* Size the union for the worst case: that the sets are completely
6844      * disjoint */
6845     u = _new_invlist(len_a + len_b);
6846
6847     /* Will contain U+0000 if either component does */
6848     array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
6849                                       || (len_b > 0 && array_b[0] == 0));
6850
6851     /* Go through each list item by item, stopping when exhausted one of
6852      * them */
6853     while (i_a < len_a && i_b < len_b) {
6854         UV cp;      /* The element to potentially add to the union's array */
6855         bool cp_in_set;   /* is it in the the input list's set or not */
6856
6857         /* We need to take one or the other of the two inputs for the union.
6858          * Since we are merging two sorted lists, we take the smaller of the
6859          * next items.  In case of a tie, we take the one that is in its set
6860          * first.  If we took one not in the set first, it would decrement the
6861          * count, possibly to 0 which would cause it to be output as ending the
6862          * range, and the next time through we would take the same number, and
6863          * output it again as beginning the next range.  By doing it the
6864          * opposite way, there is no possibility that the count will be
6865          * momentarily decremented to 0, and thus the two adjoining ranges will
6866          * be seamlessly merged.  (In a tie and both are in the set or both not
6867          * in the set, it doesn't matter which we take first.) */
6868         if (array_a[i_a] < array_b[i_b]
6869             || (array_a[i_a] == array_b[i_b]
6870                 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
6871         {
6872             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
6873             cp= array_a[i_a++];
6874         }
6875         else {
6876             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
6877             cp= array_b[i_b++];
6878         }
6879
6880         /* Here, have chosen which of the two inputs to look at.  Only output
6881          * if the running count changes to/from 0, which marks the
6882          * beginning/end of a range in that's in the set */
6883         if (cp_in_set) {
6884             if (count == 0) {
6885                 array_u[i_u++] = cp;
6886             }
6887             count++;
6888         }
6889         else {
6890             count--;
6891             if (count == 0) {
6892                 array_u[i_u++] = cp;
6893             }
6894         }
6895     }
6896
6897     /* Here, we are finished going through at least one of the lists, which
6898      * means there is something remaining in at most one.  We check if the list
6899      * that hasn't been exhausted is positioned such that we are in the middle
6900      * of a range in its set or not.  (i_a and i_b point to the element beyond
6901      * the one we care about.) If in the set, we decrement 'count'; if 0, there
6902      * is potentially more to output.
6903      * There are four cases:
6904      *  1) Both weren't in their sets, count is 0, and remains 0.  What's left
6905      *     in the union is entirely from the non-exhausted set.
6906      *  2) Both were in their sets, count is 2.  Nothing further should
6907      *     be output, as everything that remains will be in the exhausted
6908      *     list's set, hence in the union; decrementing to 1 but not 0 insures
6909      *     that
6910      *  3) the exhausted was in its set, non-exhausted isn't, count is 1.
6911      *     Nothing further should be output because the union includes
6912      *     everything from the exhausted set.  Not decrementing ensures that.
6913      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1;
6914      *     decrementing to 0 insures that we look at the remainder of the
6915      *     non-exhausted set */
6916     if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
6917         || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
6918     {
6919         count--;
6920     }
6921
6922     /* The final length is what we've output so far, plus what else is about to
6923      * be output.  (If 'count' is non-zero, then the input list we exhausted
6924      * has everything remaining up to the machine's limit in its set, and hence
6925      * in the union, so there will be no further output. */
6926     len_u = i_u;
6927     if (count == 0) {
6928         /* At most one of the subexpressions will be non-zero */
6929         len_u += (len_a - i_a) + (len_b - i_b);
6930     }
6931
6932     /* Set result to final length, which can change the pointer to array_u, so
6933      * re-find it */
6934     if (len_u != invlist_len(u)) {
6935         invlist_set_len(u, len_u);
6936         invlist_trim(u);
6937         array_u = invlist_array(u);
6938     }
6939
6940     /* When 'count' is 0, the list that was exhausted (if one was shorter than
6941      * the other) ended with everything above it not in its set.  That means
6942      * that the remaining part of the union is precisely the same as the
6943      * non-exhausted list, so can just copy it unchanged.  (If both list were
6944      * exhausted at the same time, then the operations below will be both 0.)
6945      */
6946     if (count == 0) {
6947         IV copy_count; /* At most one will have a non-zero copy count */
6948         if ((copy_count = len_a - i_a) > 0) {
6949             Copy(array_a + i_a, array_u + i_u, copy_count, UV);
6950         }
6951         else if ((copy_count = len_b - i_b) > 0) {
6952             Copy(array_b + i_b, array_u + i_u, copy_count, UV);
6953         }
6954     }
6955
6956     /*  We may be removing a reference to one of the inputs */
6957     if (a == *output || b == *output) {
6958         SvREFCNT_dec(*output);
6959     }
6960
6961     /* If we've changed b, restore it */
6962     if (complement_b) {
6963         array_b[0] = 1;
6964     }
6965
6966     *output = u;
6967     return;
6968 }
6969
6970 void
6971 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** i)
6972 {
6973     /* Take the intersection of two inversion lists and point <i> to it.  *i
6974      * should be defined upon input, and if it points to one of the two lists,
6975      * the reference count to that list will be decremented.
6976      * If <complement_b> is TRUE, the result will be the intersection of <a>
6977      * and the complement (or inversion) of <b> instead of <b> directly.
6978      *
6979      * The basis for this comes from "Unicode Demystified" Chapter 13 by
6980      * Richard Gillam, published by Addison-Wesley, and explained at some
6981      * length there.  The preface says to incorporate its examples into your
6982      * code at your own risk.  In fact, it had bugs
6983      *
6984      * The algorithm is like a merge sort, and is essentially the same as the
6985      * union above
6986      */
6987
6988     UV* array_a;                /* a's array */
6989     UV* array_b;
6990     UV len_a;   /* length of a's array */
6991     UV len_b;
6992
6993     SV* r;                   /* the resulting intersection */
6994     UV* array_r;
6995     UV len_r;
6996
6997     UV i_a = 0;             /* current index into a's array */
6998     UV i_b = 0;
6999     UV i_r = 0;
7000
7001     /* running count, as explained in the algorithm source book; items are
7002      * stopped accumulating and are output when the count changes to/from 2.
7003      * The count is incremented when we start a range that's in the set, and
7004      * decremented when we start a range that's not in the set.  So its range
7005      * is 0 to 2.  Only when the count is 2 is something in the intersection.
7006      */
7007     UV count = 0;
7008
7009     PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
7010     assert(a != b);
7011
7012     /* Special case if either one is empty */
7013     len_a = invlist_len(a);
7014     if ((len_a == 0) || ((len_b = invlist_len(b)) == 0)) {
7015
7016         if (len_a != 0 && complement_b) {
7017
7018             /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
7019              * be empty.  Here, also we are using 'b's complement, which hence
7020              * must be every possible code point.  Thus the intersection is
7021              * simply 'a'. */
7022             if (*i != a) {
7023                 *i = invlist_clone(a);
7024
7025                 if (*i == b) {
7026                     SvREFCNT_dec(b);
7027                 }
7028             }
7029             /* else *i is already 'a' */
7030             return;
7031         }
7032
7033         /* Here, 'a' or 'b' is empty and not using the complement of 'b'.  The
7034          * intersection must be empty */
7035         if (*i == a) {
7036             SvREFCNT_dec(a);
7037         }
7038         else if (*i == b) {
7039             SvREFCNT_dec(b);
7040         }
7041         *i = _new_invlist(0);
7042         return;
7043     }
7044
7045     /* Here both lists exist and are non-empty */
7046     array_a = invlist_array(a);
7047     array_b = invlist_array(b);
7048
7049     /* If are to take the intersection of 'a' with the complement of b, set it
7050      * up so are looking at b's complement. */
7051     if (complement_b) {
7052
7053         /* To complement, we invert: if the first element is 0, remove it.  To
7054          * do this, we just pretend the array starts one later, and clear the
7055          * flag as we don't have to do anything else later */
7056         if (array_b[0] == 0) {
7057             array_b++;
7058             len_b--;
7059             complement_b = FALSE;
7060         }
7061         else {
7062
7063             /* But if the first element is not zero, we unshift a 0 before the
7064              * array.  The data structure reserves a space for that 0 (which
7065              * should be a '1' right now), so physical shifting is unneeded,
7066              * but temporarily change that element to 0.  Before exiting the
7067              * routine, we must restore the element to '1' */
7068             array_b--;
7069             len_b++;
7070             array_b[0] = 0;
7071         }
7072     }
7073
7074     /* Size the intersection for the worst case: that the intersection ends up
7075      * fragmenting everything to be completely disjoint */
7076     r= _new_invlist(len_a + len_b);
7077
7078     /* Will contain U+0000 iff both components do */
7079     array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
7080                                      && len_b > 0 && array_b[0] == 0);
7081
7082     /* Go through each list item by item, stopping when exhausted one of
7083      * them */
7084     while (i_a < len_a && i_b < len_b) {
7085         UV cp;      /* The element to potentially add to the intersection's
7086                        array */
7087         bool cp_in_set; /* Is it in the input list's set or not */
7088
7089         /* We need to take one or the other of the two inputs for the
7090          * intersection.  Since we are merging two sorted lists, we take the
7091          * smaller of the next items.  In case of a tie, we take the one that
7092          * is not in its set first (a difference from the union algorithm).  If
7093          * we took one in the set first, it would increment the count, possibly
7094          * to 2 which would cause it to be output as starting a range in the
7095          * intersection, and the next time through we would take that same
7096          * number, and output it again as ending the set.  By doing it the
7097          * opposite of this, there is no possibility that the count will be
7098          * momentarily incremented to 2.  (In a tie and both are in the set or
7099          * both not in the set, it doesn't matter which we take first.) */
7100         if (array_a[i_a] < array_b[i_b]
7101             || (array_a[i_a] == array_b[i_b]
7102                 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7103         {
7104             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7105             cp= array_a[i_a++];
7106         }
7107         else {
7108             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7109             cp= array_b[i_b++];
7110         }
7111
7112         /* Here, have chosen which of the two inputs to look at.  Only output
7113          * if the running count changes to/from 2, which marks the
7114          * beginning/end of a range that's in the intersection */
7115         if (cp_in_set) {
7116             count++;
7117             if (count == 2) {
7118                 array_r[i_r++] = cp;
7119             }
7120         }
7121         else {
7122             if (count == 2) {
7123                 array_r[i_r++] = cp;
7124             }
7125             count--;
7126         }
7127     }
7128
7129     /* Here, we are finished going through at least one of the lists, which
7130      * means there is something remaining in at most one.  We check if the list
7131      * that has been exhausted is positioned such that we are in the middle
7132      * of a range in its set or not.  (i_a and i_b point to elements 1 beyond
7133      * the ones we care about.)  There are four cases:
7134      *  1) Both weren't in their sets, count is 0, and remains 0.  There's
7135      *     nothing left in the intersection.
7136      *  2) Both were in their sets, count is 2 and perhaps is incremented to
7137      *     above 2.  What should be output is exactly that which is in the
7138      *     non-exhausted set, as everything it has is also in the intersection
7139      *     set, and everything it doesn't have can't be in the intersection
7140      *  3) The exhausted was in its set, non-exhausted isn't, count is 1, and
7141      *     gets incremented to 2.  Like the previous case, the intersection is
7142      *     everything that remains in the non-exhausted set.
7143      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
7144      *     remains 1.  And the intersection has nothing more. */
7145     if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7146         || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7147     {
7148         count++;
7149     }
7150
7151     /* The final length is what we've output so far plus what else is in the
7152      * intersection.  At most one of the subexpressions below will be non-zero */
7153     len_r = i_r;
7154     if (count >= 2) {
7155         len_r += (len_a - i_a) + (len_b - i_b);
7156     }
7157
7158     /* Set result to final length, which can change the pointer to array_r, so
7159      * re-find it */
7160     if (len_r != invlist_len(r)) {
7161         invlist_set_len(r, len_r);
7162         invlist_trim(r);
7163         array_r = invlist_array(r);
7164     }
7165
7166     /* Finish outputting any remaining */
7167     if (count >= 2) { /* At most one will have a non-zero copy count */
7168         IV copy_count;
7169         if ((copy_count = len_a - i_a) > 0) {
7170             Copy(array_a + i_a, array_r + i_r, copy_count, UV);
7171         }
7172         else if ((copy_count = len_b - i_b) > 0) {
7173             Copy(array_b + i_b, array_r + i_r, copy_count, UV);
7174         }
7175     }
7176
7177     /*  We may be removing a reference to one of the inputs */
7178     if (a == *i || b == *i) {
7179         SvREFCNT_dec(*i);
7180     }
7181
7182     /* If we've changed b, restore it */
7183     if (complement_b) {
7184         array_b[0] = 1;
7185     }
7186
7187     *i = r;
7188     return;
7189 }
7190
7191 SV*
7192 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
7193 {
7194     /* Add the range from 'start' to 'end' inclusive to the inversion list's
7195      * set.  A pointer to the inversion list is returned.  This may actually be
7196      * a new list, in which case the passed in one has been destroyed.  The
7197      * passed in inversion list can be NULL, in which case a new one is created
7198      * with just the one range in it */
7199
7200     SV* range_invlist;
7201     UV len;
7202
7203     if (invlist == NULL) {
7204         invlist = _new_invlist(2);
7205         len = 0;
7206     }
7207     else {
7208         len = invlist_len(invlist);
7209     }
7210
7211     /* If comes after the final entry, can just append it to the end */
7212     if (len == 0
7213         || start >= invlist_array(invlist)
7214                                     [invlist_len(invlist) - 1])
7215     {
7216         _append_range_to_invlist(invlist, start, end);
7217         return invlist;
7218     }
7219
7220     /* Here, can't just append things, create and return a new inversion list
7221      * which is the union of this range and the existing inversion list */
7222     range_invlist = _new_invlist(2);
7223     _append_range_to_invlist(range_invlist, start, end);
7224
7225     _invlist_union(invlist, range_invlist, &invlist);
7226
7227     /* The temporary can be freed */
7228     SvREFCNT_dec(range_invlist);
7229
7230     return invlist;
7231 }
7232
7233 #endif
7234
7235 PERL_STATIC_INLINE SV*
7236 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
7237     return _add_range_to_invlist(invlist, cp, cp);
7238 }
7239
7240 #ifndef PERL_IN_XSUB_RE
7241 void
7242 Perl__invlist_invert(pTHX_ SV* const invlist)
7243 {
7244     /* Complement the input inversion list.  This adds a 0 if the list didn't
7245      * have a zero; removes it otherwise.  As described above, the data
7246      * structure is set up so that this is very efficient */
7247
7248     UV* len_pos = get_invlist_len_addr(invlist);
7249
7250     PERL_ARGS_ASSERT__INVLIST_INVERT;
7251
7252     /* The inverse of matching nothing is matching everything */
7253     if (*len_pos == 0) {
7254         _append_range_to_invlist(invlist, 0, UV_MAX);
7255         return;
7256     }
7257
7258     /* The exclusive or complents 0 to 1; and 1 to 0.  If the result is 1, the
7259      * zero element was a 0, so it is being removed, so the length decrements
7260      * by 1; and vice-versa.  SvCUR is unaffected */
7261     if (*get_invlist_zero_addr(invlist) ^= 1) {
7262         (*len_pos)--;
7263     }
7264     else {
7265         (*len_pos)++;
7266     }
7267 }
7268
7269 void
7270 Perl__invlist_invert_prop(pTHX_ SV* const invlist)
7271 {
7272     /* Complement the input inversion list (which must be a Unicode property,
7273      * all of which don't match above the Unicode maximum code point.)  And
7274      * Perl has chosen to not have the inversion match above that either.  This
7275      * adds a 0x110000 if the list didn't end with it, and removes it if it did
7276      */
7277
7278     UV len;
7279     UV* array;
7280
7281     PERL_ARGS_ASSERT__INVLIST_INVERT_PROP;
7282
7283     _invlist_invert(invlist);
7284
7285     len = invlist_len(invlist);
7286
7287     if (len != 0) { /* If empty do nothing */
7288         array = invlist_array(invlist);
7289         if (array[len - 1] != PERL_UNICODE_MAX + 1) {
7290             /* Add 0x110000.  First, grow if necessary */
7291             len++;
7292             if (invlist_max(invlist) < len) {
7293                 invlist_extend(invlist, len);
7294                 array = invlist_array(invlist);
7295             }
7296             invlist_set_len(invlist, len);
7297             array[len - 1] = PERL_UNICODE_MAX + 1;
7298         }
7299         else {  /* Remove the 0x110000 */
7300             invlist_set_len(invlist, len - 1);
7301         }
7302     }
7303
7304     return;
7305 }
7306 #endif
7307
7308 PERL_STATIC_INLINE SV*
7309 S_invlist_clone(pTHX_ SV* const invlist)
7310 {
7311
7312     /* Return a new inversion list that is a copy of the input one, which is
7313      * unchanged */
7314
7315     /* Need to allocate extra space to accommodate Perl's addition of a
7316      * trailing NUL to SvPV's, since it thinks they are always strings */
7317     SV* new_invlist = _new_invlist(invlist_len(invlist) + 1);
7318     STRLEN length = SvCUR(invlist);
7319
7320     PERL_ARGS_ASSERT_INVLIST_CLONE;
7321
7322     SvCUR_set(new_invlist, length); /* This isn't done automatically */
7323     Copy(SvPVX(invlist), SvPVX(new_invlist), length, char);
7324
7325     return new_invlist;
7326 }
7327
7328 PERL_STATIC_INLINE UV*
7329 S_get_invlist_iter_addr(pTHX_ SV* invlist)
7330 {
7331     /* Return the address of the UV that contains the current iteration
7332      * position */
7333
7334     PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
7335
7336     return (UV *) (SvPVX(invlist) + (INVLIST_ITER_OFFSET * sizeof (UV)));
7337 }
7338
7339 PERL_STATIC_INLINE UV*
7340 S_get_invlist_version_id_addr(pTHX_ SV* invlist)
7341 {
7342     /* Return the address of the UV that contains the version id. */
7343
7344     PERL_ARGS_ASSERT_GET_INVLIST_VERSION_ID_ADDR;
7345
7346     return (UV *) (SvPVX(invlist) + (INVLIST_VERSION_ID_OFFSET * sizeof (UV)));
7347 }
7348
7349 PERL_STATIC_INLINE void
7350 S_invlist_iterinit(pTHX_ SV* invlist)   /* Initialize iterator for invlist */
7351 {
7352     PERL_ARGS_ASSERT_INVLIST_ITERINIT;
7353
7354     *get_invlist_iter_addr(invlist) = 0;
7355 }
7356
7357 STATIC bool
7358 S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
7359 {
7360     /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
7361      * This call sets in <*start> and <*end>, the next range in <invlist>.
7362      * Returns <TRUE> if successful and the next call will return the next
7363      * range; <FALSE> if was already at the end of the list.  If the latter,
7364      * <*start> and <*end> are unchanged, and the next call to this function
7365      * will start over at the beginning of the list */
7366
7367     UV* pos = get_invlist_iter_addr(invlist);
7368     UV len = invlist_len(invlist);
7369     UV *array;
7370
7371     PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
7372
7373     if (*pos >= len) {
7374         *pos = UV_MAX;  /* Force iternit() to be required next time */
7375         return FALSE;
7376     }
7377
7378     array = invlist_array(invlist);
7379
7380     *start = array[(*pos)++];
7381
7382     if (*pos >= len) {
7383         *end = UV_MAX;
7384     }
7385     else {
7386         *end = array[(*pos)++] - 1;
7387     }
7388
7389     return TRUE;
7390 }
7391
7392 #ifndef PERL_IN_XSUB_RE
7393 SV *
7394 Perl__invlist_contents(pTHX_ SV* const invlist)
7395 {
7396     /* Get the contents of an inversion list into a string SV so that they can
7397      * be printed out.  It uses the format traditionally done for debug tracing
7398      */
7399
7400     UV start, end;
7401     SV* output = newSVpvs("\n");
7402
7403     PERL_ARGS_ASSERT__INVLIST_CONTENTS;
7404
7405     invlist_iterinit(invlist);
7406     while (invlist_iternext(invlist, &start, &end)) {
7407         if (end == UV_MAX) {
7408             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
7409         }
7410         else if (end != start) {
7411             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
7412                     start,       end);
7413         }
7414         else {
7415             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
7416         }
7417     }
7418
7419     return output;
7420 }
7421 #endif
7422
7423 #if 0
7424 void
7425 S_invlist_dump(pTHX_ SV* const invlist, const char * const header)
7426 {
7427     /* Dumps out the ranges in an inversion list.  The string 'header'
7428      * if present is output on a line before the first range */
7429
7430     UV start, end;
7431
7432     if (header && strlen(header)) {
7433         PerlIO_printf(Perl_debug_log, "%s\n", header);
7434     }
7435     invlist_iterinit(invlist);
7436     while (invlist_iternext(invlist, &start, &end)) {
7437         if (end == UV_MAX) {
7438             PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. INFINITY\n", start);
7439         }
7440         else {
7441             PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. 0x%04"UVXf"\n", start, end);
7442         }
7443     }
7444 }
7445 #endif
7446
7447 #undef HEADER_LENGTH
7448 #undef INVLIST_INITIAL_LENGTH
7449 #undef TO_INTERNAL_SIZE
7450 #undef FROM_INTERNAL_SIZE
7451 #undef INVLIST_LEN_OFFSET
7452 #undef INVLIST_ZERO_OFFSET
7453 #undef INVLIST_ITER_OFFSET
7454 #undef INVLIST_VERSION_ID
7455
7456 /* End of inversion list object */
7457
7458 /*
7459  - reg - regular expression, i.e. main body or parenthesized thing
7460  *
7461  * Caller must absorb opening parenthesis.
7462  *
7463  * Combining parenthesis handling with the base level of regular expression
7464  * is a trifle forced, but the need to tie the tails of the branches to what
7465  * follows makes it hard to avoid.
7466  */
7467 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
7468 #ifdef DEBUGGING
7469 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
7470 #else
7471 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
7472 #endif
7473
7474 STATIC regnode *
7475 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
7476     /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
7477 {
7478     dVAR;
7479     register regnode *ret;              /* Will be the head of the group. */
7480     register regnode *br;
7481     register regnode *lastbr;
7482     register regnode *ender = NULL;
7483     register I32 parno = 0;
7484     I32 flags;
7485     U32 oregflags = RExC_flags;
7486     bool have_branch = 0;
7487     bool is_open = 0;
7488     I32 freeze_paren = 0;
7489     I32 after_freeze = 0;
7490
7491     /* for (?g), (?gc), and (?o) warnings; warning
7492        about (?c) will warn about (?g) -- japhy    */
7493
7494 #define WASTED_O  0x01
7495 #define WASTED_G  0x02
7496 #define WASTED_C  0x04
7497 #define WASTED_GC (0x02|0x04)
7498     I32 wastedflags = 0x00;
7499
7500     char * parse_start = RExC_parse; /* MJD */
7501     char * const oregcomp_parse = RExC_parse;
7502
7503     GET_RE_DEBUG_FLAGS_DECL;
7504
7505     PERL_ARGS_ASSERT_REG;
7506     DEBUG_PARSE("reg ");
7507
7508     *flagp = 0;                         /* Tentatively. */
7509
7510
7511     /* Make an OPEN node, if parenthesized. */
7512     if (paren) {
7513         if ( *RExC_parse == '*') { /* (*VERB:ARG) */
7514             char *start_verb = RExC_parse;
7515             STRLEN verb_len = 0;
7516             char *start_arg = NULL;
7517             unsigned char op = 0;
7518             int argok = 1;
7519             int internal_argval = 0; /* internal_argval is only useful if !argok */
7520             while ( *RExC_parse && *RExC_parse != ')' ) {
7521                 if ( *RExC_parse == ':' ) {
7522                     start_arg = RExC_parse + 1;
7523                     break;
7524                 }
7525                 RExC_parse++;
7526             }
7527             ++start_verb;
7528             verb_len = RExC_parse - start_verb;
7529             if ( start_arg ) {
7530                 RExC_parse++;
7531                 while ( *RExC_parse && *RExC_parse != ')' )
7532                     RExC_parse++;
7533                 if ( *RExC_parse != ')' )
7534                     vFAIL("Unterminated verb pattern argument");
7535                 if ( RExC_parse == start_arg )
7536                     start_arg = NULL;
7537             } else {
7538                 if ( *RExC_parse != ')' )
7539                     vFAIL("Unterminated verb pattern");
7540             }
7541
7542             switch ( *start_verb ) {
7543             case 'A':  /* (*ACCEPT) */
7544                 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
7545                     op = ACCEPT;
7546                     internal_argval = RExC_nestroot;
7547                 }
7548                 break;
7549             case 'C':  /* (*COMMIT) */
7550                 if ( memEQs(start_verb,verb_len,"COMMIT") )
7551                     op = COMMIT;
7552                 break;
7553             case 'F':  /* (*FAIL) */
7554                 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
7555                     op = OPFAIL;
7556                     argok = 0;
7557                 }
7558                 break;
7559             case ':':  /* (*:NAME) */
7560             case 'M':  /* (*MARK:NAME) */
7561                 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
7562                     op = MARKPOINT;
7563                     argok = -1;
7564                 }
7565                 break;
7566             case 'P':  /* (*PRUNE) */
7567                 if ( memEQs(start_verb,verb_len,"PRUNE") )
7568                     op = PRUNE;
7569                 break;
7570             case 'S':   /* (*SKIP) */
7571                 if ( memEQs(start_verb,verb_len,"SKIP") )
7572                     op = SKIP;
7573                 break;
7574             case 'T':  /* (*THEN) */
7575                 /* [19:06] <TimToady> :: is then */
7576                 if ( memEQs(start_verb,verb_len,"THEN") ) {
7577                     op = CUTGROUP;
7578                     RExC_seen |= REG_SEEN_CUTGROUP;
7579                 }
7580                 break;
7581             }
7582             if ( ! op ) {
7583                 RExC_parse++;
7584                 vFAIL3("Unknown verb pattern '%.*s'",
7585                     verb_len, start_verb);
7586             }
7587             if ( argok ) {
7588                 if ( start_arg && internal_argval ) {
7589                     vFAIL3("Verb pattern '%.*s' may not have an argument",
7590                         verb_len, start_verb);
7591                 } else if ( argok < 0 && !start_arg ) {
7592                     vFAIL3("Verb pattern '%.*s' has a mandatory argument",
7593                         verb_len, start_verb);
7594                 } else {
7595                     ret = reganode(pRExC_state, op, internal_argval);
7596                     if ( ! internal_argval && ! SIZE_ONLY ) {
7597                         if (start_arg) {
7598                             SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
7599                             ARG(ret) = add_data( pRExC_state, 1, "S" );
7600                             RExC_rxi->data->data[ARG(ret)]=(void*)sv;
7601                             ret->flags = 0;
7602                         } else {
7603                             ret->flags = 1;
7604                         }
7605                     }
7606                 }
7607                 if (!internal_argval)
7608                     RExC_seen |= REG_SEEN_VERBARG;
7609             } else if ( start_arg ) {
7610                 vFAIL3("Verb pattern '%.*s' may not have an argument",
7611                         verb_len, start_verb);
7612             } else {
7613                 ret = reg_node(pRExC_state, op);
7614             }
7615             nextchar(pRExC_state);
7616             return ret;
7617         } else
7618         if (*RExC_parse == '?') { /* (?...) */
7619             bool is_logical = 0;
7620             const char * const seqstart = RExC_parse;
7621             bool has_use_defaults = FALSE;
7622
7623             RExC_parse++;
7624             paren = *RExC_parse++;
7625             ret = NULL;                 /* For look-ahead/behind. */
7626             switch (paren) {
7627
7628             case 'P':   /* (?P...) variants for those used to PCRE/Python */
7629                 paren = *RExC_parse++;
7630                 if ( paren == '<')         /* (?P<...>) named capture */
7631                     goto named_capture;
7632                 else if (paren == '>') {   /* (?P>name) named recursion */
7633                     goto named_recursion;
7634                 }
7635                 else if (paren == '=') {   /* (?P=...)  named backref */
7636                     /* this pretty much dupes the code for \k<NAME> in regatom(), if
7637                        you change this make sure you change that */
7638                     char* name_start = RExC_parse;
7639                     U32 num = 0;
7640                     SV *sv_dat = reg_scan_name(pRExC_state,
7641                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7642                     if (RExC_parse == name_start || *RExC_parse != ')')
7643                         vFAIL2("Sequence %.3s... not terminated",parse_start);
7644
7645                     if (!SIZE_ONLY) {
7646                         num = add_data( pRExC_state, 1, "S" );
7647                         RExC_rxi->data->data[num]=(void*)sv_dat;
7648                         SvREFCNT_inc_simple_void(sv_dat);
7649                     }
7650                     RExC_sawback = 1;
7651                     ret = reganode(pRExC_state,
7652                                    ((! FOLD)
7653                                      ? NREF
7654                                      : (MORE_ASCII_RESTRICTED)
7655                                        ? NREFFA
7656                                        : (AT_LEAST_UNI_SEMANTICS)
7657                                          ? NREFFU
7658                                          : (LOC)
7659                                            ? NREFFL
7660                                            : NREFF),
7661                                     num);
7662                     *flagp |= HASWIDTH;
7663
7664                     Set_Node_Offset(ret, parse_start+1);
7665                     Set_Node_Cur_Length(ret); /* MJD */
7666
7667                     nextchar(pRExC_state);
7668                     return ret;
7669                 }
7670                 RExC_parse++;
7671                 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
7672                 /*NOTREACHED*/
7673             case '<':           /* (?<...) */
7674                 if (*RExC_parse == '!')
7675                     paren = ',';
7676                 else if (*RExC_parse != '=')
7677               named_capture:
7678                 {               /* (?<...>) */
7679                     char *name_start;
7680                     SV *svname;
7681                     paren= '>';
7682             case '\'':          /* (?'...') */
7683                     name_start= RExC_parse;
7684                     svname = reg_scan_name(pRExC_state,
7685                         SIZE_ONLY ?  /* reverse test from the others */
7686                         REG_RSN_RETURN_NAME :
7687                         REG_RSN_RETURN_NULL);
7688                     if (RExC_parse == name_start) {
7689                         RExC_parse++;
7690                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
7691                         /*NOTREACHED*/
7692                     }
7693                     if (*RExC_parse != paren)
7694                         vFAIL2("Sequence (?%c... not terminated",
7695                             paren=='>' ? '<' : paren);
7696                     if (SIZE_ONLY) {
7697                         HE *he_str;
7698                         SV *sv_dat = NULL;
7699                         if (!svname) /* shouldn't happen */
7700                             Perl_croak(aTHX_
7701                                 "panic: reg_scan_name returned NULL");
7702                         if (!RExC_paren_names) {
7703                             RExC_paren_names= newHV();
7704                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
7705 #ifdef DEBUGGING
7706                             RExC_paren_name_list= newAV();
7707                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
7708 #endif
7709                         }
7710                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
7711                         if ( he_str )
7712                             sv_dat = HeVAL(he_str);
7713                         if ( ! sv_dat ) {
7714                             /* croak baby croak */
7715                             Perl_croak(aTHX_
7716                                 "panic: paren_name hash element allocation failed");
7717                         } else if ( SvPOK(sv_dat) ) {
7718                             /* (?|...) can mean we have dupes so scan to check
7719                                its already been stored. Maybe a flag indicating
7720                                we are inside such a construct would be useful,
7721                                but the arrays are likely to be quite small, so
7722                                for now we punt -- dmq */
7723                             IV count = SvIV(sv_dat);
7724                             I32 *pv = (I32*)SvPVX(sv_dat);
7725                             IV i;
7726                             for ( i = 0 ; i < count ; i++ ) {
7727                                 if ( pv[i] == RExC_npar ) {
7728                                     count = 0;
7729                                     break;
7730                                 }
7731                             }
7732                             if ( count ) {
7733                                 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
7734                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
7735                                 pv[count] = RExC_npar;
7736                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
7737                             }
7738                         } else {
7739                             (void)SvUPGRADE(sv_dat,SVt_PVNV);
7740                             sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
7741                             SvIOK_on(sv_dat);
7742                             SvIV_set(sv_dat, 1);
7743                         }
7744 #ifdef DEBUGGING
7745                         /* Yes this does cause a memory leak in debugging Perls */
7746                         if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
7747                             SvREFCNT_dec(svname);
7748 #endif
7749
7750                         /*sv_dump(sv_dat);*/
7751                     }
7752                     nextchar(pRExC_state);
7753                     paren = 1;
7754                     goto capturing_parens;
7755                 }
7756                 RExC_seen |= REG_SEEN_LOOKBEHIND;
7757                 RExC_in_lookbehind++;
7758                 RExC_parse++;
7759             case '=':           /* (?=...) */
7760                 RExC_seen_zerolen++;
7761                 break;
7762             case '!':           /* (?!...) */
7763                 RExC_seen_zerolen++;
7764                 if (*RExC_parse == ')') {
7765                     ret=reg_node(pRExC_state, OPFAIL);
7766                     nextchar(pRExC_state);
7767                     return ret;
7768                 }
7769                 break;
7770             case '|':           /* (?|...) */
7771                 /* branch reset, behave like a (?:...) except that
7772                    buffers in alternations share the same numbers */
7773                 paren = ':';
7774                 after_freeze = freeze_paren = RExC_npar;
7775                 break;
7776             case ':':           /* (?:...) */
7777             case '>':           /* (?>...) */
7778                 break;
7779             case '$':           /* (?$...) */
7780             case '@':           /* (?@...) */
7781                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
7782                 break;
7783             case '#':           /* (?#...) */
7784                 while (*RExC_parse && *RExC_parse != ')')
7785                     RExC_parse++;
7786                 if (*RExC_parse != ')')
7787                     FAIL("Sequence (?#... not terminated");
7788                 nextchar(pRExC_state);
7789                 *flagp = TRYAGAIN;
7790                 return NULL;
7791             case '0' :           /* (?0) */
7792             case 'R' :           /* (?R) */
7793                 if (*RExC_parse != ')')
7794                     FAIL("Sequence (?R) not terminated");
7795                 ret = reg_node(pRExC_state, GOSTART);
7796                 *flagp |= POSTPONED;
7797                 nextchar(pRExC_state);
7798                 return ret;
7799                 /*notreached*/
7800             { /* named and numeric backreferences */
7801                 I32 num;
7802             case '&':            /* (?&NAME) */
7803                 parse_start = RExC_parse - 1;
7804               named_recursion:
7805                 {
7806                     SV *sv_dat = reg_scan_name(pRExC_state,
7807                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7808                      num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
7809                 }
7810                 goto gen_recurse_regop;
7811                 /* NOT REACHED */
7812             case '+':
7813                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
7814                     RExC_parse++;
7815                     vFAIL("Illegal pattern");
7816                 }
7817                 goto parse_recursion;
7818                 /* NOT REACHED*/
7819             case '-': /* (?-1) */
7820                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
7821                     RExC_parse--; /* rewind to let it be handled later */
7822                     goto parse_flags;
7823                 }
7824                 /*FALLTHROUGH */
7825             case '1': case '2': case '3': case '4': /* (?1) */
7826             case '5': case '6': case '7': case '8': case '9':
7827                 RExC_parse--;
7828               parse_recursion:
7829                 num = atoi(RExC_parse);
7830                 parse_start = RExC_parse - 1; /* MJD */
7831                 if (*RExC_parse == '-')
7832                     RExC_parse++;
7833                 while (isDIGIT(*RExC_parse))
7834                         RExC_parse++;
7835                 if (*RExC_parse!=')')
7836                     vFAIL("Expecting close bracket");
7837
7838               gen_recurse_regop:
7839                 if ( paren == '-' ) {
7840                     /*
7841                     Diagram of capture buffer numbering.
7842                     Top line is the normal capture buffer numbers
7843                     Bottom line is the negative indexing as from
7844                     the X (the (?-2))
7845
7846                     +   1 2    3 4 5 X          6 7
7847                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
7848                     -   5 4    3 2 1 X          x x
7849
7850                     */
7851                     num = RExC_npar + num;
7852                     if (num < 1)  {
7853                         RExC_parse++;
7854                         vFAIL("Reference to nonexistent group");
7855                     }
7856                 } else if ( paren == '+' ) {
7857                     num = RExC_npar + num - 1;
7858                 }
7859
7860                 ret = reganode(pRExC_state, GOSUB, num);
7861                 if (!SIZE_ONLY) {
7862                     if (num > (I32)RExC_rx->nparens) {
7863                         RExC_parse++;
7864                         vFAIL("Reference to nonexistent group");
7865                     }
7866                     ARG2L_SET( ret, RExC_recurse_count++);
7867                     RExC_emit++;
7868                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
7869                         "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
7870                 } else {
7871                     RExC_size++;
7872                 }
7873                 RExC_seen |= REG_SEEN_RECURSE;
7874                 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
7875                 Set_Node_Offset(ret, parse_start); /* MJD */
7876
7877                 *flagp |= POSTPONED;
7878                 nextchar(pRExC_state);
7879                 return ret;
7880             } /* named and numeric backreferences */
7881             /* NOT REACHED */
7882
7883             case '?':           /* (??...) */
7884                 is_logical = 1;
7885                 if (*RExC_parse != '{') {
7886                     RExC_parse++;
7887                     vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
7888                     /*NOTREACHED*/
7889                 }
7890                 *flagp |= POSTPONED;
7891                 paren = *RExC_parse++;
7892                 /* FALL THROUGH */
7893             case '{':           /* (?{...}) */
7894             {
7895                 I32 count = 1;
7896                 U32 n = 0;
7897                 char c;
7898                 char *s = RExC_parse;
7899
7900                 RExC_seen_zerolen++;
7901                 RExC_seen |= REG_SEEN_EVAL;
7902                 while (count && (c = *RExC_parse)) {
7903                     if (c == '\\') {
7904                         if (RExC_parse[1])
7905                             RExC_parse++;
7906                     }
7907                     else if (c == '{')
7908                         count++;
7909                     else if (c == '}')
7910                         count--;
7911                     RExC_parse++;
7912                 }
7913                 if (*RExC_parse != ')') {
7914                     RExC_parse = s;
7915                     vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
7916                 }
7917                 if (!SIZE_ONLY) {
7918                     PAD *pad;
7919                     OP_4tree *sop, *rop;
7920                     SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
7921
7922                     ENTER;
7923                     Perl_save_re_context(aTHX);
7924                     rop = Perl_sv_compile_2op_is_broken(aTHX_ sv, &sop, "re", &pad);
7925                     sop->op_private |= OPpREFCOUNTED;
7926                     /* re_dup will OpREFCNT_inc */
7927                     OpREFCNT_set(sop, 1);
7928                     LEAVE;
7929
7930                     n = add_data(pRExC_state, 3, "nop");
7931                     RExC_rxi->data->data[n] = (void*)rop;
7932                     RExC_rxi->data->data[n+1] = (void*)sop;
7933                     RExC_rxi->data->data[n+2] = (void*)pad;
7934                     SvREFCNT_dec(sv);
7935                 }
7936                 else {                                          /* First pass */
7937                     if (PL_reginterp_cnt < ++RExC_seen_evals
7938                         && IN_PERL_RUNTIME)
7939                         /* No compiled RE interpolated, has runtime
7940                            components ===> unsafe.  */
7941                         FAIL("Eval-group not allowed at runtime, use re 'eval'");
7942                     if (PL_tainting && PL_tainted)
7943                         FAIL("Eval-group in insecure regular expression");
7944 #if PERL_VERSION > 8
7945                     if (IN_PERL_COMPILETIME)
7946                         PL_cv_has_eval = 1;
7947 #endif
7948                 }
7949
7950                 nextchar(pRExC_state);
7951                 if (is_logical) {
7952                     ret = reg_node(pRExC_state, LOGICAL);
7953                     if (!SIZE_ONLY)
7954                         ret->flags = 2;
7955                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
7956                     /* deal with the length of this later - MJD */
7957                     return ret;
7958                 }
7959                 ret = reganode(pRExC_state, EVAL, n);
7960                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
7961                 Set_Node_Offset(ret, parse_start);
7962                 return ret;
7963             }
7964             case '(':           /* (?(?{...})...) and (?(?=...)...) */
7965             {
7966                 int is_define= 0;
7967                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
7968                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
7969                         || RExC_parse[1] == '<'
7970                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
7971                         I32 flag;
7972
7973                         ret = reg_node(pRExC_state, LOGICAL);
7974                         if (!SIZE_ONLY)
7975                             ret->flags = 1;
7976                         REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
7977                         goto insert_if;
7978                     }
7979                 }
7980                 else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
7981                          || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
7982                 {
7983                     char ch = RExC_parse[0] == '<' ? '>' : '\'';
7984                     char *name_start= RExC_parse++;
7985                     U32 num = 0;
7986                     SV *sv_dat=reg_scan_name(pRExC_state,
7987                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7988                     if (RExC_parse == name_start || *RExC_parse != ch)
7989                         vFAIL2("Sequence (?(%c... not terminated",
7990                             (ch == '>' ? '<' : ch));
7991                     RExC_parse++;
7992                     if (!SIZE_ONLY) {
7993                         num = add_data( pRExC_state, 1, "S" );
7994                         RExC_rxi->data->data[num]=(void*)sv_dat;
7995                         SvREFCNT_inc_simple_void(sv_dat);
7996                     }
7997                     ret = reganode(pRExC_state,NGROUPP,num);
7998                     goto insert_if_check_paren;
7999                 }
8000                 else if (RExC_parse[0] == 'D' &&
8001                          RExC_parse[1] == 'E' &&
8002                          RExC_parse[2] == 'F' &&
8003                          RExC_parse[3] == 'I' &&
8004                          RExC_parse[4] == 'N' &&
8005                          RExC_parse[5] == 'E')
8006                 {
8007                     ret = reganode(pRExC_state,DEFINEP,0);
8008                     RExC_parse +=6 ;
8009                     is_define = 1;
8010                     goto insert_if_check_paren;
8011                 }
8012                 else if (RExC_parse[0] == 'R') {
8013                     RExC_parse++;
8014                     parno = 0;
8015                     if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
8016                         parno = atoi(RExC_parse++);
8017                         while (isDIGIT(*RExC_parse))
8018                             RExC_parse++;
8019                     } else if (RExC_parse[0] == '&') {
8020                         SV *sv_dat;
8021                         RExC_parse++;
8022                         sv_dat = reg_scan_name(pRExC_state,
8023                             SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8024                         parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
8025                     }
8026                     ret = reganode(pRExC_state,INSUBP,parno);
8027                     goto insert_if_check_paren;
8028                 }
8029                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
8030                     /* (?(1)...) */
8031                     char c;
8032                     parno = atoi(RExC_parse++);
8033
8034                     while (isDIGIT(*RExC_parse))
8035                         RExC_parse++;
8036                     ret = reganode(pRExC_state, GROUPP, parno);
8037
8038                  insert_if_check_paren:
8039                     if ((c = *nextchar(pRExC_state)) != ')')
8040                         vFAIL("Switch condition not recognized");
8041                   insert_if:
8042                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
8043                     br = regbranch(pRExC_state, &flags, 1,depth+1);
8044                     if (br == NULL)
8045                         br = reganode(pRExC_state, LONGJMP, 0);
8046                     else
8047                         REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
8048                     c = *nextchar(pRExC_state);
8049                     if (flags&HASWIDTH)
8050                         *flagp |= HASWIDTH;
8051                     if (c == '|') {
8052                         if (is_define)
8053                             vFAIL("(?(DEFINE)....) does not allow branches");
8054                         lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
8055                         regbranch(pRExC_state, &flags, 1,depth+1);
8056                         REGTAIL(pRExC_state, ret, lastbr);
8057                         if (flags&HASWIDTH)
8058                             *flagp |= HASWIDTH;
8059                         c = *nextchar(pRExC_state);
8060                     }
8061                     else
8062                         lastbr = NULL;
8063                     if (c != ')')
8064                         vFAIL("Switch (?(condition)... contains too many branches");
8065                     ender = reg_node(pRExC_state, TAIL);
8066                     REGTAIL(pRExC_state, br, ender);
8067                     if (lastbr) {
8068                         REGTAIL(pRExC_state, lastbr, ender);
8069                         REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
8070                     }
8071                     else
8072                         REGTAIL(pRExC_state, ret, ender);
8073                     RExC_size++; /* XXX WHY do we need this?!!
8074                                     For large programs it seems to be required
8075                                     but I can't figure out why. -- dmq*/
8076                     return ret;
8077                 }
8078                 else {
8079                     vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
8080                 }
8081             }
8082             case 0:
8083                 RExC_parse--; /* for vFAIL to print correctly */
8084                 vFAIL("Sequence (? incomplete");
8085                 break;
8086             case DEFAULT_PAT_MOD:   /* Use default flags with the exceptions
8087                                        that follow */
8088                 has_use_defaults = TRUE;
8089                 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
8090                 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
8091                                                 ? REGEX_UNICODE_CHARSET
8092                                                 : REGEX_DEPENDS_CHARSET);
8093                 goto parse_flags;
8094             default:
8095                 --RExC_parse;
8096                 parse_flags:      /* (?i) */
8097             {
8098                 U32 posflags = 0, negflags = 0;
8099                 U32 *flagsp = &posflags;
8100                 char has_charset_modifier = '\0';
8101                 regex_charset cs = get_regex_charset(RExC_flags);
8102                 if (cs == REGEX_DEPENDS_CHARSET
8103                     && (RExC_utf8 || RExC_uni_semantics))
8104                 {
8105                     cs = REGEX_UNICODE_CHARSET;
8106                 }
8107
8108                 while (*RExC_parse) {
8109                     /* && strchr("iogcmsx", *RExC_parse) */
8110                     /* (?g), (?gc) and (?o) are useless here
8111                        and must be globally applied -- japhy */
8112                     switch (*RExC_parse) {
8113                     CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
8114                     case LOCALE_PAT_MOD:
8115                         if (has_charset_modifier) {
8116                             goto excess_modifier;
8117                         }
8118                         else if (flagsp == &negflags) {
8119                             goto neg_modifier;
8120                         }
8121                         cs = REGEX_LOCALE_CHARSET;
8122                         has_charset_modifier = LOCALE_PAT_MOD;
8123                         RExC_contains_locale = 1;
8124                         break;
8125                     case UNICODE_PAT_MOD:
8126                         if (has_charset_modifier) {
8127                             goto excess_modifier;
8128                         }
8129                         else if (flagsp == &negflags) {
8130                             goto neg_modifier;
8131                         }
8132                         cs = REGEX_UNICODE_CHARSET;
8133                         has_charset_modifier = UNICODE_PAT_MOD;
8134                         break;
8135                     case ASCII_RESTRICT_PAT_MOD:
8136                         if (flagsp == &negflags) {
8137                             goto neg_modifier;
8138                         }
8139                         if (has_charset_modifier) {
8140                             if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
8141                                 goto excess_modifier;
8142                             }
8143                             /* Doubled modifier implies more restricted */
8144                             cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
8145                         }
8146                         else {
8147                             cs = REGEX_ASCII_RESTRICTED_CHARSET;
8148                         }
8149                         has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
8150                         break;
8151                     case DEPENDS_PAT_MOD:
8152                         if (has_use_defaults) {
8153                             goto fail_modifiers;
8154                         }
8155                         else if (flagsp == &negflags) {
8156                             goto neg_modifier;
8157                         }
8158                         else if (has_charset_modifier) {
8159                             goto excess_modifier;
8160                         }
8161
8162                         /* The dual charset means unicode semantics if the
8163                          * pattern (or target, not known until runtime) are
8164                          * utf8, or something in the pattern indicates unicode
8165                          * semantics */
8166                         cs = (RExC_utf8 || RExC_uni_semantics)
8167                              ? REGEX_UNICODE_CHARSET
8168                              : REGEX_DEPENDS_CHARSET;
8169                         has_charset_modifier = DEPENDS_PAT_MOD;
8170                         break;
8171                     excess_modifier:
8172                         RExC_parse++;
8173                         if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
8174                             vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
8175                         }
8176                         else if (has_charset_modifier == *(RExC_parse - 1)) {
8177                             vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
8178                         }
8179                         else {
8180                             vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
8181                         }
8182                         /*NOTREACHED*/
8183                     neg_modifier:
8184                         RExC_parse++;
8185                         vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
8186                         /*NOTREACHED*/
8187                     case ONCE_PAT_MOD: /* 'o' */
8188                     case GLOBAL_PAT_MOD: /* 'g' */
8189                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
8190                             const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
8191                             if (! (wastedflags & wflagbit) ) {
8192                                 wastedflags |= wflagbit;
8193                                 vWARN5(
8194                                     RExC_parse + 1,
8195                                     "Useless (%s%c) - %suse /%c modifier",
8196                                     flagsp == &negflags ? "?-" : "?",
8197                                     *RExC_parse,
8198                                     flagsp == &negflags ? "don't " : "",
8199                                     *RExC_parse
8200                                 );
8201                             }
8202                         }
8203                         break;
8204
8205                     case CONTINUE_PAT_MOD: /* 'c' */
8206                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
8207                             if (! (wastedflags & WASTED_C) ) {
8208                                 wastedflags |= WASTED_GC;
8209                                 vWARN3(
8210                                     RExC_parse + 1,
8211                                     "Useless (%sc) - %suse /gc modifier",
8212                                     flagsp == &negflags ? "?-" : "?",
8213                                     flagsp == &negflags ? "don't " : ""
8214                                 );
8215                             }
8216                         }
8217                         break;
8218                     case KEEPCOPY_PAT_MOD: /* 'p' */
8219                         if (flagsp == &negflags) {
8220                             if (SIZE_ONLY)
8221                                 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
8222                         } else {
8223                             *flagsp |= RXf_PMf_KEEPCOPY;
8224                         }
8225                         break;
8226                     case '-':
8227                         /* A flag is a default iff it is following a minus, so
8228                          * if there is a minus, it means will be trying to
8229                          * re-specify a default which is an error */
8230                         if (has_use_defaults || flagsp == &negflags) {
8231             fail_modifiers:
8232                             RExC_parse++;
8233                             vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8234                             /*NOTREACHED*/
8235                         }
8236                         flagsp = &negflags;
8237                         wastedflags = 0;  /* reset so (?g-c) warns twice */
8238                         break;
8239                     case ':':
8240                         paren = ':';
8241                         /*FALLTHROUGH*/
8242                     case ')':
8243                         RExC_flags |= posflags;
8244                         RExC_flags &= ~negflags;
8245                         set_regex_charset(&RExC_flags, cs);
8246                         if (paren != ':') {
8247                             oregflags |= posflags;
8248                             oregflags &= ~negflags;
8249                             set_regex_charset(&oregflags, cs);
8250                         }
8251                         nextchar(pRExC_state);
8252                         if (paren != ':') {
8253                             *flagp = TRYAGAIN;
8254                             return NULL;
8255                         } else {
8256                             ret = NULL;
8257                             goto parse_rest;
8258                         }
8259                         /*NOTREACHED*/
8260                     default:
8261                         RExC_parse++;
8262                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8263                         /*NOTREACHED*/
8264                     }
8265                     ++RExC_parse;
8266                 }
8267             }} /* one for the default block, one for the switch */
8268         }
8269         else {                  /* (...) */
8270           capturing_parens:
8271             parno = RExC_npar;
8272             RExC_npar++;
8273
8274             ret = reganode(pRExC_state, OPEN, parno);
8275             if (!SIZE_ONLY ){
8276                 if (!RExC_nestroot)
8277                     RExC_nestroot = parno;
8278                 if (RExC_seen & REG_SEEN_RECURSE
8279                     && !RExC_open_parens[parno-1])
8280                 {
8281                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
8282                         "Setting open paren #%"IVdf" to %d\n",
8283                         (IV)parno, REG_NODE_NUM(ret)));
8284                     RExC_open_parens[parno-1]= ret;
8285                 }
8286             }
8287             Set_Node_Length(ret, 1); /* MJD */
8288             Set_Node_Offset(ret, RExC_parse); /* MJD */
8289             is_open = 1;
8290         }
8291     }
8292     else                        /* ! paren */
8293         ret = NULL;
8294
8295    parse_rest:
8296     /* Pick up the branches, linking them together. */
8297     parse_start = RExC_parse;   /* MJD */
8298     br = regbranch(pRExC_state, &flags, 1,depth+1);
8299
8300     /*     branch_len = (paren != 0); */
8301
8302     if (br == NULL)
8303         return(NULL);
8304     if (*RExC_parse == '|') {
8305         if (!SIZE_ONLY && RExC_extralen) {
8306             reginsert(pRExC_state, BRANCHJ, br, depth+1);
8307         }
8308         else {                  /* MJD */
8309             reginsert(pRExC_state, BRANCH, br, depth+1);
8310             Set_Node_Length(br, paren != 0);
8311             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
8312         }
8313         have_branch = 1;
8314         if (SIZE_ONLY)
8315             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
8316     }
8317     else if (paren == ':') {
8318         *flagp |= flags&SIMPLE;
8319     }
8320     if (is_open) {                              /* Starts with OPEN. */
8321         REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
8322     }
8323     else if (paren != '?')              /* Not Conditional */
8324         ret = br;
8325     *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
8326     lastbr = br;
8327     while (*RExC_parse == '|') {
8328         if (!SIZE_ONLY && RExC_extralen) {
8329             ender = reganode(pRExC_state, LONGJMP,0);
8330             REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
8331         }
8332         if (SIZE_ONLY)
8333             RExC_extralen += 2;         /* Account for LONGJMP. */
8334         nextchar(pRExC_state);
8335         if (freeze_paren) {
8336             if (RExC_npar > after_freeze)
8337                 after_freeze = RExC_npar;
8338             RExC_npar = freeze_paren;
8339         }
8340         br = regbranch(pRExC_state, &flags, 0, depth+1);
8341
8342         if (br == NULL)
8343             return(NULL);
8344         REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
8345         lastbr = br;
8346         *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
8347     }
8348
8349     if (have_branch || paren != ':') {
8350         /* Make a closing node, and hook it on the end. */
8351         switch (paren) {
8352         case ':':
8353             ender = reg_node(pRExC_state, TAIL);
8354             break;
8355         case 1:
8356             ender = reganode(pRExC_state, CLOSE, parno);
8357             if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
8358                 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
8359                         "Setting close paren #%"IVdf" to %d\n",
8360                         (IV)parno, REG_NODE_NUM(ender)));
8361                 RExC_close_parens[parno-1]= ender;
8362                 if (RExC_nestroot == parno)
8363                     RExC_nestroot = 0;
8364             }
8365             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
8366             Set_Node_Length(ender,1); /* MJD */
8367             break;
8368         case '<':
8369         case ',':
8370         case '=':
8371         case '!':
8372             *flagp &= ~HASWIDTH;
8373             /* FALL THROUGH */
8374         case '>':
8375             ender = reg_node(pRExC_state, SUCCEED);
8376             break;
8377         case 0:
8378             ender = reg_node(pRExC_state, END);
8379             if (!SIZE_ONLY) {
8380                 assert(!RExC_opend); /* there can only be one! */
8381                 RExC_opend = ender;
8382             }
8383             break;
8384         }
8385         REGTAIL(pRExC_state, lastbr, ender);
8386
8387         if (have_branch && !SIZE_ONLY) {
8388             if (depth==1)
8389                 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
8390
8391             /* Hook the tails of the branches to the closing node. */
8392             for (br = ret; br; br = regnext(br)) {
8393                 const U8 op = PL_regkind[OP(br)];
8394                 if (op == BRANCH) {
8395                     REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
8396                 }
8397                 else if (op == BRANCHJ) {
8398                     REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
8399                 }
8400             }
8401         }
8402     }
8403
8404     {
8405         const char *p;
8406         static const char parens[] = "=!<,>";
8407
8408         if (paren && (p = strchr(parens, paren))) {
8409             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
8410             int flag = (p - parens) > 1;
8411
8412             if (paren == '>')
8413                 node = SUSPEND, flag = 0;
8414             reginsert(pRExC_state, node,ret, depth+1);
8415             Set_Node_Cur_Length(ret);
8416             Set_Node_Offset(ret, parse_start + 1);
8417             ret->flags = flag;
8418             REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
8419         }
8420     }
8421
8422     /* Check for proper termination. */
8423     if (paren) {
8424         RExC_flags = oregflags;
8425         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
8426             RExC_parse = oregcomp_parse;
8427             vFAIL("Unmatched (");
8428         }
8429     }
8430     else if (!paren && RExC_parse < RExC_end) {
8431         if (*RExC_parse == ')') {
8432             RExC_parse++;
8433             vFAIL("Unmatched )");
8434         }
8435         else
8436             FAIL("Junk on end of regexp");      /* "Can't happen". */
8437         /* NOTREACHED */
8438     }
8439
8440     if (RExC_in_lookbehind) {
8441         RExC_in_lookbehind--;
8442     }
8443     if (after_freeze > RExC_npar)
8444         RExC_npar = after_freeze;
8445     return(ret);
8446 }
8447
8448 /*
8449  - regbranch - one alternative of an | operator
8450  *
8451  * Implements the concatenation operator.
8452  */
8453 STATIC regnode *
8454 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
8455 {
8456     dVAR;
8457     register regnode *ret;
8458     register regnode *chain = NULL;
8459     register regnode *latest;
8460     I32 flags = 0, c = 0;
8461     GET_RE_DEBUG_FLAGS_DECL;
8462
8463     PERL_ARGS_ASSERT_REGBRANCH;
8464
8465     DEBUG_PARSE("brnc");
8466
8467     if (first)
8468         ret = NULL;
8469     else {
8470         if (!SIZE_ONLY && RExC_extralen)
8471             ret = reganode(pRExC_state, BRANCHJ,0);
8472         else {
8473             ret = reg_node(pRExC_state, BRANCH);
8474             Set_Node_Length(ret, 1);
8475         }
8476     }
8477
8478     if (!first && SIZE_ONLY)
8479         RExC_extralen += 1;                     /* BRANCHJ */
8480
8481     *flagp = WORST;                     /* Tentatively. */
8482
8483     RExC_parse--;
8484     nextchar(pRExC_state);
8485     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
8486         flags &= ~TRYAGAIN;
8487         latest = regpiece(pRExC_state, &flags,depth+1);
8488         if (latest == NULL) {
8489             if (flags & TRYAGAIN)
8490                 continue;
8491             return(NULL);
8492         }
8493         else if (ret == NULL)
8494             ret = latest;
8495         *flagp |= flags&(HASWIDTH|POSTPONED);
8496         if (chain == NULL)      /* First piece. */
8497             *flagp |= flags&SPSTART;
8498         else {
8499             RExC_naughty++;
8500             REGTAIL(pRExC_state, chain, latest);
8501         }
8502         chain = latest;
8503         c++;
8504     }
8505     if (chain == NULL) {        /* Loop ran zero times. */
8506         chain = reg_node(pRExC_state, NOTHING);
8507         if (ret == NULL)
8508             ret = chain;
8509     }
8510     if (c == 1) {
8511         *flagp |= flags&SIMPLE;
8512     }
8513
8514     return ret;
8515 }
8516
8517 /*
8518  - regpiece - something followed by possible [*+?]
8519  *
8520  * Note that the branching code sequences used for ? and the general cases
8521  * of * and + are somewhat optimized:  they use the same NOTHING node as
8522  * both the endmarker for their branch list and the body of the last branch.
8523  * It might seem that this node could be dispensed with entirely, but the
8524  * endmarker role is not redundant.
8525  */
8526 STATIC regnode *
8527 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
8528 {
8529     dVAR;
8530     register regnode *ret;
8531     register char op;
8532     register char *next;
8533     I32 flags;
8534     const char * const origparse = RExC_parse;
8535     I32 min;
8536     I32 max = REG_INFTY;
8537 #ifdef RE_TRACK_PATTERN_OFFSETS
8538     char *parse_start;
8539 #endif
8540     const char *maxpos = NULL;
8541     GET_RE_DEBUG_FLAGS_DECL;
8542
8543     PERL_ARGS_ASSERT_REGPIECE;
8544
8545     DEBUG_PARSE("piec");
8546
8547     ret = regatom(pRExC_state, &flags,depth+1);
8548     if (ret == NULL) {
8549         if (flags & TRYAGAIN)
8550             *flagp |= TRYAGAIN;
8551         return(NULL);
8552     }
8553
8554     op = *RExC_parse;
8555
8556     if (op == '{' && regcurly(RExC_parse)) {
8557         maxpos = NULL;
8558 #ifdef RE_TRACK_PATTERN_OFFSETS
8559         parse_start = RExC_parse; /* MJD */
8560 #endif
8561         next = RExC_parse + 1;
8562         while (isDIGIT(*next) || *next == ',') {
8563             if (*next == ',') {
8564                 if (maxpos)
8565                     break;
8566                 else
8567                     maxpos = next;
8568             }
8569             next++;
8570         }
8571         if (*next == '}') {             /* got one */
8572             if (!maxpos)
8573                 maxpos = next;
8574             RExC_parse++;
8575             min = atoi(RExC_parse);
8576             if (*maxpos == ',')
8577                 maxpos++;
8578             else
8579                 maxpos = RExC_parse;
8580             max = atoi(maxpos);
8581             if (!max && *maxpos != '0')
8582                 max = REG_INFTY;                /* meaning "infinity" */
8583             else if (max >= REG_INFTY)
8584                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
8585             RExC_parse = next;
8586             nextchar(pRExC_state);
8587
8588         do_curly:
8589             if ((flags&SIMPLE)) {
8590                 RExC_naughty += 2 + RExC_naughty / 2;
8591                 reginsert(pRExC_state, CURLY, ret, depth+1);
8592                 Set_Node_Offset(ret, parse_start+1); /* MJD */
8593                 Set_Node_Cur_Length(ret);
8594             }
8595             else {
8596                 regnode * const w = reg_node(pRExC_state, WHILEM);
8597
8598                 w->flags = 0;
8599                 REGTAIL(pRExC_state, ret, w);
8600                 if (!SIZE_ONLY && RExC_extralen) {
8601                     reginsert(pRExC_state, LONGJMP,ret, depth+1);
8602                     reginsert(pRExC_state, NOTHING,ret, depth+1);
8603                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
8604                 }
8605                 reginsert(pRExC_state, CURLYX,ret, depth+1);
8606                                 /* MJD hk */
8607                 Set_Node_Offset(ret, parse_start+1);
8608                 Set_Node_Length(ret,
8609                                 op == '{' ? (RExC_parse - parse_start) : 1);
8610
8611                 if (!SIZE_ONLY && RExC_extralen)
8612                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
8613                 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
8614                 if (SIZE_ONLY)
8615                     RExC_whilem_seen++, RExC_extralen += 3;
8616                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
8617             }
8618             ret->flags = 0;
8619
8620             if (min > 0)
8621                 *flagp = WORST;
8622             if (max > 0)
8623                 *flagp |= HASWIDTH;
8624             if (max < min)
8625                 vFAIL("Can't do {n,m} with n > m");
8626             if (!SIZE_ONLY) {
8627                 ARG1_SET(ret, (U16)min);
8628                 ARG2_SET(ret, (U16)max);
8629             }
8630
8631             goto nest_check;
8632         }
8633     }
8634
8635     if (!ISMULT1(op)) {
8636         *flagp = flags;
8637         return(ret);
8638     }
8639
8640 #if 0                           /* Now runtime fix should be reliable. */
8641
8642     /* if this is reinstated, don't forget to put this back into perldiag:
8643
8644             =item Regexp *+ operand could be empty at {#} in regex m/%s/
8645
8646            (F) The part of the regexp subject to either the * or + quantifier
8647            could match an empty string. The {#} shows in the regular
8648            expression about where the problem was discovered.
8649
8650     */
8651
8652     if (!(flags&HASWIDTH) && op != '?')
8653       vFAIL("Regexp *+ operand could be empty");
8654 #endif
8655
8656 #ifdef RE_TRACK_PATTERN_OFFSETS
8657     parse_start = RExC_parse;
8658 #endif
8659     nextchar(pRExC_state);
8660
8661     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
8662
8663     if (op == '*' && (flags&SIMPLE)) {
8664         reginsert(pRExC_state, STAR, ret, depth+1);
8665         ret->flags = 0;
8666         RExC_naughty += 4;
8667     }
8668     else if (op == '*') {
8669         min = 0;
8670         goto do_curly;
8671     }
8672     else if (op == '+' && (flags&SIMPLE)) {
8673         reginsert(pRExC_state, PLUS, ret, depth+1);
8674         ret->flags = 0;
8675         RExC_naughty += 3;
8676     }
8677     else if (op == '+') {
8678         min = 1;
8679         goto do_curly;
8680     }
8681     else if (op == '?') {
8682         min = 0; max = 1;
8683         goto do_curly;
8684     }
8685   nest_check:
8686     if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
8687         ckWARN3reg(RExC_parse,
8688                    "%.*s matches null string many times",
8689                    (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
8690                    origparse);
8691     }
8692
8693     if (RExC_parse < RExC_end && *RExC_parse == '?') {
8694         nextchar(pRExC_state);
8695         reginsert(pRExC_state, MINMOD, ret, depth+1);
8696         REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
8697     }
8698 #ifndef REG_ALLOW_MINMOD_SUSPEND
8699     else
8700 #endif
8701     if (RExC_parse < RExC_end && *RExC_parse == '+') {
8702         regnode *ender;
8703         nextchar(pRExC_state);
8704         ender = reg_node(pRExC_state, SUCCEED);
8705         REGTAIL(pRExC_state, ret, ender);
8706         reginsert(pRExC_state, SUSPEND, ret, depth+1);
8707         ret->flags = 0;
8708         ender = reg_node(pRExC_state, TAIL);
8709         REGTAIL(pRExC_state, ret, ender);
8710         /*ret= ender;*/
8711     }
8712
8713     if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
8714         RExC_parse++;
8715         vFAIL("Nested quantifiers");
8716     }
8717
8718     return(ret);
8719 }
8720
8721
8722 /* reg_namedseq(pRExC_state,UVp, UV depth)
8723
8724    This is expected to be called by a parser routine that has
8725    recognized '\N' and needs to handle the rest. RExC_parse is
8726    expected to point at the first char following the N at the time
8727    of the call.
8728
8729    The \N may be inside (indicated by valuep not being NULL) or outside a
8730    character class.
8731
8732    \N may begin either a named sequence, or if outside a character class, mean
8733    to match a non-newline.  For non single-quoted regexes, the tokenizer has
8734    attempted to decide which, and in the case of a named sequence converted it
8735    into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
8736    where c1... are the characters in the sequence.  For single-quoted regexes,
8737    the tokenizer passes the \N sequence through unchanged; this code will not
8738    attempt to determine this nor expand those.  The net effect is that if the
8739    beginning of the passed-in pattern isn't '{U+' or there is no '}', it
8740    signals that this \N occurrence means to match a non-newline.
8741
8742    Only the \N{U+...} form should occur in a character class, for the same
8743    reason that '.' inside a character class means to just match a period: it
8744    just doesn't make sense.
8745
8746    If valuep is non-null then it is assumed that we are parsing inside
8747    of a charclass definition and the first codepoint in the resolved
8748    string is returned via *valuep and the routine will return NULL.
8749    In this mode if a multichar string is returned from the charnames
8750    handler, a warning will be issued, and only the first char in the
8751    sequence will be examined. If the string returned is zero length
8752    then the value of *valuep is undefined and NON-NULL will
8753    be returned to indicate failure. (This will NOT be a valid pointer
8754    to a regnode.)
8755
8756    If valuep is null then it is assumed that we are parsing normal text and a
8757    new EXACT node is inserted into the program containing the resolved string,
8758    and a pointer to the new node is returned.  But if the string is zero length
8759    a NOTHING node is emitted instead.
8760
8761    On success RExC_parse is set to the char following the endbrace.
8762    Parsing failures will generate a fatal error via vFAIL(...)
8763  */
8764 STATIC regnode *
8765 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp, U32 depth)
8766 {
8767     char * endbrace;    /* '}' following the name */
8768     regnode *ret = NULL;
8769     char* p;
8770
8771     GET_RE_DEBUG_FLAGS_DECL;
8772
8773     PERL_ARGS_ASSERT_REG_NAMEDSEQ;
8774
8775     GET_RE_DEBUG_FLAGS;
8776
8777     /* The [^\n] meaning of \N ignores spaces and comments under the /x
8778      * modifier.  The other meaning does not */
8779     p = (RExC_flags & RXf_PMf_EXTENDED)
8780         ? regwhite( pRExC_state, RExC_parse )
8781         : RExC_parse;
8782
8783     /* Disambiguate between \N meaning a named character versus \N meaning
8784      * [^\n].  The former is assumed when it can't be the latter. */
8785     if (*p != '{' || regcurly(p)) {
8786         RExC_parse = p;
8787         if (valuep) {
8788             /* no bare \N in a charclass */
8789             vFAIL("\\N in a character class must be a named character: \\N{...}");
8790         }
8791         nextchar(pRExC_state);
8792         ret = reg_node(pRExC_state, REG_ANY);
8793         *flagp |= HASWIDTH|SIMPLE;
8794         RExC_naughty++;
8795         RExC_parse--;
8796         Set_Node_Length(ret, 1); /* MJD */
8797         return ret;
8798     }
8799
8800     /* Here, we have decided it should be a named sequence */
8801
8802     /* The test above made sure that the next real character is a '{', but
8803      * under the /x modifier, it could be separated by space (or a comment and
8804      * \n) and this is not allowed (for consistency with \x{...} and the
8805      * tokenizer handling of \N{NAME}). */
8806     if (*RExC_parse != '{') {
8807         vFAIL("Missing braces on \\N{}");
8808     }
8809
8810     RExC_parse++;       /* Skip past the '{' */
8811
8812     if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
8813         || ! (endbrace == RExC_parse            /* nothing between the {} */
8814               || (endbrace - RExC_parse >= 2    /* U+ (bad hex is checked below */
8815                   && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
8816     {
8817         if (endbrace) RExC_parse = endbrace;    /* position msg's '<--HERE' */
8818         vFAIL("\\N{NAME} must be resolved by the lexer");
8819     }
8820
8821     if (endbrace == RExC_parse) {   /* empty: \N{} */
8822         if (! valuep) {
8823             RExC_parse = endbrace + 1;
8824             return reg_node(pRExC_state,NOTHING);
8825         }
8826
8827         if (SIZE_ONLY) {
8828             ckWARNreg(RExC_parse,
8829                     "Ignoring zero length \\N{} in character class"
8830             );
8831             RExC_parse = endbrace + 1;
8832         }
8833         *valuep = 0;
8834         return (regnode *) &RExC_parse; /* Invalid regnode pointer */
8835     }
8836
8837     REQUIRE_UTF8;       /* named sequences imply Unicode semantics */
8838     RExC_parse += 2;    /* Skip past the 'U+' */
8839
8840     if (valuep) {   /* In a bracketed char class */
8841         /* We only pay attention to the first char of
8842         multichar strings being returned. I kinda wonder
8843         if this makes sense as it does change the behaviour
8844         from earlier versions, OTOH that behaviour was broken
8845         as well. XXX Solution is to recharacterize as
8846         [rest-of-class]|multi1|multi2... */
8847
8848         STRLEN length_of_hex;
8849         I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
8850             | PERL_SCAN_DISALLOW_PREFIX
8851             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
8852
8853         char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
8854         if (endchar < endbrace) {
8855             ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
8856         }
8857
8858         length_of_hex = (STRLEN)(endchar - RExC_parse);
8859         *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
8860
8861         /* The tokenizer should have guaranteed validity, but it's possible to
8862          * bypass it by using single quoting, so check */
8863         if (length_of_hex == 0
8864             || length_of_hex != (STRLEN)(endchar - RExC_parse) )
8865         {
8866             RExC_parse += length_of_hex;        /* Includes all the valid */
8867             RExC_parse += (RExC_orig_utf8)      /* point to after 1st invalid */
8868                             ? UTF8SKIP(RExC_parse)
8869                             : 1;
8870             /* Guard against malformed utf8 */
8871             if (RExC_parse >= endchar) RExC_parse = endchar;
8872             vFAIL("Invalid hexadecimal number in \\N{U+...}");
8873         }
8874
8875         RExC_parse = endbrace + 1;
8876         if (endchar == endbrace) return NULL;
8877
8878         ret = (regnode *) &RExC_parse;  /* Invalid regnode pointer */
8879     }
8880     else {      /* Not a char class */
8881
8882         /* What is done here is to convert this to a sub-pattern of the form
8883          * (?:\x{char1}\x{char2}...)
8884          * and then call reg recursively.  That way, it retains its atomicness,
8885          * while not having to worry about special handling that some code
8886          * points may have.  toke.c has converted the original Unicode values
8887          * to native, so that we can just pass on the hex values unchanged.  We
8888          * do have to set a flag to keep recoding from happening in the
8889          * recursion */
8890
8891         SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
8892         STRLEN len;
8893         char *endchar;      /* Points to '.' or '}' ending cur char in the input
8894                                stream */
8895         char *orig_end = RExC_end;
8896
8897         while (RExC_parse < endbrace) {
8898
8899             /* Code points are separated by dots.  If none, there is only one
8900              * code point, and is terminated by the brace */
8901             endchar = RExC_parse + strcspn(RExC_parse, ".}");
8902
8903             /* Convert to notation the rest of the code understands */
8904             sv_catpv(substitute_parse, "\\x{");
8905             sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
8906             sv_catpv(substitute_parse, "}");
8907
8908             /* Point to the beginning of the next character in the sequence. */
8909             RExC_parse = endchar + 1;
8910         }
8911         sv_catpv(substitute_parse, ")");
8912
8913         RExC_parse = SvPV(substitute_parse, len);
8914
8915         /* Don't allow empty number */
8916         if (len < 8) {
8917             vFAIL("Invalid hexadecimal number in \\N{U+...}");
8918         }
8919         RExC_end = RExC_parse + len;
8920
8921         /* The values are Unicode, and therefore not subject to recoding */
8922         RExC_override_recoding = 1;
8923
8924         ret = reg(pRExC_state, 1, flagp, depth+1);
8925
8926         RExC_parse = endbrace;
8927         RExC_end = orig_end;
8928         RExC_override_recoding = 0;
8929
8930         nextchar(pRExC_state);
8931     }
8932
8933     return ret;
8934 }
8935
8936
8937 /*
8938  * reg_recode
8939  *
8940  * It returns the code point in utf8 for the value in *encp.
8941  *    value: a code value in the source encoding
8942  *    encp:  a pointer to an Encode object
8943  *
8944  * If the result from Encode is not a single character,
8945  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
8946  */
8947 STATIC UV
8948 S_reg_recode(pTHX_ const char value, SV **encp)
8949 {
8950     STRLEN numlen = 1;
8951     SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
8952     const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
8953     const STRLEN newlen = SvCUR(sv);
8954     UV uv = UNICODE_REPLACEMENT;
8955
8956     PERL_ARGS_ASSERT_REG_RECODE;
8957
8958     if (newlen)
8959         uv = SvUTF8(sv)
8960              ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
8961              : *(U8*)s;
8962
8963     if (!newlen || numlen != newlen) {
8964         uv = UNICODE_REPLACEMENT;
8965         *encp = NULL;
8966     }
8967     return uv;
8968 }
8969
8970
8971 /*
8972  - regatom - the lowest level
8973
8974    Try to identify anything special at the start of the pattern. If there
8975    is, then handle it as required. This may involve generating a single regop,
8976    such as for an assertion; or it may involve recursing, such as to
8977    handle a () structure.
8978
8979    If the string doesn't start with something special then we gobble up
8980    as much literal text as we can.
8981
8982    Once we have been able to handle whatever type of thing started the
8983    sequence, we return.
8984
8985    Note: we have to be careful with escapes, as they can be both literal
8986    and special, and in the case of \10 and friends can either, depending
8987    on context. Specifically there are two separate switches for handling
8988    escape sequences, with the one for handling literal escapes requiring
8989    a dummy entry for all of the special escapes that are actually handled
8990    by the other.
8991 */
8992
8993 STATIC regnode *
8994 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
8995 {
8996     dVAR;
8997     register regnode *ret = NULL;
8998     I32 flags;
8999     char *parse_start = RExC_parse;
9000     U8 op;
9001     GET_RE_DEBUG_FLAGS_DECL;
9002     DEBUG_PARSE("atom");
9003     *flagp = WORST;             /* Tentatively. */
9004
9005     PERL_ARGS_ASSERT_REGATOM;
9006
9007 tryagain:
9008     switch ((U8)*RExC_parse) {
9009     case '^':
9010         RExC_seen_zerolen++;
9011         nextchar(pRExC_state);
9012         if (RExC_flags & RXf_PMf_MULTILINE)
9013             ret = reg_node(pRExC_state, MBOL);
9014         else if (RExC_flags & RXf_PMf_SINGLELINE)
9015             ret = reg_node(pRExC_state, SBOL);
9016         else
9017             ret = reg_node(pRExC_state, BOL);
9018         Set_Node_Length(ret, 1); /* MJD */
9019         break;
9020     case '$':
9021         nextchar(pRExC_state);
9022         if (*RExC_parse)
9023             RExC_seen_zerolen++;
9024         if (RExC_flags & RXf_PMf_MULTILINE)
9025             ret = reg_node(pRExC_state, MEOL);
9026         else if (RExC_flags & RXf_PMf_SINGLELINE)
9027             ret = reg_node(pRExC_state, SEOL);
9028         else
9029             ret = reg_node(pRExC_state, EOL);
9030         Set_Node_Length(ret, 1); /* MJD */
9031         break;
9032     case '.':
9033         nextchar(pRExC_state);
9034         if (RExC_flags & RXf_PMf_SINGLELINE)
9035             ret = reg_node(pRExC_state, SANY);
9036         else
9037             ret = reg_node(pRExC_state, REG_ANY);
9038         *flagp |= HASWIDTH|SIMPLE;
9039         RExC_naughty++;
9040         Set_Node_Length(ret, 1); /* MJD */
9041         break;
9042     case '[':
9043     {
9044         char * const oregcomp_parse = ++RExC_parse;
9045         ret = regclass(pRExC_state,depth+1);
9046         if (*RExC_parse != ']') {
9047             RExC_parse = oregcomp_parse;
9048             vFAIL("Unmatched [");
9049         }
9050         nextchar(pRExC_state);
9051         *flagp |= HASWIDTH|SIMPLE;
9052         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
9053         break;
9054     }
9055     case '(':
9056         nextchar(pRExC_state);
9057         ret = reg(pRExC_state, 1, &flags,depth+1);
9058         if (ret == NULL) {
9059                 if (flags & TRYAGAIN) {
9060                     if (RExC_parse == RExC_end) {
9061                          /* Make parent create an empty node if needed. */
9062                         *flagp |= TRYAGAIN;
9063                         return(NULL);
9064                     }
9065                     goto tryagain;
9066                 }
9067                 return(NULL);
9068         }
9069         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
9070         break;
9071     case '|':
9072     case ')':
9073         if (flags & TRYAGAIN) {
9074             *flagp |= TRYAGAIN;
9075             return NULL;
9076         }
9077         vFAIL("Internal urp");
9078                                 /* Supposed to be caught earlier. */
9079         break;
9080     case '?':
9081     case '+':
9082     case '*':
9083         RExC_parse++;
9084         vFAIL("Quantifier follows nothing");
9085         break;
9086     case '\\':
9087         /* Special Escapes
9088
9089            This switch handles escape sequences that resolve to some kind
9090            of special regop and not to literal text. Escape sequnces that
9091            resolve to literal text are handled below in the switch marked
9092            "Literal Escapes".
9093
9094            Every entry in this switch *must* have a corresponding entry
9095            in the literal escape switch. However, the opposite is not
9096            required, as the default for this switch is to jump to the
9097            literal text handling code.
9098         */
9099         switch ((U8)*++RExC_parse) {
9100         /* Special Escapes */
9101         case 'A':
9102             RExC_seen_zerolen++;
9103             ret = reg_node(pRExC_state, SBOL);
9104             *flagp |= SIMPLE;
9105             goto finish_meta_pat;
9106         case 'G':
9107             ret = reg_node(pRExC_state, GPOS);
9108             RExC_seen |= REG_SEEN_GPOS;
9109             *flagp |= SIMPLE;
9110             goto finish_meta_pat;
9111         case 'K':
9112             RExC_seen_zerolen++;
9113             ret = reg_node(pRExC_state, KEEPS);
9114             *flagp |= SIMPLE;
9115             /* XXX:dmq : disabling in-place substitution seems to
9116              * be necessary here to avoid cases of memory corruption, as
9117              * with: C<$_="x" x 80; s/x\K/y/> -- rgs
9118              */
9119             RExC_seen |= REG_SEEN_LOOKBEHIND;
9120             goto finish_meta_pat;
9121         case 'Z':
9122             ret = reg_node(pRExC_state, SEOL);
9123             *flagp |= SIMPLE;
9124             RExC_seen_zerolen++;                /* Do not optimize RE away */
9125             goto finish_meta_pat;
9126         case 'z':
9127             ret = reg_node(pRExC_state, EOS);
9128             *flagp |= SIMPLE;
9129             RExC_seen_zerolen++;                /* Do not optimize RE away */
9130             goto finish_meta_pat;
9131         case 'C':
9132             ret = reg_node(pRExC_state, CANY);
9133             RExC_seen |= REG_SEEN_CANY;
9134             *flagp |= HASWIDTH|SIMPLE;
9135             goto finish_meta_pat;
9136         case 'X':
9137             ret = reg_node(pRExC_state, CLUMP);
9138             *flagp |= HASWIDTH;
9139             goto finish_meta_pat;
9140         case 'w':
9141             switch (get_regex_charset(RExC_flags)) {
9142                 case REGEX_LOCALE_CHARSET:
9143                     op = ALNUML;
9144                     break;
9145                 case REGEX_UNICODE_CHARSET:
9146                     op = ALNUMU;
9147                     break;
9148                 case REGEX_ASCII_RESTRICTED_CHARSET:
9149                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9150                     op = ALNUMA;
9151                     break;
9152                 case REGEX_DEPENDS_CHARSET:
9153                     op = ALNUM;
9154                     break;
9155                 default:
9156                     goto bad_charset;
9157             }
9158             ret = reg_node(pRExC_state, op);
9159             *flagp |= HASWIDTH|SIMPLE;
9160             goto finish_meta_pat;
9161         case 'W':
9162             switch (get_regex_charset(RExC_flags)) {
9163                 case REGEX_LOCALE_CHARSET:
9164                     op = NALNUML;
9165                     break;
9166                 case REGEX_UNICODE_CHARSET:
9167                     op = NALNUMU;
9168                     break;
9169                 case REGEX_ASCII_RESTRICTED_CHARSET:
9170                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9171                     op = NALNUMA;
9172                     break;
9173                 case REGEX_DEPENDS_CHARSET:
9174                     op = NALNUM;
9175                     break;
9176                 default:
9177                     goto bad_charset;
9178             }
9179             ret = reg_node(pRExC_state, op);
9180             *flagp |= HASWIDTH|SIMPLE;
9181             goto finish_meta_pat;
9182         case 'b':
9183             RExC_seen_zerolen++;
9184             RExC_seen |= REG_SEEN_LOOKBEHIND;
9185             switch (get_regex_charset(RExC_flags)) {
9186                 case REGEX_LOCALE_CHARSET:
9187                     op = BOUNDL;
9188                     break;
9189                 case REGEX_UNICODE_CHARSET:
9190                     op = BOUNDU;
9191                     break;
9192                 case REGEX_ASCII_RESTRICTED_CHARSET:
9193                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9194                     op = BOUNDA;
9195                     break;
9196                 case REGEX_DEPENDS_CHARSET:
9197                     op = BOUND;
9198                     break;
9199                 default:
9200                     goto bad_charset;
9201             }
9202             ret = reg_node(pRExC_state, op);
9203             FLAGS(ret) = get_regex_charset(RExC_flags);
9204             *flagp |= SIMPLE;
9205             goto finish_meta_pat;
9206         case 'B':
9207             RExC_seen_zerolen++;
9208             RExC_seen |= REG_SEEN_LOOKBEHIND;
9209             switch (get_regex_charset(RExC_flags)) {
9210                 case REGEX_LOCALE_CHARSET:
9211                     op = NBOUNDL;
9212                     break;
9213                 case REGEX_UNICODE_CHARSET:
9214                     op = NBOUNDU;
9215                     break;
9216                 case REGEX_ASCII_RESTRICTED_CHARSET:
9217                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9218                     op = NBOUNDA;
9219                     break;
9220                 case REGEX_DEPENDS_CHARSET:
9221                     op = NBOUND;
9222                     break;
9223                 default:
9224                     goto bad_charset;
9225             }
9226             ret = reg_node(pRExC_state, op);
9227             FLAGS(ret) = get_regex_charset(RExC_flags);
9228             *flagp |= SIMPLE;
9229             goto finish_meta_pat;
9230         case 's':
9231             switch (get_regex_charset(RExC_flags)) {
9232                 case REGEX_LOCALE_CHARSET:
9233                     op = SPACEL;
9234                     break;
9235                 case REGEX_UNICODE_CHARSET:
9236                     op = SPACEU;
9237                     break;
9238                 case REGEX_ASCII_RESTRICTED_CHARSET:
9239                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9240                     op = SPACEA;
9241                     break;
9242                 case REGEX_DEPENDS_CHARSET:
9243                     op = SPACE;
9244                     break;
9245                 default:
9246                     goto bad_charset;
9247             }
9248             ret = reg_node(pRExC_state, op);
9249             *flagp |= HASWIDTH|SIMPLE;
9250             goto finish_meta_pat;
9251         case 'S':
9252             switch (get_regex_charset(RExC_flags)) {
9253                 case REGEX_LOCALE_CHARSET:
9254                     op = NSPACEL;
9255                     break;
9256                 case REGEX_UNICODE_CHARSET:
9257                     op = NSPACEU;
9258                     break;
9259                 case REGEX_ASCII_RESTRICTED_CHARSET:
9260                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9261                     op = NSPACEA;
9262                     break;
9263                 case REGEX_DEPENDS_CHARSET:
9264                     op = NSPACE;
9265                     break;
9266                 default:
9267                     goto bad_charset;
9268             }
9269             ret = reg_node(pRExC_state, op);
9270             *flagp |= HASWIDTH|SIMPLE;
9271             goto finish_meta_pat;
9272         case 'd':
9273             switch (get_regex_charset(RExC_flags)) {
9274                 case REGEX_LOCALE_CHARSET:
9275                     op = DIGITL;
9276                     break;
9277                 case REGEX_ASCII_RESTRICTED_CHARSET:
9278                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9279                     op = DIGITA;
9280                     break;
9281                 case REGEX_DEPENDS_CHARSET: /* No difference between these */
9282                 case REGEX_UNICODE_CHARSET:
9283                     op = DIGIT;
9284                     break;
9285                 default:
9286                     goto bad_charset;
9287             }
9288             ret = reg_node(pRExC_state, op);
9289             *flagp |= HASWIDTH|SIMPLE;
9290             goto finish_meta_pat;
9291         case 'D':
9292             switch (get_regex_charset(RExC_flags)) {
9293                 case REGEX_LOCALE_CHARSET:
9294                     op = NDIGITL;
9295                     break;
9296                 case REGEX_ASCII_RESTRICTED_CHARSET:
9297                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9298                     op = NDIGITA;
9299                     break;
9300                 case REGEX_DEPENDS_CHARSET: /* No difference between these */
9301                 case REGEX_UNICODE_CHARSET:
9302                     op = NDIGIT;
9303                     break;
9304                 default:
9305                     goto bad_charset;
9306             }
9307             ret = reg_node(pRExC_state, op);
9308             *flagp |= HASWIDTH|SIMPLE;
9309             goto finish_meta_pat;
9310         case 'R':
9311             ret = reg_node(pRExC_state, LNBREAK);
9312             *flagp |= HASWIDTH|SIMPLE;
9313             goto finish_meta_pat;
9314         case 'h':
9315             ret = reg_node(pRExC_state, HORIZWS);
9316             *flagp |= HASWIDTH|SIMPLE;
9317             goto finish_meta_pat;
9318         case 'H':
9319             ret = reg_node(pRExC_state, NHORIZWS);
9320             *flagp |= HASWIDTH|SIMPLE;
9321             goto finish_meta_pat;
9322         case 'v':
9323             ret = reg_node(pRExC_state, VERTWS);
9324             *flagp |= HASWIDTH|SIMPLE;
9325             goto finish_meta_pat;
9326         case 'V':
9327             ret = reg_node(pRExC_state, NVERTWS);
9328             *flagp |= HASWIDTH|SIMPLE;
9329          finish_meta_pat:
9330             nextchar(pRExC_state);
9331             Set_Node_Length(ret, 2); /* MJD */
9332             break;
9333         case 'p':
9334         case 'P':
9335             {
9336                 char* const oldregxend = RExC_end;
9337 #ifdef DEBUGGING
9338                 char* parse_start = RExC_parse - 2;
9339 #endif
9340
9341                 if (RExC_parse[1] == '{') {
9342                   /* a lovely hack--pretend we saw [\pX] instead */
9343                     RExC_end = strchr(RExC_parse, '}');
9344                     if (!RExC_end) {
9345                         const U8 c = (U8)*RExC_parse;
9346                         RExC_parse += 2;
9347                         RExC_end = oldregxend;
9348                         vFAIL2("Missing right brace on \\%c{}", c);
9349                     }
9350                     RExC_end++;
9351                 }
9352                 else {
9353                     RExC_end = RExC_parse + 2;
9354                     if (RExC_end > oldregxend)
9355                         RExC_end = oldregxend;
9356                 }
9357                 RExC_parse--;
9358
9359                 ret = regclass(pRExC_state,depth+1);
9360
9361                 RExC_end = oldregxend;
9362                 RExC_parse--;
9363
9364                 Set_Node_Offset(ret, parse_start + 2);
9365                 Set_Node_Cur_Length(ret);
9366                 nextchar(pRExC_state);
9367                 *flagp |= HASWIDTH|SIMPLE;
9368             }
9369             break;
9370         case 'N':
9371             /* Handle \N and \N{NAME} here and not below because it can be
9372             multicharacter. join_exact() will join them up later on.
9373             Also this makes sure that things like /\N{BLAH}+/ and
9374             \N{BLAH} being multi char Just Happen. dmq*/
9375             ++RExC_parse;
9376             ret= reg_namedseq(pRExC_state, NULL, flagp, depth);
9377             break;
9378         case 'k':    /* Handle \k<NAME> and \k'NAME' */
9379         parse_named_seq:
9380         {
9381             char ch= RExC_parse[1];
9382             if (ch != '<' && ch != '\'' && ch != '{') {
9383                 RExC_parse++;
9384                 vFAIL2("Sequence %.2s... not terminated",parse_start);
9385             } else {
9386                 /* this pretty much dupes the code for (?P=...) in reg(), if
9387                    you change this make sure you change that */
9388                 char* name_start = (RExC_parse += 2);
9389                 U32 num = 0;
9390                 SV *sv_dat = reg_scan_name(pRExC_state,
9391                     SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9392                 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
9393                 if (RExC_parse == name_start || *RExC_parse != ch)
9394                     vFAIL2("Sequence %.3s... not terminated",parse_start);
9395
9396                 if (!SIZE_ONLY) {
9397                     num = add_data( pRExC_state, 1, "S" );
9398                     RExC_rxi->data->data[num]=(void*)sv_dat;
9399                     SvREFCNT_inc_simple_void(sv_dat);
9400                 }
9401
9402                 RExC_sawback = 1;
9403                 ret = reganode(pRExC_state,
9404                                ((! FOLD)
9405                                  ? NREF
9406                                  : (MORE_ASCII_RESTRICTED)
9407                                    ? NREFFA
9408                                    : (AT_LEAST_UNI_SEMANTICS)
9409                                      ? NREFFU
9410                                      : (LOC)
9411                                        ? NREFFL
9412                                        : NREFF),
9413                                 num);
9414                 *flagp |= HASWIDTH;
9415
9416                 /* override incorrect value set in reganode MJD */
9417                 Set_Node_Offset(ret, parse_start+1);
9418                 Set_Node_Cur_Length(ret); /* MJD */
9419                 nextchar(pRExC_state);
9420
9421             }
9422             break;
9423         }
9424         case 'g':
9425         case '1': case '2': case '3': case '4':
9426         case '5': case '6': case '7': case '8': case '9':
9427             {
9428                 I32 num;
9429                 bool isg = *RExC_parse == 'g';
9430                 bool isrel = 0;
9431                 bool hasbrace = 0;
9432                 if (isg) {
9433                     RExC_parse++;
9434                     if (*RExC_parse == '{') {
9435                         RExC_parse++;
9436                         hasbrace = 1;
9437                     }
9438                     if (*RExC_parse == '-') {
9439                         RExC_parse++;
9440                         isrel = 1;
9441                     }
9442                     if (hasbrace && !isDIGIT(*RExC_parse)) {
9443                         if (isrel) RExC_parse--;
9444                         RExC_parse -= 2;
9445                         goto parse_named_seq;
9446                 }   }
9447                 num = atoi(RExC_parse);
9448                 if (isg && num == 0)
9449                     vFAIL("Reference to invalid group 0");
9450                 if (isrel) {
9451                     num = RExC_npar - num;
9452                     if (num < 1)
9453                         vFAIL("Reference to nonexistent or unclosed group");
9454                 }
9455                 if (!isg && num > 9 && num >= RExC_npar)
9456                     goto defchar;
9457                 else {
9458                     char * const parse_start = RExC_parse - 1; /* MJD */
9459                     while (isDIGIT(*RExC_parse))
9460                         RExC_parse++;
9461                     if (parse_start == RExC_parse - 1)
9462                         vFAIL("Unterminated \\g... pattern");
9463                     if (hasbrace) {
9464                         if (*RExC_parse != '}')
9465                             vFAIL("Unterminated \\g{...} pattern");
9466                         RExC_parse++;
9467                     }
9468                     if (!SIZE_ONLY) {
9469                         if (num > (I32)RExC_rx->nparens)
9470                             vFAIL("Reference to nonexistent group");
9471                     }
9472                     RExC_sawback = 1;
9473                     ret = reganode(pRExC_state,
9474                                    ((! FOLD)
9475                                      ? REF
9476                                      : (MORE_ASCII_RESTRICTED)
9477                                        ? REFFA
9478                                        : (AT_LEAST_UNI_SEMANTICS)
9479                                          ? REFFU
9480                                          : (LOC)
9481                                            ? REFFL
9482                                            : REFF),
9483                                     num);
9484                     *flagp |= HASWIDTH;
9485
9486                     /* override incorrect value set in reganode MJD */
9487                     Set_Node_Offset(ret, parse_start+1);
9488                     Set_Node_Cur_Length(ret); /* MJD */
9489                     RExC_parse--;
9490                     nextchar(pRExC_state);
9491                 }
9492             }
9493             break;
9494         case '\0':
9495             if (RExC_parse >= RExC_end)
9496                 FAIL("Trailing \\");
9497             /* FALL THROUGH */
9498         default:
9499             /* Do not generate "unrecognized" warnings here, we fall
9500                back into the quick-grab loop below */
9501             parse_start--;
9502             goto defchar;
9503         }
9504         break;
9505
9506     case '#':
9507         if (RExC_flags & RXf_PMf_EXTENDED) {
9508             if ( reg_skipcomment( pRExC_state ) )
9509                 goto tryagain;
9510         }
9511         /* FALL THROUGH */
9512
9513     default:
9514
9515             parse_start = RExC_parse - 1;
9516
9517             RExC_parse++;
9518
9519         defchar: {
9520             register STRLEN len;
9521             register UV ender;
9522             register char *p;
9523             char *s;
9524             STRLEN foldlen;
9525             U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
9526             U8 node_type;
9527
9528             /* Is this a LATIN LOWER CASE SHARP S in an EXACTFU node?  If so,
9529              * it is folded to 'ss' even if not utf8 */
9530             bool is_exactfu_sharp_s;
9531
9532             ender = 0;
9533             node_type = ((! FOLD) ? EXACT
9534                         : (LOC)
9535                           ? EXACTFL
9536                           : (MORE_ASCII_RESTRICTED)
9537                             ? EXACTFA
9538                             : (AT_LEAST_UNI_SEMANTICS)
9539                               ? EXACTFU
9540                               : EXACTF);
9541             ret = reg_node(pRExC_state, node_type);
9542             s = STRING(ret);
9543
9544             /* XXX The node can hold up to 255 bytes, yet this only goes to
9545              * 127.  I (khw) do not know why.  Keeping it somewhat less than
9546              * 255 allows us to not have to worry about overflow due to
9547              * converting to utf8 and fold expansion, but that value is
9548              * 255-UTF8_MAXBYTES_CASE.  join_exact() may join adjacent nodes
9549              * split up by this limit into a single one using the real max of
9550              * 255.  Even at 127, this breaks under rare circumstances.  If
9551              * folding, we do not want to split a node at a character that is a
9552              * non-final in a multi-char fold, as an input string could just
9553              * happen to want to match across the node boundary.  The join
9554              * would solve that problem if the join actually happens.  But a
9555              * series of more than two nodes in a row each of 127 would cause
9556              * the first join to succeed to get to 254, but then there wouldn't
9557              * be room for the next one, which could at be one of those split
9558              * multi-char folds.  I don't know of any fool-proof solution.  One
9559              * could back off to end with only a code point that isn't such a
9560              * non-final, but it is possible for there not to be any in the
9561              * entire node. */
9562             for (len = 0, p = RExC_parse - 1;
9563                  len < 127 && p < RExC_end;
9564                  len++)
9565             {
9566                 char * const oldp = p;
9567
9568                 if (RExC_flags & RXf_PMf_EXTENDED)
9569                     p = regwhite( pRExC_state, p );
9570                 switch ((U8)*p) {
9571                 case '^':
9572                 case '$':
9573                 case '.':
9574                 case '[':
9575                 case '(':
9576                 case ')':
9577                 case '|':
9578                     goto loopdone;
9579                 case '\\':
9580                     /* Literal Escapes Switch
9581
9582                        This switch is meant to handle escape sequences that
9583                        resolve to a literal character.
9584
9585                        Every escape sequence that represents something
9586                        else, like an assertion or a char class, is handled
9587                        in the switch marked 'Special Escapes' above in this
9588                        routine, but also has an entry here as anything that
9589                        isn't explicitly mentioned here will be treated as
9590                        an unescaped equivalent literal.
9591                     */
9592
9593                     switch ((U8)*++p) {
9594                     /* These are all the special escapes. */
9595                     case 'A':             /* Start assertion */
9596                     case 'b': case 'B':   /* Word-boundary assertion*/
9597                     case 'C':             /* Single char !DANGEROUS! */
9598                     case 'd': case 'D':   /* digit class */
9599                     case 'g': case 'G':   /* generic-backref, pos assertion */
9600                     case 'h': case 'H':   /* HORIZWS */
9601                     case 'k': case 'K':   /* named backref, keep marker */
9602                     case 'N':             /* named char sequence */
9603                     case 'p': case 'P':   /* Unicode property */
9604                               case 'R':   /* LNBREAK */
9605                     case 's': case 'S':   /* space class */
9606                     case 'v': case 'V':   /* VERTWS */
9607                     case 'w': case 'W':   /* word class */
9608                     case 'X':             /* eXtended Unicode "combining character sequence" */
9609                     case 'z': case 'Z':   /* End of line/string assertion */
9610                         --p;
9611                         goto loopdone;
9612
9613                     /* Anything after here is an escape that resolves to a
9614                        literal. (Except digits, which may or may not)
9615                      */
9616                     case 'n':
9617                         ender = '\n';
9618                         p++;
9619                         break;
9620                     case 'r':
9621                         ender = '\r';
9622                         p++;
9623                         break;
9624                     case 't':
9625                         ender = '\t';
9626                         p++;
9627                         break;
9628                     case 'f':
9629                         ender = '\f';
9630                         p++;
9631                         break;
9632                     case 'e':
9633                           ender = ASCII_TO_NATIVE('\033');
9634                         p++;
9635                         break;
9636                     case 'a':
9637                           ender = ASCII_TO_NATIVE('\007');
9638                         p++;
9639                         break;
9640                     case 'o':
9641                         {
9642                             STRLEN brace_len = len;
9643                             UV result;
9644                             const char* error_msg;
9645
9646                             bool valid = grok_bslash_o(p,
9647                                                        &result,
9648                                                        &brace_len,
9649                                                        &error_msg,
9650                                                        1);
9651                             p += brace_len;
9652                             if (! valid) {
9653                                 RExC_parse = p; /* going to die anyway; point
9654                                                    to exact spot of failure */
9655                                 vFAIL(error_msg);
9656                             }
9657                             else
9658                             {
9659                                 ender = result;
9660                             }
9661                             if (PL_encoding && ender < 0x100) {
9662                                 goto recode_encoding;
9663                             }
9664                             if (ender > 0xff) {
9665                                 REQUIRE_UTF8;
9666                             }
9667                             break;
9668                         }
9669                     case 'x':
9670                         if (*++p == '{') {
9671                             char* const e = strchr(p, '}');
9672
9673                             if (!e) {
9674                                 RExC_parse = p + 1;
9675                                 vFAIL("Missing right brace on \\x{}");
9676                             }
9677                             else {
9678                                 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
9679                                     | PERL_SCAN_DISALLOW_PREFIX;
9680                                 STRLEN numlen = e - p - 1;
9681                                 ender = grok_hex(p + 1, &numlen, &flags, NULL);
9682                                 if (ender > 0xff)
9683                                     REQUIRE_UTF8;
9684                                 p = e + 1;
9685                             }
9686                         }
9687                         else {
9688                             I32 flags = PERL_SCAN_DISALLOW_PREFIX;
9689                             STRLEN numlen = 2;
9690                             ender = grok_hex(p, &numlen, &flags, NULL);
9691                             p += numlen;
9692                         }
9693                         if (PL_encoding && ender < 0x100)
9694                             goto recode_encoding;
9695                         break;
9696                     case 'c':
9697                         p++;
9698                         ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
9699                         break;
9700                     case '0': case '1': case '2': case '3':case '4':
9701                     case '5': case '6': case '7': case '8':case '9':
9702                         if (*p == '0' ||
9703                             (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
9704                         {
9705                             I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
9706                             STRLEN numlen = 3;
9707                             ender = grok_oct(p, &numlen, &flags, NULL);
9708                             if (ender > 0xff) {
9709                                 REQUIRE_UTF8;
9710                             }
9711                             p += numlen;
9712                         }
9713                         else {
9714                             --p;
9715                             goto loopdone;
9716                         }
9717                         if (PL_encoding && ender < 0x100)
9718                             goto recode_encoding;
9719                         break;
9720                     recode_encoding:
9721                         if (! RExC_override_recoding) {
9722                             SV* enc = PL_encoding;
9723                             ender = reg_recode((const char)(U8)ender, &enc);
9724                             if (!enc && SIZE_ONLY)
9725                                 ckWARNreg(p, "Invalid escape in the specified encoding");
9726                             REQUIRE_UTF8;
9727                         }
9728                         break;
9729                     case '\0':
9730                         if (p >= RExC_end)
9731                             FAIL("Trailing \\");
9732                         /* FALL THROUGH */
9733                     default:
9734                         if (!SIZE_ONLY&& isALPHA(*p)) {
9735                             ckWARN2reg(p + 1, "Unrecognized escape \\%.1s passed through", p);
9736                         }
9737                         goto normal_default;
9738                     }
9739                     break;
9740                 case '{':
9741                     /* Currently we don't warn when the lbrace is at the start
9742                      * of a construct.  This catches it in the middle of a
9743                      * literal string, or when its the first thing after
9744                      * something like "\b" */
9745                     if (! SIZE_ONLY
9746                         && (len || (p > RExC_start && isALPHA_A(*(p -1)))))
9747                     {
9748                         ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through");
9749                     }
9750                     /*FALLTHROUGH*/
9751                 default:
9752                   normal_default:
9753                     if (UTF8_IS_START(*p) && UTF) {
9754                         STRLEN numlen;
9755                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
9756                                                &numlen, UTF8_ALLOW_DEFAULT);
9757                         p += numlen;
9758                     }
9759                     else
9760                         ender = (U8) *p++;
9761                     break;
9762                 } /* End of switch on the literal */
9763
9764                 is_exactfu_sharp_s = (node_type == EXACTFU
9765                                       && ender == LATIN_SMALL_LETTER_SHARP_S);
9766                 if ( RExC_flags & RXf_PMf_EXTENDED)
9767                     p = regwhite( pRExC_state, p );
9768                 if ((UTF && FOLD) || is_exactfu_sharp_s) {
9769                     /* Prime the casefolded buffer.  Locale rules, which apply
9770                      * only to code points < 256, aren't known until execution,
9771                      * so for them, just output the original character using
9772                      * utf8.  If we start to fold non-UTF patterns, be sure to
9773                      * update join_exact() */
9774                     if (LOC && ender < 256) {
9775                         if (UNI_IS_INVARIANT(ender)) {
9776                             *tmpbuf = (U8) ender;
9777                             foldlen = 1;
9778                         } else {
9779                             *tmpbuf = UTF8_TWO_BYTE_HI(ender);
9780                             *(tmpbuf + 1) = UTF8_TWO_BYTE_LO(ender);
9781                             foldlen = 2;
9782                         }
9783                     }
9784                     else if (isASCII(ender)) {  /* Note: Here can't also be LOC
9785                                                  */
9786                         ender = toLOWER(ender);
9787                         *tmpbuf = (U8) ender;
9788                         foldlen = 1;
9789                     }
9790                     else if (! MORE_ASCII_RESTRICTED && ! LOC) {
9791
9792                         /* Locale and /aa require more selectivity about the
9793                          * fold, so are handled below.  Otherwise, here, just
9794                          * use the fold */
9795                         ender = toFOLD_uni(ender, tmpbuf, &foldlen);
9796                     }
9797                     else {
9798                         /* Under locale rules or /aa we are not to mix,
9799                          * respectively, ords < 256 or ASCII with non-.  So
9800                          * reject folds that mix them, using only the
9801                          * non-folded code point.  So do the fold to a
9802                          * temporary, and inspect each character in it. */
9803                         U8 trialbuf[UTF8_MAXBYTES_CASE+1];
9804                         U8* s = trialbuf;
9805                         UV tmpender = toFOLD_uni(ender, trialbuf, &foldlen);
9806                         U8* e = s + foldlen;
9807                         bool fold_ok = TRUE;
9808
9809                         while (s < e) {
9810                             if (isASCII(*s)
9811                                 || (LOC && (UTF8_IS_INVARIANT(*s)
9812                                            || UTF8_IS_DOWNGRADEABLE_START(*s))))
9813                             {
9814                                 fold_ok = FALSE;
9815                                 break;
9816                             }
9817                             s += UTF8SKIP(s);
9818                         }
9819                         if (fold_ok) {
9820                             Copy(trialbuf, tmpbuf, foldlen, U8);
9821                             ender = tmpender;
9822                         }
9823                         else {
9824                             uvuni_to_utf8(tmpbuf, ender);
9825                             foldlen = UNISKIP(ender);
9826                         }
9827                     }
9828                 }
9829                 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
9830                     if (len)
9831                         p = oldp;
9832                     else if (UTF || is_exactfu_sharp_s) {
9833                          if (FOLD) {
9834                               /* Emit all the Unicode characters. */
9835                               STRLEN numlen;
9836                               for (foldbuf = tmpbuf;
9837                                    foldlen;
9838                                    foldlen -= numlen) {
9839
9840                                    /* tmpbuf has been constructed by us, so we
9841                                     * know it is valid utf8 */
9842                                    ender = valid_utf8_to_uvchr(foldbuf, &numlen);
9843                                    if (numlen > 0) {
9844                                         const STRLEN unilen = reguni(pRExC_state, ender, s);
9845                                         s       += unilen;
9846                                         len     += unilen;
9847                                         /* In EBCDIC the numlen
9848                                          * and unilen can differ. */
9849                                         foldbuf += numlen;
9850                                         if (numlen >= foldlen)
9851                                              break;
9852                                    }
9853                                    else
9854                                         break; /* "Can't happen." */
9855                               }
9856                          }
9857                          else {
9858                               const STRLEN unilen = reguni(pRExC_state, ender, s);
9859                               if (unilen > 0) {
9860                                    s   += unilen;
9861                                    len += unilen;
9862                               }
9863                          }
9864                     }
9865                     else {
9866                         len++;
9867                         REGC((char)ender, s++);
9868                     }
9869                     break;
9870                 }
9871                 if (UTF || is_exactfu_sharp_s) {
9872                      if (FOLD) {
9873                           /* Emit all the Unicode characters. */
9874                           STRLEN numlen;
9875                           for (foldbuf = tmpbuf;
9876                                foldlen;
9877                                foldlen -= numlen) {
9878                                ender = valid_utf8_to_uvchr(foldbuf, &numlen);
9879                                if (numlen > 0) {
9880                                     const STRLEN unilen = reguni(pRExC_state, ender, s);
9881                                     len     += unilen;
9882                                     s       += unilen;
9883                                     /* In EBCDIC the numlen
9884                                      * and unilen can differ. */
9885                                     foldbuf += numlen;
9886                                     if (numlen >= foldlen)
9887                                          break;
9888                                }
9889                                else
9890                                     break;
9891                           }
9892                      }
9893                      else {
9894                           const STRLEN unilen = reguni(pRExC_state, ender, s);
9895                           if (unilen > 0) {
9896                                s   += unilen;
9897                                len += unilen;
9898                           }
9899                      }
9900                      len--;
9901                 }
9902                 else {
9903                     REGC((char)ender, s++);
9904                 }
9905             }
9906         loopdone:   /* Jumped to when encounters something that shouldn't be in
9907                        the node */
9908             RExC_parse = p - 1;
9909             Set_Node_Cur_Length(ret); /* MJD */
9910             nextchar(pRExC_state);
9911             {
9912                 /* len is STRLEN which is unsigned, need to copy to signed */
9913                 IV iv = len;
9914                 if (iv < 0)
9915                     vFAIL("Internal disaster");
9916             }
9917             if (len > 0)
9918                 *flagp |= HASWIDTH;
9919             if (len == 1 && UNI_IS_INVARIANT(ender))
9920                 *flagp |= SIMPLE;
9921
9922             if (SIZE_ONLY)
9923                 RExC_size += STR_SZ(len);
9924             else {
9925                 STR_LEN(ret) = len;
9926                 RExC_emit += STR_SZ(len);
9927             }
9928         }
9929         break;
9930     }
9931
9932     return(ret);
9933
9934 /* Jumped to when an unrecognized character set is encountered */
9935 bad_charset:
9936     Perl_croak(aTHX_ "panic: Unknown regex character set encoding: %u", get_regex_charset(RExC_flags));
9937     return(NULL);
9938 }
9939
9940 STATIC char *
9941 S_regwhite( RExC_state_t *pRExC_state, char *p )
9942 {
9943     const char *e = RExC_end;
9944
9945     PERL_ARGS_ASSERT_REGWHITE;
9946
9947     while (p < e) {
9948         if (isSPACE(*p))
9949             ++p;
9950         else if (*p == '#') {
9951             bool ended = 0;
9952             do {
9953                 if (*p++ == '\n') {
9954                     ended = 1;
9955                     break;
9956                 }
9957             } while (p < e);
9958             if (!ended)
9959                 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
9960         }
9961         else
9962             break;
9963     }
9964     return p;
9965 }
9966
9967 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
9968    Character classes ([:foo:]) can also be negated ([:^foo:]).
9969    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
9970    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
9971    but trigger failures because they are currently unimplemented. */
9972
9973 #define POSIXCC_DONE(c)   ((c) == ':')
9974 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
9975 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
9976
9977 STATIC I32
9978 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
9979 {
9980     dVAR;
9981     I32 namedclass = OOB_NAMEDCLASS;
9982
9983     PERL_ARGS_ASSERT_REGPPOSIXCC;
9984
9985     if (value == '[' && RExC_parse + 1 < RExC_end &&
9986         /* I smell either [: or [= or [. -- POSIX has been here, right? */
9987         POSIXCC(UCHARAT(RExC_parse))) {
9988         const char c = UCHARAT(RExC_parse);
9989         char* const s = RExC_parse++;
9990
9991         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
9992             RExC_parse++;
9993         if (RExC_parse == RExC_end)
9994             /* Grandfather lone [:, [=, [. */
9995             RExC_parse = s;
9996         else {
9997             const char* const t = RExC_parse++; /* skip over the c */
9998             assert(*t == c);
9999
10000             if (UCHARAT(RExC_parse) == ']') {
10001                 const char *posixcc = s + 1;
10002                 RExC_parse++; /* skip over the ending ] */
10003
10004                 if (*s == ':') {
10005                     const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
10006                     const I32 skip = t - posixcc;
10007
10008                     /* Initially switch on the length of the name.  */
10009                     switch (skip) {
10010                     case 4:
10011                         if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
10012                             namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
10013                         break;
10014                     case 5:
10015                         /* Names all of length 5.  */
10016                         /* alnum alpha ascii blank cntrl digit graph lower
10017                            print punct space upper  */
10018                         /* Offset 4 gives the best switch position.  */
10019                         switch (posixcc[4]) {
10020                         case 'a':
10021                             if (memEQ(posixcc, "alph", 4)) /* alpha */
10022                                 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
10023                             break;
10024                         case 'e':
10025                             if (memEQ(posixcc, "spac", 4)) /* space */
10026                                 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
10027                             break;
10028                         case 'h':
10029                             if (memEQ(posixcc, "grap", 4)) /* graph */
10030                                 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
10031                             break;
10032                         case 'i':
10033                             if (memEQ(posixcc, "asci", 4)) /* ascii */
10034                                 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
10035                             break;
10036                         case 'k':
10037                             if (memEQ(posixcc, "blan", 4)) /* blank */
10038                                 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
10039                             break;
10040                         case 'l':
10041                             if (memEQ(posixcc, "cntr", 4)) /* cntrl */
10042                                 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
10043                             break;
10044                         case 'm':
10045                             if (memEQ(posixcc, "alnu", 4)) /* alnum */
10046                                 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
10047                             break;
10048                         case 'r':
10049                             if (memEQ(posixcc, "lowe", 4)) /* lower */
10050                                 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
10051                             else if (memEQ(posixcc, "uppe", 4)) /* upper */
10052                                 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
10053                             break;
10054                         case 't':
10055                             if (memEQ(posixcc, "digi", 4)) /* digit */
10056                                 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
10057                             else if (memEQ(posixcc, "prin", 4)) /* print */
10058                                 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
10059                             else if (memEQ(posixcc, "punc", 4)) /* punct */
10060                                 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
10061                             break;
10062                         }
10063                         break;
10064                     case 6:
10065                         if (memEQ(posixcc, "xdigit", 6))
10066                             namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
10067                         break;
10068                     }
10069
10070                     if (namedclass == OOB_NAMEDCLASS)
10071                         Simple_vFAIL3("POSIX class [:%.*s:] unknown",
10072                                       t - s - 1, s + 1);
10073                     assert (posixcc[skip] == ':');
10074                     assert (posixcc[skip+1] == ']');
10075                 } else if (!SIZE_ONLY) {
10076                     /* [[=foo=]] and [[.foo.]] are still future. */
10077
10078                     /* adjust RExC_parse so the warning shows after
10079                        the class closes */
10080                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
10081                         RExC_parse++;
10082                     Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
10083                 }
10084             } else {
10085                 /* Maternal grandfather:
10086                  * "[:" ending in ":" but not in ":]" */
10087                 RExC_parse = s;
10088             }
10089         }
10090     }
10091
10092     return namedclass;
10093 }
10094
10095 STATIC void
10096 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
10097 {
10098     dVAR;
10099
10100     PERL_ARGS_ASSERT_CHECKPOSIXCC;
10101
10102     if (POSIXCC(UCHARAT(RExC_parse))) {
10103         const char *s = RExC_parse;
10104         const char  c = *s++;
10105
10106         while (isALNUM(*s))
10107             s++;
10108         if (*s && c == *s && s[1] == ']') {
10109             ckWARN3reg(s+2,
10110                        "POSIX syntax [%c %c] belongs inside character classes",
10111                        c, c);
10112
10113             /* [[=foo=]] and [[.foo.]] are still future. */
10114             if (POSIXCC_NOTYET(c)) {
10115                 /* adjust RExC_parse so the error shows after
10116                    the class closes */
10117                 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
10118                     NOOP;
10119                 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
10120             }
10121         }
10122     }
10123 }
10124
10125 /* Generate the code to add a full posix character <class> to the bracketed
10126  * character class given by <node>.  (<node> is needed only under locale rules)
10127  * destlist     is the inversion list for non-locale rules that this class is
10128  *              to be added to
10129  * sourcelist   is the ASCII-range inversion list to add under /a rules
10130  * Xsourcelist  is the full Unicode range list to use otherwise. */
10131 #define DO_POSIX(node, class, destlist, sourcelist, Xsourcelist)           \
10132     if (LOC) {                                                             \
10133         SV* scratch_list = NULL;                                           \
10134                                                                            \
10135         /* Set this class in the node for runtime matching */              \
10136         ANYOF_CLASS_SET(node, class);                                      \
10137                                                                            \
10138         /* For above Latin1 code points, we use the full Unicode range */  \
10139         _invlist_intersection(PL_AboveLatin1,                              \
10140                               Xsourcelist,                                 \
10141                               &scratch_list);                              \
10142         /* And set the output to it, adding instead if there already is an \
10143          * output.  Checking if <destlist> is NULL first saves an extra    \
10144          * clone.  Its reference count will be decremented at the next     \
10145          * union, etc, or if this is the only instance, at the end of the  \
10146          * routine */                                                      \
10147         if (! destlist) {                                                  \
10148             destlist = scratch_list;                                       \
10149         }                                                                  \
10150         else {                                                             \
10151             _invlist_union(destlist, scratch_list, &destlist);             \
10152             SvREFCNT_dec(scratch_list);                                    \
10153         }                                                                  \
10154     }                                                                      \
10155     else {                                                                 \
10156         /* For non-locale, just add it to any existing list */             \
10157         _invlist_union(destlist,                                           \
10158                        (AT_LEAST_ASCII_RESTRICTED)                         \
10159                            ? sourcelist                                    \
10160                            : Xsourcelist,                                  \
10161                        &destlist);                                         \
10162     }
10163
10164 /* Like DO_POSIX, but matches the complement of <sourcelist> and <Xsourcelist>.
10165  */
10166 #define DO_N_POSIX(node, class, destlist, sourcelist, Xsourcelist)         \
10167     if (LOC) {                                                             \
10168         SV* scratch_list = NULL;                                           \
10169         ANYOF_CLASS_SET(node, class);                                      \
10170         _invlist_subtract(PL_AboveLatin1, Xsourcelist, &scratch_list);     \
10171         if (! destlist) {                                                  \
10172             destlist = scratch_list;                                       \
10173         }                                                                  \
10174         else {                                                             \
10175             _invlist_union(destlist, scratch_list, &destlist);             \
10176             SvREFCNT_dec(scratch_list);                                    \
10177         }                                                                  \
10178     }                                                                      \
10179     else {                                                                 \
10180         _invlist_union_complement_2nd(destlist,                            \
10181                                     (AT_LEAST_ASCII_RESTRICTED)            \
10182                                         ? sourcelist                       \
10183                                         : Xsourcelist,                     \
10184                                     &destlist);                            \
10185         /* Under /d, everything in the upper half of the Latin1 range      \
10186          * matches this complement */                                      \
10187         if (DEPENDS_SEMANTICS) {                                           \
10188             ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL;                \
10189         }                                                                  \
10190     }
10191
10192 /* Generate the code to add a posix character <class> to the bracketed
10193  * character class given by <node>.  (<node> is needed only under locale rules)
10194  * destlist       is the inversion list for non-locale rules that this class is
10195  *                to be added to
10196  * sourcelist     is the ASCII-range inversion list to add under /a rules
10197  * l1_sourcelist  is the Latin1 range list to use otherwise.
10198  * Xpropertyname  is the name to add to <run_time_list> of the property to
10199  *                specify the code points above Latin1 that will have to be
10200  *                determined at run-time
10201  * run_time_list  is a SV* that contains text names of properties that are to
10202  *                be computed at run time.  This concatenates <Xpropertyname>
10203  *                to it, apppropriately
10204  * This is essentially DO_POSIX, but we know only the Latin1 values at compile
10205  * time */
10206 #define DO_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist,      \
10207                               l1_sourcelist, Xpropertyname, run_time_list) \
10208         /* First, resolve whether to use the ASCII-only list or the L1     \
10209          * list */                                                         \
10210         DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(node, class, destlist,      \
10211                 ((AT_LEAST_ASCII_RESTRICTED) ? sourcelist : l1_sourcelist),\
10212                 Xpropertyname, run_time_list)
10213
10214 #define DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(node, class, destlist, sourcelist, \
10215                 Xpropertyname, run_time_list)                              \
10216     /* If not /a matching, there are going to be code points we will have  \
10217      * to defer to runtime to look-up */                                   \
10218     if (! AT_LEAST_ASCII_RESTRICTED) {                                     \
10219         Perl_sv_catpvf(aTHX_ run_time_list, "+utf8::%s\n", Xpropertyname); \
10220     }                                                                      \
10221     if (LOC) {                                                             \
10222         ANYOF_CLASS_SET(node, class);                                      \
10223     }                                                                      \
10224     else {                                                                 \
10225         _invlist_union(destlist, sourcelist, &destlist);                   \
10226     }
10227
10228 /* Like DO_POSIX_LATIN1_ONLY_KNOWN, but for the complement.  A combination of
10229  * this and DO_N_POSIX */
10230 #define DO_N_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist,    \
10231                               l1_sourcelist, Xpropertyname, run_time_list) \
10232     if (AT_LEAST_ASCII_RESTRICTED) {                                       \
10233         _invlist_union_complement_2nd(destlist, sourcelist, &destlist);    \
10234     }                                                                      \
10235     else {                                                                 \
10236         Perl_sv_catpvf(aTHX_ run_time_list, "!utf8::%s\n", Xpropertyname); \
10237         if (LOC) {                                                         \
10238             ANYOF_CLASS_SET(node, namedclass);                             \
10239         }                                                                  \
10240         else {                                                             \
10241             SV* scratch_list = NULL;                                       \
10242             _invlist_subtract(PL_Latin1, l1_sourcelist, &scratch_list);    \
10243             if (! destlist) {                                              \
10244                 destlist = scratch_list;                                   \
10245             }                                                              \
10246             else {                                                         \
10247                 _invlist_union(destlist, scratch_list, &destlist);         \
10248                 SvREFCNT_dec(scratch_list);                                \
10249             }                                                              \
10250             if (DEPENDS_SEMANTICS) {                                       \
10251                 ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL;            \
10252             }                                                              \
10253         }                                                                  \
10254     }
10255
10256 STATIC U8
10257 S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, SV** invlist_ptr, AV** alternate_ptr)
10258 {
10259
10260     /* Handle the setting of folds in the bitmap for non-locale ANYOF nodes.
10261      * Locale folding is done at run-time, so this function should not be
10262      * called for nodes that are for locales.
10263      *
10264      * This function sets the bit corresponding to the fold of the input
10265      * 'value', if not already set.  The fold of 'f' is 'F', and the fold of
10266      * 'F' is 'f'.
10267      *
10268      * It also knows about the characters that are in the bitmap that have
10269      * folds that are matchable only outside it, and sets the appropriate lists
10270      * and flags.
10271      *
10272      * It returns the number of bits that actually changed from 0 to 1 */
10273
10274     U8 stored = 0;
10275     U8 fold;
10276
10277     PERL_ARGS_ASSERT_SET_REGCLASS_BIT_FOLD;
10278
10279     fold = (AT_LEAST_UNI_SEMANTICS) ? PL_fold_latin1[value]
10280                                     : PL_fold[value];
10281
10282     /* It assumes the bit for 'value' has already been set */
10283     if (fold != value && ! ANYOF_BITMAP_TEST(node, fold)) {
10284         ANYOF_BITMAP_SET(node, fold);
10285         stored++;
10286     }
10287     if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value) && (! isASCII(value) || ! MORE_ASCII_RESTRICTED)) {
10288         /* Certain Latin1 characters have matches outside the bitmap.  To get
10289          * here, 'value' is one of those characters.   None of these matches is
10290          * valid for ASCII characters under /aa, which have been excluded by
10291          * the 'if' above.  The matches fall into three categories:
10292          * 1) They are singly folded-to or -from an above 255 character, as
10293          *    LATIN SMALL LETTER Y WITH DIAERESIS and LATIN CAPITAL LETTER Y
10294          *    WITH DIAERESIS;
10295          * 2) They are part of a multi-char fold with another character in the
10296          *    bitmap, only LATIN SMALL LETTER SHARP S => "ss" fits that bill;
10297          * 3) They are part of a multi-char fold with a character not in the
10298          *    bitmap, such as various ligatures.
10299          * We aren't dealing fully with multi-char folds, except we do deal
10300          * with the pattern containing a character that has a multi-char fold
10301          * (not so much the inverse).
10302          * For types 1) and 3), the matches only happen when the target string
10303          * is utf8; that's not true for 2), and we set a flag for it.
10304          *
10305          * The code below adds to the passed in inversion list the single fold
10306          * closures for 'value'.  The values are hard-coded here so that an
10307          * innocent-looking character class, like /[ks]/i won't have to go out
10308          * to disk to find the possible matches.  XXX It would be better to
10309          * generate these via regen, in case a new version of the Unicode
10310          * standard adds new mappings, though that is not really likely. */
10311         switch (value) {
10312             case 'k':
10313             case 'K':
10314                 /* KELVIN SIGN */
10315                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212A);
10316                 break;
10317             case 's':
10318             case 'S':
10319                 /* LATIN SMALL LETTER LONG S */
10320                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x017F);
10321                 break;
10322             case MICRO_SIGN:
10323                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10324                                                  GREEK_SMALL_LETTER_MU);
10325                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10326                                                  GREEK_CAPITAL_LETTER_MU);
10327                 break;
10328             case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
10329             case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
10330                 /* ANGSTROM SIGN */
10331                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212B);
10332                 if (DEPENDS_SEMANTICS) {    /* See DEPENDS comment below */
10333                     *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10334                                                      PL_fold_latin1[value]);
10335                 }
10336                 break;
10337             case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
10338                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10339                                         LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
10340                 break;
10341             case LATIN_SMALL_LETTER_SHARP_S:
10342                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10343                                         LATIN_CAPITAL_LETTER_SHARP_S);
10344
10345                 /* Under /a, /d, and /u, this can match the two chars "ss" */
10346                 if (! MORE_ASCII_RESTRICTED) {
10347                     add_alternate(alternate_ptr, (U8 *) "ss", 2);
10348
10349                     /* And under /u or /a, it can match even if the target is
10350                      * not utf8 */
10351                     if (AT_LEAST_UNI_SEMANTICS) {
10352                         ANYOF_FLAGS(node) |= ANYOF_NONBITMAP_NON_UTF8;
10353                     }
10354                 }
10355                 break;
10356             case 'F': case 'f':
10357             case 'I': case 'i':
10358             case 'L': case 'l':
10359             case 'T': case 't':
10360             case 'A': case 'a':
10361             case 'H': case 'h':
10362             case 'J': case 'j':
10363             case 'N': case 'n':
10364             case 'W': case 'w':
10365             case 'Y': case 'y':
10366                 /* These all are targets of multi-character folds from code
10367                  * points that require UTF8 to express, so they can't match
10368                  * unless the target string is in UTF-8, so no action here is
10369                  * necessary, as regexec.c properly handles the general case
10370                  * for UTF-8 matching */
10371                 break;
10372             default:
10373                 /* Use deprecated warning to increase the chances of this
10374                  * being output */
10375                 ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%x; please use the perlbug utility to report;", value);
10376                 break;
10377         }
10378     }
10379     else if (DEPENDS_SEMANTICS
10380             && ! isASCII(value)
10381             && PL_fold_latin1[value] != value)
10382     {
10383            /* Under DEPENDS rules, non-ASCII Latin1 characters match their
10384             * folds only when the target string is in UTF-8.  We add the fold
10385             * here to the list of things to match outside the bitmap, which
10386             * won't be looked at unless it is UTF8 (or else if something else
10387             * says to look even if not utf8, but those things better not happen
10388             * under DEPENDS semantics. */
10389         *invlist_ptr = add_cp_to_invlist(*invlist_ptr, PL_fold_latin1[value]);
10390     }
10391
10392     return stored;
10393 }
10394
10395
10396 PERL_STATIC_INLINE U8
10397 S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, SV** invlist_ptr, AV** alternate_ptr)
10398 {
10399     /* This inline function sets a bit in the bitmap if not already set, and if
10400      * appropriate, its fold, returning the number of bits that actually
10401      * changed from 0 to 1 */
10402
10403     U8 stored;
10404
10405     PERL_ARGS_ASSERT_SET_REGCLASS_BIT;
10406
10407     if (ANYOF_BITMAP_TEST(node, value)) {   /* Already set */
10408         return 0;
10409     }
10410
10411     ANYOF_BITMAP_SET(node, value);
10412     stored = 1;
10413
10414     if (FOLD && ! LOC) {        /* Locale folds aren't known until runtime */
10415         stored += set_regclass_bit_fold(pRExC_state, node, value, invlist_ptr, alternate_ptr);
10416     }
10417
10418     return stored;
10419 }
10420
10421 STATIC void
10422 S_add_alternate(pTHX_ AV** alternate_ptr, U8* string, STRLEN len)
10423 {
10424     /* Adds input 'string' with length 'len' to the ANYOF node's unicode
10425      * alternate list, pointed to by 'alternate_ptr'.  This is an array of
10426      * the multi-character folds of characters in the node */
10427     SV *sv;
10428
10429     PERL_ARGS_ASSERT_ADD_ALTERNATE;
10430
10431     if (! *alternate_ptr) {
10432         *alternate_ptr = newAV();
10433     }
10434     sv = newSVpvn_utf8((char*)string, len, TRUE);
10435     av_push(*alternate_ptr, sv);
10436     return;
10437 }
10438
10439 /*
10440    parse a class specification and produce either an ANYOF node that
10441    matches the pattern or perhaps will be optimized into an EXACTish node
10442    instead. The node contains a bit map for the first 256 characters, with the
10443    corresponding bit set if that character is in the list.  For characters
10444    above 255, a range list is used */
10445
10446 STATIC regnode *
10447 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
10448 {
10449     dVAR;
10450     register UV nextvalue;
10451     register IV prevvalue = OOB_UNICODE;
10452     register IV range = 0;
10453     UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
10454     register regnode *ret;
10455     STRLEN numlen;
10456     IV namedclass;
10457     char *rangebegin = NULL;
10458     bool need_class = 0;
10459     bool allow_full_fold = TRUE;   /* Assume wants multi-char folding */
10460     SV *listsv = NULL;
10461     STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
10462                                       than just initialized.  */
10463     SV* properties = NULL;    /* Code points that match \p{} \P{} */
10464     UV element_count = 0;   /* Number of distinct elements in the class.
10465                                Optimizations may be possible if this is tiny */
10466     UV n;
10467
10468     /* Unicode properties are stored in a swash; this holds the current one
10469      * being parsed.  If this swash is the only above-latin1 component of the
10470      * character class, an optimization is to pass it directly on to the
10471      * execution engine.  Otherwise, it is set to NULL to indicate that there
10472      * are other things in the class that have to be dealt with at execution
10473      * time */
10474     SV* swash = NULL;           /* Code points that match \p{} \P{} */
10475
10476     /* Set if a component of this character class is user-defined; just passed
10477      * on to the engine */
10478     UV has_user_defined_property = 0;
10479
10480     /* code points this node matches that can't be stored in the bitmap */
10481     SV* nonbitmap = NULL;
10482
10483     /* The items that are to match that aren't stored in the bitmap, but are a
10484      * result of things that are stored there.  This is the fold closure of
10485      * such a character, either because it has DEPENDS semantics and shouldn't
10486      * be matched unless the target string is utf8, or is a code point that is
10487      * too large for the bit map, as for example, the fold of the MICRO SIGN is
10488      * above 255.  This all is solely for performance reasons.  By having this
10489      * code know the outside-the-bitmap folds that the bitmapped characters are
10490      * involved with, we don't have to go out to disk to find the list of
10491      * matches, unless the character class includes code points that aren't
10492      * storable in the bit map.  That means that a character class with an 's'
10493      * in it, for example, doesn't need to go out to disk to find everything
10494      * that matches.  A 2nd list is used so that the 'nonbitmap' list is kept
10495      * empty unless there is something whose fold we don't know about, and will
10496      * have to go out to the disk to find. */
10497     SV* l1_fold_invlist = NULL;
10498
10499     /* List of multi-character folds that are matched by this node */
10500     AV* unicode_alternate  = NULL;
10501 #ifdef EBCDIC
10502     UV literal_endpoint = 0;
10503 #endif
10504     UV stored = 0;  /* how many chars stored in the bitmap */
10505
10506     regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
10507         case we need to change the emitted regop to an EXACT. */
10508     const char * orig_parse = RExC_parse;
10509     GET_RE_DEBUG_FLAGS_DECL;
10510
10511     PERL_ARGS_ASSERT_REGCLASS;
10512 #ifndef DEBUGGING
10513     PERL_UNUSED_ARG(depth);
10514 #endif
10515
10516     DEBUG_PARSE("clas");
10517
10518     /* Assume we are going to generate an ANYOF node. */
10519     ret = reganode(pRExC_state, ANYOF, 0);
10520
10521
10522     if (!SIZE_ONLY) {
10523         ANYOF_FLAGS(ret) = 0;
10524     }
10525
10526     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
10527         RExC_naughty++;
10528         RExC_parse++;
10529         if (!SIZE_ONLY)
10530             ANYOF_FLAGS(ret) |= ANYOF_INVERT;
10531
10532         /* We have decided to not allow multi-char folds in inverted character
10533          * classes, due to the confusion that can happen, especially with
10534          * classes that are designed for a non-Unicode world:  You have the
10535          * peculiar case that:
10536             "s s" =~ /^[^\xDF]+$/i => Y
10537             "ss"  =~ /^[^\xDF]+$/i => N
10538          *
10539          * See [perl #89750] */
10540         allow_full_fold = FALSE;
10541     }
10542
10543     if (SIZE_ONLY) {
10544         RExC_size += ANYOF_SKIP;
10545         listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
10546     }
10547     else {
10548         RExC_emit += ANYOF_SKIP;
10549         if (LOC) {
10550             ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
10551         }
10552         ANYOF_BITMAP_ZERO(ret);
10553         listsv = newSVpvs("# comment\n");
10554         initial_listsv_len = SvCUR(listsv);
10555     }
10556
10557     nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
10558
10559     if (!SIZE_ONLY && POSIXCC(nextvalue))
10560         checkposixcc(pRExC_state);
10561
10562     /* allow 1st char to be ] (allowing it to be - is dealt with later) */
10563     if (UCHARAT(RExC_parse) == ']')
10564         goto charclassloop;
10565
10566 parseit:
10567     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
10568
10569     charclassloop:
10570
10571         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
10572
10573         if (!range) {
10574             rangebegin = RExC_parse;
10575             element_count++;
10576         }
10577         if (UTF) {
10578             value = utf8n_to_uvchr((U8*)RExC_parse,
10579                                    RExC_end - RExC_parse,
10580                                    &numlen, UTF8_ALLOW_DEFAULT);
10581             RExC_parse += numlen;
10582         }
10583         else
10584             value = UCHARAT(RExC_parse++);
10585
10586         nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
10587         if (value == '[' && POSIXCC(nextvalue))
10588             namedclass = regpposixcc(pRExC_state, value);
10589         else if (value == '\\') {
10590             if (UTF) {
10591                 value = utf8n_to_uvchr((U8*)RExC_parse,
10592                                    RExC_end - RExC_parse,
10593                                    &numlen, UTF8_ALLOW_DEFAULT);
10594                 RExC_parse += numlen;
10595             }
10596             else
10597                 value = UCHARAT(RExC_parse++);
10598             /* Some compilers cannot handle switching on 64-bit integer
10599              * values, therefore value cannot be an UV.  Yes, this will
10600              * be a problem later if we want switch on Unicode.
10601              * A similar issue a little bit later when switching on
10602              * namedclass. --jhi */
10603             switch ((I32)value) {
10604             case 'w':   namedclass = ANYOF_ALNUM;       break;
10605             case 'W':   namedclass = ANYOF_NALNUM;      break;
10606             case 's':   namedclass = ANYOF_SPACE;       break;
10607             case 'S':   namedclass = ANYOF_NSPACE;      break;
10608             case 'd':   namedclass = ANYOF_DIGIT;       break;
10609             case 'D':   namedclass = ANYOF_NDIGIT;      break;
10610             case 'v':   namedclass = ANYOF_VERTWS;      break;
10611             case 'V':   namedclass = ANYOF_NVERTWS;     break;
10612             case 'h':   namedclass = ANYOF_HORIZWS;     break;
10613             case 'H':   namedclass = ANYOF_NHORIZWS;    break;
10614             case 'N':  /* Handle \N{NAME} in class */
10615                 {
10616                     /* We only pay attention to the first char of
10617                     multichar strings being returned. I kinda wonder
10618                     if this makes sense as it does change the behaviour
10619                     from earlier versions, OTOH that behaviour was broken
10620                     as well. */
10621                     UV v; /* value is register so we cant & it /grrr */
10622                     if (reg_namedseq(pRExC_state, &v, NULL, depth)) {
10623                         goto parseit;
10624                     }
10625                     value= v;
10626                 }
10627                 break;
10628             case 'p':
10629             case 'P':
10630                 {
10631                 char *e;
10632                 if (RExC_parse >= RExC_end)
10633                     vFAIL2("Empty \\%c{}", (U8)value);
10634                 if (*RExC_parse == '{') {
10635                     const U8 c = (U8)value;
10636                     e = strchr(RExC_parse++, '}');
10637                     if (!e)
10638                         vFAIL2("Missing right brace on \\%c{}", c);
10639                     while (isSPACE(UCHARAT(RExC_parse)))
10640                         RExC_parse++;
10641                     if (e == RExC_parse)
10642                         vFAIL2("Empty \\%c{}", c);
10643                     n = e - RExC_parse;
10644                     while (isSPACE(UCHARAT(RExC_parse + n - 1)))
10645                         n--;
10646                 }
10647                 else {
10648                     e = RExC_parse;
10649                     n = 1;
10650                 }
10651                 if (!SIZE_ONLY) {
10652                     SV** invlistsvp;
10653                     SV* invlist;
10654                     char* name;
10655                     if (UCHARAT(RExC_parse) == '^') {
10656                          RExC_parse++;
10657                          n--;
10658                          value = value == 'p' ? 'P' : 'p'; /* toggle */
10659                          while (isSPACE(UCHARAT(RExC_parse))) {
10660                               RExC_parse++;
10661                               n--;
10662                          }
10663                     }
10664                     /* Try to get the definition of the property into
10665                      * <invlist>.  If /i is in effect, the effective property
10666                      * will have its name be <__NAME_i>.  The design is
10667                      * discussed in commit
10668                      * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
10669                     Newx(name, n + sizeof("_i__\n"), char);
10670
10671                     sprintf(name, "%s%.*s%s\n",
10672                                     (FOLD) ? "__" : "",
10673                                     (int)n,
10674                                     RExC_parse,
10675                                     (FOLD) ? "_i" : ""
10676                     );
10677
10678                     /* Look up the property name, and get its swash and
10679                      * inversion list, if the property is found  */
10680                     if (swash) {
10681                         SvREFCNT_dec(swash);
10682                     }
10683                     swash = _core_swash_init("utf8", name, &PL_sv_undef,
10684                                              1, /* binary */
10685                                              0, /* not tr/// */
10686                                              TRUE, /* this routine will handle
10687                                                       undefined properties */
10688                                              NULL, FALSE /* No inversion list */
10689                                             );
10690                     if (   ! swash
10691                         || ! SvROK(swash)
10692                         || ! SvTYPE(SvRV(swash)) == SVt_PVHV
10693                         || ! (invlistsvp =
10694                                 hv_fetchs(MUTABLE_HV(SvRV(swash)),
10695                                 "INVLIST", FALSE))
10696                         || ! (invlist = *invlistsvp))
10697                     {
10698                         if (swash) {
10699                             SvREFCNT_dec(swash);
10700                             swash = NULL;
10701                         }
10702
10703                         /* Here didn't find it.  It could be a user-defined
10704                          * property that will be available at run-time.  Add it
10705                          * to the list to look up then */
10706                         Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n",
10707                                         (value == 'p' ? '+' : '!'),
10708                                         name);
10709                         has_user_defined_property = 1;
10710
10711                         /* We don't know yet, so have to assume that the
10712                          * property could match something in the Latin1 range,
10713                          * hence something that isn't utf8 */
10714                         ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
10715                     }
10716                     else {
10717
10718                         /* Here, did get the swash and its inversion list.  If
10719                          * the swash is from a user-defined property, then this
10720                          * whole character class should be regarded as such */
10721                         SV** user_defined_svp =
10722                                             hv_fetchs(MUTABLE_HV(SvRV(swash)),
10723                                                         "USER_DEFINED", FALSE);
10724                         if (user_defined_svp) {
10725                             has_user_defined_property
10726                                                     |= SvUV(*user_defined_svp);
10727                         }
10728
10729                         /* Invert if asking for the complement */
10730                         if (value == 'P') {
10731                             _invlist_union_complement_2nd(properties, invlist, &properties);
10732
10733                             /* The swash can't be used as-is, because we've
10734                              * inverted things; delay removing it to here after
10735                              * have copied its invlist above */
10736                             SvREFCNT_dec(swash);
10737                             swash = NULL;
10738                         }
10739                         else {
10740                             _invlist_union(properties, invlist, &properties);
10741                         }
10742                     }
10743                     Safefree(name);
10744                 }
10745                 RExC_parse = e + 1;
10746                 namedclass = ANYOF_MAX;  /* no official name, but it's named */
10747
10748                 /* \p means they want Unicode semantics */
10749                 RExC_uni_semantics = 1;
10750                 }
10751                 break;
10752             case 'n':   value = '\n';                   break;
10753             case 'r':   value = '\r';                   break;
10754             case 't':   value = '\t';                   break;
10755             case 'f':   value = '\f';                   break;
10756             case 'b':   value = '\b';                   break;
10757             case 'e':   value = ASCII_TO_NATIVE('\033');break;
10758             case 'a':   value = ASCII_TO_NATIVE('\007');break;
10759             case 'o':
10760                 RExC_parse--;   /* function expects to be pointed at the 'o' */
10761                 {
10762                     const char* error_msg;
10763                     bool valid = grok_bslash_o(RExC_parse,
10764                                                &value,
10765                                                &numlen,
10766                                                &error_msg,
10767                                                SIZE_ONLY);
10768                     RExC_parse += numlen;
10769                     if (! valid) {
10770                         vFAIL(error_msg);
10771                     }
10772                 }
10773                 if (PL_encoding && value < 0x100) {
10774                     goto recode_encoding;
10775                 }
10776                 break;
10777             case 'x':
10778                 if (*RExC_parse == '{') {
10779                     I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
10780                         | PERL_SCAN_DISALLOW_PREFIX;
10781                     char * const e = strchr(RExC_parse++, '}');
10782                     if (!e)
10783                         vFAIL("Missing right brace on \\x{}");
10784
10785                     numlen = e - RExC_parse;
10786                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
10787                     RExC_parse = e + 1;
10788                 }
10789                 else {
10790                     I32 flags = PERL_SCAN_DISALLOW_PREFIX;
10791                     numlen = 2;
10792                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
10793                     RExC_parse += numlen;
10794                 }
10795                 if (PL_encoding && value < 0x100)
10796                     goto recode_encoding;
10797                 break;
10798             case 'c':
10799                 value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
10800                 break;
10801             case '0': case '1': case '2': case '3': case '4':
10802             case '5': case '6': case '7':
10803                 {
10804                     /* Take 1-3 octal digits */
10805                     I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
10806                     numlen = 3;
10807                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
10808                     RExC_parse += numlen;
10809                     if (PL_encoding && value < 0x100)
10810                         goto recode_encoding;
10811                     break;
10812                 }
10813             recode_encoding:
10814                 if (! RExC_override_recoding) {
10815                     SV* enc = PL_encoding;
10816                     value = reg_recode((const char)(U8)value, &enc);
10817                     if (!enc && SIZE_ONLY)
10818                         ckWARNreg(RExC_parse,
10819                                   "Invalid escape in the specified encoding");
10820                     break;
10821                 }
10822             default:
10823                 /* Allow \_ to not give an error */
10824                 if (!SIZE_ONLY && isALNUM(value) && value != '_') {
10825                     ckWARN2reg(RExC_parse,
10826                                "Unrecognized escape \\%c in character class passed through",
10827                                (int)value);
10828                 }
10829                 break;
10830             }
10831         } /* end of \blah */
10832 #ifdef EBCDIC
10833         else
10834             literal_endpoint++;
10835 #endif
10836
10837         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
10838
10839             /* What matches in a locale is not known until runtime, so need to
10840              * (one time per class) allocate extra space to pass to regexec.
10841              * The space will contain a bit for each named class that is to be
10842              * matched against.  This isn't needed for \p{} and pseudo-classes,
10843              * as they are not affected by locale, and hence are dealt with
10844              * separately */
10845             if (LOC && namedclass < ANYOF_MAX && ! need_class) {
10846                 need_class = 1;
10847                 if (SIZE_ONLY) {
10848                     RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
10849                 }
10850                 else {
10851                     RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
10852                     ANYOF_CLASS_ZERO(ret);
10853                 }
10854                 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
10855             }
10856
10857             /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
10858              * literal, as is the character that began the false range, i.e.
10859              * the 'a' in the examples */
10860             if (range) {
10861                 if (!SIZE_ONLY) {
10862                     const int w =
10863                         RExC_parse >= rangebegin ?
10864                         RExC_parse - rangebegin : 0;
10865                     ckWARN4reg(RExC_parse,
10866                                "False [] range \"%*.*s\"",
10867                                w, w, rangebegin);
10868
10869                     stored +=
10870                          set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
10871                     if (prevvalue < 256) {
10872                         stored +=
10873                          set_regclass_bit(pRExC_state, ret, (U8) prevvalue, &l1_fold_invlist, &unicode_alternate);
10874                     }
10875                     else {
10876                         nonbitmap = add_cp_to_invlist(nonbitmap, prevvalue);
10877                     }
10878                 }
10879
10880                 range = 0; /* this was not a true range */
10881             }
10882
10883             if (!SIZE_ONLY) {
10884
10885                 /* Possible truncation here but in some 64-bit environments
10886                  * the compiler gets heartburn about switch on 64-bit values.
10887                  * A similar issue a little earlier when switching on value.
10888                  * --jhi */
10889                 switch ((I32)namedclass) {
10890
10891                 case ANYOF_ALNUMC: /* C's alnum, in contrast to \w */
10892                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10893                         PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv);
10894                     break;
10895                 case ANYOF_NALNUMC:
10896                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10897                         PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv);
10898                     break;
10899                 case ANYOF_ALPHA:
10900                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10901                         PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv);
10902                     break;
10903                 case ANYOF_NALPHA:
10904                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10905                         PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv);
10906                     break;
10907                 case ANYOF_ASCII:
10908                     if (LOC) {
10909                         ANYOF_CLASS_SET(ret, namedclass);
10910                     }
10911                     else {
10912                         _invlist_union(properties, PL_ASCII, &properties);
10913                     }
10914                     break;
10915                 case ANYOF_NASCII:
10916                     if (LOC) {
10917                         ANYOF_CLASS_SET(ret, namedclass);
10918                     }
10919                     else {
10920                         _invlist_union_complement_2nd(properties,
10921                                                     PL_ASCII, &properties);
10922                         if (DEPENDS_SEMANTICS) {
10923                             ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
10924                         }
10925                     }
10926                     break;
10927                 case ANYOF_BLANK:
10928                     DO_POSIX(ret, namedclass, properties,
10929                                             PL_PosixBlank, PL_XPosixBlank);
10930                     break;
10931                 case ANYOF_NBLANK:
10932                     DO_N_POSIX(ret, namedclass, properties,
10933                                             PL_PosixBlank, PL_XPosixBlank);
10934                     break;
10935                 case ANYOF_CNTRL:
10936                     DO_POSIX(ret, namedclass, properties,
10937                                             PL_PosixCntrl, PL_XPosixCntrl);
10938                     break;
10939                 case ANYOF_NCNTRL:
10940                     DO_N_POSIX(ret, namedclass, properties,
10941                                             PL_PosixCntrl, PL_XPosixCntrl);
10942                     break;
10943                 case ANYOF_DIGIT:
10944                     /* There are no digits in the Latin1 range outside of
10945                      * ASCII, so call the macro that doesn't have to resolve
10946                      * them */
10947                     DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(ret, namedclass, properties,
10948                         PL_PosixDigit, "XPosixDigit", listsv);
10949                     break;
10950                 case ANYOF_NDIGIT:
10951                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10952                         PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv);
10953                     break;
10954                 case ANYOF_GRAPH:
10955                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10956                         PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv);
10957                     break;
10958                 case ANYOF_NGRAPH:
10959                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10960                         PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv);
10961                     break;
10962                 case ANYOF_HORIZWS:
10963                     /* For these, we use the nonbitmap, as /d doesn't make a
10964                      * difference in what these match.  There would be problems
10965                      * if these characters had folds other than themselves, as
10966                      * nonbitmap is subject to folding.  It turns out that \h
10967                      * is just a synonym for XPosixBlank */
10968                     _invlist_union(nonbitmap, PL_XPosixBlank, &nonbitmap);
10969                     break;
10970                 case ANYOF_NHORIZWS:
10971                     _invlist_union_complement_2nd(nonbitmap,
10972                                                  PL_XPosixBlank, &nonbitmap);
10973                     break;
10974                 case ANYOF_LOWER:
10975                 case ANYOF_NLOWER:
10976                 {   /* These require special handling, as they differ under
10977                        folding, matching Cased there (which in the ASCII range
10978                        is the same as Alpha */
10979
10980                     SV* ascii_source;
10981                     SV* l1_source;
10982                     const char *Xname;
10983
10984                     if (FOLD && ! LOC) {
10985                         ascii_source = PL_PosixAlpha;
10986                         l1_source = PL_L1Cased;
10987                         Xname = "Cased";
10988                     }
10989                     else {
10990                         ascii_source = PL_PosixLower;
10991                         l1_source = PL_L1PosixLower;
10992                         Xname = "XPosixLower";
10993                     }
10994                     if (namedclass == ANYOF_LOWER) {
10995                         DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10996                                     ascii_source, l1_source, Xname, listsv);
10997                     }
10998                     else {
10999                         DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
11000                             properties, ascii_source, l1_source, Xname, listsv);
11001                     }
11002                     break;
11003                 }
11004                 case ANYOF_PRINT:
11005                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11006                         PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv);
11007                     break;
11008                 case ANYOF_NPRINT:
11009                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11010                         PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv);
11011                     break;
11012                 case ANYOF_PUNCT:
11013                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11014                         PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv);
11015                     break;
11016                 case ANYOF_NPUNCT:
11017                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11018                         PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv);
11019                     break;
11020                 case ANYOF_PSXSPC:
11021                     DO_POSIX(ret, namedclass, properties,
11022                                             PL_PosixSpace, PL_XPosixSpace);
11023                     break;
11024                 case ANYOF_NPSXSPC:
11025                     DO_N_POSIX(ret, namedclass, properties,
11026                                             PL_PosixSpace, PL_XPosixSpace);
11027                     break;
11028                 case ANYOF_SPACE:
11029                     DO_POSIX(ret, namedclass, properties,
11030                                             PL_PerlSpace, PL_XPerlSpace);
11031                     break;
11032                 case ANYOF_NSPACE:
11033                     DO_N_POSIX(ret, namedclass, properties,
11034                                             PL_PerlSpace, PL_XPerlSpace);
11035                     break;
11036                 case ANYOF_UPPER:   /* Same as LOWER, above */
11037                 case ANYOF_NUPPER:
11038                 {
11039                     SV* ascii_source;
11040                     SV* l1_source;
11041                     const char *Xname;
11042
11043                     if (FOLD && ! LOC) {
11044                         ascii_source = PL_PosixAlpha;
11045                         l1_source = PL_L1Cased;
11046                         Xname = "Cased";
11047                     }
11048                     else {
11049                         ascii_source = PL_PosixUpper;
11050                         l1_source = PL_L1PosixUpper;
11051                         Xname = "XPosixUpper";
11052                     }
11053                     if (namedclass == ANYOF_UPPER) {
11054                         DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11055                                     ascii_source, l1_source, Xname, listsv);
11056                     }
11057                     else {
11058                         DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
11059                         properties, ascii_source, l1_source, Xname, listsv);
11060                     }
11061                     break;
11062                 }
11063                 case ANYOF_ALNUM:   /* Really is 'Word' */
11064                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11065                             PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
11066                     break;
11067                 case ANYOF_NALNUM:
11068                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11069                             PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
11070                     break;
11071                 case ANYOF_VERTWS:
11072                     /* For these, we use the nonbitmap, as /d doesn't make a
11073                      * difference in what these match.  There would be problems
11074                      * if these characters had folds other than themselves, as
11075                      * nonbitmap is subject to folding */
11076                     _invlist_union(nonbitmap, PL_VertSpace, &nonbitmap);
11077                     break;
11078                 case ANYOF_NVERTWS:
11079                     _invlist_union_complement_2nd(nonbitmap,
11080                                                     PL_VertSpace, &nonbitmap);
11081                     break;
11082                 case ANYOF_XDIGIT:
11083                     DO_POSIX(ret, namedclass, properties,
11084                                             PL_PosixXDigit, PL_XPosixXDigit);
11085                     break;
11086                 case ANYOF_NXDIGIT:
11087                     DO_N_POSIX(ret, namedclass, properties,
11088                                             PL_PosixXDigit, PL_XPosixXDigit);
11089                     break;
11090                 case ANYOF_MAX:
11091                     /* this is to handle \p and \P */
11092                     break;
11093                 default:
11094                     vFAIL("Invalid [::] class");
11095                     break;
11096                 }
11097
11098                 continue;
11099             }
11100         } /* end of namedclass \blah */
11101
11102         if (range) {
11103             if (prevvalue > (IV)value) /* b-a */ {
11104                 const int w = RExC_parse - rangebegin;
11105                 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
11106                 range = 0; /* not a valid range */
11107             }
11108         }
11109         else {
11110             prevvalue = value; /* save the beginning of the range */
11111             if (RExC_parse+1 < RExC_end
11112                 && *RExC_parse == '-'
11113                 && RExC_parse[1] != ']')
11114             {
11115                 RExC_parse++;
11116
11117                 /* a bad range like \w-, [:word:]- ? */
11118                 if (namedclass > OOB_NAMEDCLASS) {
11119                     if (ckWARN(WARN_REGEXP)) {
11120                         const int w =
11121                             RExC_parse >= rangebegin ?
11122                             RExC_parse - rangebegin : 0;
11123                         vWARN4(RExC_parse,
11124                                "False [] range \"%*.*s\"",
11125                                w, w, rangebegin);
11126                     }
11127                     if (!SIZE_ONLY)
11128                         stored +=
11129                             set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
11130                 } else
11131                     range = 1;  /* yeah, it's a range! */
11132                 continue;       /* but do it the next time */
11133             }
11134         }
11135
11136         /* non-Latin1 code point implies unicode semantics.  Must be set in
11137          * pass1 so is there for the whole of pass 2 */
11138         if (value > 255) {
11139             RExC_uni_semantics = 1;
11140         }
11141
11142         /* now is the next time */
11143         if (!SIZE_ONLY) {
11144             if (prevvalue < 256) {
11145                 const IV ceilvalue = value < 256 ? value : 255;
11146                 IV i;
11147 #ifdef EBCDIC
11148                 /* In EBCDIC [\x89-\x91] should include
11149                  * the \x8e but [i-j] should not. */
11150                 if (literal_endpoint == 2 &&
11151                     ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
11152                      (isUPPER(prevvalue) && isUPPER(ceilvalue))))
11153                 {
11154                     if (isLOWER(prevvalue)) {
11155                         for (i = prevvalue; i <= ceilvalue; i++)
11156                             if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
11157                                 stored +=
11158                                   set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
11159                             }
11160                     } else {
11161                         for (i = prevvalue; i <= ceilvalue; i++)
11162                             if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
11163                                 stored +=
11164                                   set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
11165                             }
11166                     }
11167                 }
11168                 else
11169 #endif
11170                       for (i = prevvalue; i <= ceilvalue; i++) {
11171                         stored += set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
11172                       }
11173           }
11174           if (value > 255) {
11175             const UV prevnatvalue  = NATIVE_TO_UNI(prevvalue);
11176             const UV natvalue      = NATIVE_TO_UNI(value);
11177             nonbitmap = _add_range_to_invlist(nonbitmap, prevnatvalue, natvalue);
11178         }
11179 #ifdef EBCDIC
11180             literal_endpoint = 0;
11181 #endif
11182         }
11183
11184         range = 0; /* this range (if it was one) is done now */
11185     }
11186
11187
11188
11189     if (SIZE_ONLY)
11190         return ret;
11191     /****** !SIZE_ONLY AFTER HERE *********/
11192
11193     /* If folding and there are code points above 255, we calculate all
11194      * characters that could fold to or from the ones already on the list */
11195     if (FOLD && nonbitmap) {
11196         UV start, end;  /* End points of code point ranges */
11197
11198         SV* fold_intersection = NULL;
11199
11200         /* This is a list of all the characters that participate in folds
11201             * (except marks, etc in multi-char folds */
11202         if (! PL_utf8_foldable) {
11203             SV* swash = swash_init("utf8", "Cased", &PL_sv_undef, 1, 0);
11204             PL_utf8_foldable = _swash_to_invlist(swash);
11205             SvREFCNT_dec(swash);
11206         }
11207
11208         /* This is a hash that for a particular fold gives all characters
11209             * that are involved in it */
11210         if (! PL_utf8_foldclosures) {
11211
11212             /* If we were unable to find any folds, then we likely won't be
11213              * able to find the closures.  So just create an empty list.
11214              * Folding will effectively be restricted to the non-Unicode rules
11215              * hard-coded into Perl.  (This case happens legitimately during
11216              * compilation of Perl itself before the Unicode tables are
11217              * generated) */
11218             if (invlist_len(PL_utf8_foldable) == 0) {
11219                 PL_utf8_foldclosures = newHV();
11220             } else {
11221                 /* If the folds haven't been read in, call a fold function
11222                     * to force that */
11223                 if (! PL_utf8_tofold) {
11224                     U8 dummy[UTF8_MAXBYTES+1];
11225                     STRLEN dummy_len;
11226
11227                     /* This particular string is above \xff in both UTF-8 and
11228                      * UTFEBCDIC */
11229                     to_utf8_fold((U8*) "\xC8\x80", dummy, &dummy_len);
11230                     assert(PL_utf8_tofold); /* Verify that worked */
11231                 }
11232                 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
11233             }
11234         }
11235
11236         /* Only the characters in this class that participate in folds need be
11237          * checked.  Get the intersection of this class and all the possible
11238          * characters that are foldable.  This can quickly narrow down a large
11239          * class */
11240         _invlist_intersection(PL_utf8_foldable, nonbitmap, &fold_intersection);
11241
11242         /* Now look at the foldable characters in this class individually */
11243         invlist_iterinit(fold_intersection);
11244         while (invlist_iternext(fold_intersection, &start, &end)) {
11245             UV j;
11246
11247             /* Look at every character in the range */
11248             for (j = start; j <= end; j++) {
11249
11250                 /* Get its fold */
11251                 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
11252                 STRLEN foldlen;
11253                 const UV f =
11254                     _to_uni_fold_flags(j, foldbuf, &foldlen,
11255                                        (allow_full_fold) ? FOLD_FLAGS_FULL : 0);
11256
11257                 if (foldlen > (STRLEN)UNISKIP(f)) {
11258
11259                     /* Any multicharacter foldings (disallowed in lookbehind
11260                      * patterns) require the following transform: [ABCDEF] ->
11261                      * (?:[ABCabcDEFd]|pq|rst) where E folds into "pq" and F
11262                      * folds into "rst", all other characters fold to single
11263                      * characters.  We save away these multicharacter foldings,
11264                      * to be later saved as part of the additional "s" data. */
11265                     if (! RExC_in_lookbehind) {
11266                         U8* loc = foldbuf;
11267                         U8* e = foldbuf + foldlen;
11268
11269                         /* If any of the folded characters of this are in the
11270                          * Latin1 range, tell the regex engine that this can
11271                          * match a non-utf8 target string.  The only multi-byte
11272                          * fold whose source is in the Latin1 range (U+00DF)
11273                          * applies only when the target string is utf8, or
11274                          * under unicode rules */
11275                         if (j > 255 || AT_LEAST_UNI_SEMANTICS) {
11276                             while (loc < e) {
11277
11278                                 /* Can't mix ascii with non- under /aa */
11279                                 if (MORE_ASCII_RESTRICTED
11280                                     && (isASCII(*loc) != isASCII(j)))
11281                                 {
11282                                     goto end_multi_fold;
11283                                 }
11284                                 if (UTF8_IS_INVARIANT(*loc)
11285                                     || UTF8_IS_DOWNGRADEABLE_START(*loc))
11286                                 {
11287                                     /* Can't mix above and below 256 under LOC
11288                                      */
11289                                     if (LOC) {
11290                                         goto end_multi_fold;
11291                                     }
11292                                     ANYOF_FLAGS(ret)
11293                                             |= ANYOF_NONBITMAP_NON_UTF8;
11294                                     break;
11295                                 }
11296                                 loc += UTF8SKIP(loc);
11297                             }
11298                         }
11299
11300                         add_alternate(&unicode_alternate, foldbuf, foldlen);
11301                     end_multi_fold: ;
11302                     }
11303
11304                     /* This is special-cased, as it is the only letter which
11305                      * has both a multi-fold and single-fold in Latin1.  All
11306                      * the other chars that have single and multi-folds are
11307                      * always in utf8, and the utf8 folding algorithm catches
11308                      * them */
11309                     if (! LOC && j == LATIN_CAPITAL_LETTER_SHARP_S) {
11310                         stored += set_regclass_bit(pRExC_state,
11311                                         ret,
11312                                         LATIN_SMALL_LETTER_SHARP_S,
11313                                         &l1_fold_invlist, &unicode_alternate);
11314                     }
11315                 }
11316                 else {
11317                     /* Single character fold.  Add everything in its fold
11318                      * closure to the list that this node should match */
11319                     SV** listp;
11320
11321                     /* The fold closures data structure is a hash with the keys
11322                      * being every character that is folded to, like 'k', and
11323                      * the values each an array of everything that folds to its
11324                      * key.  e.g. [ 'k', 'K', KELVIN_SIGN ] */
11325                     if ((listp = hv_fetch(PL_utf8_foldclosures,
11326                                     (char *) foldbuf, foldlen, FALSE)))
11327                     {
11328                         AV* list = (AV*) *listp;
11329                         IV k;
11330                         for (k = 0; k <= av_len(list); k++) {
11331                             SV** c_p = av_fetch(list, k, FALSE);
11332                             UV c;
11333                             if (c_p == NULL) {
11334                                 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
11335                             }
11336                             c = SvUV(*c_p);
11337
11338                             /* /aa doesn't allow folds between ASCII and non-;
11339                              * /l doesn't allow them between above and below
11340                              * 256 */
11341                             if ((MORE_ASCII_RESTRICTED
11342                                  && (isASCII(c) != isASCII(j)))
11343                                     || (LOC && ((c < 256) != (j < 256))))
11344                             {
11345                                 continue;
11346                             }
11347
11348                             if (c < 256 && AT_LEAST_UNI_SEMANTICS) {
11349                                 stored += set_regclass_bit(pRExC_state,
11350                                         ret,
11351                                         (U8) c,
11352                                         &l1_fold_invlist, &unicode_alternate);
11353                             }
11354                                 /* It may be that the code point is already in
11355                                  * this range or already in the bitmap, in
11356                                  * which case we need do nothing */
11357                             else if ((c < start || c > end)
11358                                         && (c > 255
11359                                             || ! ANYOF_BITMAP_TEST(ret, c)))
11360                             {
11361                                 nonbitmap = add_cp_to_invlist(nonbitmap, c);
11362                             }
11363                         }
11364                     }
11365                 }
11366             }
11367         }
11368         SvREFCNT_dec(fold_intersection);
11369     }
11370
11371     /* Combine the two lists into one. */
11372     if (l1_fold_invlist) {
11373         if (nonbitmap) {
11374             _invlist_union(nonbitmap, l1_fold_invlist, &nonbitmap);
11375             SvREFCNT_dec(l1_fold_invlist);
11376         }
11377         else {
11378             nonbitmap = l1_fold_invlist;
11379         }
11380     }
11381
11382     /* And combine the result (if any) with any inversion list from properties.
11383      * The lists are kept separate up to now because we don't want to fold the
11384      * properties */
11385     if (properties) {
11386         if (nonbitmap) {
11387             _invlist_union(nonbitmap, properties, &nonbitmap);
11388             SvREFCNT_dec(properties);
11389         }
11390         else {
11391             nonbitmap = properties;
11392         }
11393     }
11394
11395     /* Here, <nonbitmap> contains all the code points we can determine at
11396      * compile time that we haven't put into the bitmap.  Go through it, and
11397      * for things that belong in the bitmap, put them there, and delete from
11398      * <nonbitmap> */
11399     if (nonbitmap) {
11400
11401         /* Above-ASCII code points in /d have to stay in <nonbitmap>, as they
11402          * possibly only should match when the target string is UTF-8 */
11403         UV max_cp_to_set = (DEPENDS_SEMANTICS) ? 127 : 255;
11404
11405         /* This gets set if we actually need to modify things */
11406         bool change_invlist = FALSE;
11407
11408         UV start, end;
11409
11410         /* Start looking through <nonbitmap> */
11411         invlist_iterinit(nonbitmap);
11412         while (invlist_iternext(nonbitmap, &start, &end)) {
11413             UV high;
11414             int i;
11415
11416             /* Quit if are above what we should change */
11417             if (start > max_cp_to_set) {
11418                 break;
11419             }
11420
11421             change_invlist = TRUE;
11422
11423             /* Set all the bits in the range, up to the max that we are doing */
11424             high = (end < max_cp_to_set) ? end : max_cp_to_set;
11425             for (i = start; i <= (int) high; i++) {
11426                 if (! ANYOF_BITMAP_TEST(ret, i)) {
11427                     ANYOF_BITMAP_SET(ret, i);
11428                     stored++;
11429                     prevvalue = value;
11430                     value = i;
11431                 }
11432             }
11433         }
11434
11435         /* Done with loop; remove any code points that are in the bitmap from
11436          * <nonbitmap> */
11437         if (change_invlist) {
11438             _invlist_subtract(nonbitmap,
11439                               (DEPENDS_SEMANTICS)
11440                                 ? PL_ASCII
11441                                 : PL_Latin1,
11442                               &nonbitmap);
11443         }
11444
11445         /* If have completely emptied it, remove it completely */
11446         if (invlist_len(nonbitmap) == 0) {
11447             SvREFCNT_dec(nonbitmap);
11448             nonbitmap = NULL;
11449         }
11450     }
11451
11452     /* Here, we have calculated what code points should be in the character
11453      * class.  <nonbitmap> does not overlap the bitmap except possibly in the
11454      * case of DEPENDS rules.
11455      *
11456      * Now we can see about various optimizations.  Fold calculation (which we
11457      * did above) needs to take place before inversion.  Otherwise /[^k]/i
11458      * would invert to include K, which under /i would match k, which it
11459      * shouldn't. */
11460
11461     /* Optimize inverted simple patterns (e.g. [^a-z]).  Note that we haven't
11462      * set the FOLD flag yet, so this does optimize those.  It doesn't
11463      * optimize locale.  Doing so perhaps could be done as long as there is
11464      * nothing like \w in it; some thought also would have to be given to the
11465      * interaction with above 0x100 chars */
11466     if ((ANYOF_FLAGS(ret) & ANYOF_INVERT)
11467         && ! LOC
11468         && ! unicode_alternate
11469         /* In case of /d, there are some things that should match only when in
11470          * not in the bitmap, i.e., they require UTF8 to match.  These are
11471          * listed in nonbitmap, but if ANYOF_NONBITMAP_NON_UTF8 is set in this
11472          * case, they don't require UTF8, so can invert here */
11473         && (! nonbitmap
11474             || ! DEPENDS_SEMANTICS
11475             || (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP_NON_UTF8))
11476         && SvCUR(listsv) == initial_listsv_len)
11477     {
11478         int i;
11479         if (! nonbitmap) {
11480             for (i = 0; i < 256; ++i) {
11481                 if (ANYOF_BITMAP_TEST(ret, i)) {
11482                     ANYOF_BITMAP_CLEAR(ret, i);
11483                 }
11484                 else {
11485                     ANYOF_BITMAP_SET(ret, i);
11486                     prevvalue = value;
11487                     value = i;
11488                 }
11489             }
11490             /* The inversion means that everything above 255 is matched */
11491             ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
11492         }
11493         else {
11494             /* Here, also has things outside the bitmap that may overlap with
11495              * the bitmap.  We have to sync them up, so that they get inverted
11496              * in both places.  Earlier, we removed all overlaps except in the
11497              * case of /d rules, so no syncing is needed except for this case
11498              */
11499             SV *remove_list = NULL;
11500
11501             if (DEPENDS_SEMANTICS) {
11502                 UV start, end;
11503
11504                 /* Set the bits that correspond to the ones that aren't in the
11505                  * bitmap.  Otherwise, when we invert, we'll miss these.
11506                  * Earlier, we removed from the nonbitmap all code points
11507                  * < 128, so there is no extra work here */
11508                 invlist_iterinit(nonbitmap);
11509                 while (invlist_iternext(nonbitmap, &start, &end)) {
11510                     if (start > 255) {  /* The bit map goes to 255 */
11511                         break;
11512                     }
11513                     if (end > 255) {
11514                         end = 255;
11515                     }
11516                     for (i = start; i <= (int) end; ++i) {
11517                         ANYOF_BITMAP_SET(ret, i);
11518                         prevvalue = value;
11519                         value = i;
11520                     }
11521                 }
11522             }
11523
11524             /* Now invert both the bitmap and the nonbitmap.  Anything in the
11525              * bitmap has to also be removed from the non-bitmap, but again,
11526              * there should not be overlap unless is /d rules. */
11527             _invlist_invert(nonbitmap);
11528
11529             /* Any swash can't be used as-is, because we've inverted things */
11530             if (swash) {
11531                 SvREFCNT_dec(swash);
11532                 swash = NULL;
11533             }
11534
11535             for (i = 0; i < 256; ++i) {
11536                 if (ANYOF_BITMAP_TEST(ret, i)) {
11537                     ANYOF_BITMAP_CLEAR(ret, i);
11538                     if (DEPENDS_SEMANTICS) {
11539                         if (! remove_list) {
11540                             remove_list = _new_invlist(2);
11541                         }
11542                         remove_list = add_cp_to_invlist(remove_list, i);
11543                     }
11544                 }
11545                 else {
11546                     ANYOF_BITMAP_SET(ret, i);
11547                     prevvalue = value;
11548                     value = i;
11549                 }
11550             }
11551
11552             /* And do the removal */
11553             if (DEPENDS_SEMANTICS) {
11554                 if (remove_list) {
11555                     _invlist_subtract(nonbitmap, remove_list, &nonbitmap);
11556                     SvREFCNT_dec(remove_list);
11557                 }
11558             }
11559             else {
11560                 /* There is no overlap for non-/d, so just delete anything
11561                  * below 256 */
11562                 _invlist_intersection(nonbitmap, PL_AboveLatin1, &nonbitmap);
11563             }
11564         }
11565
11566         stored = 256 - stored;
11567
11568         /* Clear the invert flag since have just done it here */
11569         ANYOF_FLAGS(ret) &= ~ANYOF_INVERT;
11570     }
11571
11572     /* Folding in the bitmap is taken care of above, but not for locale (for
11573      * which we have to wait to see what folding is in effect at runtime), and
11574      * for some things not in the bitmap (only the upper latin folds in this
11575      * case, as all other single-char folding has been set above).  Set
11576      * run-time fold flag for these */
11577     if (FOLD && (LOC
11578                 || (DEPENDS_SEMANTICS
11579                     && nonbitmap
11580                     && ! (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP_NON_UTF8))
11581                 || unicode_alternate))
11582     {
11583         ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD;
11584     }
11585
11586     /* A single character class can be "optimized" into an EXACTish node.
11587      * Note that since we don't currently count how many characters there are
11588      * outside the bitmap, we are XXX missing optimization possibilities for
11589      * them.  This optimization can't happen unless this is a truly single
11590      * character class, which means that it can't be an inversion into a
11591      * many-character class, and there must be no possibility of there being
11592      * things outside the bitmap.  'stored' (only) for locales doesn't include
11593      * \w, etc, so have to make a special test that they aren't present
11594      *
11595      * Similarly A 2-character class of the very special form like [bB] can be
11596      * optimized into an EXACTFish node, but only for non-locales, and for
11597      * characters which only have the two folds; so things like 'fF' and 'Ii'
11598      * wouldn't work because they are part of the fold of 'LATIN SMALL LIGATURE
11599      * FI'. */
11600     if (! nonbitmap
11601         && ! unicode_alternate
11602         && SvCUR(listsv) == initial_listsv_len
11603         && ! (ANYOF_FLAGS(ret) & (ANYOF_INVERT|ANYOF_UNICODE_ALL))
11604         && (((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
11605                               || (! ANYOF_CLASS_TEST_ANY_SET(ret)))))
11606             || (stored == 2 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
11607                                  && (! _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value))
11608                                  /* If the latest code point has a fold whose
11609                                   * bit is set, it must be the only other one */
11610                                 && ((prevvalue = PL_fold_latin1[value]) != (IV)value)
11611                                  && ANYOF_BITMAP_TEST(ret, prevvalue)))))
11612     {
11613         /* Note that the information needed to decide to do this optimization
11614          * is not currently available until the 2nd pass, and that the actually
11615          * used EXACTish node takes less space than the calculated ANYOF node,
11616          * and hence the amount of space calculated in the first pass is larger
11617          * than actually used, so this optimization doesn't gain us any space.
11618          * But an EXACT node is faster than an ANYOF node, and can be combined
11619          * with any adjacent EXACT nodes later by the optimizer for further
11620          * gains.  The speed of executing an EXACTF is similar to an ANYOF
11621          * node, so the optimization advantage comes from the ability to join
11622          * it to adjacent EXACT nodes */
11623
11624         const char * cur_parse= RExC_parse;
11625         U8 op;
11626         RExC_emit = (regnode *)orig_emit;
11627         RExC_parse = (char *)orig_parse;
11628
11629         if (stored == 1) {
11630
11631             /* A locale node with one point can be folded; all the other cases
11632              * with folding will have two points, since we calculate them above
11633              */
11634             if (ANYOF_FLAGS(ret) & ANYOF_LOC_NONBITMAP_FOLD) {
11635                  op = EXACTFL;
11636             }
11637             else {
11638                 op = EXACT;
11639             }
11640         }
11641         else {   /* else 2 chars in the bit map: the folds of each other */
11642
11643             /* Use the folded value, which for the cases where we get here,
11644              * is just the lower case of the current one (which may resolve to
11645              * itself, or to the other one */
11646             value = toLOWER_LATIN1(value);
11647
11648             /* To join adjacent nodes, they must be the exact EXACTish type.
11649              * Try to use the most likely type, by using EXACTFA if possible,
11650              * then EXACTFU if the regex calls for it, or is required because
11651              * the character is non-ASCII.  (If <value> is ASCII, its fold is
11652              * also ASCII for the cases where we get here.) */
11653             if (MORE_ASCII_RESTRICTED && isASCII(value)) {
11654                 op = EXACTFA;
11655             }
11656             else if (AT_LEAST_UNI_SEMANTICS || !isASCII(value)) {
11657                 op = EXACTFU;
11658             }
11659             else {    /* Otherwise, more likely to be EXACTF type */
11660                 op = EXACTF;
11661             }
11662         }
11663
11664         ret = reg_node(pRExC_state, op);
11665         RExC_parse = (char *)cur_parse;
11666         if (UTF && ! NATIVE_IS_INVARIANT(value)) {
11667             *STRING(ret)= UTF8_EIGHT_BIT_HI((U8) value);
11668             *(STRING(ret) + 1)= UTF8_EIGHT_BIT_LO((U8) value);
11669             STR_LEN(ret)= 2;
11670             RExC_emit += STR_SZ(2);
11671         }
11672         else {
11673             *STRING(ret)= (char)value;
11674             STR_LEN(ret)= 1;
11675             RExC_emit += STR_SZ(1);
11676         }
11677         SvREFCNT_dec(listsv);
11678         return ret;
11679     }
11680
11681     /* If there is a swash and more than one element, we can't use the swash in
11682      * the optimization below. */
11683     if (swash && element_count > 1) {
11684         SvREFCNT_dec(swash);
11685         swash = NULL;
11686     }
11687     if (! nonbitmap
11688         && SvCUR(listsv) == initial_listsv_len
11689         && ! unicode_alternate)
11690     {
11691         ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
11692         SvREFCNT_dec(listsv);
11693         SvREFCNT_dec(unicode_alternate);
11694     }
11695     else {
11696         /* av[0] stores the character class description in its textual form:
11697          *       used later (regexec.c:Perl_regclass_swash()) to initialize the
11698          *       appropriate swash, and is also useful for dumping the regnode.
11699          * av[1] if NULL, is a placeholder to later contain the swash computed
11700          *       from av[0].  But if no further computation need be done, the
11701          *       swash is stored there now.
11702          * av[2] stores the multicharacter foldings, used later in
11703          *       regexec.c:S_reginclass().
11704          * av[3] stores the nonbitmap inversion list for use in addition or
11705          *       instead of av[0]; not used if av[1] isn't NULL
11706          * av[4] is set if any component of the class is from a user-defined
11707          *       property; not used if av[1] isn't NULL */
11708         AV * const av = newAV();
11709         SV *rv;
11710
11711         av_store(av, 0, (SvCUR(listsv) == initial_listsv_len)
11712                         ? &PL_sv_undef
11713                         : listsv);
11714         if (swash) {
11715             av_store(av, 1, swash);
11716             SvREFCNT_dec(nonbitmap);
11717         }
11718         else {
11719             av_store(av, 1, NULL);
11720             if (nonbitmap) {
11721                 av_store(av, 3, nonbitmap);
11722                 av_store(av, 4, newSVuv(has_user_defined_property));
11723             }
11724         }
11725
11726         /* Store any computed multi-char folds only if we are allowing
11727          * them */
11728         if (allow_full_fold) {
11729             av_store(av, 2, MUTABLE_SV(unicode_alternate));
11730             if (unicode_alternate) { /* This node is variable length */
11731                 OP(ret) = ANYOFV;
11732             }
11733         }
11734         else {
11735             av_store(av, 2, NULL);
11736         }
11737         rv = newRV_noinc(MUTABLE_SV(av));
11738         n = add_data(pRExC_state, 1, "s");
11739         RExC_rxi->data->data[n] = (void*)rv;
11740         ARG_SET(ret, n);
11741     }
11742     return ret;
11743 }
11744
11745
11746 /* reg_skipcomment()
11747
11748    Absorbs an /x style # comments from the input stream.
11749    Returns true if there is more text remaining in the stream.
11750    Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
11751    terminates the pattern without including a newline.
11752
11753    Note its the callers responsibility to ensure that we are
11754    actually in /x mode
11755
11756 */
11757
11758 STATIC bool
11759 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
11760 {
11761     bool ended = 0;
11762
11763     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
11764
11765     while (RExC_parse < RExC_end)
11766         if (*RExC_parse++ == '\n') {
11767             ended = 1;
11768             break;
11769         }
11770     if (!ended) {
11771         /* we ran off the end of the pattern without ending
11772            the comment, so we have to add an \n when wrapping */
11773         RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11774         return 0;
11775     } else
11776         return 1;
11777 }
11778
11779 /* nextchar()
11780
11781    Advances the parse position, and optionally absorbs
11782    "whitespace" from the inputstream.
11783
11784    Without /x "whitespace" means (?#...) style comments only,
11785    with /x this means (?#...) and # comments and whitespace proper.
11786
11787    Returns the RExC_parse point from BEFORE the scan occurs.
11788
11789    This is the /x friendly way of saying RExC_parse++.
11790 */
11791
11792 STATIC char*
11793 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
11794 {
11795     char* const retval = RExC_parse++;
11796
11797     PERL_ARGS_ASSERT_NEXTCHAR;
11798
11799     for (;;) {
11800         if (RExC_end - RExC_parse >= 3
11801             && *RExC_parse == '('
11802             && RExC_parse[1] == '?'
11803             && RExC_parse[2] == '#')
11804         {
11805             while (*RExC_parse != ')') {
11806                 if (RExC_parse == RExC_end)
11807                     FAIL("Sequence (?#... not terminated");
11808                 RExC_parse++;
11809             }
11810             RExC_parse++;
11811             continue;
11812         }
11813         if (RExC_flags & RXf_PMf_EXTENDED) {
11814             if (isSPACE(*RExC_parse)) {
11815                 RExC_parse++;
11816                 continue;
11817             }
11818             else if (*RExC_parse == '#') {
11819                 if ( reg_skipcomment( pRExC_state ) )
11820                     continue;
11821             }
11822         }
11823         return retval;
11824     }
11825 }
11826
11827 /*
11828 - reg_node - emit a node
11829 */
11830 STATIC regnode *                        /* Location. */
11831 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
11832 {
11833     dVAR;
11834     register regnode *ptr;
11835     regnode * const ret = RExC_emit;
11836     GET_RE_DEBUG_FLAGS_DECL;
11837
11838     PERL_ARGS_ASSERT_REG_NODE;
11839
11840     if (SIZE_ONLY) {
11841         SIZE_ALIGN(RExC_size);
11842         RExC_size += 1;
11843         return(ret);
11844     }
11845     if (RExC_emit >= RExC_emit_bound)
11846         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
11847                    op, RExC_emit, RExC_emit_bound);
11848
11849     NODE_ALIGN_FILL(ret);
11850     ptr = ret;
11851     FILL_ADVANCE_NODE(ptr, op);
11852 #ifdef RE_TRACK_PATTERN_OFFSETS
11853     if (RExC_offsets) {         /* MJD */
11854         MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
11855               "reg_node", __LINE__,
11856               PL_reg_name[op],
11857               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
11858                 ? "Overwriting end of array!\n" : "OK",
11859               (UV)(RExC_emit - RExC_emit_start),
11860               (UV)(RExC_parse - RExC_start),
11861               (UV)RExC_offsets[0]));
11862         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
11863     }
11864 #endif
11865     RExC_emit = ptr;
11866     return(ret);
11867 }
11868
11869 /*
11870 - reganode - emit a node with an argument
11871 */
11872 STATIC regnode *                        /* Location. */
11873 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
11874 {
11875     dVAR;
11876     register regnode *ptr;
11877     regnode * const ret = RExC_emit;
11878     GET_RE_DEBUG_FLAGS_DECL;
11879
11880     PERL_ARGS_ASSERT_REGANODE;
11881
11882     if (SIZE_ONLY) {
11883         SIZE_ALIGN(RExC_size);
11884         RExC_size += 2;
11885         /*
11886            We can't do this:
11887
11888            assert(2==regarglen[op]+1);
11889
11890            Anything larger than this has to allocate the extra amount.
11891            If we changed this to be:
11892
11893            RExC_size += (1 + regarglen[op]);
11894
11895            then it wouldn't matter. Its not clear what side effect
11896            might come from that so its not done so far.
11897            -- dmq
11898         */
11899         return(ret);
11900     }
11901     if (RExC_emit >= RExC_emit_bound)
11902         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
11903                    op, RExC_emit, RExC_emit_bound);
11904
11905     NODE_ALIGN_FILL(ret);
11906     ptr = ret;
11907     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
11908 #ifdef RE_TRACK_PATTERN_OFFSETS
11909     if (RExC_offsets) {         /* MJD */
11910         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
11911               "reganode",
11912               __LINE__,
11913               PL_reg_name[op],
11914               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
11915               "Overwriting end of array!\n" : "OK",
11916               (UV)(RExC_emit - RExC_emit_start),
11917               (UV)(RExC_parse - RExC_start),
11918               (UV)RExC_offsets[0]));
11919         Set_Cur_Node_Offset;
11920     }
11921 #endif
11922     RExC_emit = ptr;
11923     return(ret);
11924 }
11925
11926 /*
11927 - reguni - emit (if appropriate) a Unicode character
11928 */
11929 STATIC STRLEN
11930 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
11931 {
11932     dVAR;
11933
11934     PERL_ARGS_ASSERT_REGUNI;
11935
11936     return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
11937 }
11938
11939 /*
11940 - reginsert - insert an operator in front of already-emitted operand
11941 *
11942 * Means relocating the operand.
11943 */
11944 STATIC void
11945 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
11946 {
11947     dVAR;
11948     register regnode *src;
11949     register regnode *dst;
11950     register regnode *place;
11951     const int offset = regarglen[(U8)op];
11952     const int size = NODE_STEP_REGNODE + offset;
11953     GET_RE_DEBUG_FLAGS_DECL;
11954
11955     PERL_ARGS_ASSERT_REGINSERT;
11956     PERL_UNUSED_ARG(depth);
11957 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
11958     DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
11959     if (SIZE_ONLY) {
11960         RExC_size += size;
11961         return;
11962     }
11963
11964     src = RExC_emit;
11965     RExC_emit += size;
11966     dst = RExC_emit;
11967     if (RExC_open_parens) {
11968         int paren;
11969         /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
11970         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
11971             if ( RExC_open_parens[paren] >= opnd ) {
11972                 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
11973                 RExC_open_parens[paren] += size;
11974             } else {
11975                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
11976             }
11977             if ( RExC_close_parens[paren] >= opnd ) {
11978                 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
11979                 RExC_close_parens[paren] += size;
11980             } else {
11981                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
11982             }
11983         }
11984     }
11985
11986     while (src > opnd) {
11987         StructCopy(--src, --dst, regnode);
11988 #ifdef RE_TRACK_PATTERN_OFFSETS
11989         if (RExC_offsets) {     /* MJD 20010112 */
11990             MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
11991                   "reg_insert",
11992                   __LINE__,
11993                   PL_reg_name[op],
11994                   (UV)(dst - RExC_emit_start) > RExC_offsets[0]
11995                     ? "Overwriting end of array!\n" : "OK",
11996                   (UV)(src - RExC_emit_start),
11997                   (UV)(dst - RExC_emit_start),
11998                   (UV)RExC_offsets[0]));
11999             Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
12000             Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
12001         }
12002 #endif
12003     }
12004
12005
12006     place = opnd;               /* Op node, where operand used to be. */
12007 #ifdef RE_TRACK_PATTERN_OFFSETS
12008     if (RExC_offsets) {         /* MJD */
12009         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
12010               "reginsert",
12011               __LINE__,
12012               PL_reg_name[op],
12013               (UV)(place - RExC_emit_start) > RExC_offsets[0]
12014               ? "Overwriting end of array!\n" : "OK",
12015               (UV)(place - RExC_emit_start),
12016               (UV)(RExC_parse - RExC_start),
12017               (UV)RExC_offsets[0]));
12018         Set_Node_Offset(place, RExC_parse);
12019         Set_Node_Length(place, 1);
12020     }
12021 #endif
12022     src = NEXTOPER(place);
12023     FILL_ADVANCE_NODE(place, op);
12024     Zero(src, offset, regnode);
12025 }
12026
12027 /*
12028 - regtail - set the next-pointer at the end of a node chain of p to val.
12029 - SEE ALSO: regtail_study
12030 */
12031 /* TODO: All three parms should be const */
12032 STATIC void
12033 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
12034 {
12035     dVAR;
12036     register regnode *scan;
12037     GET_RE_DEBUG_FLAGS_DECL;
12038
12039     PERL_ARGS_ASSERT_REGTAIL;
12040 #ifndef DEBUGGING
12041     PERL_UNUSED_ARG(depth);
12042 #endif
12043
12044     if (SIZE_ONLY)
12045         return;
12046
12047     /* Find last node. */
12048     scan = p;
12049     for (;;) {
12050         regnode * const temp = regnext(scan);
12051         DEBUG_PARSE_r({
12052             SV * const mysv=sv_newmortal();
12053             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
12054             regprop(RExC_rx, mysv, scan);
12055             PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
12056                 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
12057                     (temp == NULL ? "->" : ""),
12058                     (temp == NULL ? PL_reg_name[OP(val)] : "")
12059             );
12060         });
12061         if (temp == NULL)
12062             break;
12063         scan = temp;
12064     }
12065
12066     if (reg_off_by_arg[OP(scan)]) {
12067         ARG_SET(scan, val - scan);
12068     }
12069     else {
12070         NEXT_OFF(scan) = val - scan;
12071     }
12072 }
12073
12074 #ifdef DEBUGGING
12075 /*
12076 - regtail_study - set the next-pointer at the end of a node chain of p to val.
12077 - Look for optimizable sequences at the same time.
12078 - currently only looks for EXACT chains.
12079
12080 This is experimental code. The idea is to use this routine to perform
12081 in place optimizations on branches and groups as they are constructed,
12082 with the long term intention of removing optimization from study_chunk so
12083 that it is purely analytical.
12084
12085 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
12086 to control which is which.
12087
12088 */
12089 /* TODO: All four parms should be const */
12090
12091 STATIC U8
12092 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
12093 {
12094     dVAR;
12095     register regnode *scan;
12096     U8 exact = PSEUDO;
12097 #ifdef EXPERIMENTAL_INPLACESCAN
12098     I32 min = 0;
12099 #endif
12100     GET_RE_DEBUG_FLAGS_DECL;
12101
12102     PERL_ARGS_ASSERT_REGTAIL_STUDY;
12103
12104
12105     if (SIZE_ONLY)
12106         return exact;
12107
12108     /* Find last node. */
12109
12110     scan = p;
12111     for (;;) {
12112         regnode * const temp = regnext(scan);
12113 #ifdef EXPERIMENTAL_INPLACESCAN
12114         if (PL_regkind[OP(scan)] == EXACT) {
12115             bool has_exactf_sharp_s;    /* Unexamined in this routine */
12116             if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1))
12117                 return EXACT;
12118         }
12119 #endif
12120         if ( exact ) {
12121             switch (OP(scan)) {
12122                 case EXACT:
12123                 case EXACTF:
12124                 case EXACTFA:
12125                 case EXACTFU:
12126                 case EXACTFU_SS:
12127                 case EXACTFU_TRICKYFOLD:
12128                 case EXACTFL:
12129                         if( exact == PSEUDO )
12130                             exact= OP(scan);
12131                         else if ( exact != OP(scan) )
12132                             exact= 0;
12133                 case NOTHING:
12134                     break;
12135                 default:
12136                     exact= 0;
12137             }
12138         }
12139         DEBUG_PARSE_r({
12140             SV * const mysv=sv_newmortal();
12141             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
12142             regprop(RExC_rx, mysv, scan);
12143             PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
12144                 SvPV_nolen_const(mysv),
12145                 REG_NODE_NUM(scan),
12146                 PL_reg_name[exact]);
12147         });
12148         if (temp == NULL)
12149             break;
12150         scan = temp;
12151     }
12152     DEBUG_PARSE_r({
12153         SV * const mysv_val=sv_newmortal();
12154         DEBUG_PARSE_MSG("");
12155         regprop(RExC_rx, mysv_val, val);
12156         PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
12157                       SvPV_nolen_const(mysv_val),
12158                       (IV)REG_NODE_NUM(val),
12159                       (IV)(val - scan)
12160         );
12161     });
12162     if (reg_off_by_arg[OP(scan)]) {
12163         ARG_SET(scan, val - scan);
12164     }
12165     else {
12166         NEXT_OFF(scan) = val - scan;
12167     }
12168
12169     return exact;
12170 }
12171 #endif
12172
12173 /*
12174  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
12175  */
12176 #ifdef DEBUGGING
12177 static void
12178 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
12179 {
12180     int bit;
12181     int set=0;
12182     regex_charset cs;
12183
12184     for (bit=0; bit<32; bit++) {
12185         if (flags & (1<<bit)) {
12186             if ((1<<bit) & RXf_PMf_CHARSET) {   /* Output separately, below */
12187                 continue;
12188             }
12189             if (!set++ && lead)
12190                 PerlIO_printf(Perl_debug_log, "%s",lead);
12191             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
12192         }
12193     }
12194     if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
12195             if (!set++ && lead) {
12196                 PerlIO_printf(Perl_debug_log, "%s",lead);
12197             }
12198             switch (cs) {
12199                 case REGEX_UNICODE_CHARSET:
12200                     PerlIO_printf(Perl_debug_log, "UNICODE");
12201                     break;
12202                 case REGEX_LOCALE_CHARSET:
12203                     PerlIO_printf(Perl_debug_log, "LOCALE");
12204                     break;
12205                 case REGEX_ASCII_RESTRICTED_CHARSET:
12206                     PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
12207                     break;
12208                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
12209                     PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
12210                     break;
12211                 default:
12212                     PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
12213                     break;
12214             }
12215     }
12216     if (lead)  {
12217         if (set)
12218             PerlIO_printf(Perl_debug_log, "\n");
12219         else
12220             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
12221     }
12222 }
12223 #endif
12224
12225 void
12226 Perl_regdump(pTHX_ const regexp *r)
12227 {
12228 #ifdef DEBUGGING
12229     dVAR;
12230     SV * const sv = sv_newmortal();
12231     SV *dsv= sv_newmortal();
12232     RXi_GET_DECL(r,ri);
12233     GET_RE_DEBUG_FLAGS_DECL;
12234
12235     PERL_ARGS_ASSERT_REGDUMP;
12236
12237     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
12238
12239     /* Header fields of interest. */
12240     if (r->anchored_substr) {
12241         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
12242             RE_SV_DUMPLEN(r->anchored_substr), 30);
12243         PerlIO_printf(Perl_debug_log,
12244                       "anchored %s%s at %"IVdf" ",
12245                       s, RE_SV_TAIL(r->anchored_substr),
12246                       (IV)r->anchored_offset);
12247     } else if (r->anchored_utf8) {
12248         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
12249             RE_SV_DUMPLEN(r->anchored_utf8), 30);
12250         PerlIO_printf(Perl_debug_log,
12251                       "anchored utf8 %s%s at %"IVdf" ",
12252                       s, RE_SV_TAIL(r->anchored_utf8),
12253                       (IV)r->anchored_offset);
12254     }
12255     if (r->float_substr) {
12256         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
12257             RE_SV_DUMPLEN(r->float_substr), 30);
12258         PerlIO_printf(Perl_debug_log,
12259                       "floating %s%s at %"IVdf"..%"UVuf" ",
12260                       s, RE_SV_TAIL(r->float_substr),
12261                       (IV)r->float_min_offset, (UV)r->float_max_offset);
12262     } else if (r->float_utf8) {
12263         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
12264             RE_SV_DUMPLEN(r->float_utf8), 30);
12265         PerlIO_printf(Perl_debug_log,
12266                       "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
12267                       s, RE_SV_TAIL(r->float_utf8),
12268                       (IV)r->float_min_offset, (UV)r->float_max_offset);
12269     }
12270     if (r->check_substr || r->check_utf8)
12271         PerlIO_printf(Perl_debug_log,
12272                       (const char *)
12273                       (r->check_substr == r->float_substr
12274                        && r->check_utf8 == r->float_utf8
12275                        ? "(checking floating" : "(checking anchored"));
12276     if (r->extflags & RXf_NOSCAN)
12277         PerlIO_printf(Perl_debug_log, " noscan");
12278     if (r->extflags & RXf_CHECK_ALL)
12279         PerlIO_printf(Perl_debug_log, " isall");
12280     if (r->check_substr || r->check_utf8)
12281         PerlIO_printf(Perl_debug_log, ") ");
12282
12283     if (ri->regstclass) {
12284         regprop(r, sv, ri->regstclass);
12285         PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
12286     }
12287     if (r->extflags & RXf_ANCH) {
12288         PerlIO_printf(Perl_debug_log, "anchored");
12289         if (r->extflags & RXf_ANCH_BOL)
12290             PerlIO_printf(Perl_debug_log, "(BOL)");
12291         if (r->extflags & RXf_ANCH_MBOL)
12292             PerlIO_printf(Perl_debug_log, "(MBOL)");
12293         if (r->extflags & RXf_ANCH_SBOL)
12294             PerlIO_printf(Perl_debug_log, "(SBOL)");
12295         if (r->extflags & RXf_ANCH_GPOS)
12296             PerlIO_printf(Perl_debug_log, "(GPOS)");
12297         PerlIO_putc(Perl_debug_log, ' ');
12298     }
12299     if (r->extflags & RXf_GPOS_SEEN)
12300         PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
12301     if (r->intflags & PREGf_SKIP)
12302         PerlIO_printf(Perl_debug_log, "plus ");
12303     if (r->intflags & PREGf_IMPLICIT)
12304         PerlIO_printf(Perl_debug_log, "implicit ");
12305     PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
12306     if (r->extflags & RXf_EVAL_SEEN)
12307         PerlIO_printf(Perl_debug_log, "with eval ");
12308     PerlIO_printf(Perl_debug_log, "\n");
12309     DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
12310 #else
12311     PERL_ARGS_ASSERT_REGDUMP;
12312     PERL_UNUSED_CONTEXT;
12313     PERL_UNUSED_ARG(r);
12314 #endif  /* DEBUGGING */
12315 }
12316
12317 /*
12318 - regprop - printable representation of opcode
12319 */
12320 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
12321 STMT_START { \
12322         if (do_sep) {                           \
12323             Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
12324             if (flags & ANYOF_INVERT)           \
12325                 /*make sure the invert info is in each */ \
12326                 sv_catpvs(sv, "^");             \
12327             do_sep = 0;                         \
12328         }                                       \
12329 } STMT_END
12330
12331 void
12332 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
12333 {
12334 #ifdef DEBUGGING
12335     dVAR;
12336     register int k;
12337     RXi_GET_DECL(prog,progi);
12338     GET_RE_DEBUG_FLAGS_DECL;
12339
12340     PERL_ARGS_ASSERT_REGPROP;
12341
12342     sv_setpvs(sv, "");
12343
12344     if (OP(o) > REGNODE_MAX)            /* regnode.type is unsigned */
12345         /* It would be nice to FAIL() here, but this may be called from
12346            regexec.c, and it would be hard to supply pRExC_state. */
12347         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
12348     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
12349
12350     k = PL_regkind[OP(o)];
12351
12352     if (k == EXACT) {
12353         sv_catpvs(sv, " ");
12354         /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
12355          * is a crude hack but it may be the best for now since
12356          * we have no flag "this EXACTish node was UTF-8"
12357          * --jhi */
12358         pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
12359                   PERL_PV_ESCAPE_UNI_DETECT |
12360                   PERL_PV_ESCAPE_NONASCII   |
12361                   PERL_PV_PRETTY_ELLIPSES   |
12362                   PERL_PV_PRETTY_LTGT       |
12363                   PERL_PV_PRETTY_NOCLEAR
12364                   );
12365     } else if (k == TRIE) {
12366         /* print the details of the trie in dumpuntil instead, as
12367          * progi->data isn't available here */
12368         const char op = OP(o);
12369         const U32 n = ARG(o);
12370         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
12371                (reg_ac_data *)progi->data->data[n] :
12372                NULL;
12373         const reg_trie_data * const trie
12374             = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
12375
12376         Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
12377         DEBUG_TRIE_COMPILE_r(
12378             Perl_sv_catpvf(aTHX_ sv,
12379                 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
12380                 (UV)trie->startstate,
12381                 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
12382                 (UV)trie->wordcount,
12383                 (UV)trie->minlen,
12384                 (UV)trie->maxlen,
12385                 (UV)TRIE_CHARCOUNT(trie),
12386                 (UV)trie->uniquecharcount
12387             )
12388         );
12389         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
12390             int i;
12391             int rangestart = -1;
12392             U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
12393             sv_catpvs(sv, "[");
12394             for (i = 0; i <= 256; i++) {
12395                 if (i < 256 && BITMAP_TEST(bitmap,i)) {
12396                     if (rangestart == -1)
12397                         rangestart = i;
12398                 } else if (rangestart != -1) {
12399                     if (i <= rangestart + 3)
12400                         for (; rangestart < i; rangestart++)
12401                             put_byte(sv, rangestart);
12402                     else {
12403                         put_byte(sv, rangestart);
12404                         sv_catpvs(sv, "-");
12405                         put_byte(sv, i - 1);
12406                     }
12407                     rangestart = -1;
12408                 }
12409             }
12410             sv_catpvs(sv, "]");
12411         }
12412
12413     } else if (k == CURLY) {
12414         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
12415             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
12416         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
12417     }
12418     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
12419         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
12420     else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
12421         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
12422         if ( RXp_PAREN_NAMES(prog) ) {
12423             if ( k != REF || (OP(o) < NREF)) {
12424                 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
12425                 SV **name= av_fetch(list, ARG(o), 0 );
12426                 if (name)
12427                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
12428             }
12429             else {
12430                 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
12431                 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
12432                 I32 *nums=(I32*)SvPVX(sv_dat);
12433                 SV **name= av_fetch(list, nums[0], 0 );
12434                 I32 n;
12435                 if (name) {
12436                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
12437                         Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
12438                                     (n ? "," : ""), (IV)nums[n]);
12439                     }
12440                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
12441                 }
12442             }
12443         }
12444     } else if (k == GOSUB)
12445         Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
12446     else if (k == VERB) {
12447         if (!o->flags)
12448             Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
12449                            SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
12450     } else if (k == LOGICAL)
12451         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);     /* 2: embedded, otherwise 1 */
12452     else if (k == ANYOF) {
12453         int i, rangestart = -1;
12454         const U8 flags = ANYOF_FLAGS(o);
12455         int do_sep = 0;
12456
12457         /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
12458         static const char * const anyofs[] = {
12459             "\\w",
12460             "\\W",
12461             "\\s",
12462             "\\S",
12463             "\\d",
12464             "\\D",
12465             "[:alnum:]",
12466             "[:^alnum:]",
12467             "[:alpha:]",
12468             "[:^alpha:]",
12469             "[:ascii:]",
12470             "[:^ascii:]",
12471             "[:cntrl:]",
12472             "[:^cntrl:]",
12473             "[:graph:]",
12474             "[:^graph:]",
12475             "[:lower:]",
12476             "[:^lower:]",
12477             "[:print:]",
12478             "[:^print:]",
12479             "[:punct:]",
12480             "[:^punct:]",
12481             "[:upper:]",
12482             "[:^upper:]",
12483             "[:xdigit:]",
12484             "[:^xdigit:]",
12485             "[:space:]",
12486             "[:^space:]",
12487             "[:blank:]",
12488             "[:^blank:]"
12489         };
12490
12491         if (flags & ANYOF_LOCALE)
12492             sv_catpvs(sv, "{loc}");
12493         if (flags & ANYOF_LOC_NONBITMAP_FOLD)
12494             sv_catpvs(sv, "{i}");
12495         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
12496         if (flags & ANYOF_INVERT)
12497             sv_catpvs(sv, "^");
12498
12499         /* output what the standard cp 0-255 bitmap matches */
12500         for (i = 0; i <= 256; i++) {
12501             if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
12502                 if (rangestart == -1)
12503                     rangestart = i;
12504             } else if (rangestart != -1) {
12505                 if (i <= rangestart + 3)
12506                     for (; rangestart < i; rangestart++)
12507                         put_byte(sv, rangestart);
12508                 else {
12509                     put_byte(sv, rangestart);
12510                     sv_catpvs(sv, "-");
12511                     put_byte(sv, i - 1);
12512                 }
12513                 do_sep = 1;
12514                 rangestart = -1;
12515             }
12516         }
12517
12518         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
12519         /* output any special charclass tests (used entirely under use locale) */
12520         if (ANYOF_CLASS_TEST_ANY_SET(o))
12521             for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
12522                 if (ANYOF_CLASS_TEST(o,i)) {
12523                     sv_catpv(sv, anyofs[i]);
12524                     do_sep = 1;
12525                 }
12526
12527         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
12528
12529         if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
12530             sv_catpvs(sv, "{non-utf8-latin1-all}");
12531         }
12532
12533         /* output information about the unicode matching */
12534         if (flags & ANYOF_UNICODE_ALL)
12535             sv_catpvs(sv, "{unicode_all}");
12536         else if (ANYOF_NONBITMAP(o))
12537             sv_catpvs(sv, "{unicode}");
12538         if (flags & ANYOF_NONBITMAP_NON_UTF8)
12539             sv_catpvs(sv, "{outside bitmap}");
12540
12541         if (ANYOF_NONBITMAP(o)) {
12542             SV *lv; /* Set if there is something outside the bit map */
12543             SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
12544             bool byte_output = FALSE;   /* If something in the bitmap has been
12545                                            output */
12546
12547             if (lv && lv != &PL_sv_undef) {
12548                 if (sw) {
12549                     U8 s[UTF8_MAXBYTES_CASE+1];
12550
12551                     for (i = 0; i <= 256; i++) { /* Look at chars in bitmap */
12552                         uvchr_to_utf8(s, i);
12553
12554                         if (i < 256
12555                             && ! ANYOF_BITMAP_TEST(o, i)    /* Don't duplicate
12556                                                                things already
12557                                                                output as part
12558                                                                of the bitmap */
12559                             && swash_fetch(sw, s, TRUE))
12560                         {
12561                             if (rangestart == -1)
12562                                 rangestart = i;
12563                         } else if (rangestart != -1) {
12564                             byte_output = TRUE;
12565                             if (i <= rangestart + 3)
12566                                 for (; rangestart < i; rangestart++) {
12567                                     put_byte(sv, rangestart);
12568                                 }
12569                             else {
12570                                 put_byte(sv, rangestart);
12571                                 sv_catpvs(sv, "-");
12572                                 put_byte(sv, i-1);
12573                             }
12574                             rangestart = -1;
12575                         }
12576                     }
12577                 }
12578
12579                 {
12580                     char *s = savesvpv(lv);
12581                     char * const origs = s;
12582
12583                     while (*s && *s != '\n')
12584                         s++;
12585
12586                     if (*s == '\n') {
12587                         const char * const t = ++s;
12588
12589                         if (byte_output) {
12590                             sv_catpvs(sv, " ");
12591                         }
12592
12593                         while (*s) {
12594                             if (*s == '\n') {
12595
12596                                 /* Truncate very long output */
12597                                 if (s - origs > 256) {
12598                                     Perl_sv_catpvf(aTHX_ sv,
12599                                                    "%.*s...",
12600                                                    (int) (s - origs - 1),
12601                                                    t);
12602                                     goto out_dump;
12603                                 }
12604                                 *s = ' ';
12605                             }
12606                             else if (*s == '\t') {
12607                                 *s = '-';
12608                             }
12609                             s++;
12610                         }
12611                         if (s[-1] == ' ')
12612                             s[-1] = 0;
12613
12614                         sv_catpv(sv, t);
12615                     }
12616
12617                 out_dump:
12618
12619                     Safefree(origs);
12620                 }
12621                 SvREFCNT_dec(lv);
12622             }
12623         }
12624
12625         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
12626     }
12627     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
12628         Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
12629 #else
12630     PERL_UNUSED_CONTEXT;
12631     PERL_UNUSED_ARG(sv);
12632     PERL_UNUSED_ARG(o);
12633     PERL_UNUSED_ARG(prog);
12634 #endif  /* DEBUGGING */
12635 }
12636
12637 SV *
12638 Perl_re_intuit_string(pTHX_ REGEXP * const r)
12639 {                               /* Assume that RE_INTUIT is set */
12640     dVAR;
12641     struct regexp *const prog = (struct regexp *)SvANY(r);
12642     GET_RE_DEBUG_FLAGS_DECL;
12643
12644     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
12645     PERL_UNUSED_CONTEXT;
12646
12647     DEBUG_COMPILE_r(
12648         {
12649             const char * const s = SvPV_nolen_const(prog->check_substr
12650                       ? prog->check_substr : prog->check_utf8);
12651
12652             if (!PL_colorset) reginitcolors();
12653             PerlIO_printf(Perl_debug_log,
12654                       "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
12655                       PL_colors[4],
12656                       prog->check_substr ? "" : "utf8 ",
12657                       PL_colors[5],PL_colors[0],
12658                       s,
12659                       PL_colors[1],
12660                       (strlen(s) > 60 ? "..." : ""));
12661         } );
12662
12663     return prog->check_substr ? prog->check_substr : prog->check_utf8;
12664 }
12665
12666 /*
12667    pregfree()
12668
12669    handles refcounting and freeing the perl core regexp structure. When
12670    it is necessary to actually free the structure the first thing it
12671    does is call the 'free' method of the regexp_engine associated to
12672    the regexp, allowing the handling of the void *pprivate; member
12673    first. (This routine is not overridable by extensions, which is why
12674    the extensions free is called first.)
12675
12676    See regdupe and regdupe_internal if you change anything here.
12677 */
12678 #ifndef PERL_IN_XSUB_RE
12679 void
12680 Perl_pregfree(pTHX_ REGEXP *r)
12681 {
12682     SvREFCNT_dec(r);
12683 }
12684
12685 void
12686 Perl_pregfree2(pTHX_ REGEXP *rx)
12687 {
12688     dVAR;
12689     struct regexp *const r = (struct regexp *)SvANY(rx);
12690     GET_RE_DEBUG_FLAGS_DECL;
12691
12692     PERL_ARGS_ASSERT_PREGFREE2;
12693
12694     if (r->mother_re) {
12695         ReREFCNT_dec(r->mother_re);
12696     } else {
12697         CALLREGFREE_PVT(rx); /* free the private data */
12698         SvREFCNT_dec(RXp_PAREN_NAMES(r));
12699     }
12700     if (r->substrs) {
12701         SvREFCNT_dec(r->anchored_substr);
12702         SvREFCNT_dec(r->anchored_utf8);
12703         SvREFCNT_dec(r->float_substr);
12704         SvREFCNT_dec(r->float_utf8);
12705         Safefree(r->substrs);
12706     }
12707     RX_MATCH_COPY_FREE(rx);
12708 #ifdef PERL_OLD_COPY_ON_WRITE
12709     SvREFCNT_dec(r->saved_copy);
12710 #endif
12711     Safefree(r->offs);
12712 }
12713
12714 /*  reg_temp_copy()
12715
12716     This is a hacky workaround to the structural issue of match results
12717     being stored in the regexp structure which is in turn stored in
12718     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
12719     could be PL_curpm in multiple contexts, and could require multiple
12720     result sets being associated with the pattern simultaneously, such
12721     as when doing a recursive match with (??{$qr})
12722
12723     The solution is to make a lightweight copy of the regexp structure
12724     when a qr// is returned from the code executed by (??{$qr}) this
12725     lightweight copy doesn't actually own any of its data except for
12726     the starp/end and the actual regexp structure itself.
12727
12728 */
12729
12730
12731 REGEXP *
12732 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
12733 {
12734     struct regexp *ret;
12735     struct regexp *const r = (struct regexp *)SvANY(rx);
12736     register const I32 npar = r->nparens+1;
12737
12738     PERL_ARGS_ASSERT_REG_TEMP_COPY;
12739
12740     if (!ret_x)
12741         ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
12742     ret = (struct regexp *)SvANY(ret_x);
12743
12744     (void)ReREFCNT_inc(rx);
12745     /* We can take advantage of the existing "copied buffer" mechanism in SVs
12746        by pointing directly at the buffer, but flagging that the allocated
12747        space in the copy is zero. As we've just done a struct copy, it's now
12748        a case of zero-ing that, rather than copying the current length.  */
12749     SvPV_set(ret_x, RX_WRAPPED(rx));
12750     SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
12751     memcpy(&(ret->xpv_cur), &(r->xpv_cur),
12752            sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
12753     SvLEN_set(ret_x, 0);
12754     SvSTASH_set(ret_x, NULL);
12755     SvMAGIC_set(ret_x, NULL);
12756     Newx(ret->offs, npar, regexp_paren_pair);
12757     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
12758     if (r->substrs) {
12759         Newx(ret->substrs, 1, struct reg_substr_data);
12760         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
12761
12762         SvREFCNT_inc_void(ret->anchored_substr);
12763         SvREFCNT_inc_void(ret->anchored_utf8);
12764         SvREFCNT_inc_void(ret->float_substr);
12765         SvREFCNT_inc_void(ret->float_utf8);
12766
12767         /* check_substr and check_utf8, if non-NULL, point to either their
12768            anchored or float namesakes, and don't hold a second reference.  */
12769     }
12770     RX_MATCH_COPIED_off(ret_x);
12771 #ifdef PERL_OLD_COPY_ON_WRITE
12772     ret->saved_copy = NULL;
12773 #endif
12774     ret->mother_re = rx;
12775
12776     return ret_x;
12777 }
12778 #endif
12779
12780 /* regfree_internal()
12781
12782    Free the private data in a regexp. This is overloadable by
12783    extensions. Perl takes care of the regexp structure in pregfree(),
12784    this covers the *pprivate pointer which technically perl doesn't
12785    know about, however of course we have to handle the
12786    regexp_internal structure when no extension is in use.
12787
12788    Note this is called before freeing anything in the regexp
12789    structure.
12790  */
12791
12792 void
12793 Perl_regfree_internal(pTHX_ REGEXP * const rx)
12794 {
12795     dVAR;
12796     struct regexp *const r = (struct regexp *)SvANY(rx);
12797     RXi_GET_DECL(r,ri);
12798     GET_RE_DEBUG_FLAGS_DECL;
12799
12800     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
12801
12802     DEBUG_COMPILE_r({
12803         if (!PL_colorset)
12804             reginitcolors();
12805         {
12806             SV *dsv= sv_newmortal();
12807             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
12808                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
12809             PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
12810                 PL_colors[4],PL_colors[5],s);
12811         }
12812     });
12813 #ifdef RE_TRACK_PATTERN_OFFSETS
12814     if (ri->u.offsets)
12815         Safefree(ri->u.offsets);             /* 20010421 MJD */
12816 #endif
12817     if (ri->data) {
12818         int n = ri->data->count;
12819         PAD* new_comppad = NULL;
12820         PAD* old_comppad;
12821         PADOFFSET refcnt;
12822
12823         while (--n >= 0) {
12824           /* If you add a ->what type here, update the comment in regcomp.h */
12825             switch (ri->data->what[n]) {
12826             case 'a':
12827             case 's':
12828             case 'S':
12829             case 'u':
12830                 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
12831                 break;
12832             case 'f':
12833                 Safefree(ri->data->data[n]);
12834                 break;
12835             case 'p':
12836                 new_comppad = MUTABLE_AV(ri->data->data[n]);
12837                 break;
12838             case 'o':
12839                 if (new_comppad == NULL)
12840                     Perl_croak(aTHX_ "panic: pregfree comppad");
12841                 PAD_SAVE_LOCAL(old_comppad,
12842                     /* Watch out for global destruction's random ordering. */
12843                     (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
12844                 );
12845                 OP_REFCNT_LOCK;
12846                 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
12847                 OP_REFCNT_UNLOCK;
12848                 if (!refcnt)
12849                     op_free((OP_4tree*)ri->data->data[n]);
12850
12851                 PAD_RESTORE_LOCAL(old_comppad);
12852                 SvREFCNT_dec(MUTABLE_SV(new_comppad));
12853                 new_comppad = NULL;
12854                 break;
12855             case 'n':
12856                 break;
12857             case 'T':
12858                 { /* Aho Corasick add-on structure for a trie node.
12859                      Used in stclass optimization only */
12860                     U32 refcount;
12861                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
12862                     OP_REFCNT_LOCK;
12863                     refcount = --aho->refcount;
12864                     OP_REFCNT_UNLOCK;
12865                     if ( !refcount ) {
12866                         PerlMemShared_free(aho->states);
12867                         PerlMemShared_free(aho->fail);
12868                          /* do this last!!!! */
12869                         PerlMemShared_free(ri->data->data[n]);
12870                         PerlMemShared_free(ri->regstclass);
12871                     }
12872                 }
12873                 break;
12874             case 't':
12875                 {
12876                     /* trie structure. */
12877                     U32 refcount;
12878                     reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
12879                     OP_REFCNT_LOCK;
12880                     refcount = --trie->refcount;
12881                     OP_REFCNT_UNLOCK;
12882                     if ( !refcount ) {
12883                         PerlMemShared_free(trie->charmap);
12884                         PerlMemShared_free(trie->states);
12885                         PerlMemShared_free(trie->trans);
12886                         if (trie->bitmap)
12887                             PerlMemShared_free(trie->bitmap);
12888                         if (trie->jump)
12889                             PerlMemShared_free(trie->jump);
12890                         PerlMemShared_free(trie->wordinfo);
12891                         /* do this last!!!! */
12892                         PerlMemShared_free(ri->data->data[n]);
12893                     }
12894                 }
12895                 break;
12896             default:
12897                 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
12898             }
12899         }
12900         Safefree(ri->data->what);
12901         Safefree(ri->data);
12902     }
12903
12904     Safefree(ri);
12905 }
12906
12907 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
12908 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
12909 #define SAVEPVN(p,n)    ((p) ? savepvn(p,n) : NULL)
12910
12911 /*
12912    re_dup - duplicate a regexp.
12913
12914    This routine is expected to clone a given regexp structure. It is only
12915    compiled under USE_ITHREADS.
12916
12917    After all of the core data stored in struct regexp is duplicated
12918    the regexp_engine.dupe method is used to copy any private data
12919    stored in the *pprivate pointer. This allows extensions to handle
12920    any duplication it needs to do.
12921
12922    See pregfree() and regfree_internal() if you change anything here.
12923 */
12924 #if defined(USE_ITHREADS)
12925 #ifndef PERL_IN_XSUB_RE
12926 void
12927 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
12928 {
12929     dVAR;
12930     I32 npar;
12931     const struct regexp *r = (const struct regexp *)SvANY(sstr);
12932     struct regexp *ret = (struct regexp *)SvANY(dstr);
12933
12934     PERL_ARGS_ASSERT_RE_DUP_GUTS;
12935
12936     npar = r->nparens+1;
12937     Newx(ret->offs, npar, regexp_paren_pair);
12938     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
12939     if(ret->swap) {
12940         /* no need to copy these */
12941         Newx(ret->swap, npar, regexp_paren_pair);
12942     }
12943
12944     if (ret->substrs) {
12945         /* Do it this way to avoid reading from *r after the StructCopy().
12946            That way, if any of the sv_dup_inc()s dislodge *r from the L1
12947            cache, it doesn't matter.  */
12948         const bool anchored = r->check_substr
12949             ? r->check_substr == r->anchored_substr
12950             : r->check_utf8 == r->anchored_utf8;
12951         Newx(ret->substrs, 1, struct reg_substr_data);
12952         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
12953
12954         ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
12955         ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
12956         ret->float_substr = sv_dup_inc(ret->float_substr, param);
12957         ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
12958
12959         /* check_substr and check_utf8, if non-NULL, point to either their
12960            anchored or float namesakes, and don't hold a second reference.  */
12961
12962         if (ret->check_substr) {
12963             if (anchored) {
12964                 assert(r->check_utf8 == r->anchored_utf8);
12965                 ret->check_substr = ret->anchored_substr;
12966                 ret->check_utf8 = ret->anchored_utf8;
12967             } else {
12968                 assert(r->check_substr == r->float_substr);
12969                 assert(r->check_utf8 == r->float_utf8);
12970                 ret->check_substr = ret->float_substr;
12971                 ret->check_utf8 = ret->float_utf8;
12972             }
12973         } else if (ret->check_utf8) {
12974             if (anchored) {
12975                 ret->check_utf8 = ret->anchored_utf8;
12976             } else {
12977                 ret->check_utf8 = ret->float_utf8;
12978             }
12979         }
12980     }
12981
12982     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
12983
12984     if (ret->pprivate)
12985         RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
12986
12987     if (RX_MATCH_COPIED(dstr))
12988         ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
12989     else
12990         ret->subbeg = NULL;
12991 #ifdef PERL_OLD_COPY_ON_WRITE
12992     ret->saved_copy = NULL;
12993 #endif
12994
12995     if (ret->mother_re) {
12996         if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
12997             /* Our storage points directly to our mother regexp, but that's
12998                1: a buffer in a different thread
12999                2: something we no longer hold a reference on
13000                so we need to copy it locally.  */
13001             /* Note we need to use SvCUR(), rather than
13002                SvLEN(), on our mother_re, because it, in
13003                turn, may well be pointing to its own mother_re.  */
13004             SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
13005                                    SvCUR(ret->mother_re)+1));
13006             SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
13007         }
13008         ret->mother_re      = NULL;
13009     }
13010     ret->gofs = 0;
13011 }
13012 #endif /* PERL_IN_XSUB_RE */
13013
13014 /*
13015    regdupe_internal()
13016
13017    This is the internal complement to regdupe() which is used to copy
13018    the structure pointed to by the *pprivate pointer in the regexp.
13019    This is the core version of the extension overridable cloning hook.
13020    The regexp structure being duplicated will be copied by perl prior
13021    to this and will be provided as the regexp *r argument, however
13022    with the /old/ structures pprivate pointer value. Thus this routine
13023    may override any copying normally done by perl.
13024
13025    It returns a pointer to the new regexp_internal structure.
13026 */
13027
13028 void *
13029 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
13030 {
13031     dVAR;
13032     struct regexp *const r = (struct regexp *)SvANY(rx);
13033     regexp_internal *reti;
13034     int len;
13035     RXi_GET_DECL(r,ri);
13036
13037     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
13038
13039     len = ProgLen(ri);
13040
13041     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
13042     Copy(ri->program, reti->program, len+1, regnode);
13043
13044
13045     reti->regstclass = NULL;
13046
13047     if (ri->data) {
13048         struct reg_data *d;
13049         const int count = ri->data->count;
13050         int i;
13051
13052         Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
13053                 char, struct reg_data);
13054         Newx(d->what, count, U8);
13055
13056         d->count = count;
13057         for (i = 0; i < count; i++) {
13058             d->what[i] = ri->data->what[i];
13059             switch (d->what[i]) {
13060                 /* legal options are one of: sSfpontTua
13061                    see also regcomp.h and pregfree() */
13062             case 'a': /* actually an AV, but the dup function is identical.  */
13063             case 's':
13064             case 'S':
13065             case 'p': /* actually an AV, but the dup function is identical.  */
13066             case 'u': /* actually an HV, but the dup function is identical.  */
13067                 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
13068                 break;
13069             case 'f':
13070                 /* This is cheating. */
13071                 Newx(d->data[i], 1, struct regnode_charclass_class);
13072                 StructCopy(ri->data->data[i], d->data[i],
13073                             struct regnode_charclass_class);
13074                 reti->regstclass = (regnode*)d->data[i];
13075                 break;
13076             case 'o':
13077                 /* Compiled op trees are readonly and in shared memory,
13078                    and can thus be shared without duplication. */
13079                 OP_REFCNT_LOCK;
13080                 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
13081                 OP_REFCNT_UNLOCK;
13082                 break;
13083             case 'T':
13084                 /* Trie stclasses are readonly and can thus be shared
13085                  * without duplication. We free the stclass in pregfree
13086                  * when the corresponding reg_ac_data struct is freed.
13087                  */
13088                 reti->regstclass= ri->regstclass;
13089                 /* Fall through */
13090             case 't':
13091                 OP_REFCNT_LOCK;
13092                 ((reg_trie_data*)ri->data->data[i])->refcount++;
13093                 OP_REFCNT_UNLOCK;
13094                 /* Fall through */
13095             case 'n':
13096                 d->data[i] = ri->data->data[i];
13097                 break;
13098             default:
13099                 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
13100             }
13101         }
13102
13103         reti->data = d;
13104     }
13105     else
13106         reti->data = NULL;
13107
13108     reti->name_list_idx = ri->name_list_idx;
13109
13110 #ifdef RE_TRACK_PATTERN_OFFSETS
13111     if (ri->u.offsets) {
13112         Newx(reti->u.offsets, 2*len+1, U32);
13113         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
13114     }
13115 #else
13116     SetProgLen(reti,len);
13117 #endif
13118
13119     return (void*)reti;
13120 }
13121
13122 #endif    /* USE_ITHREADS */
13123
13124 #ifndef PERL_IN_XSUB_RE
13125
13126 /*
13127  - regnext - dig the "next" pointer out of a node
13128  */
13129 regnode *
13130 Perl_regnext(pTHX_ register regnode *p)
13131 {
13132     dVAR;
13133     register I32 offset;
13134
13135     if (!p)
13136         return(NULL);
13137
13138     if (OP(p) > REGNODE_MAX) {          /* regnode.type is unsigned */
13139         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
13140     }
13141
13142     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
13143     if (offset == 0)
13144         return(NULL);
13145
13146     return(p+offset);
13147 }
13148 #endif
13149
13150 STATIC void
13151 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
13152 {
13153     va_list args;
13154     STRLEN l1 = strlen(pat1);
13155     STRLEN l2 = strlen(pat2);
13156     char buf[512];
13157     SV *msv;
13158     const char *message;
13159
13160     PERL_ARGS_ASSERT_RE_CROAK2;
13161
13162     if (l1 > 510)
13163         l1 = 510;
13164     if (l1 + l2 > 510)
13165         l2 = 510 - l1;
13166     Copy(pat1, buf, l1 , char);
13167     Copy(pat2, buf + l1, l2 , char);
13168     buf[l1 + l2] = '\n';
13169     buf[l1 + l2 + 1] = '\0';
13170 #ifdef I_STDARG
13171     /* ANSI variant takes additional second argument */
13172     va_start(args, pat2);
13173 #else
13174     va_start(args);
13175 #endif
13176     msv = vmess(buf, &args);
13177     va_end(args);
13178     message = SvPV_const(msv,l1);
13179     if (l1 > 512)
13180         l1 = 512;
13181     Copy(message, buf, l1 , char);
13182     buf[l1-1] = '\0';                   /* Overwrite \n */
13183     Perl_croak(aTHX_ "%s", buf);
13184 }
13185
13186 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
13187
13188 #ifndef PERL_IN_XSUB_RE
13189 void
13190 Perl_save_re_context(pTHX)
13191 {
13192     dVAR;
13193
13194     struct re_save_state *state;
13195
13196     SAVEVPTR(PL_curcop);
13197     SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
13198
13199     state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
13200     PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
13201     SSPUSHUV(SAVEt_RE_STATE);
13202
13203     Copy(&PL_reg_state, state, 1, struct re_save_state);
13204
13205     PL_reg_start_tmp = 0;
13206     PL_reg_start_tmpl = 0;
13207     PL_reg_oldsaved = NULL;
13208     PL_reg_oldsavedlen = 0;
13209     PL_reg_maxiter = 0;
13210     PL_reg_leftiter = 0;
13211     PL_reg_poscache = NULL;
13212     PL_reg_poscache_size = 0;
13213 #ifdef PERL_OLD_COPY_ON_WRITE
13214     PL_nrs = NULL;
13215 #endif
13216
13217     /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
13218     if (PL_curpm) {
13219         const REGEXP * const rx = PM_GETRE(PL_curpm);
13220         if (rx) {
13221             U32 i;
13222             for (i = 1; i <= RX_NPARENS(rx); i++) {
13223                 char digits[TYPE_CHARS(long)];
13224                 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
13225                 GV *const *const gvp
13226                     = (GV**)hv_fetch(PL_defstash, digits, len, 0);
13227
13228                 if (gvp) {
13229                     GV * const gv = *gvp;
13230                     if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
13231                         save_scalar(gv);
13232                 }
13233             }
13234         }
13235     }
13236 }
13237 #endif
13238
13239 static void
13240 clear_re(pTHX_ void *r)
13241 {
13242     dVAR;
13243     ReREFCNT_dec((REGEXP *)r);
13244 }
13245
13246 #ifdef DEBUGGING
13247
13248 STATIC void
13249 S_put_byte(pTHX_ SV *sv, int c)
13250 {
13251     PERL_ARGS_ASSERT_PUT_BYTE;
13252
13253     /* Our definition of isPRINT() ignores locales, so only bytes that are
13254        not part of UTF-8 are considered printable. I assume that the same
13255        holds for UTF-EBCDIC.
13256        Also, code point 255 is not printable in either (it's E0 in EBCDIC,
13257        which Wikipedia says:
13258
13259        EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
13260        ones (binary 1111 1111, hexadecimal FF). It is similar, but not
13261        identical, to the ASCII delete (DEL) or rubout control character.
13262        ) So the old condition can be simplified to !isPRINT(c)  */
13263     if (!isPRINT(c)) {
13264         if (c < 256) {
13265             Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
13266         }
13267         else {
13268             Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
13269         }
13270     }
13271     else {
13272         const char string = c;
13273         if (c == '-' || c == ']' || c == '\\' || c == '^')
13274             sv_catpvs(sv, "\\");
13275         sv_catpvn(sv, &string, 1);
13276     }
13277 }
13278
13279
13280 #define CLEAR_OPTSTART \
13281     if (optstart) STMT_START { \
13282             DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
13283             optstart=NULL; \
13284     } STMT_END
13285
13286 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
13287
13288 STATIC const regnode *
13289 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
13290             const regnode *last, const regnode *plast,
13291             SV* sv, I32 indent, U32 depth)
13292 {
13293     dVAR;
13294     register U8 op = PSEUDO;    /* Arbitrary non-END op. */
13295     register const regnode *next;
13296     const regnode *optstart= NULL;
13297
13298     RXi_GET_DECL(r,ri);
13299     GET_RE_DEBUG_FLAGS_DECL;
13300
13301     PERL_ARGS_ASSERT_DUMPUNTIL;
13302
13303 #ifdef DEBUG_DUMPUNTIL
13304     PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
13305         last ? last-start : 0,plast ? plast-start : 0);
13306 #endif
13307
13308     if (plast && plast < last)
13309         last= plast;
13310
13311     while (PL_regkind[op] != END && (!last || node < last)) {
13312         /* While that wasn't END last time... */
13313         NODE_ALIGN(node);
13314         op = OP(node);
13315         if (op == CLOSE || op == WHILEM)
13316             indent--;
13317         next = regnext((regnode *)node);
13318
13319         /* Where, what. */
13320         if (OP(node) == OPTIMIZED) {
13321             if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
13322                 optstart = node;
13323             else
13324                 goto after_print;
13325         } else
13326             CLEAR_OPTSTART;
13327
13328         regprop(r, sv, node);
13329         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
13330                       (int)(2*indent + 1), "", SvPVX_const(sv));
13331
13332         if (OP(node) != OPTIMIZED) {
13333             if (next == NULL)           /* Next ptr. */
13334                 PerlIO_printf(Perl_debug_log, " (0)");
13335             else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
13336                 PerlIO_printf(Perl_debug_log, " (FAIL)");
13337             else
13338                 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
13339             (void)PerlIO_putc(Perl_debug_log, '\n');
13340         }
13341
13342       after_print:
13343         if (PL_regkind[(U8)op] == BRANCHJ) {
13344             assert(next);
13345             {
13346                 register const regnode *nnode = (OP(next) == LONGJMP
13347                                              ? regnext((regnode *)next)
13348                                              : next);
13349                 if (last && nnode > last)
13350                     nnode = last;
13351                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
13352             }
13353         }
13354         else if (PL_regkind[(U8)op] == BRANCH) {
13355             assert(next);
13356             DUMPUNTIL(NEXTOPER(node), next);
13357         }
13358         else if ( PL_regkind[(U8)op]  == TRIE ) {
13359             const regnode *this_trie = node;
13360             const char op = OP(node);
13361             const U32 n = ARG(node);
13362             const reg_ac_data * const ac = op>=AHOCORASICK ?
13363                (reg_ac_data *)ri->data->data[n] :
13364                NULL;
13365             const reg_trie_data * const trie =
13366                 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
13367 #ifdef DEBUGGING
13368             AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
13369 #endif
13370             const regnode *nextbranch= NULL;
13371             I32 word_idx;
13372             sv_setpvs(sv, "");
13373             for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
13374                 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
13375
13376                 PerlIO_printf(Perl_debug_log, "%*s%s ",
13377                    (int)(2*(indent+3)), "",
13378                     elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
13379                             PL_colors[0], PL_colors[1],
13380                             (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
13381                             PERL_PV_PRETTY_ELLIPSES    |
13382                             PERL_PV_PRETTY_LTGT
13383                             )
13384                             : "???"
13385                 );
13386                 if (trie->jump) {
13387                     U16 dist= trie->jump[word_idx+1];
13388                     PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
13389                                   (UV)((dist ? this_trie + dist : next) - start));
13390                     if (dist) {
13391                         if (!nextbranch)
13392                             nextbranch= this_trie + trie->jump[0];
13393                         DUMPUNTIL(this_trie + dist, nextbranch);
13394                     }
13395                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
13396                         nextbranch= regnext((regnode *)nextbranch);
13397                 } else {
13398                     PerlIO_printf(Perl_debug_log, "\n");
13399                 }
13400             }
13401             if (last && next > last)
13402                 node= last;
13403             else
13404                 node= next;
13405         }
13406         else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
13407             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
13408                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
13409         }
13410         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
13411             assert(next);
13412             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
13413         }
13414         else if ( op == PLUS || op == STAR) {
13415             DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
13416         }
13417         else if (PL_regkind[(U8)op] == ANYOF) {
13418             /* arglen 1 + class block */
13419             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
13420                     ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
13421             node = NEXTOPER(node);
13422         }
13423         else if (PL_regkind[(U8)op] == EXACT) {
13424             /* Literal string, where present. */
13425             node += NODE_SZ_STR(node) - 1;
13426             node = NEXTOPER(node);
13427         }
13428         else {
13429             node = NEXTOPER(node);
13430             node += regarglen[(U8)op];
13431         }
13432         if (op == CURLYX || op == OPEN)
13433             indent++;
13434     }
13435     CLEAR_OPTSTART;
13436 #ifdef DEBUG_DUMPUNTIL
13437     PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
13438 #endif
13439     return node;
13440 }
13441
13442 #endif  /* DEBUGGING */
13443
13444 /*
13445  * Local variables:
13446  * c-indentation-style: bsd
13447  * c-basic-offset: 4
13448  * indent-tabs-mode: t
13449  * End:
13450  *
13451  * ex: set ts=8 sts=4 sw=4 noet:
13452  */