src/5017005/orig/regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  *      Copyright (c) 1986 by University of Toronto.
  42  *      Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  *      Permission is granted to anyone to use this software for any
  45  *      purpose on any computer system, and to redistribute it freely,
  46  *      subject to the following restrictions:
  47  *
  48  *      1. The author is not responsible for the consequences of use of
  49  *              this software, no matter how awful, even if they arise
  50  *              from defects in it.
  51  *
  52  *      2. The origin of this software must not be misrepresented, either
  53  *              by explicit claim or by omission.
  54  *
  55  *      3. Altered versions must be plainly marked as such, and must not
  56  *              be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #include "perl.h"
  76
  77 #ifndef PERL_IN_XSUB_RE
  78 #  include "INTERN.h"
  79 #endif
  80
  81 #define REG_COMP_C
  82 #ifdef PERL_IN_XSUB_RE
  83 #  include "re_comp.h"
  84 extern const struct regexp_engine my_reg_engine;
  85 #else
  86 #  include "regcomp.h"
  87 #endif
  88
  89 #include "dquote_static.c"
  90 #include "charclass_invlists.h"
  91 #include "inline_invlist.c"
  92 #include "unicode_constants.h"
  93
  94 #ifdef HAS_ISBLANK
  95 #   define hasISBLANK 1
  96 #else
  97 #   define hasISBLANK 0
  98 #endif
  99
 100 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
 101 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
 102 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
 103
 104 #ifdef op
 105 #undef op
 106 #endif /* op */
 107
 108 #ifdef MSDOS
 109 #  if defined(BUGGY_MSC6)
 110  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
 111 #    pragma optimize("a",off)
 112  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
 113 #    pragma optimize("w",on )
 114 #  endif /* BUGGY_MSC6 */
 115 #endif /* MSDOS */
 116
 117 #ifndef STATIC
 118 #define STATIC  static
 119 #endif
 120
 121
 122 typedef struct RExC_state_t {
 123     U32         flags;                  /* RXf_* are we folding, multilining? */
 124     U32         pm_flags;               /* PMf_* stuff from the calling PMOP */
 125     char        *precomp;               /* uncompiled string. */
 126     REGEXP      *rx_sv;                 /* The SV that is the regexp. */
 127     regexp      *rx;                    /* perl core regexp structure */
 128     regexp_internal     *rxi;           /* internal data for regexp object pprivate field */
 129     char        *start;                 /* Start of input for compile */
 130     char        *end;                   /* End of input for compile */
 131     char        *parse;                 /* Input-scan pointer. */
 132     I32         whilem_seen;            /* number of WHILEM in this expr */
 133     regnode     *emit_start;            /* Start of emitted-code area */
 134     regnode     *emit_bound;            /* First regnode outside of the allocated space */
 135     regnode     *emit;                  /* Code-emit pointer; &regdummy = don't = compiling */
 136     I32         naughty;                /* How bad is this pattern? */
 137     I32         sawback;                /* Did we see \1, ...? */
 138     U32         seen;
 139     I32         size;                   /* Code size. */
 140     I32         npar;                   /* Capture buffer count, (OPEN). */
 141     I32         cpar;                   /* Capture buffer count, (CLOSE). */
 142     I32         nestroot;               /* root parens we are in - used by accept */
 143     I32         extralen;
 144     I32         seen_zerolen;
 145     regnode     **open_parens;          /* pointers to open parens */
 146     regnode     **close_parens;         /* pointers to close parens */
 147     regnode     *opend;                 /* END node in program */
 148     I32         utf8;           /* whether the pattern is utf8 or not */
 149     I32         orig_utf8;      /* whether the pattern was originally in utf8 */
 150                                 /* XXX use this for future optimisation of case
 151                                  * where pattern must be upgraded to utf8. */
 152     I32         uni_semantics;  /* If a d charset modifier should use unicode
 153                                    rules, even if the pattern is not in
 154                                    utf8 */
 155     HV          *paren_names;           /* Paren names */
 156
 157     regnode     **recurse;              /* Recurse regops */
 158     I32         recurse_count;          /* Number of recurse regops */
 159     I32         in_lookbehind;
 160     I32         contains_locale;
 161     I32         override_recoding;
 162     I32         in_multi_char_class;
 163     struct reg_code_block *code_blocks; /* positions of literal (?{})
 164                                             within pattern */
 165     int         num_code_blocks;        /* size of code_blocks[] */
 166     int         code_index;             /* next code_blocks[] slot */
 167 #if ADD_TO_REGEXEC
 168     char        *starttry;              /* -Dr: where regtry was called. */
 169 #define RExC_starttry   (pRExC_state->starttry)
 170 #endif
 171     SV          *runtime_code_qr;       /* qr with the runtime code blocks */
 172 #ifdef DEBUGGING
 173     const char  *lastparse;
 174     I32         lastnum;
 175     AV          *paren_name_list;       /* idx -> name */
 176 #define RExC_lastparse  (pRExC_state->lastparse)
 177 #define RExC_lastnum    (pRExC_state->lastnum)
 178 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 179 #endif
 180 } RExC_state_t;
 181
 182 #define RExC_flags      (pRExC_state->flags)
 183 #define RExC_pm_flags   (pRExC_state->pm_flags)
 184 #define RExC_precomp    (pRExC_state->precomp)
 185 #define RExC_rx_sv      (pRExC_state->rx_sv)
 186 #define RExC_rx         (pRExC_state->rx)
 187 #define RExC_rxi        (pRExC_state->rxi)
 188 #define RExC_start      (pRExC_state->start)
 189 #define RExC_end        (pRExC_state->end)
 190 #define RExC_parse      (pRExC_state->parse)
 191 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
 192 #ifdef RE_TRACK_PATTERN_OFFSETS
 193 #define RExC_offsets    (pRExC_state->rxi->u.offsets) /* I am not like the others */
 194 #endif
 195 #define RExC_emit       (pRExC_state->emit)
 196 #define RExC_emit_start (pRExC_state->emit_start)
 197 #define RExC_emit_bound (pRExC_state->emit_bound)
 198 #define RExC_naughty    (pRExC_state->naughty)
 199 #define RExC_sawback    (pRExC_state->sawback)
 200 #define RExC_seen       (pRExC_state->seen)
 201 #define RExC_size       (pRExC_state->size)
 202 #define RExC_npar       (pRExC_state->npar)
 203 #define RExC_nestroot   (pRExC_state->nestroot)
 204 #define RExC_extralen   (pRExC_state->extralen)
 205 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
 206 #define RExC_utf8       (pRExC_state->utf8)
 207 #define RExC_uni_semantics      (pRExC_state->uni_semantics)
 208 #define RExC_orig_utf8  (pRExC_state->orig_utf8)
 209 #define RExC_open_parens        (pRExC_state->open_parens)
 210 #define RExC_close_parens       (pRExC_state->close_parens)
 211 #define RExC_opend      (pRExC_state->opend)
 212 #define RExC_paren_names        (pRExC_state->paren_names)
 213 #define RExC_recurse    (pRExC_state->recurse)
 214 #define RExC_recurse_count      (pRExC_state->recurse_count)
 215 #define RExC_in_lookbehind      (pRExC_state->in_lookbehind)
 216 #define RExC_contains_locale    (pRExC_state->contains_locale)
 217 #define RExC_override_recoding (pRExC_state->override_recoding)
 218 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
 219
 220
 221 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
 222 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 223         ((*s) == '{' && regcurly(s)))
 224
 225 #ifdef SPSTART
 226 #undef SPSTART          /* dratted cpp namespace... */
 227 #endif
 228 /*
 229  * Flags to be passed up and down.
 230  */
 231 #define WORST           0       /* Worst case. */
 232 #define HASWIDTH        0x01    /* Known to match non-null strings. */
 233
 234 /* Simple enough to be STAR/PLUS operand; in an EXACT node must be a single
 235  * character.  Note that this is not the same thing as REGNODE_SIMPLE */
 236 #define SIMPLE          0x02
 237 #define SPSTART         0x04    /* Starts with * or +. */
 238 #define TRYAGAIN        0x08    /* Weeded out a declaration. */
 239 #define POSTPONED       0x10    /* (?1),(?&name), (??{...}) or similar */
 240
 241 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 242
 243 /* whether trie related optimizations are enabled */
 244 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 245 #define TRIE_STUDY_OPT
 246 #define FULL_TRIE_STUDY
 247 #define TRIE_STCLASS
 248 #endif
 249
 250
 251
 252 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 253 #define PBITVAL(paren) (1 << ((paren) & 7))
 254 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 255 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 256 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 257
 258 /* If not already in utf8, do a longjmp back to the beginning */
 259 #define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
 260 #define REQUIRE_UTF8    STMT_START {                                       \
 261                                      if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
 262                         } STMT_END
 263
 264 /* About scan_data_t.
 265
 266   During optimisation we recurse through the regexp program performing
 267   various inplace (keyhole style) optimisations. In addition study_chunk
 268   and scan_commit populate this data structure with information about
 269   what strings MUST appear in the pattern. We look for the longest
 270   string that must appear at a fixed location, and we look for the
 271   longest string that may appear at a floating location. So for instance
 272   in the pattern:
 273
 274     /FOO[xX]A.*B[xX]BAR/
 275
 276   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 277   strings (because they follow a .* construct). study_chunk will identify
 278   both FOO and BAR as being the longest fixed and floating strings respectively.
 279
 280   The strings can be composites, for instance
 281
 282      /(f)(o)(o)/
 283
 284   will result in a composite fixed substring 'foo'.
 285
 286   For each string some basic information is maintained:
 287
 288   - offset or min_offset
 289     This is the position the string must appear at, or not before.
 290     It also implicitly (when combined with minlenp) tells us how many
 291     characters must match before the string we are searching for.
 292     Likewise when combined with minlenp and the length of the string it
 293     tells us how many characters must appear after the string we have
 294     found.
 295
 296   - max_offset
 297     Only used for floating strings. This is the rightmost point that
 298     the string can appear at. If set to I32 max it indicates that the
 299     string can occur infinitely far to the right.
 300
 301   - minlenp
 302     A pointer to the minimum number of characters of the pattern that the
 303     string was found inside. This is important as in the case of positive
 304     lookahead or positive lookbehind we can have multiple patterns
 305     involved. Consider
 306
 307     /(?=FOO).*F/
 308
 309     The minimum length of the pattern overall is 3, the minimum length
 310     of the lookahead part is 3, but the minimum length of the part that
 311     will actually match is 1. So 'FOO's minimum length is 3, but the
 312     minimum length for the F is 1. This is important as the minimum length
 313     is used to determine offsets in front of and behind the string being
 314     looked for.  Since strings can be composites this is the length of the
 315     pattern at the time it was committed with a scan_commit. Note that
 316     the length is calculated by study_chunk, so that the minimum lengths
 317     are not known until the full pattern has been compiled, thus the
 318     pointer to the value.
 319
 320   - lookbehind
 321
 322     In the case of lookbehind the string being searched for can be
 323     offset past the start point of the final matching string.
 324     If this value was just blithely removed from the min_offset it would
 325     invalidate some of the calculations for how many chars must match
 326     before or after (as they are derived from min_offset and minlen and
 327     the length of the string being searched for).
 328     When the final pattern is compiled and the data is moved from the
 329     scan_data_t structure into the regexp structure the information
 330     about lookbehind is factored in, with the information that would
 331     have been lost precalculated in the end_shift field for the
 332     associated string.
 333
 334   The fields pos_min and pos_delta are used to store the minimum offset
 335   and the delta to the maximum offset at the current point in the pattern.
 336
 337 */
 338
 339 typedef struct scan_data_t {
 340     /*I32 len_min;      unused */
 341     /*I32 len_delta;    unused */
 342     I32 pos_min;
 343     I32 pos_delta;
 344     SV *last_found;
 345     I32 last_end;           /* min value, <0 unless valid. */
 346     I32 last_start_min;
 347     I32 last_start_max;
 348     SV **longest;           /* Either &l_fixed, or &l_float. */
 349     SV *longest_fixed;      /* longest fixed string found in pattern */
 350     I32 offset_fixed;       /* offset where it starts */
 351     I32 *minlen_fixed;      /* pointer to the minlen relevant to the string */
 352     I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 353     SV *longest_float;      /* longest floating string found in pattern */
 354     I32 offset_float_min;   /* earliest point in string it can appear */
 355     I32 offset_float_max;   /* latest point in string it can appear */
 356     I32 *minlen_float;      /* pointer to the minlen relevant to the string */
 357     I32 lookbehind_float;   /* is the position of the string modified by LB */
 358     I32 flags;
 359     I32 whilem_c;
 360     I32 *last_closep;
 361     struct regnode_charclass_class *start_class;
 362 } scan_data_t;
 363
 364 /*
 365  * Forward declarations for pregcomp()'s friends.
 366  */
 367
 368 static const scan_data_t zero_scan_data =
 369   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 370
 371 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 372 #define SF_BEFORE_SEOL          0x0001
 373 #define SF_BEFORE_MEOL          0x0002
 374 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 375 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 376
 377 #ifdef NO_UNARY_PLUS
 378 #  define SF_FIX_SHIFT_EOL      (0+2)
 379 #  define SF_FL_SHIFT_EOL               (0+4)
 380 #else
 381 #  define SF_FIX_SHIFT_EOL      (+2)
 382 #  define SF_FL_SHIFT_EOL               (+4)
 383 #endif
 384
 385 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 386 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 387
 388 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 389 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 390 #define SF_IS_INF               0x0040
 391 #define SF_HAS_PAR              0x0080
 392 #define SF_IN_PAR               0x0100
 393 #define SF_HAS_EVAL             0x0200
 394 #define SCF_DO_SUBSTR           0x0400
 395 #define SCF_DO_STCLASS_AND      0x0800
 396 #define SCF_DO_STCLASS_OR       0x1000
 397 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 398 #define SCF_WHILEM_VISITED_POS  0x2000
 399
 400 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 401 #define SCF_SEEN_ACCEPT         0x8000
 402
 403 #define UTF cBOOL(RExC_utf8)
 404
 405 /* The enums for all these are ordered so things work out correctly */
 406 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 407 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
 408 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
 409 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
 410 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
 411 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
 412 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
 413
 414 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 415
 416 #define OOB_NAMEDCLASS          -1
 417
 418 /* There is no code point that is out-of-bounds, so this is problematic.  But
 419  * its only current use is to initialize a variable that is always set before
 420  * looked at. */
 421 #define OOB_UNICODE             0xDEADBEEF
 422
 423 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 424 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 425
 426
 427 /* length of regex to show in messages that don't mark a position within */
 428 #define RegexLengthToShowInErrorMessages 127
 429
 430 /*
 431  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 432  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 433  * op/pragma/warn/regcomp.
 434  */
 435 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 436 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 437
 438 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
 439
 440 /*
 441  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 442  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 443  * "...".
 444  */
 445 #define _FAIL(code) STMT_START {                                        \
 446     const char *ellipses = "";                                          \
 447     IV len = RExC_end - RExC_precomp;                                   \
 448                                                                         \
 449     if (!SIZE_ONLY)                                                     \
 450         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);                   \
 451     if (len > RegexLengthToShowInErrorMessages) {                       \
 452         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
 453         len = RegexLengthToShowInErrorMessages - 10;                    \
 454         ellipses = "...";                                               \
 455     }                                                                   \
 456     code;                                                               \
 457 } STMT_END
 458
 459 #define FAIL(msg) _FAIL(                            \
 460     Perl_croak(aTHX_ "%s in regex m/%.*s%s/",       \
 461             msg, (int)len, RExC_precomp, ellipses))
 462
 463 #define FAIL2(msg,arg) _FAIL(                       \
 464     Perl_croak(aTHX_ msg " in regex m/%.*s%s/",     \
 465             arg, (int)len, RExC_precomp, ellipses))
 466
 467 /*
 468  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 469  */
 470 #define Simple_vFAIL(m) STMT_START {                                    \
 471     const IV offset = RExC_parse - RExC_precomp;                        \
 472     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
 473             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
 474 } STMT_END
 475
 476 /*
 477  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 478  */
 479 #define vFAIL(m) STMT_START {                           \
 480     if (!SIZE_ONLY)                                     \
 481         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 482     Simple_vFAIL(m);                                    \
 483 } STMT_END
 484
 485 /*
 486  * Like Simple_vFAIL(), but accepts two arguments.
 487  */
 488 #define Simple_vFAIL2(m,a1) STMT_START {                        \
 489     const IV offset = RExC_parse - RExC_precomp;                        \
 490     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,                   \
 491             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 492 } STMT_END
 493
 494 /*
 495  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 496  */
 497 #define vFAIL2(m,a1) STMT_START {                       \
 498     if (!SIZE_ONLY)                                     \
 499         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 500     Simple_vFAIL2(m, a1);                               \
 501 } STMT_END
 502
 503
 504 /*
 505  * Like Simple_vFAIL(), but accepts three arguments.
 506  */
 507 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
 508     const IV offset = RExC_parse - RExC_precomp;                \
 509     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,               \
 510             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 511 } STMT_END
 512
 513 /*
 514  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 515  */
 516 #define vFAIL3(m,a1,a2) STMT_START {                    \
 517     if (!SIZE_ONLY)                                     \
 518         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 519     Simple_vFAIL3(m, a1, a2);                           \
 520 } STMT_END
 521
 522 /*
 523  * Like Simple_vFAIL(), but accepts four arguments.
 524  */
 525 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
 526     const IV offset = RExC_parse - RExC_precomp;                \
 527     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,           \
 528             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 529 } STMT_END
 530
 531 #define ckWARNreg(loc,m) STMT_START {                                   \
 532     const IV offset = loc - RExC_precomp;                               \
 533     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 534             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 535 } STMT_END
 536
 537 #define ckWARNregdep(loc,m) STMT_START {                                \
 538     const IV offset = loc - RExC_precomp;                               \
 539     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 540             m REPORT_LOCATION,                                          \
 541             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 542 } STMT_END
 543
 544 #define ckWARN2regdep(loc,m, a1) STMT_START {                           \
 545     const IV offset = loc - RExC_precomp;                               \
 546     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 547             m REPORT_LOCATION,                                          \
 548             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
 549 } STMT_END
 550
 551 #define ckWARN2reg(loc, m, a1) STMT_START {                             \
 552     const IV offset = loc - RExC_precomp;                               \
 553     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 554             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
 555 } STMT_END
 556
 557 #define vWARN3(loc, m, a1, a2) STMT_START {                             \
 558     const IV offset = loc - RExC_precomp;                               \
 559     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 560             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 561 } STMT_END
 562
 563 #define ckWARN3reg(loc, m, a1, a2) STMT_START {                         \
 564     const IV offset = loc - RExC_precomp;                               \
 565     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 566             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 567 } STMT_END
 568
 569 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
 570     const IV offset = loc - RExC_precomp;                               \
 571     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 572             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 573 } STMT_END
 574
 575 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START {                     \
 576     const IV offset = loc - RExC_precomp;                               \
 577     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 578             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 579 } STMT_END
 580
 581 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
 582     const IV offset = loc - RExC_precomp;                               \
 583     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 584             a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
 585 } STMT_END
 586
 587
 588 /* Allow for side effects in s */
 589 #define REGC(c,s) STMT_START {                  \
 590     if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
 591 } STMT_END
 592
 593 /* Macros for recording node offsets.   20001227 mjd@plover.com
 594  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 595  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 596  * Element 0 holds the number n.
 597  * Position is 1 indexed.
 598  */
 599 #ifndef RE_TRACK_PATTERN_OFFSETS
 600 #define Set_Node_Offset_To_R(node,byte)
 601 #define Set_Node_Offset(node,byte)
 602 #define Set_Cur_Node_Offset
 603 #define Set_Node_Length_To_R(node,len)
 604 #define Set_Node_Length(node,len)
 605 #define Set_Node_Cur_Length(node)
 606 #define Node_Offset(n)
 607 #define Node_Length(n)
 608 #define Set_Node_Offset_Length(node,offset,len)
 609 #define ProgLen(ri) ri->u.proglen
 610 #define SetProgLen(ri,x) ri->u.proglen = x
 611 #else
 612 #define ProgLen(ri) ri->u.offsets[0]
 613 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 614 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
 615     if (! SIZE_ONLY) {                                                  \
 616         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
 617                     __LINE__, (int)(node), (int)(byte)));               \
 618         if((node) < 0) {                                                \
 619             Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
 620         } else {                                                        \
 621             RExC_offsets[2*(node)-1] = (byte);                          \
 622         }                                                               \
 623     }                                                                   \
 624 } STMT_END
 625
 626 #define Set_Node_Offset(node,byte) \
 627     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 628 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 629
 630 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
 631     if (! SIZE_ONLY) {                                                  \
 632         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
 633                 __LINE__, (int)(node), (int)(len)));                    \
 634         if((node) < 0) {                                                \
 635             Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
 636         } else {                                                        \
 637             RExC_offsets[2*(node)] = (len);                             \
 638         }                                                               \
 639     }                                                                   \
 640 } STMT_END
 641
 642 #define Set_Node_Length(node,len) \
 643     Set_Node_Length_To_R((node)-RExC_emit_start, len)
 644 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
 645 #define Set_Node_Cur_Length(node) \
 646     Set_Node_Length(node, RExC_parse - parse_start)
 647
 648 /* Get offsets and lengths */
 649 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 650 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 651
 652 #define Set_Node_Offset_Length(node,offset,len) STMT_START {    \
 653     Set_Node_Offset_To_R((node)-RExC_emit_start, (offset));     \
 654     Set_Node_Length_To_R((node)-RExC_emit_start, (len));        \
 655 } STMT_END
 656 #endif
 657
 658 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 659 #define EXPERIMENTAL_INPLACESCAN
 660 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
 661
 662 #define DEBUG_STUDYDATA(str,data,depth)                              \
 663 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
 664     PerlIO_printf(Perl_debug_log,                                    \
 665         "%*s" str "Pos:%"IVdf"/%"IVdf                                \
 666         " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s",       \
 667         (int)(depth)*2, "",                                          \
 668         (IV)((data)->pos_min),                                       \
 669         (IV)((data)->pos_delta),                                     \
 670         (UV)((data)->flags),                                         \
 671         (IV)((data)->whilem_c),                                      \
 672         (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
 673         is_inf ? "INF " : ""                                         \
 674     );                                                               \
 675     if ((data)->last_found)                                          \
 676         PerlIO_printf(Perl_debug_log,                                \
 677             "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 678             " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 679             SvPVX_const((data)->last_found),                         \
 680             (IV)((data)->last_end),                                  \
 681             (IV)((data)->last_start_min),                            \
 682             (IV)((data)->last_start_max),                            \
 683             ((data)->longest &&                                      \
 684              (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 685             SvPVX_const((data)->longest_fixed),                      \
 686             (IV)((data)->offset_fixed),                              \
 687             ((data)->longest &&                                      \
 688              (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 689             SvPVX_const((data)->longest_float),                      \
 690             (IV)((data)->offset_float_min),                          \
 691             (IV)((data)->offset_float_max)                           \
 692         );                                                           \
 693     PerlIO_printf(Perl_debug_log,"\n");                              \
 694 });
 695
 696 static void clear_re(pTHX_ void *r);
 697
 698 /* Mark that we cannot extend a found fixed substring at this point.
 699    Update the longest found anchored substring and the longest found
 700    floating substrings if needed. */
 701
 702 STATIC void
 703 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
 704 {
 705     const STRLEN l = CHR_SVLEN(data->last_found);
 706     const STRLEN old_l = CHR_SVLEN(*data->longest);
 707     GET_RE_DEBUG_FLAGS_DECL;
 708
 709     PERL_ARGS_ASSERT_SCAN_COMMIT;
 710
 711     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 712         SvSetMagicSV(*data->longest, data->last_found);
 713         if (*data->longest == data->longest_fixed) {
 714             data->offset_fixed = l ? data->last_start_min : data->pos_min;
 715             if (data->flags & SF_BEFORE_EOL)
 716                 data->flags
 717                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
 718             else
 719                 data->flags &= ~SF_FIX_BEFORE_EOL;
 720             data->minlen_fixed=minlenp;
 721             data->lookbehind_fixed=0;
 722         }
 723         else { /* *data->longest == data->longest_float */
 724             data->offset_float_min = l ? data->last_start_min : data->pos_min;
 725             data->offset_float_max = (l
 726                                       ? data->last_start_max
 727                                       : data->pos_min + data->pos_delta);
 728             if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
 729                 data->offset_float_max = I32_MAX;
 730             if (data->flags & SF_BEFORE_EOL)
 731                 data->flags
 732                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
 733             else
 734                 data->flags &= ~SF_FL_BEFORE_EOL;
 735             data->minlen_float=minlenp;
 736             data->lookbehind_float=0;
 737         }
 738     }
 739     SvCUR_set(data->last_found, 0);
 740     {
 741         SV * const sv = data->last_found;
 742         if (SvUTF8(sv) && SvMAGICAL(sv)) {
 743             MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 744             if (mg)
 745                 mg->mg_len = 0;
 746         }
 747     }
 748     data->last_end = -1;
 749     data->flags &= ~SF_BEFORE_EOL;
 750     DEBUG_STUDYDATA("commit: ",data,0);
 751 }
 752
 753 /* Can match anything (initialization) */
 754 STATIC void
 755 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 756 {
 757     PERL_ARGS_ASSERT_CL_ANYTHING;
 758
 759     ANYOF_BITMAP_SETALL(cl);
 760     cl->flags = ANYOF_CLASS|ANYOF_EOS|ANYOF_UNICODE_ALL
 761                 |ANYOF_NON_UTF8_LATIN1_ALL;
 762
 763     /* If any portion of the regex is to operate under locale rules,
 764      * initialization includes it.  The reason this isn't done for all regexes
 765      * is that the optimizer was written under the assumption that locale was
 766      * all-or-nothing.  Given the complexity and lack of documentation in the
 767      * optimizer, and that there are inadequate test cases for locale, so many
 768      * parts of it may not work properly, it is safest to avoid locale unless
 769      * necessary. */
 770     if (RExC_contains_locale) {
 771         ANYOF_CLASS_SETALL(cl);     /* /l uses class */
 772         cl->flags |= ANYOF_LOCALE|ANYOF_LOC_FOLD;
 773     }
 774     else {
 775         ANYOF_CLASS_ZERO(cl);       /* Only /l uses class now */
 776     }
 777 }
 778
 779 /* Can match anything (initialization) */
 780 STATIC int
 781 S_cl_is_anything(const struct regnode_charclass_class *cl)
 782 {
 783     int value;
 784
 785     PERL_ARGS_ASSERT_CL_IS_ANYTHING;
 786
 787     for (value = 0; value <= ANYOF_MAX; value += 2)
 788         if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
 789             return 1;
 790     if (!(cl->flags & ANYOF_UNICODE_ALL))
 791         return 0;
 792     if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
 793         return 0;
 794     return 1;
 795 }
 796
 797 /* Can match anything (initialization) */
 798 STATIC void
 799 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 800 {
 801     PERL_ARGS_ASSERT_CL_INIT;
 802
 803     Zero(cl, 1, struct regnode_charclass_class);
 804     cl->type = ANYOF;
 805     cl_anything(pRExC_state, cl);
 806     ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
 807 }
 808
 809 /* These two functions currently do the exact same thing */
 810 #define cl_init_zero            S_cl_init
 811
 812 /* 'AND' a given class with another one.  Can create false positives.  'cl'
 813  * should not be inverted.  'and_with->flags & ANYOF_CLASS' should be 0 if
 814  * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
 815 STATIC void
 816 S_cl_and(struct regnode_charclass_class *cl,
 817         const struct regnode_charclass_class *and_with)
 818 {
 819     PERL_ARGS_ASSERT_CL_AND;
 820
 821     assert(and_with->type == ANYOF);
 822
 823     /* I (khw) am not sure all these restrictions are necessary XXX */
 824     if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
 825         && !(ANYOF_CLASS_TEST_ANY_SET(cl))
 826         && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 827         && !(and_with->flags & ANYOF_LOC_FOLD)
 828         && !(cl->flags & ANYOF_LOC_FOLD)) {
 829         int i;
 830
 831         if (and_with->flags & ANYOF_INVERT)
 832             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 833                 cl->bitmap[i] &= ~and_with->bitmap[i];
 834         else
 835             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 836                 cl->bitmap[i] &= and_with->bitmap[i];
 837     } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
 838
 839     if (and_with->flags & ANYOF_INVERT) {
 840
 841         /* Here, the and'ed node is inverted.  Get the AND of the flags that
 842          * aren't affected by the inversion.  Those that are affected are
 843          * handled individually below */
 844         U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
 845         cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
 846         cl->flags |= affected_flags;
 847
 848         /* We currently don't know how to deal with things that aren't in the
 849          * bitmap, but we know that the intersection is no greater than what
 850          * is already in cl, so let there be false positives that get sorted
 851          * out after the synthetic start class succeeds, and the node is
 852          * matched for real. */
 853
 854         /* The inversion of these two flags indicate that the resulting
 855          * intersection doesn't have them */
 856         if (and_with->flags & ANYOF_UNICODE_ALL) {
 857             cl->flags &= ~ANYOF_UNICODE_ALL;
 858         }
 859         if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
 860             cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
 861         }
 862     }
 863     else {   /* and'd node is not inverted */
 864         U8 outside_bitmap_but_not_utf8; /* Temp variable */
 865
 866         if (! ANYOF_NONBITMAP(and_with)) {
 867
 868             /* Here 'and_with' doesn't match anything outside the bitmap
 869              * (except possibly ANYOF_UNICODE_ALL), which means the
 870              * intersection can't either, except for ANYOF_UNICODE_ALL, in
 871              * which case we don't know what the intersection is, but it's no
 872              * greater than what cl already has, so can just leave it alone,
 873              * with possible false positives */
 874             if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
 875                 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
 876                 cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
 877             }
 878         }
 879         else if (! ANYOF_NONBITMAP(cl)) {
 880
 881             /* Here, 'and_with' does match something outside the bitmap, and cl
 882              * doesn't have a list of things to match outside the bitmap.  If
 883              * cl can match all code points above 255, the intersection will
 884              * be those above-255 code points that 'and_with' matches.  If cl
 885              * can't match all Unicode code points, it means that it can't
 886              * match anything outside the bitmap (since the 'if' that got us
 887              * into this block tested for that), so we leave the bitmap empty.
 888              */
 889             if (cl->flags & ANYOF_UNICODE_ALL) {
 890                 ARG_SET(cl, ARG(and_with));
 891
 892                 /* and_with's ARG may match things that don't require UTF8.
 893                  * And now cl's will too, in spite of this being an 'and'.  See
 894                  * the comments below about the kludge */
 895                 cl->flags |= and_with->flags & ANYOF_NONBITMAP_NON_UTF8;
 896             }
 897         }
 898         else {
 899             /* Here, both 'and_with' and cl match something outside the
 900              * bitmap.  Currently we do not do the intersection, so just match
 901              * whatever cl had at the beginning.  */
 902         }
 903
 904
 905         /* Take the intersection of the two sets of flags.  However, the
 906          * ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'.  This is a
 907          * kludge around the fact that this flag is not treated like the others
 908          * which are initialized in cl_anything().  The way the optimizer works
 909          * is that the synthetic start class (SSC) is initialized to match
 910          * anything, and then the first time a real node is encountered, its
 911          * values are AND'd with the SSC's with the result being the values of
 912          * the real node.  However, there are paths through the optimizer where
 913          * the AND never gets called, so those initialized bits are set
 914          * inappropriately, which is not usually a big deal, as they just cause
 915          * false positives in the SSC, which will just mean a probably
 916          * imperceptible slow down in execution.  However this bit has a
 917          * higher false positive consequence in that it can cause utf8.pm,
 918          * utf8_heavy.pl ... to be loaded when not necessary, which is a much
 919          * bigger slowdown and also causes significant extra memory to be used.
 920          * In order to prevent this, the code now takes a different tack.  The
 921          * bit isn't set unless some part of the regular expression needs it,
 922          * but once set it won't get cleared.  This means that these extra
 923          * modules won't get loaded unless there was some path through the
 924          * pattern that would have required them anyway, and  so any false
 925          * positives that occur by not ANDing them out when they could be
 926          * aren't as severe as they would be if we treated this bit like all
 927          * the others */
 928         outside_bitmap_but_not_utf8 = (cl->flags | and_with->flags)
 929                                       & ANYOF_NONBITMAP_NON_UTF8;
 930         cl->flags &= and_with->flags;
 931         cl->flags |= outside_bitmap_but_not_utf8;
 932     }
 933 }
 934
 935 /* 'OR' a given class with another one.  Can create false positives.  'cl'
 936  * should not be inverted.  'or_with->flags & ANYOF_CLASS' should be 0 if
 937  * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
 938 STATIC void
 939 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
 940 {
 941     PERL_ARGS_ASSERT_CL_OR;
 942
 943     if (or_with->flags & ANYOF_INVERT) {
 944
 945         /* Here, the or'd node is to be inverted.  This means we take the
 946          * complement of everything not in the bitmap, but currently we don't
 947          * know what that is, so give up and match anything */
 948         if (ANYOF_NONBITMAP(or_with)) {
 949             cl_anything(pRExC_state, cl);
 950         }
 951         /* We do not use
 952          * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
 953          *   <= (B1 | !B2) | (CL1 | !CL2)
 954          * which is wasteful if CL2 is small, but we ignore CL2:
 955          *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
 956          * XXXX Can we handle case-fold?  Unclear:
 957          *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
 958          *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
 959          */
 960         else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 961              && !(or_with->flags & ANYOF_LOC_FOLD)
 962              && !(cl->flags & ANYOF_LOC_FOLD) ) {
 963             int i;
 964
 965             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 966                 cl->bitmap[i] |= ~or_with->bitmap[i];
 967         } /* XXXX: logic is complicated otherwise */
 968         else {
 969             cl_anything(pRExC_state, cl);
 970         }
 971
 972         /* And, we can just take the union of the flags that aren't affected
 973          * by the inversion */
 974         cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
 975
 976         /* For the remaining flags:
 977             ANYOF_UNICODE_ALL and inverted means to not match anything above
 978                     255, which means that the union with cl should just be
 979                     what cl has in it, so can ignore this flag
 980             ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
 981                     is 127-255 to match them, but then invert that, so the
 982                     union with cl should just be what cl has in it, so can
 983                     ignore this flag
 984          */
 985     } else {    /* 'or_with' is not inverted */
 986         /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
 987         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 988              && (!(or_with->flags & ANYOF_LOC_FOLD)
 989                  || (cl->flags & ANYOF_LOC_FOLD)) ) {
 990             int i;
 991
 992             /* OR char bitmap and class bitmap separately */
 993             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 994                 cl->bitmap[i] |= or_with->bitmap[i];
 995             if (ANYOF_CLASS_TEST_ANY_SET(or_with)) {
 996                 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
 997                     cl->classflags[i] |= or_with->classflags[i];
 998                 cl->flags |= ANYOF_CLASS;
 999             }
1000         }
1001         else { /* XXXX: logic is complicated, leave it along for a moment. */
1002             cl_anything(pRExC_state, cl);
1003         }
1004
1005         if (ANYOF_NONBITMAP(or_with)) {
1006
1007             /* Use the added node's outside-the-bit-map match if there isn't a
1008              * conflict.  If there is a conflict (both nodes match something
1009              * outside the bitmap, but what they match outside is not the same
1010              * pointer, and hence not easily compared until XXX we extend
1011              * inversion lists this far), give up and allow the start class to
1012              * match everything outside the bitmap.  If that stuff is all above
1013              * 255, can just set UNICODE_ALL, otherwise caould be anything. */
1014             if (! ANYOF_NONBITMAP(cl)) {
1015                 ARG_SET(cl, ARG(or_with));
1016             }
1017             else if (ARG(cl) != ARG(or_with)) {
1018
1019                 if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
1020                     cl_anything(pRExC_state, cl);
1021                 }
1022                 else {
1023                     cl->flags |= ANYOF_UNICODE_ALL;
1024                 }
1025             }
1026         }
1027
1028         /* Take the union */
1029         cl->flags |= or_with->flags;
1030     }
1031 }
1032
1033 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1034 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
1035 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1036 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
1037
1038
1039 #ifdef DEBUGGING
1040 /*
1041    dump_trie(trie,widecharmap,revcharmap)
1042    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1043    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1044
1045    These routines dump out a trie in a somewhat readable format.
1046    The _interim_ variants are used for debugging the interim
1047    tables that are used to generate the final compressed
1048    representation which is what dump_trie expects.
1049
1050    Part of the reason for their existence is to provide a form
1051    of documentation as to how the different representations function.
1052
1053 */
1054
1055 /*
1056   Dumps the final compressed table form of the trie to Perl_debug_log.
1057   Used for debugging make_trie().
1058 */
1059
1060 STATIC void
1061 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1062             AV *revcharmap, U32 depth)
1063 {
1064     U32 state;
1065     SV *sv=sv_newmortal();
1066     int colwidth= widecharmap ? 6 : 4;
1067     U16 word;
1068     GET_RE_DEBUG_FLAGS_DECL;
1069
1070     PERL_ARGS_ASSERT_DUMP_TRIE;
1071
1072     PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1073         (int)depth * 2 + 2,"",
1074         "Match","Base","Ofs" );
1075
1076     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1077         SV ** const tmp = av_fetch( revcharmap, state, 0);
1078         if ( tmp ) {
1079             PerlIO_printf( Perl_debug_log, "%*s",
1080                 colwidth,
1081                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1082                             PL_colors[0], PL_colors[1],
1083                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1084                             PERL_PV_ESCAPE_FIRSTCHAR
1085                 )
1086             );
1087         }
1088     }
1089     PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1090         (int)depth * 2 + 2,"");
1091
1092     for( state = 0 ; state < trie->uniquecharcount ; state++ )
1093         PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1094     PerlIO_printf( Perl_debug_log, "\n");
1095
1096     for( state = 1 ; state < trie->statecount ; state++ ) {
1097         const U32 base = trie->states[ state ].trans.base;
1098
1099         PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1100
1101         if ( trie->states[ state ].wordnum ) {
1102             PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1103         } else {
1104             PerlIO_printf( Perl_debug_log, "%6s", "" );
1105         }
1106
1107         PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1108
1109         if ( base ) {
1110             U32 ofs = 0;
1111
1112             while( ( base + ofs  < trie->uniquecharcount ) ||
1113                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
1114                      && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1115                     ofs++;
1116
1117             PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1118
1119             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1120                 if ( ( base + ofs >= trie->uniquecharcount ) &&
1121                      ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1122                      trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1123                 {
1124                    PerlIO_printf( Perl_debug_log, "%*"UVXf,
1125                     colwidth,
1126                     (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1127                 } else {
1128                     PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
1129                 }
1130             }
1131
1132             PerlIO_printf( Perl_debug_log, "]");
1133
1134         }
1135         PerlIO_printf( Perl_debug_log, "\n" );
1136     }
1137     PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1138     for (word=1; word <= trie->wordcount; word++) {
1139         PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1140             (int)word, (int)(trie->wordinfo[word].prev),
1141             (int)(trie->wordinfo[word].len));
1142     }
1143     PerlIO_printf(Perl_debug_log, "\n" );
1144 }
1145 /*
1146   Dumps a fully constructed but uncompressed trie in list form.
1147   List tries normally only are used for construction when the number of
1148   possible chars (trie->uniquecharcount) is very high.
1149   Used for debugging make_trie().
1150 */
1151 STATIC void
1152 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1153                          HV *widecharmap, AV *revcharmap, U32 next_alloc,
1154                          U32 depth)
1155 {
1156     U32 state;
1157     SV *sv=sv_newmortal();
1158     int colwidth= widecharmap ? 6 : 4;
1159     GET_RE_DEBUG_FLAGS_DECL;
1160
1161     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1162
1163     /* print out the table precompression.  */
1164     PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1165         (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1166         "------:-----+-----------------\n" );
1167
1168     for( state=1 ; state < next_alloc ; state ++ ) {
1169         U16 charid;
1170
1171         PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1172             (int)depth * 2 + 2,"", (UV)state  );
1173         if ( ! trie->states[ state ].wordnum ) {
1174             PerlIO_printf( Perl_debug_log, "%5s| ","");
1175         } else {
1176             PerlIO_printf( Perl_debug_log, "W%4x| ",
1177                 trie->states[ state ].wordnum
1178             );
1179         }
1180         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1181             SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1182             if ( tmp ) {
1183                 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1184                     colwidth,
1185                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1186                             PL_colors[0], PL_colors[1],
1187                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1188                             PERL_PV_ESCAPE_FIRSTCHAR
1189                     ) ,
1190                     TRIE_LIST_ITEM(state,charid).forid,
1191                     (UV)TRIE_LIST_ITEM(state,charid).newstate
1192                 );
1193                 if (!(charid % 10))
1194                     PerlIO_printf(Perl_debug_log, "\n%*s| ",
1195                         (int)((depth * 2) + 14), "");
1196             }
1197         }
1198         PerlIO_printf( Perl_debug_log, "\n");
1199     }
1200 }
1201
1202 /*
1203   Dumps a fully constructed but uncompressed trie in table form.
1204   This is the normal DFA style state transition table, with a few
1205   twists to facilitate compression later.
1206   Used for debugging make_trie().
1207 */
1208 STATIC void
1209 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1210                           HV *widecharmap, AV *revcharmap, U32 next_alloc,
1211                           U32 depth)
1212 {
1213     U32 state;
1214     U16 charid;
1215     SV *sv=sv_newmortal();
1216     int colwidth= widecharmap ? 6 : 4;
1217     GET_RE_DEBUG_FLAGS_DECL;
1218
1219     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1220
1221     /*
1222        print out the table precompression so that we can do a visual check
1223        that they are identical.
1224      */
1225
1226     PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1227
1228     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1229         SV ** const tmp = av_fetch( revcharmap, charid, 0);
1230         if ( tmp ) {
1231             PerlIO_printf( Perl_debug_log, "%*s",
1232                 colwidth,
1233                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1234                             PL_colors[0], PL_colors[1],
1235                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1236                             PERL_PV_ESCAPE_FIRSTCHAR
1237                 )
1238             );
1239         }
1240     }
1241
1242     PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1243
1244     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1245         PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1246     }
1247
1248     PerlIO_printf( Perl_debug_log, "\n" );
1249
1250     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1251
1252         PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1253             (int)depth * 2 + 2,"",
1254             (UV)TRIE_NODENUM( state ) );
1255
1256         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1257             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1258             if (v)
1259                 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1260             else
1261                 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1262         }
1263         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1264             PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1265         } else {
1266             PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1267             trie->states[ TRIE_NODENUM( state ) ].wordnum );
1268         }
1269     }
1270 }
1271
1272 #endif
1273
1274
1275 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1276   startbranch: the first branch in the whole branch sequence
1277   first      : start branch of sequence of branch-exact nodes.
1278                May be the same as startbranch
1279   last       : Thing following the last branch.
1280                May be the same as tail.
1281   tail       : item following the branch sequence
1282   count      : words in the sequence
1283   flags      : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1284   depth      : indent depth
1285
1286 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1287
1288 A trie is an N'ary tree where the branches are determined by digital
1289 decomposition of the key. IE, at the root node you look up the 1st character and
1290 follow that branch repeat until you find the end of the branches. Nodes can be
1291 marked as "accepting" meaning they represent a complete word. Eg:
1292
1293   /he|she|his|hers/
1294
1295 would convert into the following structure. Numbers represent states, letters
1296 following numbers represent valid transitions on the letter from that state, if
1297 the number is in square brackets it represents an accepting state, otherwise it
1298 will be in parenthesis.
1299
1300       +-h->+-e->[3]-+-r->(8)-+-s->[9]
1301       |    |
1302       |   (2)
1303       |    |
1304      (1)   +-i->(6)-+-s->[7]
1305       |
1306       +-s->(3)-+-h->(4)-+-e->[5]
1307
1308       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1309
1310 This shows that when matching against the string 'hers' we will begin at state 1
1311 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1312 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1313 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1314 single traverse. We store a mapping from accepting to state to which word was
1315 matched, and then when we have multiple possibilities we try to complete the
1316 rest of the regex in the order in which they occured in the alternation.
1317
1318 The only prior NFA like behaviour that would be changed by the TRIE support is
1319 the silent ignoring of duplicate alternations which are of the form:
1320
1321  / (DUPE|DUPE) X? (?{ ... }) Y /x
1322
1323 Thus EVAL blocks following a trie may be called a different number of times with
1324 and without the optimisation. With the optimisations dupes will be silently
1325 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1326 the following demonstrates:
1327
1328  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1329
1330 which prints out 'word' three times, but
1331
1332  'words'=~/(word|word|word)(?{ print $1 })S/
1333
1334 which doesnt print it out at all. This is due to other optimisations kicking in.
1335
1336 Example of what happens on a structural level:
1337
1338 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1339
1340    1: CURLYM[1] {1,32767}(18)
1341    5:   BRANCH(8)
1342    6:     EXACT <ac>(16)
1343    8:   BRANCH(11)
1344    9:     EXACT <ad>(16)
1345   11:   BRANCH(14)
1346   12:     EXACT <ab>(16)
1347   16:   SUCCEED(0)
1348   17:   NOTHING(18)
1349   18: END(0)
1350
1351 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1352 and should turn into:
1353
1354    1: CURLYM[1] {1,32767}(18)
1355    5:   TRIE(16)
1356         [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1357           <ac>
1358           <ad>
1359           <ab>
1360   16:   SUCCEED(0)
1361   17:   NOTHING(18)
1362   18: END(0)
1363
1364 Cases where tail != last would be like /(?foo|bar)baz/:
1365
1366    1: BRANCH(4)
1367    2:   EXACT <foo>(8)
1368    4: BRANCH(7)
1369    5:   EXACT <bar>(8)
1370    7: TAIL(8)
1371    8: EXACT <baz>(10)
1372   10: END(0)
1373
1374 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1375 and would end up looking like:
1376
1377     1: TRIE(8)
1378       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1379         <foo>
1380         <bar>
1381    7: TAIL(8)
1382    8: EXACT <baz>(10)
1383   10: END(0)
1384
1385     d = uvuni_to_utf8_flags(d, uv, 0);
1386
1387 is the recommended Unicode-aware way of saying
1388
1389     *(d++) = uv;
1390 */
1391
1392 #define TRIE_STORE_REVCHAR(val)                                            \
1393     STMT_START {                                                           \
1394         if (UTF) {                                                         \
1395             SV *zlopp = newSV(7); /* XXX: optimize me */                   \
1396             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
1397             unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, val); \
1398             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
1399             SvPOK_on(zlopp);                                               \
1400             SvUTF8_on(zlopp);                                              \
1401             av_push(revcharmap, zlopp);                                    \
1402         } else {                                                           \
1403             char ooooff = (char)val;                                           \
1404             av_push(revcharmap, newSVpvn(&ooooff, 1));                     \
1405         }                                                                  \
1406         } STMT_END
1407
1408 #define TRIE_READ_CHAR STMT_START {                                                     \
1409     wordlen++;                                                                          \
1410     if ( UTF ) {                                                                        \
1411         /* if it is UTF then it is either already folded, or does not need folding */   \
1412         uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags);             \
1413     }                                                                                   \
1414     else if (folder == PL_fold_latin1) {                                                \
1415         /* if we use this folder we have to obey unicode rules on latin-1 data */       \
1416         if ( foldlen > 0 ) {                                                            \
1417            uvc = utf8n_to_uvuni( (const U8*) scan, UTF8_MAXLEN, &len, uniflags );       \
1418            foldlen -= len;                                                              \
1419            scan += len;                                                                 \
1420            len = 0;                                                                     \
1421         } else {                                                                        \
1422             len = 1;                                                                    \
1423             uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1);                     \
1424             skiplen = UNISKIP(uvc);                                                     \
1425             foldlen -= skiplen;                                                         \
1426             scan = foldbuf + skiplen;                                                   \
1427         }                                                                               \
1428     } else {                                                                            \
1429         /* raw data, will be folded later if needed */                                  \
1430         uvc = (U32)*uc;                                                                 \
1431         len = 1;                                                                        \
1432     }                                                                                   \
1433 } STMT_END
1434
1435
1436
1437 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
1438     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
1439         U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
1440         Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1441     }                                                           \
1442     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
1443     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
1444     TRIE_LIST_CUR( state )++;                                   \
1445 } STMT_END
1446
1447 #define TRIE_LIST_NEW(state) STMT_START {                       \
1448     Newxz( trie->states[ state ].trans.list,               \
1449         4, reg_trie_trans_le );                                 \
1450      TRIE_LIST_CUR( state ) = 1;                                \
1451      TRIE_LIST_LEN( state ) = 4;                                \
1452 } STMT_END
1453
1454 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
1455     U16 dupe= trie->states[ state ].wordnum;                    \
1456     regnode * const noper_next = regnext( noper );              \
1457                                                                 \
1458     DEBUG_r({                                                   \
1459         /* store the word for dumping */                        \
1460         SV* tmp;                                                \
1461         if (OP(noper) != NOTHING)                               \
1462             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);    \
1463         else                                                    \
1464             tmp = newSVpvn_utf8( "", 0, UTF );                  \
1465         av_push( trie_words, tmp );                             \
1466     });                                                         \
1467                                                                 \
1468     curword++;                                                  \
1469     trie->wordinfo[curword].prev   = 0;                         \
1470     trie->wordinfo[curword].len    = wordlen;                   \
1471     trie->wordinfo[curword].accept = state;                     \
1472                                                                 \
1473     if ( noper_next < tail ) {                                  \
1474         if (!trie->jump)                                        \
1475             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1476         trie->jump[curword] = (U16)(noper_next - convert);      \
1477         if (!jumper)                                            \
1478             jumper = noper_next;                                \
1479         if (!nextbranch)                                        \
1480             nextbranch= regnext(cur);                           \
1481     }                                                           \
1482                                                                 \
1483     if ( dupe ) {                                               \
1484         /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
1485         /* chain, so that when the bits of chain are later    */\
1486         /* linked together, the dups appear in the chain      */\
1487         trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1488         trie->wordinfo[dupe].prev = curword;                    \
1489     } else {                                                    \
1490         /* we haven't inserted this word yet.                */ \
1491         trie->states[ state ].wordnum = curword;                \
1492     }                                                           \
1493 } STMT_END
1494
1495
1496 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)          \
1497      ( ( base + charid >=  ucharcount                                   \
1498          && base + charid < ubound                                      \
1499          && state == trie->trans[ base - ucharcount + charid ].check    \
1500          && trie->trans[ base - ucharcount + charid ].next )            \
1501            ? trie->trans[ base - ucharcount + charid ].next             \
1502            : ( state==1 ? special : 0 )                                 \
1503       )
1504
1505 #define MADE_TRIE       1
1506 #define MADE_JUMP_TRIE  2
1507 #define MADE_EXACT_TRIE 4
1508
1509 STATIC I32
1510 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1511 {
1512     dVAR;
1513     /* first pass, loop through and scan words */
1514     reg_trie_data *trie;
1515     HV *widecharmap = NULL;
1516     AV *revcharmap = newAV();
1517     regnode *cur;
1518     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1519     STRLEN len = 0;
1520     UV uvc = 0;
1521     U16 curword = 0;
1522     U32 next_alloc = 0;
1523     regnode *jumper = NULL;
1524     regnode *nextbranch = NULL;
1525     regnode *convert = NULL;
1526     U32 *prev_states; /* temp array mapping each state to previous one */
1527     /* we just use folder as a flag in utf8 */
1528     const U8 * folder = NULL;
1529
1530 #ifdef DEBUGGING
1531     const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1532     AV *trie_words = NULL;
1533     /* along with revcharmap, this only used during construction but both are
1534      * useful during debugging so we store them in the struct when debugging.
1535      */
1536 #else
1537     const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1538     STRLEN trie_charcount=0;
1539 #endif
1540     SV *re_trie_maxbuff;
1541     GET_RE_DEBUG_FLAGS_DECL;
1542
1543     PERL_ARGS_ASSERT_MAKE_TRIE;
1544 #ifndef DEBUGGING
1545     PERL_UNUSED_ARG(depth);
1546 #endif
1547
1548     switch (flags) {
1549         case EXACT: break;
1550         case EXACTFA:
1551         case EXACTFU_SS:
1552         case EXACTFU_TRICKYFOLD:
1553         case EXACTFU: folder = PL_fold_latin1; break;
1554         case EXACTF:  folder = PL_fold; break;
1555         case EXACTFL: folder = PL_fold_locale; break;
1556         default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1557     }
1558
1559     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1560     trie->refcount = 1;
1561     trie->startstate = 1;
1562     trie->wordcount = word_count;
1563     RExC_rxi->data->data[ data_slot ] = (void*)trie;
1564     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1565     if (flags == EXACT)
1566         trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1567     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1568                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
1569
1570     DEBUG_r({
1571         trie_words = newAV();
1572     });
1573
1574     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1575     if (!SvIOK(re_trie_maxbuff)) {
1576         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1577     }
1578     DEBUG_TRIE_COMPILE_r({
1579                 PerlIO_printf( Perl_debug_log,
1580                   "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1581                   (int)depth * 2 + 2, "",
1582                   REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1583                   REG_NODE_NUM(last), REG_NODE_NUM(tail),
1584                   (int)depth);
1585     });
1586
1587    /* Find the node we are going to overwrite */
1588     if ( first == startbranch && OP( last ) != BRANCH ) {
1589         /* whole branch chain */
1590         convert = first;
1591     } else {
1592         /* branch sub-chain */
1593         convert = NEXTOPER( first );
1594     }
1595
1596     /*  -- First loop and Setup --
1597
1598        We first traverse the branches and scan each word to determine if it
1599        contains widechars, and how many unique chars there are, this is
1600        important as we have to build a table with at least as many columns as we
1601        have unique chars.
1602
1603        We use an array of integers to represent the character codes 0..255
1604        (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1605        native representation of the character value as the key and IV's for the
1606        coded index.
1607
1608        *TODO* If we keep track of how many times each character is used we can
1609        remap the columns so that the table compression later on is more
1610        efficient in terms of memory by ensuring the most common value is in the
1611        middle and the least common are on the outside.  IMO this would be better
1612        than a most to least common mapping as theres a decent chance the most
1613        common letter will share a node with the least common, meaning the node
1614        will not be compressible. With a middle is most common approach the worst
1615        case is when we have the least common nodes twice.
1616
1617      */
1618
1619     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1620         regnode *noper = NEXTOPER( cur );
1621         const U8 *uc = (U8*)STRING( noper );
1622         const U8 *e  = uc + STR_LEN( noper );
1623         STRLEN foldlen = 0;
1624         U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1625         STRLEN skiplen = 0;
1626         const U8 *scan = (U8*)NULL;
1627         U32 wordlen      = 0;         /* required init */
1628         STRLEN chars = 0;
1629         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1630
1631         if (OP(noper) == NOTHING) {
1632             regnode *noper_next= regnext(noper);
1633             if (noper_next != tail && OP(noper_next) == flags) {
1634                 noper = noper_next;
1635                 uc= (U8*)STRING(noper);
1636                 e= uc + STR_LEN(noper);
1637                 trie->minlen= STR_LEN(noper);
1638             } else {
1639                 trie->minlen= 0;
1640                 continue;
1641             }
1642         }
1643
1644         if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
1645             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1646                                           regardless of encoding */
1647             if (OP( noper ) == EXACTFU_SS) {
1648                 /* false positives are ok, so just set this */
1649                 TRIE_BITMAP_SET(trie,0xDF);
1650             }
1651         }
1652         for ( ; uc < e ; uc += len ) {
1653             TRIE_CHARCOUNT(trie)++;
1654             TRIE_READ_CHAR;
1655             chars++;
1656             if ( uvc < 256 ) {
1657                 if ( folder ) {
1658                     U8 folded= folder[ (U8) uvc ];
1659                     if ( !trie->charmap[ folded ] ) {
1660                         trie->charmap[ folded ]=( ++trie->uniquecharcount );
1661                         TRIE_STORE_REVCHAR( folded );
1662                     }
1663                 }
1664                 if ( !trie->charmap[ uvc ] ) {
1665                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1666                     TRIE_STORE_REVCHAR( uvc );
1667                 }
1668                 if ( set_bit ) {
1669                     /* store the codepoint in the bitmap, and its folded
1670                      * equivalent. */
1671                     TRIE_BITMAP_SET(trie, uvc);
1672
1673                     /* store the folded codepoint */
1674                     if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
1675
1676                     if ( !UTF ) {
1677                         /* store first byte of utf8 representation of
1678                            variant codepoints */
1679                         if (! UNI_IS_INVARIANT(uvc)) {
1680                             TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
1681                         }
1682                     }
1683                     set_bit = 0; /* We've done our bit :-) */
1684                 }
1685             } else {
1686                 SV** svpp;
1687                 if ( !widecharmap )
1688                     widecharmap = newHV();
1689
1690                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1691
1692                 if ( !svpp )
1693                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1694
1695                 if ( !SvTRUE( *svpp ) ) {
1696                     sv_setiv( *svpp, ++trie->uniquecharcount );
1697                     TRIE_STORE_REVCHAR(uvc);
1698                 }
1699             }
1700         }
1701         if( cur == first ) {
1702             trie->minlen = chars;
1703             trie->maxlen = chars;
1704         } else if (chars < trie->minlen) {
1705             trie->minlen = chars;
1706         } else if (chars > trie->maxlen) {
1707             trie->maxlen = chars;
1708         }
1709         if (OP( noper ) == EXACTFU_SS) {
1710             /* XXX: workaround - 'ss' could match "\x{DF}" so minlen could be 1 and not 2*/
1711             if (trie->minlen > 1)
1712                 trie->minlen= 1;
1713         }
1714         if (OP( noper ) == EXACTFU_TRICKYFOLD) {
1715             /* XXX: workround - things like "\x{1FBE}\x{0308}\x{0301}" can match "\x{0390}"
1716              *                - We assume that any such sequence might match a 2 byte string */
1717             if (trie->minlen > 2 )
1718                 trie->minlen= 2;
1719         }
1720
1721     } /* end first pass */
1722     DEBUG_TRIE_COMPILE_r(
1723         PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1724                 (int)depth * 2 + 2,"",
1725                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1726                 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1727                 (int)trie->minlen, (int)trie->maxlen )
1728     );
1729
1730     /*
1731         We now know what we are dealing with in terms of unique chars and
1732         string sizes so we can calculate how much memory a naive
1733         representation using a flat table  will take. If it's over a reasonable
1734         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1735         conservative but potentially much slower representation using an array
1736         of lists.
1737
1738         At the end we convert both representations into the same compressed
1739         form that will be used in regexec.c for matching with. The latter
1740         is a form that cannot be used to construct with but has memory
1741         properties similar to the list form and access properties similar
1742         to the table form making it both suitable for fast searches and
1743         small enough that its feasable to store for the duration of a program.
1744
1745         See the comment in the code where the compressed table is produced
1746         inplace from the flat tabe representation for an explanation of how
1747         the compression works.
1748
1749     */
1750
1751
1752     Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1753     prev_states[1] = 0;
1754
1755     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1756         /*
1757             Second Pass -- Array Of Lists Representation
1758
1759             Each state will be represented by a list of charid:state records
1760             (reg_trie_trans_le) the first such element holds the CUR and LEN
1761             points of the allocated array. (See defines above).
1762
1763             We build the initial structure using the lists, and then convert
1764             it into the compressed table form which allows faster lookups
1765             (but cant be modified once converted).
1766         */
1767
1768         STRLEN transcount = 1;
1769
1770         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1771             "%*sCompiling trie using list compiler\n",
1772             (int)depth * 2 + 2, ""));
1773
1774         trie->states = (reg_trie_state *)
1775             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1776                                   sizeof(reg_trie_state) );
1777         TRIE_LIST_NEW(1);
1778         next_alloc = 2;
1779
1780         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1781
1782             regnode *noper   = NEXTOPER( cur );
1783             U8 *uc           = (U8*)STRING( noper );
1784             const U8 *e      = uc + STR_LEN( noper );
1785             U32 state        = 1;         /* required init */
1786             U16 charid       = 0;         /* sanity init */
1787             U8 *scan         = (U8*)NULL; /* sanity init */
1788             STRLEN foldlen   = 0;         /* required init */
1789             U32 wordlen      = 0;         /* required init */
1790             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1791             STRLEN skiplen   = 0;
1792
1793             if (OP(noper) == NOTHING) {
1794                 regnode *noper_next= regnext(noper);
1795                 if (noper_next != tail && OP(noper_next) == flags) {
1796                     noper = noper_next;
1797                     uc= (U8*)STRING(noper);
1798                     e= uc + STR_LEN(noper);
1799                 }
1800             }
1801
1802             if (OP(noper) != NOTHING) {
1803                 for ( ; uc < e ; uc += len ) {
1804
1805                     TRIE_READ_CHAR;
1806
1807                     if ( uvc < 256 ) {
1808                         charid = trie->charmap[ uvc ];
1809                     } else {
1810                         SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1811                         if ( !svpp ) {
1812                             charid = 0;
1813                         } else {
1814                             charid=(U16)SvIV( *svpp );
1815                         }
1816                     }
1817                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
1818                     if ( charid ) {
1819
1820                         U16 check;
1821                         U32 newstate = 0;
1822
1823                         charid--;
1824                         if ( !trie->states[ state ].trans.list ) {
1825                             TRIE_LIST_NEW( state );
1826                         }
1827                         for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1828                             if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1829                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1830                                 break;
1831                             }
1832                         }
1833                         if ( ! newstate ) {
1834                             newstate = next_alloc++;
1835                             prev_states[newstate] = state;
1836                             TRIE_LIST_PUSH( state, charid, newstate );
1837                             transcount++;
1838                         }
1839                         state = newstate;
1840                     } else {
1841                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1842                     }
1843                 }
1844             }
1845             TRIE_HANDLE_WORD(state);
1846
1847         } /* end second pass */
1848
1849         /* next alloc is the NEXT state to be allocated */
1850         trie->statecount = next_alloc;
1851         trie->states = (reg_trie_state *)
1852             PerlMemShared_realloc( trie->states,
1853                                    next_alloc
1854                                    * sizeof(reg_trie_state) );
1855
1856         /* and now dump it out before we compress it */
1857         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1858                                                          revcharmap, next_alloc,
1859                                                          depth+1)
1860         );
1861
1862         trie->trans = (reg_trie_trans *)
1863             PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1864         {
1865             U32 state;
1866             U32 tp = 0;
1867             U32 zp = 0;
1868
1869
1870             for( state=1 ; state < next_alloc ; state ++ ) {
1871                 U32 base=0;
1872
1873                 /*
1874                 DEBUG_TRIE_COMPILE_MORE_r(
1875                     PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1876                 );
1877                 */
1878
1879                 if (trie->states[state].trans.list) {
1880                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1881                     U16 maxid=minid;
1882                     U16 idx;
1883
1884                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1885                         const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1886                         if ( forid < minid ) {
1887                             minid=forid;
1888                         } else if ( forid > maxid ) {
1889                             maxid=forid;
1890                         }
1891                     }
1892                     if ( transcount < tp + maxid - minid + 1) {
1893                         transcount *= 2;
1894                         trie->trans = (reg_trie_trans *)
1895                             PerlMemShared_realloc( trie->trans,
1896                                                      transcount
1897                                                      * sizeof(reg_trie_trans) );
1898                         Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1899                     }
1900                     base = trie->uniquecharcount + tp - minid;
1901                     if ( maxid == minid ) {
1902                         U32 set = 0;
1903                         for ( ; zp < tp ; zp++ ) {
1904                             if ( ! trie->trans[ zp ].next ) {
1905                                 base = trie->uniquecharcount + zp - minid;
1906                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1907                                 trie->trans[ zp ].check = state;
1908                                 set = 1;
1909                                 break;
1910                             }
1911                         }
1912                         if ( !set ) {
1913                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1914                             trie->trans[ tp ].check = state;
1915                             tp++;
1916                             zp = tp;
1917                         }
1918                     } else {
1919                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1920                             const U32 tid = base -  trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1921                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1922                             trie->trans[ tid ].check = state;
1923                         }
1924                         tp += ( maxid - minid + 1 );
1925                     }
1926                     Safefree(trie->states[ state ].trans.list);
1927                 }
1928                 /*
1929                 DEBUG_TRIE_COMPILE_MORE_r(
1930                     PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1931                 );
1932                 */
1933                 trie->states[ state ].trans.base=base;
1934             }
1935             trie->lasttrans = tp + 1;
1936         }
1937     } else {
1938         /*
1939            Second Pass -- Flat Table Representation.
1940
1941            we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1942            We know that we will need Charcount+1 trans at most to store the data
1943            (one row per char at worst case) So we preallocate both structures
1944            assuming worst case.
1945
1946            We then construct the trie using only the .next slots of the entry
1947            structs.
1948
1949            We use the .check field of the first entry of the node temporarily to
1950            make compression both faster and easier by keeping track of how many non
1951            zero fields are in the node.
1952
1953            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1954            transition.
1955
1956            There are two terms at use here: state as a TRIE_NODEIDX() which is a
1957            number representing the first entry of the node, and state as a
1958            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1959            TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1960            are 2 entrys per node. eg:
1961
1962              A B       A B
1963           1. 2 4    1. 3 7
1964           2. 0 3    3. 0 5
1965           3. 0 0    5. 0 0
1966           4. 0 0    7. 0 0
1967
1968            The table is internally in the right hand, idx form. However as we also
1969            have to deal with the states array which is indexed by nodenum we have to
1970            use TRIE_NODENUM() to convert.
1971
1972         */
1973         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1974             "%*sCompiling trie using table compiler\n",
1975             (int)depth * 2 + 2, ""));
1976
1977         trie->trans = (reg_trie_trans *)
1978             PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1979                                   * trie->uniquecharcount + 1,
1980                                   sizeof(reg_trie_trans) );
1981         trie->states = (reg_trie_state *)
1982             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1983                                   sizeof(reg_trie_state) );
1984         next_alloc = trie->uniquecharcount + 1;
1985
1986
1987         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1988
1989             regnode *noper   = NEXTOPER( cur );
1990             const U8 *uc     = (U8*)STRING( noper );
1991             const U8 *e      = uc + STR_LEN( noper );
1992
1993             U32 state        = 1;         /* required init */
1994
1995             U16 charid       = 0;         /* sanity init */
1996             U32 accept_state = 0;         /* sanity init */
1997             U8 *scan         = (U8*)NULL; /* sanity init */
1998
1999             STRLEN foldlen   = 0;         /* required init */
2000             U32 wordlen      = 0;         /* required init */
2001             STRLEN skiplen   = 0;
2002             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2003
2004             if (OP(noper) == NOTHING) {
2005                 regnode *noper_next= regnext(noper);
2006                 if (noper_next != tail && OP(noper_next) == flags) {
2007                     noper = noper_next;
2008                     uc= (U8*)STRING(noper);
2009                     e= uc + STR_LEN(noper);
2010                 }
2011             }
2012
2013             if ( OP(noper) != NOTHING ) {
2014                 for ( ; uc < e ; uc += len ) {
2015
2016                     TRIE_READ_CHAR;
2017
2018                     if ( uvc < 256 ) {
2019                         charid = trie->charmap[ uvc ];
2020                     } else {
2021                         SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
2022                         charid = svpp ? (U16)SvIV(*svpp) : 0;
2023                     }
2024                     if ( charid ) {
2025                         charid--;
2026                         if ( !trie->trans[ state + charid ].next ) {
2027                             trie->trans[ state + charid ].next = next_alloc;
2028                             trie->trans[ state ].check++;
2029                             prev_states[TRIE_NODENUM(next_alloc)]
2030                                     = TRIE_NODENUM(state);
2031                             next_alloc += trie->uniquecharcount;
2032                         }
2033                         state = trie->trans[ state + charid ].next;
2034                     } else {
2035                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2036                     }
2037                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
2038                 }
2039             }
2040             accept_state = TRIE_NODENUM( state );
2041             TRIE_HANDLE_WORD(accept_state);
2042
2043         } /* end second pass */
2044
2045         /* and now dump it out before we compress it */
2046         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2047                                                           revcharmap,
2048                                                           next_alloc, depth+1));
2049
2050         {
2051         /*
2052            * Inplace compress the table.*
2053
2054            For sparse data sets the table constructed by the trie algorithm will
2055            be mostly 0/FAIL transitions or to put it another way mostly empty.
2056            (Note that leaf nodes will not contain any transitions.)
2057
2058            This algorithm compresses the tables by eliminating most such
2059            transitions, at the cost of a modest bit of extra work during lookup:
2060
2061            - Each states[] entry contains a .base field which indicates the
2062            index in the state[] array wheres its transition data is stored.
2063
2064            - If .base is 0 there are no valid transitions from that node.
2065
2066            - If .base is nonzero then charid is added to it to find an entry in
2067            the trans array.
2068
2069            -If trans[states[state].base+charid].check!=state then the
2070            transition is taken to be a 0/Fail transition. Thus if there are fail
2071            transitions at the front of the node then the .base offset will point
2072            somewhere inside the previous nodes data (or maybe even into a node
2073            even earlier), but the .check field determines if the transition is
2074            valid.
2075
2076            XXX - wrong maybe?
2077            The following process inplace converts the table to the compressed
2078            table: We first do not compress the root node 1,and mark all its
2079            .check pointers as 1 and set its .base pointer as 1 as well. This
2080            allows us to do a DFA construction from the compressed table later,
2081            and ensures that any .base pointers we calculate later are greater
2082            than 0.
2083
2084            - We set 'pos' to indicate the first entry of the second node.
2085
2086            - We then iterate over the columns of the node, finding the first and
2087            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2088            and set the .check pointers accordingly, and advance pos
2089            appropriately and repreat for the next node. Note that when we copy
2090            the next pointers we have to convert them from the original
2091            NODEIDX form to NODENUM form as the former is not valid post
2092            compression.
2093
2094            - If a node has no transitions used we mark its base as 0 and do not
2095            advance the pos pointer.
2096
2097            - If a node only has one transition we use a second pointer into the
2098            structure to fill in allocated fail transitions from other states.
2099            This pointer is independent of the main pointer and scans forward
2100            looking for null transitions that are allocated to a state. When it
2101            finds one it writes the single transition into the "hole".  If the
2102            pointer doesnt find one the single transition is appended as normal.
2103
2104            - Once compressed we can Renew/realloc the structures to release the
2105            excess space.
2106
2107            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2108            specifically Fig 3.47 and the associated pseudocode.
2109
2110            demq
2111         */
2112         const U32 laststate = TRIE_NODENUM( next_alloc );
2113         U32 state, charid;
2114         U32 pos = 0, zp=0;
2115         trie->statecount = laststate;
2116
2117         for ( state = 1 ; state < laststate ; state++ ) {
2118             U8 flag = 0;
2119             const U32 stateidx = TRIE_NODEIDX( state );
2120             const U32 o_used = trie->trans[ stateidx ].check;
2121             U32 used = trie->trans[ stateidx ].check;
2122             trie->trans[ stateidx ].check = 0;
2123
2124             for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2125                 if ( flag || trie->trans[ stateidx + charid ].next ) {
2126                     if ( trie->trans[ stateidx + charid ].next ) {
2127                         if (o_used == 1) {
2128                             for ( ; zp < pos ; zp++ ) {
2129                                 if ( ! trie->trans[ zp ].next ) {
2130                                     break;
2131                                 }
2132                             }
2133                             trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2134                             trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2135                             trie->trans[ zp ].check = state;
2136                             if ( ++zp > pos ) pos = zp;
2137                             break;
2138                         }
2139                         used--;
2140                     }
2141                     if ( !flag ) {
2142                         flag = 1;
2143                         trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2144                     }
2145                     trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2146                     trie->trans[ pos ].check = state;
2147                     pos++;
2148                 }
2149             }
2150         }
2151         trie->lasttrans = pos + 1;
2152         trie->states = (reg_trie_state *)
2153             PerlMemShared_realloc( trie->states, laststate
2154                                    * sizeof(reg_trie_state) );
2155         DEBUG_TRIE_COMPILE_MORE_r(
2156                 PerlIO_printf( Perl_debug_log,
2157                     "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2158                     (int)depth * 2 + 2,"",
2159                     (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
2160                     (IV)next_alloc,
2161                     (IV)pos,
2162                     ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2163             );
2164
2165         } /* end table compress */
2166     }
2167     DEBUG_TRIE_COMPILE_MORE_r(
2168             PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2169                 (int)depth * 2 + 2, "",
2170                 (UV)trie->statecount,
2171                 (UV)trie->lasttrans)
2172     );
2173     /* resize the trans array to remove unused space */
2174     trie->trans = (reg_trie_trans *)
2175         PerlMemShared_realloc( trie->trans, trie->lasttrans
2176                                * sizeof(reg_trie_trans) );
2177
2178     {   /* Modify the program and insert the new TRIE node */
2179         U8 nodetype =(U8)(flags & 0xFF);
2180         char *str=NULL;
2181
2182 #ifdef DEBUGGING
2183         regnode *optimize = NULL;
2184 #ifdef RE_TRACK_PATTERN_OFFSETS
2185
2186         U32 mjd_offset = 0;
2187         U32 mjd_nodelen = 0;
2188 #endif /* RE_TRACK_PATTERN_OFFSETS */
2189 #endif /* DEBUGGING */
2190         /*
2191            This means we convert either the first branch or the first Exact,
2192            depending on whether the thing following (in 'last') is a branch
2193            or not and whther first is the startbranch (ie is it a sub part of
2194            the alternation or is it the whole thing.)
2195            Assuming its a sub part we convert the EXACT otherwise we convert
2196            the whole branch sequence, including the first.
2197          */
2198         /* Find the node we are going to overwrite */
2199         if ( first != startbranch || OP( last ) == BRANCH ) {
2200             /* branch sub-chain */
2201             NEXT_OFF( first ) = (U16)(last - first);
2202 #ifdef RE_TRACK_PATTERN_OFFSETS
2203             DEBUG_r({
2204                 mjd_offset= Node_Offset((convert));
2205                 mjd_nodelen= Node_Length((convert));
2206             });
2207 #endif
2208             /* whole branch chain */
2209         }
2210 #ifdef RE_TRACK_PATTERN_OFFSETS
2211         else {
2212             DEBUG_r({
2213                 const  regnode *nop = NEXTOPER( convert );
2214                 mjd_offset= Node_Offset((nop));
2215                 mjd_nodelen= Node_Length((nop));
2216             });
2217         }
2218         DEBUG_OPTIMISE_r(
2219             PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2220                 (int)depth * 2 + 2, "",
2221                 (UV)mjd_offset, (UV)mjd_nodelen)
2222         );
2223 #endif
2224         /* But first we check to see if there is a common prefix we can
2225            split out as an EXACT and put in front of the TRIE node.  */
2226         trie->startstate= 1;
2227         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
2228             U32 state;
2229             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2230                 U32 ofs = 0;
2231                 I32 idx = -1;
2232                 U32 count = 0;
2233                 const U32 base = trie->states[ state ].trans.base;
2234
2235                 if ( trie->states[state].wordnum )
2236                         count = 1;
2237
2238                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2239                     if ( ( base + ofs >= trie->uniquecharcount ) &&
2240                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2241                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2242                     {
2243                         if ( ++count > 1 ) {
2244                             SV **tmp = av_fetch( revcharmap, ofs, 0);
2245                             const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2246                             if ( state == 1 ) break;
2247                             if ( count == 2 ) {
2248                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2249                                 DEBUG_OPTIMISE_r(
2250                                     PerlIO_printf(Perl_debug_log,
2251                                         "%*sNew Start State=%"UVuf" Class: [",
2252                                         (int)depth * 2 + 2, "",
2253                                         (UV)state));
2254                                 if (idx >= 0) {
2255                                     SV ** const tmp = av_fetch( revcharmap, idx, 0);
2256                                     const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2257
2258                                     TRIE_BITMAP_SET(trie,*ch);
2259                                     if ( folder )
2260                                         TRIE_BITMAP_SET(trie, folder[ *ch ]);
2261                                     DEBUG_OPTIMISE_r(
2262                                         PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2263                                     );
2264                                 }
2265                             }
2266                             TRIE_BITMAP_SET(trie,*ch);
2267                             if ( folder )
2268                                 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2269                             DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2270                         }
2271                         idx = ofs;
2272                     }
2273                 }
2274                 if ( count == 1 ) {
2275                     SV **tmp = av_fetch( revcharmap, idx, 0);
2276                     STRLEN len;
2277                     char *ch = SvPV( *tmp, len );
2278                     DEBUG_OPTIMISE_r({
2279                         SV *sv=sv_newmortal();
2280                         PerlIO_printf( Perl_debug_log,
2281                             "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2282                             (int)depth * 2 + 2, "",
2283                             (UV)state, (UV)idx,
2284                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2285                                 PL_colors[0], PL_colors[1],
2286                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2287                                 PERL_PV_ESCAPE_FIRSTCHAR
2288                             )
2289                         );
2290                     });
2291                     if ( state==1 ) {
2292                         OP( convert ) = nodetype;
2293                         str=STRING(convert);
2294                         STR_LEN(convert)=0;
2295                     }
2296                     STR_LEN(convert) += len;
2297                     while (len--)
2298                         *str++ = *ch++;
2299                 } else {
2300 #ifdef DEBUGGING
2301                     if (state>1)
2302                         DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2303 #endif
2304                     break;
2305                 }
2306             }
2307             trie->prefixlen = (state-1);
2308             if (str) {
2309                 regnode *n = convert+NODE_SZ_STR(convert);
2310                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2311                 trie->startstate = state;
2312                 trie->minlen -= (state - 1);
2313                 trie->maxlen -= (state - 1);
2314 #ifdef DEBUGGING
2315                /* At least the UNICOS C compiler choked on this
2316                 * being argument to DEBUG_r(), so let's just have
2317                 * it right here. */
2318                if (
2319 #ifdef PERL_EXT_RE_BUILD
2320                    1
2321 #else
2322                    DEBUG_r_TEST
2323 #endif
2324                    ) {
2325                    regnode *fix = convert;
2326                    U32 word = trie->wordcount;
2327                    mjd_nodelen++;
2328                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2329                    while( ++fix < n ) {
2330                        Set_Node_Offset_Length(fix, 0, 0);
2331                    }
2332                    while (word--) {
2333                        SV ** const tmp = av_fetch( trie_words, word, 0 );
2334                        if (tmp) {
2335                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
2336                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2337                            else
2338                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2339                        }
2340                    }
2341                }
2342 #endif
2343                 if (trie->maxlen) {
2344                     convert = n;
2345                 } else {
2346                     NEXT_OFF(convert) = (U16)(tail - convert);
2347                     DEBUG_r(optimize= n);
2348                 }
2349             }
2350         }
2351         if (!jumper)
2352             jumper = last;
2353         if ( trie->maxlen ) {
2354             NEXT_OFF( convert ) = (U16)(tail - convert);
2355             ARG_SET( convert, data_slot );
2356             /* Store the offset to the first unabsorbed branch in
2357                jump[0], which is otherwise unused by the jump logic.
2358                We use this when dumping a trie and during optimisation. */
2359             if (trie->jump)
2360                 trie->jump[0] = (U16)(nextbranch - convert);
2361
2362             /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2363              *   and there is a bitmap
2364              *   and the first "jump target" node we found leaves enough room
2365              * then convert the TRIE node into a TRIEC node, with the bitmap
2366              * embedded inline in the opcode - this is hypothetically faster.
2367              */
2368             if ( !trie->states[trie->startstate].wordnum
2369                  && trie->bitmap
2370                  && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2371             {
2372                 OP( convert ) = TRIEC;
2373                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2374                 PerlMemShared_free(trie->bitmap);
2375                 trie->bitmap= NULL;
2376             } else
2377                 OP( convert ) = TRIE;
2378
2379             /* store the type in the flags */
2380             convert->flags = nodetype;
2381             DEBUG_r({
2382             optimize = convert
2383                       + NODE_STEP_REGNODE
2384                       + regarglen[ OP( convert ) ];
2385             });
2386             /* XXX We really should free up the resource in trie now,
2387                    as we won't use them - (which resources?) dmq */
2388         }
2389         /* needed for dumping*/
2390         DEBUG_r(if (optimize) {
2391             regnode *opt = convert;
2392
2393             while ( ++opt < optimize) {
2394                 Set_Node_Offset_Length(opt,0,0);
2395             }
2396             /*
2397                 Try to clean up some of the debris left after the
2398                 optimisation.
2399              */
2400             while( optimize < jumper ) {
2401                 mjd_nodelen += Node_Length((optimize));
2402                 OP( optimize ) = OPTIMIZED;
2403                 Set_Node_Offset_Length(optimize,0,0);
2404                 optimize++;
2405             }
2406             Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2407         });
2408     } /* end node insert */
2409
2410     /*  Finish populating the prev field of the wordinfo array.  Walk back
2411      *  from each accept state until we find another accept state, and if
2412      *  so, point the first word's .prev field at the second word. If the
2413      *  second already has a .prev field set, stop now. This will be the
2414      *  case either if we've already processed that word's accept state,
2415      *  or that state had multiple words, and the overspill words were
2416      *  already linked up earlier.
2417      */
2418     {
2419         U16 word;
2420         U32 state;
2421         U16 prev;
2422
2423         for (word=1; word <= trie->wordcount; word++) {
2424             prev = 0;
2425             if (trie->wordinfo[word].prev)
2426                 continue;
2427             state = trie->wordinfo[word].accept;
2428             while (state) {
2429                 state = prev_states[state];
2430                 if (!state)
2431                     break;
2432                 prev = trie->states[state].wordnum;
2433                 if (prev)
2434                     break;
2435             }
2436             trie->wordinfo[word].prev = prev;
2437         }
2438         Safefree(prev_states);
2439     }
2440
2441
2442     /* and now dump out the compressed format */
2443     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2444
2445     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2446 #ifdef DEBUGGING
2447     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2448     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2449 #else
2450     SvREFCNT_dec(revcharmap);
2451 #endif
2452     return trie->jump
2453            ? MADE_JUMP_TRIE
2454            : trie->startstate>1
2455              ? MADE_EXACT_TRIE
2456              : MADE_TRIE;
2457 }
2458
2459 STATIC void
2460 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode *stclass, U32 depth)
2461 {
2462 /* The Trie is constructed and compressed now so we can build a fail array if it's needed
2463
2464    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2465    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2466    ISBN 0-201-10088-6
2467
2468    We find the fail state for each state in the trie, this state is the longest proper
2469    suffix of the current state's 'word' that is also a proper prefix of another word in our
2470    trie. State 1 represents the word '' and is thus the default fail state. This allows
2471    the DFA not to have to restart after its tried and failed a word at a given point, it
2472    simply continues as though it had been matching the other word in the first place.
2473    Consider
2474       'abcdgu'=~/abcdefg|cdgu/
2475    When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2476    fail, which would bring us to the state representing 'd' in the second word where we would
2477    try 'g' and succeed, proceeding to match 'cdgu'.
2478  */
2479  /* add a fail transition */
2480     const U32 trie_offset = ARG(source);
2481     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2482     U32 *q;
2483     const U32 ucharcount = trie->uniquecharcount;
2484     const U32 numstates = trie->statecount;
2485     const U32 ubound = trie->lasttrans + ucharcount;
2486     U32 q_read = 0;
2487     U32 q_write = 0;
2488     U32 charid;
2489     U32 base = trie->states[ 1 ].trans.base;
2490     U32 *fail;
2491     reg_ac_data *aho;
2492     const U32 data_slot = add_data( pRExC_state, 1, "T" );
2493     GET_RE_DEBUG_FLAGS_DECL;
2494
2495     PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2496 #ifndef DEBUGGING
2497     PERL_UNUSED_ARG(depth);
2498 #endif
2499
2500
2501     ARG_SET( stclass, data_slot );
2502     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2503     RExC_rxi->data->data[ data_slot ] = (void*)aho;
2504     aho->trie=trie_offset;
2505     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2506     Copy( trie->states, aho->states, numstates, reg_trie_state );
2507     Newxz( q, numstates, U32);
2508     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2509     aho->refcount = 1;
2510     fail = aho->fail;
2511     /* initialize fail[0..1] to be 1 so that we always have
2512        a valid final fail state */
2513     fail[ 0 ] = fail[ 1 ] = 1;
2514
2515     for ( charid = 0; charid < ucharcount ; charid++ ) {
2516         const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2517         if ( newstate ) {
2518             q[ q_write ] = newstate;
2519             /* set to point at the root */
2520             fail[ q[ q_write++ ] ]=1;
2521         }
2522     }
2523     while ( q_read < q_write) {
2524         const U32 cur = q[ q_read++ % numstates ];
2525         base = trie->states[ cur ].trans.base;
2526
2527         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2528             const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2529             if (ch_state) {
2530                 U32 fail_state = cur;
2531                 U32 fail_base;
2532                 do {
2533                     fail_state = fail[ fail_state ];
2534                     fail_base = aho->states[ fail_state ].trans.base;
2535                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2536
2537                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2538                 fail[ ch_state ] = fail_state;
2539                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2540                 {
2541                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
2542                 }
2543                 q[ q_write++ % numstates] = ch_state;
2544             }
2545         }
2546     }
2547     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2548        when we fail in state 1, this allows us to use the
2549        charclass scan to find a valid start char. This is based on the principle
2550        that theres a good chance the string being searched contains lots of stuff
2551        that cant be a start char.
2552      */
2553     fail[ 0 ] = fail[ 1 ] = 0;
2554     DEBUG_TRIE_COMPILE_r({
2555         PerlIO_printf(Perl_debug_log,
2556                       "%*sStclass Failtable (%"UVuf" states): 0",
2557                       (int)(depth * 2), "", (UV)numstates
2558         );
2559         for( q_read=1; q_read<numstates; q_read++ ) {
2560             PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2561         }
2562         PerlIO_printf(Perl_debug_log, "\n");
2563     });
2564     Safefree(q);
2565     /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2566 }
2567
2568
2569 /*
2570  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2571  * These need to be revisited when a newer toolchain becomes available.
2572  */
2573 #if defined(__sparc64__) && defined(__GNUC__)
2574 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2575 #       undef  SPARC64_GCC_WORKAROUND
2576 #       define SPARC64_GCC_WORKAROUND 1
2577 #   endif
2578 #endif
2579
2580 #define DEBUG_PEEP(str,scan,depth) \
2581     DEBUG_OPTIMISE_r({if (scan){ \
2582        SV * const mysv=sv_newmortal(); \
2583        regnode *Next = regnext(scan); \
2584        regprop(RExC_rx, mysv, scan); \
2585        PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2586        (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2587        Next ? (REG_NODE_NUM(Next)) : 0 ); \
2588    }});
2589
2590
2591 /* The below joins as many adjacent EXACTish nodes as possible into a single
2592  * one.  The regop may be changed if the node(s) contain certain sequences that
2593  * require special handling.  The joining is only done if:
2594  * 1) there is room in the current conglomerated node to entirely contain the
2595  *    next one.
2596  * 2) they are the exact same node type
2597  *
2598  * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
2599  * these get optimized out
2600  *
2601  * If a node is to match under /i (folded), the number of characters it matches
2602  * can be different than its character length if it contains a multi-character
2603  * fold.  *min_subtract is set to the total delta of the input nodes.
2604  *
2605  * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
2606  * and contains LATIN SMALL LETTER SHARP S
2607  *
2608  * This is as good a place as any to discuss the design of handling these
2609  * multi-character fold sequences.  It's been wrong in Perl for a very long
2610  * time.  There are three code points in Unicode whose multi-character folds
2611  * were long ago discovered to mess things up.  The previous designs for
2612  * dealing with these involved assigning a special node for them.  This
2613  * approach doesn't work, as evidenced by this example:
2614  *      "\xDFs" =~ /s\xDF/ui    # Used to fail before these patches
2615  * Both these fold to "sss", but if the pattern is parsed to create a node that
2616  * would match just the \xDF, it won't be able to handle the case where a
2617  * successful match would have to cross the node's boundary.  The new approach
2618  * that hopefully generally solves the problem generates an EXACTFU_SS node
2619  * that is "sss".
2620  *
2621  * It turns out that there are problems with all multi-character folds, and not
2622  * just these three.  Now the code is general, for all such cases, but the
2623  * three still have some special handling.  The approach taken is:
2624  * 1)   This routine examines each EXACTFish node that could contain multi-
2625  *      character fold sequences.  It returns in *min_subtract how much to
2626  *      subtract from the the actual length of the string to get a real minimum
2627  *      match length; it is 0 if there are no multi-char folds.  This delta is
2628  *      used by the caller to adjust the min length of the match, and the delta
2629  *      between min and max, so that the optimizer doesn't reject these
2630  *      possibilities based on size constraints.
2631  * 2)   Certain of these sequences require special handling by the trie code,
2632  *      so, if found, this code changes the joined node type to special ops:
2633  *      EXACTFU_TRICKYFOLD and EXACTFU_SS.
2634  * 3)   For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
2635  *      is used for an EXACTFU node that contains at least one "ss" sequence in
2636  *      it.  For non-UTF-8 patterns and strings, this is the only case where
2637  *      there is a possible fold length change.  That means that a regular
2638  *      EXACTFU node without UTF-8 involvement doesn't have to concern itself
2639  *      with length changes, and so can be processed faster.  regexec.c takes
2640  *      advantage of this.  Generally, an EXACTFish node that is in UTF-8 is
2641  *      pre-folded by regcomp.c.  This saves effort in regex matching.
2642  *      However, the pre-folding isn't done for non-UTF8 patterns because the
2643  *      fold of the MICRO SIGN requires UTF-8, and we don't want to slow things
2644  *      down by forcing the pattern into UTF8 unless necessary.  Also what
2645  *      EXACTF and EXACTFL nodes fold to isn't known until runtime.  The fold
2646  *      possibilities for the non-UTF8 patterns are quite simple, except for
2647  *      the sharp s.  All the ones that don't involve a UTF-8 target string are
2648  *      members of a fold-pair, and arrays are set up for all of them so that
2649  *      the other member of the pair can be found quickly.  Code elsewhere in
2650  *      this file makes sure that in EXACTFU nodes, the sharp s gets folded to
2651  *      'ss', even if the pattern isn't UTF-8.  This avoids the issues
2652  *      described in the next item.
2653  * 4)   A problem remains for the sharp s in EXACTF nodes.  Whether it matches
2654  *      'ss' or not is not knowable at compile time.  It will match iff the
2655  *      target string is in UTF-8, unlike the EXACTFU nodes, where it always
2656  *      matches; and the EXACTFL and EXACTFA nodes where it never does.  Thus
2657  *      it can't be folded to "ss" at compile time, unlike EXACTFU does (as
2658  *      described in item 3).  An assumption that the optimizer part of
2659  *      regexec.c (probably unwittingly) makes is that a character in the
2660  *      pattern corresponds to at most a single character in the target string.
2661  *      (And I do mean character, and not byte here, unlike other parts of the
2662  *      documentation that have never been updated to account for multibyte
2663  *      Unicode.)  This assumption is wrong only in this case, as all other
2664  *      cases are either 1-1 folds when no UTF-8 is involved; or is true by
2665  *      virtue of having this file pre-fold UTF-8 patterns.   I'm
2666  *      reluctant to try to change this assumption, so instead the code punts.
2667  *      This routine examines EXACTF nodes for the sharp s, and returns a
2668  *      boolean indicating whether or not the node is an EXACTF node that
2669  *      contains a sharp s.  When it is true, the caller sets a flag that later
2670  *      causes the optimizer in this file to not set values for the floating
2671  *      and fixed string lengths, and thus avoids the optimizer code in
2672  *      regexec.c that makes the invalid assumption.  Thus, there is no
2673  *      optimization based on string lengths for EXACTF nodes that contain the
2674  *      sharp s.  This only happens for /id rules (which means the pattern
2675  *      isn't in UTF-8).
2676  */
2677
2678 #define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
2679     if (PL_regkind[OP(scan)] == EXACT) \
2680         join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
2681
2682 STATIC U32
2683 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
2684     /* Merge several consecutive EXACTish nodes into one. */
2685     regnode *n = regnext(scan);
2686     U32 stringok = 1;
2687     regnode *next = scan + NODE_SZ_STR(scan);
2688     U32 merged = 0;
2689     U32 stopnow = 0;
2690 #ifdef DEBUGGING
2691     regnode *stop = scan;
2692     GET_RE_DEBUG_FLAGS_DECL;
2693 #else
2694     PERL_UNUSED_ARG(depth);
2695 #endif
2696
2697     PERL_ARGS_ASSERT_JOIN_EXACT;
2698 #ifndef EXPERIMENTAL_INPLACESCAN
2699     PERL_UNUSED_ARG(flags);
2700     PERL_UNUSED_ARG(val);
2701 #endif
2702     DEBUG_PEEP("join",scan,depth);
2703
2704     /* Look through the subsequent nodes in the chain.  Skip NOTHING, merge
2705      * EXACT ones that are mergeable to the current one. */
2706     while (n
2707            && (PL_regkind[OP(n)] == NOTHING
2708                || (stringok && OP(n) == OP(scan)))
2709            && NEXT_OFF(n)
2710            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
2711     {
2712
2713         if (OP(n) == TAIL || n > next)
2714             stringok = 0;
2715         if (PL_regkind[OP(n)] == NOTHING) {
2716             DEBUG_PEEP("skip:",n,depth);
2717             NEXT_OFF(scan) += NEXT_OFF(n);
2718             next = n + NODE_STEP_REGNODE;
2719 #ifdef DEBUGGING
2720             if (stringok)
2721                 stop = n;
2722 #endif
2723             n = regnext(n);
2724         }
2725         else if (stringok) {
2726             const unsigned int oldl = STR_LEN(scan);
2727             regnode * const nnext = regnext(n);
2728
2729             /* XXX I (khw) kind of doubt that this works on platforms where
2730              * U8_MAX is above 255 because of lots of other assumptions */
2731             if (oldl + STR_LEN(n) > U8_MAX)
2732                 break;
2733
2734             DEBUG_PEEP("merg",n,depth);
2735             merged++;
2736
2737             NEXT_OFF(scan) += NEXT_OFF(n);
2738             STR_LEN(scan) += STR_LEN(n);
2739             next = n + NODE_SZ_STR(n);
2740             /* Now we can overwrite *n : */
2741             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2742 #ifdef DEBUGGING
2743             stop = next - 1;
2744 #endif
2745             n = nnext;
2746             if (stopnow) break;
2747         }
2748
2749 #ifdef EXPERIMENTAL_INPLACESCAN
2750         if (flags && !NEXT_OFF(n)) {
2751             DEBUG_PEEP("atch", val, depth);
2752             if (reg_off_by_arg[OP(n)]) {
2753                 ARG_SET(n, val - n);
2754             }
2755             else {
2756                 NEXT_OFF(n) = val - n;
2757             }
2758             stopnow = 1;
2759         }
2760 #endif
2761     }
2762
2763     *min_subtract = 0;
2764     *has_exactf_sharp_s = FALSE;
2765
2766     /* Here, all the adjacent mergeable EXACTish nodes have been merged.  We
2767      * can now analyze for sequences of problematic code points.  (Prior to
2768      * this final joining, sequences could have been split over boundaries, and
2769      * hence missed).  The sequences only happen in folding, hence for any
2770      * non-EXACT EXACTish node */
2771     if (OP(scan) != EXACT) {
2772         const U8 * const s0 = (U8*) STRING(scan);
2773         const U8 * s = s0;
2774         const U8 * const s_end = s0 + STR_LEN(scan);
2775
2776         /* One pass is made over the node's string looking for all the
2777          * possibilities.  to avoid some tests in the loop, there are two main
2778          * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
2779          * non-UTF-8 */
2780         if (UTF) {
2781
2782             /* Examine the string for a multi-character fold sequence.  UTF-8
2783              * patterns have all characters pre-folded by the time this code is
2784              * executed */
2785             while (s < s_end - 1) /* Can stop 1 before the end, as minimum
2786                                      length sequence we are looking for is 2 */
2787             {
2788                 int count = 0;
2789                 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
2790                 if (! len) {    /* Not a multi-char fold: get next char */
2791                     s += UTF8SKIP(s);
2792                     continue;
2793                 }
2794
2795                 /* Nodes with 'ss' require special handling, except for EXACTFL
2796                  * and EXACTFA for which there is no multi-char fold to this */
2797                 if (len == 2 && *s == 's' && *(s+1) == 's'
2798                     && OP(scan) != EXACTFL && OP(scan) != EXACTFA)
2799                 {
2800                     count = 2;
2801                     OP(scan) = EXACTFU_SS;
2802                     s += 2;
2803                 }
2804                 else if (len == 6   /* len is the same in both ASCII and EBCDIC for these */
2805                          && (memEQ(s, GREEK_SMALL_LETTER_IOTA_UTF8
2806                                       COMBINING_DIAERESIS_UTF8
2807                                       COMBINING_ACUTE_ACCENT_UTF8,
2808                                    6)
2809                              || memEQ(s, GREEK_SMALL_LETTER_UPSILON_UTF8
2810                                          COMBINING_DIAERESIS_UTF8
2811                                          COMBINING_ACUTE_ACCENT_UTF8,
2812                                      6)))
2813                 {
2814                     count = 3;
2815
2816                     /* These two folds require special handling by trie's, so
2817                      * change the node type to indicate this.  If EXACTFA and
2818                      * EXACTFL were ever to be handled by trie's, this would
2819                      * have to be changed.  If this node has already been
2820                      * changed to EXACTFU_SS in this loop, leave it as is.  (I
2821                      * (khw) think it doesn't matter in regexec.c for UTF
2822                      * patterns, but no need to change it */
2823                     if (OP(scan) == EXACTFU) {
2824                         OP(scan) = EXACTFU_TRICKYFOLD;
2825                     }
2826                     s += 6;
2827                 }
2828                 else { /* Here is a generic multi-char fold. */
2829                     const U8* multi_end  = s + len;
2830
2831                     /* Count how many characters in it.  In the case of /l and
2832                      * /aa, no folds which contain ASCII code points are
2833                      * allowed, so check for those, and skip if found.  (In
2834                      * EXACTFL, no folds are allowed to any Latin1 code point,
2835                      * not just ASCII.  But there aren't any of these
2836                      * currently, nor ever likely, so don't take the time to
2837                      * test for them.  The code that generates the
2838                      * is_MULTI_foo() macros croaks should one actually get put
2839                      * into Unicode .) */
2840                     if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
2841                         count = utf8_length(s, multi_end);
2842                         s = multi_end;
2843                     }
2844                     else {
2845                         while (s < multi_end) {
2846                             if (isASCII(*s)) {
2847                                 s++;
2848                                 goto next_iteration;
2849                             }
2850                             else {
2851                                 s += UTF8SKIP(s);
2852                             }
2853                             count++;
2854                         }
2855                     }
2856                 }
2857
2858                 /* The delta is how long the sequence is minus 1 (1 is how long
2859                  * the character that folds to the sequence is) */
2860                 *min_subtract += count - 1;
2861             next_iteration: ;
2862             }
2863         }
2864         else if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
2865
2866             /* Here, the pattern is not UTF-8.  Look for the multi-char folds
2867              * that are all ASCII.  As in the above case, EXACTFL and EXACTFA
2868              * nodes can't have multi-char folds to this range (and there are
2869              * no existing ones in the upper latin1 range).  In the EXACTF
2870              * case we look also for the sharp s, which can be in the final
2871              * position.  Otherwise we can stop looking 1 byte earlier because
2872              * have to find at least two characters for a multi-fold */
2873             const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
2874
2875             /* The below is perhaps overboard, but this allows us to save a
2876              * test each time through the loop at the expense of a mask.  This
2877              * is because on both EBCDIC and ASCII machines, 'S' and 's' differ
2878              * by a single bit.  On ASCII they are 32 apart; on EBCDIC, they
2879              * are 64.  This uses an exclusive 'or' to find that bit and then
2880              * inverts it to form a mask, with just a single 0, in the bit
2881              * position where 'S' and 's' differ. */
2882             const U8 S_or_s_mask = (U8) ~ ('S' ^ 's');
2883             const U8 s_masked = 's' & S_or_s_mask;
2884
2885             while (s < upper) {
2886                 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
2887                 if (! len) {    /* Not a multi-char fold. */
2888                     if (*s == LATIN_SMALL_LETTER_SHARP_S && OP(scan) == EXACTF)
2889                     {
2890                         *has_exactf_sharp_s = TRUE;
2891                     }
2892                     s++;
2893                     continue;
2894                 }
2895
2896                 if (len == 2
2897                     && ((*s & S_or_s_mask) == s_masked)
2898                     && ((*(s+1) & S_or_s_mask) == s_masked))
2899                 {
2900
2901                     /* EXACTF nodes need to know that the minimum length
2902                      * changed so that a sharp s in the string can match this
2903                      * ss in the pattern, but they remain EXACTF nodes, as they
2904                      * won't match this unless the target string is is UTF-8,
2905                      * which we don't know until runtime */
2906                     if (OP(scan) != EXACTF) {
2907                         OP(scan) = EXACTFU_SS;
2908                     }
2909                 }
2910
2911                 *min_subtract += len - 1;
2912                 s += len;
2913             }
2914         }
2915     }
2916
2917 #ifdef DEBUGGING
2918     /* Allow dumping but overwriting the collection of skipped
2919      * ops and/or strings with fake optimized ops */
2920     n = scan + NODE_SZ_STR(scan);
2921     while (n <= stop) {
2922         OP(n) = OPTIMIZED;
2923         FLAGS(n) = 0;
2924         NEXT_OFF(n) = 0;
2925         n++;
2926     }
2927 #endif
2928     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2929     return stopnow;
2930 }
2931
2932 /* REx optimizer.  Converts nodes into quicker variants "in place".
2933    Finds fixed substrings.  */
2934
2935 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2936    to the position after last scanned or to NULL. */
2937
2938 #define INIT_AND_WITHP \
2939     assert(!and_withp); \
2940     Newx(and_withp,1,struct regnode_charclass_class); \
2941     SAVEFREEPV(and_withp)
2942
2943 /* this is a chain of data about sub patterns we are processing that
2944    need to be handled separately/specially in study_chunk. Its so
2945    we can simulate recursion without losing state.  */
2946 struct scan_frame;
2947 typedef struct scan_frame {
2948     regnode *last;  /* last node to process in this frame */
2949     regnode *next;  /* next node to process when last is reached */
2950     struct scan_frame *prev; /*previous frame*/
2951     I32 stop; /* what stopparen do we use */
2952 } scan_frame;
2953
2954
2955 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2956
2957 #define CASE_SYNST_FNC(nAmE)                                       \
2958 case nAmE:                                                         \
2959     if (flags & SCF_DO_STCLASS_AND) {                              \
2960             for (value = 0; value < 256; value++)                  \
2961                 if (!is_ ## nAmE ## _cp(value))                       \
2962                     ANYOF_BITMAP_CLEAR(data->start_class, value);  \
2963     }                                                              \
2964     else {                                                         \
2965             for (value = 0; value < 256; value++)                  \
2966                 if (is_ ## nAmE ## _cp(value))                        \
2967                     ANYOF_BITMAP_SET(data->start_class, value);    \
2968     }                                                              \
2969     break;                                                         \
2970 case N ## nAmE:                                                    \
2971     if (flags & SCF_DO_STCLASS_AND) {                              \
2972             for (value = 0; value < 256; value++)                   \
2973                 if (is_ ## nAmE ## _cp(value))                         \
2974                     ANYOF_BITMAP_CLEAR(data->start_class, value);   \
2975     }                                                               \
2976     else {                                                          \
2977             for (value = 0; value < 256; value++)                   \
2978                 if (!is_ ## nAmE ## _cp(value))                        \
2979                     ANYOF_BITMAP_SET(data->start_class, value);     \
2980     }                                                               \
2981     break
2982
2983
2984
2985 STATIC I32
2986 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2987                         I32 *minlenp, I32 *deltap,
2988                         regnode *last,
2989                         scan_data_t *data,
2990                         I32 stopparen,
2991                         U8* recursed,
2992                         struct regnode_charclass_class *and_withp,
2993                         U32 flags, U32 depth)
2994                         /* scanp: Start here (read-write). */
2995                         /* deltap: Write maxlen-minlen here. */
2996                         /* last: Stop before this one. */
2997                         /* data: string data about the pattern */
2998                         /* stopparen: treat close N as END */
2999                         /* recursed: which subroutines have we recursed into */
3000                         /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3001 {
3002     dVAR;
3003     I32 min = 0;    /* There must be at least this number of characters to match */
3004     I32 pars = 0, code;
3005     regnode *scan = *scanp, *next;
3006     I32 delta = 0;
3007     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3008     int is_inf_internal = 0;            /* The studied chunk is infinite */
3009     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3010     scan_data_t data_fake;
3011     SV *re_trie_maxbuff = NULL;
3012     regnode *first_non_open = scan;
3013     I32 stopmin = I32_MAX;
3014     scan_frame *frame = NULL;
3015     GET_RE_DEBUG_FLAGS_DECL;
3016
3017     PERL_ARGS_ASSERT_STUDY_CHUNK;
3018
3019 #ifdef DEBUGGING
3020     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3021 #endif
3022
3023     if ( depth == 0 ) {
3024         while (first_non_open && OP(first_non_open) == OPEN)
3025             first_non_open=regnext(first_non_open);
3026     }
3027
3028
3029   fake_study_recurse:
3030     while ( scan && OP(scan) != END && scan < last ){
3031         UV min_subtract = 0;    /* How mmany chars to subtract from the minimum
3032                                    node length to get a real minimum (because
3033                                    the folded version may be shorter) */
3034         bool has_exactf_sharp_s = FALSE;
3035         /* Peephole optimizer: */
3036         DEBUG_STUDYDATA("Peep:", data,depth);
3037         DEBUG_PEEP("Peep",scan,depth);
3038
3039         /* Its not clear to khw or hv why this is done here, and not in the
3040          * clauses that deal with EXACT nodes.  khw's guess is that it's
3041          * because of a previous design */
3042         JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
3043
3044         /* Follow the next-chain of the current node and optimize
3045            away all the NOTHINGs from it.  */
3046         if (OP(scan) != CURLYX) {
3047             const int max = (reg_off_by_arg[OP(scan)]
3048                        ? I32_MAX
3049                        /* I32 may be smaller than U16 on CRAYs! */
3050                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3051             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3052             int noff;
3053             regnode *n = scan;
3054
3055             /* Skip NOTHING and LONGJMP. */
3056             while ((n = regnext(n))
3057                    && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3058                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3059                    && off + noff < max)
3060                 off += noff;
3061             if (reg_off_by_arg[OP(scan)])
3062                 ARG(scan) = off;
3063             else
3064                 NEXT_OFF(scan) = off;
3065         }
3066
3067
3068
3069         /* The principal pseudo-switch.  Cannot be a switch, since we
3070            look into several different things.  */
3071         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3072                    || OP(scan) == IFTHEN) {
3073             next = regnext(scan);
3074             code = OP(scan);
3075             /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
3076
3077             if (OP(next) == code || code == IFTHEN) {
3078                 /* NOTE - There is similar code to this block below for handling
3079                    TRIE nodes on a re-study.  If you change stuff here check there
3080                    too. */
3081                 I32 max1 = 0, min1 = I32_MAX, num = 0;
3082                 struct regnode_charclass_class accum;
3083                 regnode * const startbranch=scan;
3084
3085                 if (flags & SCF_DO_SUBSTR)
3086                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
3087                 if (flags & SCF_DO_STCLASS)
3088                     cl_init_zero(pRExC_state, &accum);
3089
3090                 while (OP(scan) == code) {
3091                     I32 deltanext, minnext, f = 0, fake;
3092                     struct regnode_charclass_class this_class;
3093
3094                     num++;
3095                     data_fake.flags = 0;
3096                     if (data) {
3097                         data_fake.whilem_c = data->whilem_c;
3098                         data_fake.last_closep = data->last_closep;
3099                     }
3100                     else
3101                         data_fake.last_closep = &fake;
3102
3103                     data_fake.pos_delta = delta;
3104                     next = regnext(scan);
3105                     scan = NEXTOPER(scan);
3106                     if (code != BRANCH)
3107                         scan = NEXTOPER(scan);
3108                     if (flags & SCF_DO_STCLASS) {
3109                         cl_init(pRExC_state, &this_class);
3110                         data_fake.start_class = &this_class;
3111                         f = SCF_DO_STCLASS_AND;
3112                     }
3113                     if (flags & SCF_WHILEM_VISITED_POS)
3114                         f |= SCF_WHILEM_VISITED_POS;
3115
3116                     /* we suppose the run is continuous, last=next...*/
3117                     minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3118                                           next, &data_fake,
3119                                           stopparen, recursed, NULL, f,depth+1);
3120                     if (min1 > minnext)
3121                         min1 = minnext;
3122                     if (max1 < minnext + deltanext)
3123                         max1 = minnext + deltanext;
3124                     if (deltanext == I32_MAX)
3125                         is_inf = is_inf_internal = 1;
3126                     scan = next;
3127                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3128                         pars++;
3129                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
3130                         if ( stopmin > minnext)
3131                             stopmin = min + min1;
3132                         flags &= ~SCF_DO_SUBSTR;
3133                         if (data)
3134                             data->flags |= SCF_SEEN_ACCEPT;
3135                     }
3136                     if (data) {
3137                         if (data_fake.flags & SF_HAS_EVAL)
3138                             data->flags |= SF_HAS_EVAL;
3139                         data->whilem_c = data_fake.whilem_c;
3140                     }
3141                     if (flags & SCF_DO_STCLASS)
3142                         cl_or(pRExC_state, &accum, &this_class);
3143                 }
3144                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3145                     min1 = 0;
3146                 if (flags & SCF_DO_SUBSTR) {
3147                     data->pos_min += min1;
3148                     data->pos_delta += max1 - min1;
3149                     if (max1 != min1 || is_inf)
3150                         data->longest = &(data->longest_float);
3151                 }
3152                 min += min1;
3153                 delta += max1 - min1;
3154                 if (flags & SCF_DO_STCLASS_OR) {
3155                     cl_or(pRExC_state, data->start_class, &accum);
3156                     if (min1) {
3157                         cl_and(data->start_class, and_withp);
3158                         flags &= ~SCF_DO_STCLASS;
3159                     }
3160                 }
3161                 else if (flags & SCF_DO_STCLASS_AND) {
3162                     if (min1) {
3163                         cl_and(data->start_class, &accum);
3164                         flags &= ~SCF_DO_STCLASS;
3165                     }
3166                     else {
3167                         /* Switch to OR mode: cache the old value of
3168                          * data->start_class */
3169                         INIT_AND_WITHP;
3170                         StructCopy(data->start_class, and_withp,
3171                                    struct regnode_charclass_class);
3172                         flags &= ~SCF_DO_STCLASS_AND;
3173                         StructCopy(&accum, data->start_class,
3174                                    struct regnode_charclass_class);
3175                         flags |= SCF_DO_STCLASS_OR;
3176                         data->start_class->flags |= ANYOF_EOS;
3177                     }
3178                 }
3179
3180                 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
3181                 /* demq.
3182
3183                    Assuming this was/is a branch we are dealing with: 'scan' now
3184                    points at the item that follows the branch sequence, whatever
3185                    it is. We now start at the beginning of the sequence and look
3186                    for subsequences of
3187
3188                    BRANCH->EXACT=>x1
3189                    BRANCH->EXACT=>x2
3190                    tail
3191
3192                    which would be constructed from a pattern like /A|LIST|OF|WORDS/
3193
3194                    If we can find such a subsequence we need to turn the first
3195                    element into a trie and then add the subsequent branch exact
3196                    strings to the trie.
3197
3198                    We have two cases
3199
3200                      1. patterns where the whole set of branches can be converted.
3201
3202                      2. patterns where only a subset can be converted.
3203
3204                    In case 1 we can replace the whole set with a single regop
3205                    for the trie. In case 2 we need to keep the start and end
3206                    branches so
3207
3208                      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3209                      becomes BRANCH TRIE; BRANCH X;
3210
3211                   There is an additional case, that being where there is a
3212                   common prefix, which gets split out into an EXACT like node
3213                   preceding the TRIE node.
3214
3215                   If x(1..n)==tail then we can do a simple trie, if not we make
3216                   a "jump" trie, such that when we match the appropriate word
3217                   we "jump" to the appropriate tail node. Essentially we turn
3218                   a nested if into a case structure of sorts.
3219
3220                 */
3221
3222                     int made=0;
3223                     if (!re_trie_maxbuff) {
3224                         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3225                         if (!SvIOK(re_trie_maxbuff))
3226                             sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3227                     }
3228                     if ( SvIV(re_trie_maxbuff)>=0  ) {
3229                         regnode *cur;
3230                         regnode *first = (regnode *)NULL;
3231                         regnode *last = (regnode *)NULL;
3232                         regnode *tail = scan;
3233                         U8 trietype = 0;
3234                         U32 count=0;
3235
3236 #ifdef DEBUGGING
3237                         SV * const mysv = sv_newmortal();       /* for dumping */
3238 #endif
3239                         /* var tail is used because there may be a TAIL
3240                            regop in the way. Ie, the exacts will point to the
3241                            thing following the TAIL, but the last branch will
3242                            point at the TAIL. So we advance tail. If we
3243                            have nested (?:) we may have to move through several
3244                            tails.
3245                          */
3246
3247                         while ( OP( tail ) == TAIL ) {
3248                             /* this is the TAIL generated by (?:) */
3249                             tail = regnext( tail );
3250                         }
3251
3252
3253                         DEBUG_TRIE_COMPILE_r({
3254                             regprop(RExC_rx, mysv, tail );
3255                             PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3256                                 (int)depth * 2 + 2, "",
3257                                 "Looking for TRIE'able sequences. Tail node is: ",
3258                                 SvPV_nolen_const( mysv )
3259                             );
3260                         });
3261
3262                         /*
3263
3264                             Step through the branches
3265                                 cur represents each branch,
3266                                 noper is the first thing to be matched as part of that branch
3267                                 noper_next is the regnext() of that node.
3268
3269                             We normally handle a case like this /FOO[xyz]|BAR[pqr]/
3270                             via a "jump trie" but we also support building with NOJUMPTRIE,
3271                             which restricts the trie logic to structures like /FOO|BAR/.
3272
3273                             If noper is a trieable nodetype then the branch is a possible optimization
3274                             target. If we are building under NOJUMPTRIE then we require that noper_next
3275                             is the same as scan (our current position in the regex program).
3276
3277                             Once we have two or more consecutive such branches we can create a
3278                             trie of the EXACT's contents and stitch it in place into the program.
3279
3280                             If the sequence represents all of the branches in the alternation we
3281                             replace the entire thing with a single TRIE node.
3282
3283                             Otherwise when it is a subsequence we need to stitch it in place and
3284                             replace only the relevant branches. This means the first branch has
3285                             to remain as it is used by the alternation logic, and its next pointer,
3286                             and needs to be repointed at the item on the branch chain following
3287                             the last branch we have optimized away.
3288
3289                             This could be either a BRANCH, in which case the subsequence is internal,
3290                             or it could be the item following the branch sequence in which case the
3291                             subsequence is at the end (which does not necessarily mean the first node
3292                             is the start of the alternation).
3293
3294                             TRIE_TYPE(X) is a define which maps the optype to a trietype.
3295
3296                                 optype          |  trietype
3297                                 ----------------+-----------
3298                                 NOTHING         | NOTHING
3299                                 EXACT           | EXACT
3300                                 EXACTFU         | EXACTFU
3301                                 EXACTFU_SS      | EXACTFU
3302                                 EXACTFU_TRICKYFOLD | EXACTFU
3303                                 EXACTFA         | 0
3304
3305
3306                         */
3307 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING :   \
3308                        ( EXACT == (X) )   ? EXACT :        \
3309                        ( EXACTFU == (X) || EXACTFU_SS == (X) || EXACTFU_TRICKYFOLD == (X) ) ? EXACTFU :        \
3310                        0 )
3311
3312                         /* dont use tail as the end marker for this traverse */
3313                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3314                             regnode * const noper = NEXTOPER( cur );
3315                             U8 noper_type = OP( noper );
3316                             U8 noper_trietype = TRIE_TYPE( noper_type );
3317 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3318                             regnode * const noper_next = regnext( noper );
3319                             U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
3320                             U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
3321 #endif
3322
3323                             DEBUG_TRIE_COMPILE_r({
3324                                 regprop(RExC_rx, mysv, cur);
3325                                 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3326                                    (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3327
3328                                 regprop(RExC_rx, mysv, noper);
3329                                 PerlIO_printf( Perl_debug_log, " -> %s",
3330                                     SvPV_nolen_const(mysv));
3331
3332                                 if ( noper_next ) {
3333                                   regprop(RExC_rx, mysv, noper_next );
3334                                   PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3335                                     SvPV_nolen_const(mysv));
3336                                 }
3337                                 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
3338                                    REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
3339                                    PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
3340                                 );
3341                             });
3342
3343                             /* Is noper a trieable nodetype that can be merged with the
3344                              * current trie (if there is one)? */
3345                             if ( noper_trietype
3346                                   &&
3347                                   (
3348                                         ( noper_trietype == NOTHING)
3349                                         || ( trietype == NOTHING )
3350                                         || ( trietype == noper_trietype )
3351                                   )
3352 #ifdef NOJUMPTRIE
3353                                   && noper_next == tail
3354 #endif
3355                                   && count < U16_MAX)
3356                             {
3357                                 /* Handle mergable triable node
3358                                  * Either we are the first node in a new trieable sequence,
3359                                  * in which case we do some bookkeeping, otherwise we update
3360                                  * the end pointer. */
3361                                 if ( !first ) {
3362                                     first = cur;
3363                                     if ( noper_trietype == NOTHING ) {
3364 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
3365                                         regnode * const noper_next = regnext( noper );
3366                                         U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
3367                                         U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
3368 #endif
3369
3370                                         if ( noper_next_trietype ) {
3371                                             trietype = noper_next_trietype;
3372                                         } else if (noper_next_type)  {
3373                                             /* a NOTHING regop is 1 regop wide. We need at least two
3374                                              * for a trie so we can't merge this in */
3375                                             first = NULL;
3376                                         }
3377                                     } else {
3378                                         trietype = noper_trietype;
3379                                     }
3380                                 } else {
3381                                     if ( trietype == NOTHING )
3382                                         trietype = noper_trietype;
3383                                     last = cur;
3384                                 }
3385                                 if (first)
3386                                     count++;
3387                             } /* end handle mergable triable node */
3388                             else {
3389                                 /* handle unmergable node -
3390                                  * noper may either be a triable node which can not be tried
3391                                  * together with the current trie, or a non triable node */
3392                                 if ( last ) {
3393                                     /* If last is set and trietype is not NOTHING then we have found
3394                                      * at least two triable branch sequences in a row of a similar
3395                                      * trietype so we can turn them into a trie. If/when we
3396                                      * allow NOTHING to start a trie sequence this condition will be
3397                                      * required, and it isn't expensive so we leave it in for now. */
3398                                     if ( trietype && trietype != NOTHING )
3399                                         make_trie( pRExC_state,
3400                                                 startbranch, first, cur, tail, count,
3401                                                 trietype, depth+1 );
3402                                     last = NULL; /* note: we clear/update first, trietype etc below, so we dont do it here */
3403                                 }
3404                                 if ( noper_trietype
3405 #ifdef NOJUMPTRIE
3406                                      && noper_next == tail
3407 #endif
3408                                 ){
3409                                     /* noper is triable, so we can start a new trie sequence */
3410                                     count = 1;
3411                                     first = cur;
3412                                     trietype = noper_trietype;
3413                                 } else if (first) {
3414                                     /* if we already saw a first but the current node is not triable then we have
3415                                      * to reset the first information. */
3416                                     count = 0;
3417                                     first = NULL;
3418                                     trietype = 0;
3419                                 }
3420                             } /* end handle unmergable node */
3421                         } /* loop over branches */
3422                         DEBUG_TRIE_COMPILE_r({
3423                             regprop(RExC_rx, mysv, cur);
3424                             PerlIO_printf( Perl_debug_log,
3425                               "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3426                               "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3427
3428                         });
3429                         if ( last && trietype ) {
3430                             if ( trietype != NOTHING ) {
3431                                 /* the last branch of the sequence was part of a trie,
3432                                  * so we have to construct it here outside of the loop
3433                                  */
3434                                 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 );
3435 #ifdef TRIE_STUDY_OPT
3436                                 if ( ((made == MADE_EXACT_TRIE &&
3437                                      startbranch == first)
3438                                      || ( first_non_open == first )) &&
3439                                      depth==0 ) {
3440                                     flags |= SCF_TRIE_RESTUDY;
3441                                     if ( startbranch == first
3442                                          && scan == tail )
3443                                     {
3444                                         RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3445                                     }
3446                                 }
3447 #endif
3448                             } else {
3449                                 /* at this point we know whatever we have is a NOTHING sequence/branch
3450                                  * AND if 'startbranch' is 'first' then we can turn the whole thing into a NOTHING
3451                                  */
3452                                 if ( startbranch == first ) {
3453                                     regnode *opt;
3454                                     /* the entire thing is a NOTHING sequence, something like this:
3455                                      * (?:|) So we can turn it into a plain NOTHING op. */
3456                                     DEBUG_TRIE_COMPILE_r({
3457                                         regprop(RExC_rx, mysv, cur);
3458                                         PerlIO_printf( Perl_debug_log,
3459                                           "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
3460                                           "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3461
3462                                     });
3463                                     OP(startbranch)= NOTHING;
3464                                     NEXT_OFF(startbranch)= tail - startbranch;
3465                                     for ( opt= startbranch + 1; opt < tail ; opt++ )
3466                                         OP(opt)= OPTIMIZED;
3467                                 }
3468                             }
3469                         } /* end if ( last) */
3470                     } /* TRIE_MAXBUF is non zero */
3471
3472                 } /* do trie */
3473
3474             }
3475             else if ( code == BRANCHJ ) {  /* single branch is optimized. */
3476                 scan = NEXTOPER(NEXTOPER(scan));
3477             } else                      /* single branch is optimized. */
3478                 scan = NEXTOPER(scan);
3479             continue;
3480         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3481             scan_frame *newframe = NULL;
3482             I32 paren;
3483             regnode *start;
3484             regnode *end;
3485
3486             if (OP(scan) != SUSPEND) {
3487             /* set the pointer */
3488                 if (OP(scan) == GOSUB) {
3489                     paren = ARG(scan);
3490                     RExC_recurse[ARG2L(scan)] = scan;
3491                     start = RExC_open_parens[paren-1];
3492                     end   = RExC_close_parens[paren-1];
3493                 } else {
3494                     paren = 0;
3495                     start = RExC_rxi->program + 1;
3496                     end   = RExC_opend;
3497                 }
3498                 if (!recursed) {
3499                     Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3500                     SAVEFREEPV(recursed);
3501                 }
3502                 if (!PAREN_TEST(recursed,paren+1)) {
3503                     PAREN_SET(recursed,paren+1);
3504                     Newx(newframe,1,scan_frame);
3505                 } else {
3506                     if (flags & SCF_DO_SUBSTR) {
3507                         SCAN_COMMIT(pRExC_state,data,minlenp);
3508                         data->longest = &(data->longest_float);
3509                     }
3510                     is_inf = is_inf_internal = 1;
3511                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3512                         cl_anything(pRExC_state, data->start_class);
3513                     flags &= ~SCF_DO_STCLASS;
3514                 }
3515             } else {
3516                 Newx(newframe,1,scan_frame);
3517                 paren = stopparen;
3518                 start = scan+2;
3519                 end = regnext(scan);
3520             }
3521             if (newframe) {
3522                 assert(start);
3523                 assert(end);
3524                 SAVEFREEPV(newframe);
3525                 newframe->next = regnext(scan);
3526                 newframe->last = last;
3527                 newframe->stop = stopparen;
3528                 newframe->prev = frame;
3529
3530                 frame = newframe;
3531                 scan =  start;
3532                 stopparen = paren;
3533                 last = end;
3534
3535                 continue;
3536             }
3537         }
3538         else if (OP(scan) == EXACT) {
3539             I32 l = STR_LEN(scan);
3540             UV uc;
3541             if (UTF) {
3542                 const U8 * const s = (U8*)STRING(scan);
3543                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3544                 l = utf8_length(s, s + l);
3545             } else {
3546                 uc = *((U8*)STRING(scan));
3547             }
3548             min += l;
3549             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3550                 /* The code below prefers earlier match for fixed
3551                    offset, later match for variable offset.  */
3552                 if (data->last_end == -1) { /* Update the start info. */
3553                     data->last_start_min = data->pos_min;
3554                     data->last_start_max = is_inf
3555                         ? I32_MAX : data->pos_min + data->pos_delta;
3556                 }
3557                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3558                 if (UTF)
3559                     SvUTF8_on(data->last_found);
3560                 {
3561                     SV * const sv = data->last_found;
3562                     MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3563                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
3564                     if (mg && mg->mg_len >= 0)
3565                         mg->mg_len += utf8_length((U8*)STRING(scan),
3566                                                   (U8*)STRING(scan)+STR_LEN(scan));
3567                 }
3568                 data->last_end = data->pos_min + l;
3569                 data->pos_min += l; /* As in the first entry. */
3570                 data->flags &= ~SF_BEFORE_EOL;
3571             }
3572             if (flags & SCF_DO_STCLASS_AND) {
3573                 /* Check whether it is compatible with what we know already! */
3574                 int compat = 1;
3575
3576
3577                 /* If compatible, we or it in below.  It is compatible if is
3578                  * in the bitmp and either 1) its bit or its fold is set, or 2)
3579                  * it's for a locale.  Even if there isn't unicode semantics
3580                  * here, at runtime there may be because of matching against a
3581                  * utf8 string, so accept a possible false positive for
3582                  * latin1-range folds */
3583                 if (uc >= 0x100 ||
3584                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3585                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
3586                     && (!(data->start_class->flags & ANYOF_LOC_FOLD)
3587                         || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3588                     )
3589                 {
3590                     compat = 0;
3591                 }
3592                 ANYOF_CLASS_ZERO(data->start_class);
3593                 ANYOF_BITMAP_ZERO(data->start_class);
3594                 if (compat)
3595                     ANYOF_BITMAP_SET(data->start_class, uc);
3596                 else if (uc >= 0x100) {
3597                     int i;
3598
3599                     /* Some Unicode code points fold to the Latin1 range; as
3600                      * XXX temporary code, instead of figuring out if this is
3601                      * one, just assume it is and set all the start class bits
3602                      * that could be some such above 255 code point's fold
3603                      * which will generate fals positives.  As the code
3604                      * elsewhere that does compute the fold settles down, it
3605                      * can be extracted out and re-used here */
3606                     for (i = 0; i < 256; i++){
3607                         if (HAS_NONLATIN1_FOLD_CLOSURE(i)) {
3608                             ANYOF_BITMAP_SET(data->start_class, i);
3609                         }
3610                     }
3611                 }
3612                 data->start_class->flags &= ~ANYOF_EOS;
3613                 if (uc < 0x100)
3614                   data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3615             }
3616             else if (flags & SCF_DO_STCLASS_OR) {
3617                 /* false positive possible if the class is case-folded */
3618                 if (uc < 0x100)
3619                     ANYOF_BITMAP_SET(data->start_class, uc);
3620                 else
3621                     data->start_class->flags |= ANYOF_UNICODE_ALL;
3622                 data->start_class->flags &= ~ANYOF_EOS;
3623                 cl_and(data->start_class, and_withp);
3624             }
3625             flags &= ~SCF_DO_STCLASS;
3626         }
3627         else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3628             I32 l = STR_LEN(scan);
3629             UV uc = *((U8*)STRING(scan));
3630
3631             /* Search for fixed substrings supports EXACT only. */
3632             if (flags & SCF_DO_SUBSTR) {
3633                 assert(data);
3634                 SCAN_COMMIT(pRExC_state, data, minlenp);
3635             }
3636             if (UTF) {
3637                 const U8 * const s = (U8 *)STRING(scan);
3638                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3639                 l = utf8_length(s, s + l);
3640             }
3641             if (has_exactf_sharp_s) {
3642                 RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
3643             }
3644             min += l - min_subtract;
3645             assert (min >= 0);
3646             delta += min_subtract;
3647             if (flags & SCF_DO_SUBSTR) {
3648                 data->pos_min += l - min_subtract;
3649                 if (data->pos_min < 0) {
3650                     data->pos_min = 0;
3651                 }
3652                 data->pos_delta += min_subtract;
3653                 if (min_subtract) {
3654                     data->longest = &(data->longest_float);
3655                 }
3656             }
3657             if (flags & SCF_DO_STCLASS_AND) {
3658                 /* Check whether it is compatible with what we know already! */
3659                 int compat = 1;
3660                 if (uc >= 0x100 ||
3661                  (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3662                   && !ANYOF_BITMAP_TEST(data->start_class, uc)
3663                   && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3664                 {
3665                     compat = 0;
3666                 }
3667                 ANYOF_CLASS_ZERO(data->start_class);
3668                 ANYOF_BITMAP_ZERO(data->start_class);
3669                 if (compat) {
3670                     ANYOF_BITMAP_SET(data->start_class, uc);
3671                     data->start_class->flags &= ~ANYOF_EOS;
3672                     if (OP(scan) == EXACTFL) {
3673                         /* XXX This set is probably no longer necessary, and
3674                          * probably wrong as LOCALE now is on in the initial
3675                          * state */
3676                         data->start_class->flags |= ANYOF_LOCALE|ANYOF_LOC_FOLD;
3677                     }
3678                     else {
3679
3680                         /* Also set the other member of the fold pair.  In case
3681                          * that unicode semantics is called for at runtime, use
3682                          * the full latin1 fold.  (Can't do this for locale,
3683                          * because not known until runtime) */
3684                         ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3685
3686                         /* All other (EXACTFL handled above) folds except under
3687                          * /iaa that include s, S, and sharp_s also may include
3688                          * the others */
3689                         if (OP(scan) != EXACTFA) {
3690                             if (uc == 's' || uc == 'S') {
3691                                 ANYOF_BITMAP_SET(data->start_class,
3692                                                  LATIN_SMALL_LETTER_SHARP_S);
3693                             }
3694                             else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3695                                 ANYOF_BITMAP_SET(data->start_class, 's');
3696                                 ANYOF_BITMAP_SET(data->start_class, 'S');
3697                             }
3698                         }
3699                     }
3700                 }
3701                 else if (uc >= 0x100) {
3702                     int i;
3703                     for (i = 0; i < 256; i++){
3704                         if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3705                             ANYOF_BITMAP_SET(data->start_class, i);
3706                         }
3707                     }
3708                 }
3709             }
3710             else if (flags & SCF_DO_STCLASS_OR) {
3711                 if (data->start_class->flags & ANYOF_LOC_FOLD) {
3712                     /* false positive possible if the class is case-folded.
3713                        Assume that the locale settings are the same... */
3714                     if (uc < 0x100) {
3715                         ANYOF_BITMAP_SET(data->start_class, uc);
3716                         if (OP(scan) != EXACTFL) {
3717
3718                             /* And set the other member of the fold pair, but
3719                              * can't do that in locale because not known until
3720                              * run-time */
3721                             ANYOF_BITMAP_SET(data->start_class,
3722                                              PL_fold_latin1[uc]);
3723
3724                             /* All folds except under /iaa that include s, S,
3725                              * and sharp_s also may include the others */
3726                             if (OP(scan) != EXACTFA) {
3727                                 if (uc == 's' || uc == 'S') {
3728                                     ANYOF_BITMAP_SET(data->start_class,
3729                                                    LATIN_SMALL_LETTER_SHARP_S);
3730                                 }
3731                                 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3732                                     ANYOF_BITMAP_SET(data->start_class, 's');
3733                                     ANYOF_BITMAP_SET(data->start_class, 'S');
3734                                 }
3735                             }
3736                         }
3737                     }
3738                     data->start_class->flags &= ~ANYOF_EOS;
3739                 }
3740                 cl_and(data->start_class, and_withp);
3741             }
3742             flags &= ~SCF_DO_STCLASS;
3743         }
3744         else if (REGNODE_VARIES(OP(scan))) {
3745             I32 mincount, maxcount, minnext, deltanext, fl = 0;
3746             I32 f = flags, pos_before = 0;
3747             regnode * const oscan = scan;
3748             struct regnode_charclass_class this_class;
3749             struct regnode_charclass_class *oclass = NULL;
3750             I32 next_is_eval = 0;
3751
3752             switch (PL_regkind[OP(scan)]) {
3753             case WHILEM:                /* End of (?:...)* . */
3754                 scan = NEXTOPER(scan);
3755                 goto finish;
3756             case PLUS:
3757                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3758                     next = NEXTOPER(scan);
3759                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3760                         mincount = 1;
3761                         maxcount = REG_INFTY;
3762                         next = regnext(scan);
3763                         scan = NEXTOPER(scan);
3764                         goto do_curly;
3765                     }
3766                 }
3767                 if (flags & SCF_DO_SUBSTR)
3768                     data->pos_min++;
3769                 min++;
3770                 /* Fall through. */
3771             case STAR:
3772                 if (flags & SCF_DO_STCLASS) {
3773                     mincount = 0;
3774                     maxcount = REG_INFTY;
3775                     next = regnext(scan);
3776                     scan = NEXTOPER(scan);
3777                     goto do_curly;
3778                 }
3779                 is_inf = is_inf_internal = 1;
3780                 scan = regnext(scan);
3781                 if (flags & SCF_DO_SUBSTR) {
3782                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3783                     data->longest = &(data->longest_float);
3784                 }
3785                 goto optimize_curly_tail;
3786             case CURLY:
3787                 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3788                     && (scan->flags == stopparen))
3789                 {
3790                     mincount = 1;
3791                     maxcount = 1;
3792                 } else {
3793                     mincount = ARG1(scan);
3794                     maxcount = ARG2(scan);
3795                 }
3796                 next = regnext(scan);
3797                 if (OP(scan) == CURLYX) {
3798                     I32 lp = (data ? *(data->last_closep) : 0);
3799                     scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3800                 }
3801                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3802                 next_is_eval = (OP(scan) == EVAL);
3803               do_curly:
3804                 if (flags & SCF_DO_SUBSTR) {
3805                     if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3806                     pos_before = data->pos_min;
3807                 }
3808                 if (data) {
3809                     fl = data->flags;
3810                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3811                     if (is_inf)
3812                         data->flags |= SF_IS_INF;
3813                 }
3814                 if (flags & SCF_DO_STCLASS) {
3815                     cl_init(pRExC_state, &this_class);
3816                     oclass = data->start_class;
3817                     data->start_class = &this_class;
3818                     f |= SCF_DO_STCLASS_AND;
3819                     f &= ~SCF_DO_STCLASS_OR;
3820                 }
3821                 /* Exclude from super-linear cache processing any {n,m}
3822                    regops for which the combination of input pos and regex
3823                    pos is not enough information to determine if a match
3824                    will be possible.
3825
3826                    For example, in the regex /foo(bar\s*){4,8}baz/ with the
3827                    regex pos at the \s*, the prospects for a match depend not
3828                    only on the input position but also on how many (bar\s*)
3829                    repeats into the {4,8} we are. */
3830                if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3831                     f &= ~SCF_WHILEM_VISITED_POS;
3832
3833                 /* This will finish on WHILEM, setting scan, or on NULL: */
3834                 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3835                                       last, data, stopparen, recursed, NULL,
3836                                       (mincount == 0
3837                                         ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3838
3839                 if (flags & SCF_DO_STCLASS)
3840                     data->start_class = oclass;
3841                 if (mincount == 0 || minnext == 0) {
3842                     if (flags & SCF_DO_STCLASS_OR) {
3843                         cl_or(pRExC_state, data->start_class, &this_class);
3844                     }
3845                     else if (flags & SCF_DO_STCLASS_AND) {
3846                         /* Switch to OR mode: cache the old value of
3847                          * data->start_class */
3848                         INIT_AND_WITHP;
3849                         StructCopy(data->start_class, and_withp,
3850                                    struct regnode_charclass_class);
3851                         flags &= ~SCF_DO_STCLASS_AND;
3852                         StructCopy(&this_class, data->start_class,
3853                                    struct regnode_charclass_class);
3854                         flags |= SCF_DO_STCLASS_OR;
3855                         data->start_class->flags |= ANYOF_EOS;
3856                     }
3857                 } else {                /* Non-zero len */
3858                     if (flags & SCF_DO_STCLASS_OR) {
3859                         cl_or(pRExC_state, data->start_class, &this_class);
3860                         cl_and(data->start_class, and_withp);
3861                     }
3862                     else if (flags & SCF_DO_STCLASS_AND)
3863                         cl_and(data->start_class, &this_class);
3864                     flags &= ~SCF_DO_STCLASS;
3865                 }
3866                 if (!scan)              /* It was not CURLYX, but CURLY. */
3867                     scan = next;
3868                 if ( /* ? quantifier ok, except for (?{ ... }) */
3869                     (next_is_eval || !(mincount == 0 && maxcount == 1))
3870                     && (minnext == 0) && (deltanext == 0)
3871                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3872                     && maxcount <= REG_INFTY/3) /* Complement check for big count */
3873                 {
3874                     ckWARNreg(RExC_parse,
3875                               "Quantifier unexpected on zero-length expression");
3876                 }
3877
3878                 min += minnext * mincount;
3879                 is_inf_internal |= ((maxcount == REG_INFTY
3880                                      && (minnext + deltanext) > 0)
3881                                     || deltanext == I32_MAX);
3882                 is_inf |= is_inf_internal;
3883                 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3884
3885                 /* Try powerful optimization CURLYX => CURLYN. */
3886                 if (  OP(oscan) == CURLYX && data
3887                       && data->flags & SF_IN_PAR
3888                       && !(data->flags & SF_HAS_EVAL)
3889                       && !deltanext && minnext == 1 ) {
3890                     /* Try to optimize to CURLYN.  */
3891                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3892                     regnode * const nxt1 = nxt;
3893 #ifdef DEBUGGING
3894                     regnode *nxt2;
3895 #endif
3896
3897                     /* Skip open. */
3898                     nxt = regnext(nxt);
3899                     if (!REGNODE_SIMPLE(OP(nxt))
3900                         && !(PL_regkind[OP(nxt)] == EXACT
3901                              && STR_LEN(nxt) == 1))
3902                         goto nogo;
3903 #ifdef DEBUGGING
3904                     nxt2 = nxt;
3905 #endif
3906                     nxt = regnext(nxt);
3907                     if (OP(nxt) != CLOSE)
3908                         goto nogo;
3909                     if (RExC_open_parens) {
3910                         RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3911                         RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3912                     }
3913                     /* Now we know that nxt2 is the only contents: */
3914                     oscan->flags = (U8)ARG(nxt);
3915                     OP(oscan) = CURLYN;
3916                     OP(nxt1) = NOTHING; /* was OPEN. */
3917
3918 #ifdef DEBUGGING
3919                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3920                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3921                     NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3922                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
3923                     OP(nxt + 1) = OPTIMIZED; /* was count. */
3924                     NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3925 #endif
3926                 }
3927               nogo:
3928
3929                 /* Try optimization CURLYX => CURLYM. */
3930                 if (  OP(oscan) == CURLYX && data
3931                       && !(data->flags & SF_HAS_PAR)
3932                       && !(data->flags & SF_HAS_EVAL)
3933                       && !deltanext     /* atom is fixed width */
3934                       && minnext != 0   /* CURLYM can't handle zero width */
3935                       && ! (RExC_seen & REG_SEEN_EXACTF_SHARP_S) /* Nor \xDF */
3936                 ) {
3937                     /* XXXX How to optimize if data == 0? */
3938                     /* Optimize to a simpler form.  */
3939                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3940                     regnode *nxt2;
3941
3942                     OP(oscan) = CURLYM;
3943                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3944                             && (OP(nxt2) != WHILEM))
3945                         nxt = nxt2;
3946                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
3947                     /* Need to optimize away parenths. */
3948                     if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3949                         /* Set the parenth number.  */
3950                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3951
3952                         oscan->flags = (U8)ARG(nxt);
3953                         if (RExC_open_parens) {
3954                             RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3955                             RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3956                         }
3957                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
3958                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
3959
3960 #ifdef DEBUGGING
3961                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3962                         OP(nxt + 1) = OPTIMIZED; /* was count. */
3963                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3964                         NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
3965 #endif
3966 #if 0
3967                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
3968                             regnode *nnxt = regnext(nxt1);
3969                             if (nnxt == nxt) {
3970                                 if (reg_off_by_arg[OP(nxt1)])
3971                                     ARG_SET(nxt1, nxt2 - nxt1);
3972                                 else if (nxt2 - nxt1 < U16_MAX)
3973                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
3974                                 else
3975                                     OP(nxt) = NOTHING;  /* Cannot beautify */
3976                             }
3977                             nxt1 = nnxt;
3978                         }
3979 #endif
3980                         /* Optimize again: */
3981                         study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3982                                     NULL, stopparen, recursed, NULL, 0,depth+1);
3983                     }
3984                     else
3985                         oscan->flags = 0;
3986                 }
3987                 else if ((OP(oscan) == CURLYX)
3988                          && (flags & SCF_WHILEM_VISITED_POS)
3989                          /* See the comment on a similar expression above.
3990                             However, this time it's not a subexpression
3991                             we care about, but the expression itself. */
3992                          && (maxcount == REG_INFTY)
3993                          && data && ++data->whilem_c < 16) {
3994                     /* This stays as CURLYX, we can put the count/of pair. */
3995                     /* Find WHILEM (as in regexec.c) */
3996                     regnode *nxt = oscan + NEXT_OFF(oscan);
3997
3998                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3999                         nxt += ARG(nxt);
4000                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
4001                         | (RExC_whilem_seen << 4)); /* On WHILEM */
4002                 }
4003                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4004                     pars++;
4005                 if (flags & SCF_DO_SUBSTR) {
4006                     SV *last_str = NULL;
4007                     int counted = mincount != 0;
4008
4009                     if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
4010 #if defined(SPARC64_GCC_WORKAROUND)
4011                         I32 b = 0;
4012                         STRLEN l = 0;
4013                         const char *s = NULL;
4014                         I32 old = 0;
4015
4016                         if (pos_before >= data->last_start_min)
4017                             b = pos_before;
4018                         else
4019                             b = data->last_start_min;
4020
4021                         l = 0;
4022                         s = SvPV_const(data->last_found, l);
4023                         old = b - data->last_start_min;
4024
4025 #else
4026                         I32 b = pos_before >= data->last_start_min
4027                             ? pos_before : data->last_start_min;
4028                         STRLEN l;
4029                         const char * const s = SvPV_const(data->last_found, l);
4030                         I32 old = b - data->last_start_min;
4031 #endif
4032
4033                         if (UTF)
4034                             old = utf8_hop((U8*)s, old) - (U8*)s;
4035                         l -= old;
4036                         /* Get the added string: */
4037                         last_str = newSVpvn_utf8(s  + old, l, UTF);
4038                         if (deltanext == 0 && pos_before == b) {
4039                             /* What was added is a constant string */
4040                             if (mincount > 1) {
4041                                 SvGROW(last_str, (mincount * l) + 1);
4042                                 repeatcpy(SvPVX(last_str) + l,
4043                                           SvPVX_const(last_str), l, mincount - 1);
4044                                 SvCUR_set(last_str, SvCUR(last_str) * mincount);
4045                                 /* Add additional parts. */
4046                                 SvCUR_set(data->last_found,
4047                                           SvCUR(data->last_found) - l);
4048                                 sv_catsv(data->last_found, last_str);
4049                                 {
4050                                     SV * sv = data->last_found;
4051                                     MAGIC *mg =
4052                                         SvUTF8(sv) && SvMAGICAL(sv) ?
4053                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
4054                                     if (mg && mg->mg_len >= 0)
4055                                         mg->mg_len += CHR_SVLEN(last_str) - l;
4056                                 }
4057                                 data->last_end += l * (mincount - 1);
4058                             }
4059                         } else {
4060                             /* start offset must point into the last copy */
4061                             data->last_start_min += minnext * (mincount - 1);
4062                             data->last_start_max += is_inf ? I32_MAX
4063                                 : (maxcount - 1) * (minnext + data->pos_delta);
4064                         }
4065                     }
4066                     /* It is counted once already... */
4067                     data->pos_min += minnext * (mincount - counted);
4068                     data->pos_delta += - counted * deltanext +
4069                         (minnext + deltanext) * maxcount - minnext * mincount;
4070                     if (mincount != maxcount) {
4071                          /* Cannot extend fixed substrings found inside
4072                             the group.  */
4073                         SCAN_COMMIT(pRExC_state,data,minlenp);
4074                         if (mincount && last_str) {
4075                             SV * const sv = data->last_found;
4076                             MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4077                                 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4078
4079                             if (mg)
4080                                 mg->mg_len = -1;
4081                             sv_setsv(sv, last_str);
4082                             data->last_end = data->pos_min;
4083                             data->last_start_min =
4084                                 data->pos_min - CHR_SVLEN(last_str);
4085                             data->last_start_max = is_inf
4086                                 ? I32_MAX
4087                                 : data->pos_min + data->pos_delta
4088                                 - CHR_SVLEN(last_str);
4089                         }
4090                         data->longest = &(data->longest_float);
4091                     }
4092                     SvREFCNT_dec(last_str);
4093                 }
4094                 if (data && (fl & SF_HAS_EVAL))
4095                     data->flags |= SF_HAS_EVAL;
4096               optimize_curly_tail:
4097                 if (OP(oscan) != CURLYX) {
4098                     while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4099                            && NEXT_OFF(next))
4100                         NEXT_OFF(oscan) += NEXT_OFF(next);
4101                 }
4102                 continue;
4103             default:                    /* REF, ANYOFV, and CLUMP only? */
4104                 if (flags & SCF_DO_SUBSTR) {
4105                     SCAN_COMMIT(pRExC_state,data,minlenp);      /* Cannot expect anything... */
4106                     data->longest = &(data->longest_float);
4107                 }
4108                 is_inf = is_inf_internal = 1;
4109                 if (flags & SCF_DO_STCLASS_OR)
4110                     cl_anything(pRExC_state, data->start_class);
4111                 flags &= ~SCF_DO_STCLASS;
4112                 break;
4113             }
4114         }
4115         else if (OP(scan) == LNBREAK) {
4116             if (flags & SCF_DO_STCLASS) {
4117                 int value = 0;
4118                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
4119                 if (flags & SCF_DO_STCLASS_AND) {
4120                     for (value = 0; value < 256; value++)
4121                         if (!is_VERTWS_cp(value))
4122                             ANYOF_BITMAP_CLEAR(data->start_class, value);
4123                 }
4124                 else {
4125                     for (value = 0; value < 256; value++)
4126                         if (is_VERTWS_cp(value))
4127                             ANYOF_BITMAP_SET(data->start_class, value);
4128                 }
4129                 if (flags & SCF_DO_STCLASS_OR)
4130                     cl_and(data->start_class, and_withp);
4131                 flags &= ~SCF_DO_STCLASS;
4132             }
4133             min++;
4134             delta++;    /* Because of the 2 char string cr-lf */
4135             if (flags & SCF_DO_SUBSTR) {
4136                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
4137                 data->pos_min += 1;
4138                 data->pos_delta += 1;
4139                 data->longest = &(data->longest_float);
4140             }
4141         }
4142         else if (REGNODE_SIMPLE(OP(scan))) {
4143             int value = 0;
4144
4145             if (flags & SCF_DO_SUBSTR) {
4146                 SCAN_COMMIT(pRExC_state,data,minlenp);
4147                 data->pos_min++;
4148             }
4149             min++;
4150             if (flags & SCF_DO_STCLASS) {
4151                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
4152
4153                 /* Some of the logic below assumes that switching
4154                    locale on will only add false positives. */
4155                 switch (PL_regkind[OP(scan)]) {
4156                 case SANY:
4157                 default:
4158                   do_default:
4159                     /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
4160                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4161                         cl_anything(pRExC_state, data->start_class);
4162                     break;
4163                 case REG_ANY:
4164                     if (OP(scan) == SANY)
4165                         goto do_default;
4166                     if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
4167                         value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
4168                                  || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
4169                         cl_anything(pRExC_state, data->start_class);
4170                     }
4171                     if (flags & SCF_DO_STCLASS_AND || !value)
4172                         ANYOF_BITMAP_CLEAR(data->start_class,'\n');
4173                     break;
4174                 case ANYOF:
4175                     if (flags & SCF_DO_STCLASS_AND)
4176                         cl_and(data->start_class,
4177                                (struct regnode_charclass_class*)scan);
4178                     else
4179                         cl_or(pRExC_state, data->start_class,
4180                               (struct regnode_charclass_class*)scan);
4181                     break;
4182                 case ALNUM:
4183                     if (flags & SCF_DO_STCLASS_AND) {
4184                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4185                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NWORDCHAR);
4186                             if (OP(scan) == ALNUMU) {
4187                                 for (value = 0; value < 256; value++) {
4188                                     if (!isWORDCHAR_L1(value)) {
4189                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4190                                     }
4191                                 }
4192                             } else {
4193                                 for (value = 0; value < 256; value++) {
4194                                     if (!isALNUM(value)) {
4195                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4196                                     }
4197                                 }
4198                             }
4199                         }
4200                     }
4201                     else {
4202                         if (data->start_class->flags & ANYOF_LOCALE)
4203                             ANYOF_CLASS_SET(data->start_class,ANYOF_WORDCHAR);
4204
4205                         /* Even if under locale, set the bits for non-locale
4206                          * in case it isn't a true locale-node.  This will
4207                          * create false positives if it truly is locale */
4208                         if (OP(scan) == ALNUMU) {
4209                             for (value = 0; value < 256; value++) {
4210                                 if (isWORDCHAR_L1(value)) {
4211                                     ANYOF_BITMAP_SET(data->start_class, value);
4212                                 }
4213                             }
4214                         } else {
4215                             for (value = 0; value < 256; value++) {
4216                                 if (isALNUM(value)) {
4217                                     ANYOF_BITMAP_SET(data->start_class, value);
4218                                 }
4219                             }
4220                         }
4221                     }
4222                     break;
4223                 case NALNUM:
4224                     if (flags & SCF_DO_STCLASS_AND) {
4225                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4226                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_WORDCHAR);
4227                             if (OP(scan) == NALNUMU) {
4228                                 for (value = 0; value < 256; value++) {
4229                                     if (isWORDCHAR_L1(value)) {
4230                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4231                                     }
4232                                 }
4233                             } else {
4234                                 for (value = 0; value < 256; value++) {
4235                                     if (isALNUM(value)) {
4236                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4237                                     }
4238                                 }
4239                             }
4240                         }
4241                     }
4242                     else {
4243                         if (data->start_class->flags & ANYOF_LOCALE)
4244                             ANYOF_CLASS_SET(data->start_class,ANYOF_NWORDCHAR);
4245
4246                         /* Even if under locale, set the bits for non-locale in
4247                          * case it isn't a true locale-node.  This will create
4248                          * false positives if it truly is locale */
4249                         if (OP(scan) == NALNUMU) {
4250                             for (value = 0; value < 256; value++) {
4251                                 if (! isWORDCHAR_L1(value)) {
4252                                     ANYOF_BITMAP_SET(data->start_class, value);
4253                                 }
4254                             }
4255                         } else {
4256                             for (value = 0; value < 256; value++) {
4257                                 if (! isALNUM(value)) {
4258                                     ANYOF_BITMAP_SET(data->start_class, value);
4259                                 }
4260                             }
4261                         }
4262                     }
4263                     break;
4264                 case SPACE:
4265                     if (flags & SCF_DO_STCLASS_AND) {
4266                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4267                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
4268                             if (OP(scan) == SPACEU) {
4269                                 for (value = 0; value < 256; value++) {
4270                                     if (!isSPACE_L1(value)) {
4271                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4272                                     }
4273                                 }
4274                             } else {
4275                                 for (value = 0; value < 256; value++) {
4276                                     if (!isSPACE(value)) {
4277                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4278                                     }
4279                                 }
4280                             }
4281                         }
4282                     }
4283                     else {
4284                         if (data->start_class->flags & ANYOF_LOCALE) {
4285                             ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
4286                         }
4287                         if (OP(scan) == SPACEU) {
4288                             for (value = 0; value < 256; value++) {
4289                                 if (isSPACE_L1(value)) {
4290                                     ANYOF_BITMAP_SET(data->start_class, value);
4291                                 }
4292                             }
4293                         } else {
4294                             for (value = 0; value < 256; value++) {
4295                                 if (isSPACE(value)) {
4296                                     ANYOF_BITMAP_SET(data->start_class, value);
4297                                 }
4298                             }
4299                         }
4300                     }
4301                     break;
4302                 case NSPACE:
4303                     if (flags & SCF_DO_STCLASS_AND) {
4304                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4305                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
4306                             if (OP(scan) == NSPACEU) {
4307                                 for (value = 0; value < 256; value++) {
4308                                     if (isSPACE_L1(value)) {
4309                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4310                                     }
4311                                 }
4312                             } else {
4313                                 for (value = 0; value < 256; value++) {
4314                                     if (isSPACE(value)) {
4315                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4316                                     }
4317                                 }
4318                             }
4319                         }
4320                     }
4321                     else {
4322                         if (data->start_class->flags & ANYOF_LOCALE)
4323                             ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
4324                         if (OP(scan) == NSPACEU) {
4325                             for (value = 0; value < 256; value++) {
4326                                 if (!isSPACE_L1(value)) {
4327                                     ANYOF_BITMAP_SET(data->start_class, value);
4328                                 }
4329                             }
4330                         }
4331                         else {
4332                             for (value = 0; value < 256; value++) {
4333                                 if (!isSPACE(value)) {
4334                                     ANYOF_BITMAP_SET(data->start_class, value);
4335                                 }
4336                             }
4337                         }
4338                     }
4339                     break;
4340                 case DIGIT:
4341                     if (flags & SCF_DO_STCLASS_AND) {
4342                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4343                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
4344                             for (value = 0; value < 256; value++)
4345                                 if (!isDIGIT(value))
4346                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
4347                         }
4348                     }
4349                     else {
4350                         if (data->start_class->flags & ANYOF_LOCALE)
4351                             ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
4352                         for (value = 0; value < 256; value++)
4353                             if (isDIGIT(value))
4354                                 ANYOF_BITMAP_SET(data->start_class, value);
4355                     }
4356                     break;
4357                 case NDIGIT:
4358                     if (flags & SCF_DO_STCLASS_AND) {
4359                         if (!(data->start_class->flags & ANYOF_LOCALE))
4360                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
4361                         for (value = 0; value < 256; value++)
4362                             if (isDIGIT(value))
4363                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
4364                     }
4365                     else {
4366                         if (data->start_class->flags & ANYOF_LOCALE)
4367                             ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
4368                         for (value = 0; value < 256; value++)
4369                             if (!isDIGIT(value))
4370                                 ANYOF_BITMAP_SET(data->start_class, value);
4371                     }
4372                     break;
4373                 CASE_SYNST_FNC(VERTWS);
4374                 CASE_SYNST_FNC(HORIZWS);
4375
4376                 }
4377                 if (flags & SCF_DO_STCLASS_OR)
4378                     cl_and(data->start_class, and_withp);
4379                 flags &= ~SCF_DO_STCLASS;
4380             }
4381         }
4382         else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
4383             data->flags |= (OP(scan) == MEOL
4384                             ? SF_BEFORE_MEOL
4385                             : SF_BEFORE_SEOL);
4386             SCAN_COMMIT(pRExC_state, data, minlenp);
4387
4388         }
4389         else if (  PL_regkind[OP(scan)] == BRANCHJ
4390                  /* Lookbehind, or need to calculate parens/evals/stclass: */
4391                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
4392                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4393             if ( OP(scan) == UNLESSM &&
4394                  scan->flags == 0 &&
4395                  OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
4396                  OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
4397             ) {
4398                 regnode *opt;
4399                 regnode *upto= regnext(scan);
4400                 DEBUG_PARSE_r({
4401                     SV * const mysv_val=sv_newmortal();
4402                     DEBUG_STUDYDATA("OPFAIL",data,depth);
4403
4404                     /*DEBUG_PARSE_MSG("opfail");*/
4405                     regprop(RExC_rx, mysv_val, upto);
4406                     PerlIO_printf(Perl_debug_log, "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
4407                                   SvPV_nolen_const(mysv_val),
4408                                   (IV)REG_NODE_NUM(upto),
4409                                   (IV)(upto - scan)
4410                     );
4411                 });
4412                 OP(scan) = OPFAIL;
4413                 NEXT_OFF(scan) = upto - scan;
4414                 for (opt= scan + 1; opt < upto ; opt++)
4415                     OP(opt) = OPTIMIZED;
4416                 scan= upto;
4417                 continue;
4418             }
4419             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4420                 || OP(scan) == UNLESSM )
4421             {
4422                 /* Negative Lookahead/lookbehind
4423                    In this case we can't do fixed string optimisation.
4424                 */
4425
4426                 I32 deltanext, minnext, fake = 0;
4427                 regnode *nscan;
4428                 struct regnode_charclass_class intrnl;
4429                 int f = 0;
4430
4431                 data_fake.flags = 0;
4432                 if (data) {
4433                     data_fake.whilem_c = data->whilem_c;
4434                     data_fake.last_closep = data->last_closep;
4435                 }
4436                 else
4437                     data_fake.last_closep = &fake;
4438                 data_fake.pos_delta = delta;
4439                 if ( flags & SCF_DO_STCLASS && !scan->flags
4440                      && OP(scan) == IFMATCH ) { /* Lookahead */
4441                     cl_init(pRExC_state, &intrnl);
4442                     data_fake.start_class = &intrnl;
4443                     f |= SCF_DO_STCLASS_AND;
4444                 }
4445                 if (flags & SCF_WHILEM_VISITED_POS)
4446                     f |= SCF_WHILEM_VISITED_POS;
4447                 next = regnext(scan);
4448                 nscan = NEXTOPER(NEXTOPER(scan));
4449                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
4450                     last, &data_fake, stopparen, recursed, NULL, f, depth+1);
4451                 if (scan->flags) {
4452                     if (deltanext) {
4453                         FAIL("Variable length lookbehind not implemented");
4454                     }
4455                     else if (minnext > (I32)U8_MAX) {
4456                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4457                     }
4458                     scan->flags = (U8)minnext;
4459                 }
4460                 if (data) {
4461                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4462                         pars++;
4463                     if (data_fake.flags & SF_HAS_EVAL)
4464                         data->flags |= SF_HAS_EVAL;
4465                     data->whilem_c = data_fake.whilem_c;
4466                 }
4467                 if (f & SCF_DO_STCLASS_AND) {
4468                     if (flags & SCF_DO_STCLASS_OR) {
4469                         /* OR before, AND after: ideally we would recurse with
4470                          * data_fake to get the AND applied by study of the
4471                          * remainder of the pattern, and then derecurse;
4472                          * *** HACK *** for now just treat as "no information".
4473                          * See [perl #56690].
4474                          */
4475                         cl_init(pRExC_state, data->start_class);
4476                     }  else {
4477                         /* AND before and after: combine and continue */
4478                         const int was = (data->start_class->flags & ANYOF_EOS);
4479
4480                         cl_and(data->start_class, &intrnl);
4481                         if (was)
4482                             data->start_class->flags |= ANYOF_EOS;
4483                     }
4484                 }
4485             }
4486 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4487             else {
4488                 /* Positive Lookahead/lookbehind
4489                    In this case we can do fixed string optimisation,
4490                    but we must be careful about it. Note in the case of
4491                    lookbehind the positions will be offset by the minimum
4492                    length of the pattern, something we won't know about
4493                    until after the recurse.
4494                 */
4495                 I32 deltanext, fake = 0;
4496                 regnode *nscan;
4497                 struct regnode_charclass_class intrnl;
4498                 int f = 0;
4499                 /* We use SAVEFREEPV so that when the full compile
4500                     is finished perl will clean up the allocated
4501                     minlens when it's all done. This way we don't
4502                     have to worry about freeing them when we know
4503                     they wont be used, which would be a pain.
4504                  */
4505                 I32 *minnextp;
4506                 Newx( minnextp, 1, I32 );
4507                 SAVEFREEPV(minnextp);
4508
4509                 if (data) {
4510                     StructCopy(data, &data_fake, scan_data_t);
4511                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
4512                         f |= SCF_DO_SUBSTR;
4513                         if (scan->flags)
4514                             SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
4515                         data_fake.last_found=newSVsv(data->last_found);
4516                     }
4517                 }
4518                 else
4519                     data_fake.last_closep = &fake;
4520                 data_fake.flags = 0;
4521                 data_fake.pos_delta = delta;
4522                 if (is_inf)
4523                     data_fake.flags |= SF_IS_INF;
4524                 if ( flags & SCF_DO_STCLASS && !scan->flags
4525                      && OP(scan) == IFMATCH ) { /* Lookahead */
4526                     cl_init(pRExC_state, &intrnl);
4527                     data_fake.start_class = &intrnl;
4528                     f |= SCF_DO_STCLASS_AND;
4529                 }
4530                 if (flags & SCF_WHILEM_VISITED_POS)
4531                     f |= SCF_WHILEM_VISITED_POS;
4532                 next = regnext(scan);
4533                 nscan = NEXTOPER(NEXTOPER(scan));
4534
4535                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
4536                     last, &data_fake, stopparen, recursed, NULL, f,depth+1);
4537                 if (scan->flags) {
4538                     if (deltanext) {
4539                         FAIL("Variable length lookbehind not implemented");
4540                     }
4541                     else if (*minnextp > (I32)U8_MAX) {
4542                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4543                     }
4544                     scan->flags = (U8)*minnextp;
4545                 }
4546
4547                 *minnextp += min;
4548
4549                 if (f & SCF_DO_STCLASS_AND) {
4550                     const int was = (data->start_class->flags & ANYOF_EOS);
4551
4552                     cl_and(data->start_class, &intrnl);
4553                     if (was)
4554                         data->start_class->flags |= ANYOF_EOS;
4555                 }
4556                 if (data) {
4557                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4558                         pars++;
4559                     if (data_fake.flags & SF_HAS_EVAL)
4560                         data->flags |= SF_HAS_EVAL;
4561                     data->whilem_c = data_fake.whilem_c;
4562                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4563                         if (RExC_rx->minlen<*minnextp)
4564                             RExC_rx->minlen=*minnextp;
4565                         SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4566                         SvREFCNT_dec(data_fake.last_found);
4567
4568                         if ( data_fake.minlen_fixed != minlenp )
4569                         {
4570                             data->offset_fixed= data_fake.offset_fixed;
4571                             data->minlen_fixed= data_fake.minlen_fixed;
4572                             data->lookbehind_fixed+= scan->flags;
4573                         }
4574                         if ( data_fake.minlen_float != minlenp )
4575                         {
4576                             data->minlen_float= data_fake.minlen_float;
4577                             data->offset_float_min=data_fake.offset_float_min;
4578                             data->offset_float_max=data_fake.offset_float_max;
4579                             data->lookbehind_float+= scan->flags;
4580                         }
4581                     }
4582                 }
4583             }
4584 #endif
4585         }
4586         else if (OP(scan) == OPEN) {
4587             if (stopparen != (I32)ARG(scan))
4588                 pars++;
4589         }
4590         else if (OP(scan) == CLOSE) {
4591             if (stopparen == (I32)ARG(scan)) {
4592                 break;
4593             }
4594             if ((I32)ARG(scan) == is_par) {
4595                 next = regnext(scan);
4596
4597                 if ( next && (OP(next) != WHILEM) && next < last)
4598                     is_par = 0;         /* Disable optimization */
4599             }
4600             if (data)
4601                 *(data->last_closep) = ARG(scan);
4602         }
4603         else if (OP(scan) == EVAL) {
4604                 if (data)
4605                     data->flags |= SF_HAS_EVAL;
4606         }
4607         else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4608             if (flags & SCF_DO_SUBSTR) {
4609                 SCAN_COMMIT(pRExC_state,data,minlenp);
4610                 flags &= ~SCF_DO_SUBSTR;
4611             }
4612             if (data && OP(scan)==ACCEPT) {
4613                 data->flags |= SCF_SEEN_ACCEPT;
4614                 if (stopmin > min)
4615                     stopmin = min;
4616             }
4617         }
4618         else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4619         {
4620                 if (flags & SCF_DO_SUBSTR) {
4621                     SCAN_COMMIT(pRExC_state,data,minlenp);
4622                     data->longest = &(data->longest_float);
4623                 }
4624                 is_inf = is_inf_internal = 1;
4625                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4626                     cl_anything(pRExC_state, data->start_class);
4627                 flags &= ~SCF_DO_STCLASS;
4628         }
4629         else if (OP(scan) == GPOS) {
4630             if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4631                 !(delta || is_inf || (data && data->pos_delta)))
4632             {
4633                 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4634                     RExC_rx->extflags |= RXf_ANCH_GPOS;
4635                 if (RExC_rx->gofs < (U32)min)
4636                     RExC_rx->gofs = min;
4637             } else {
4638                 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4639                 RExC_rx->gofs = 0;
4640             }
4641         }
4642 #ifdef TRIE_STUDY_OPT
4643 #ifdef FULL_TRIE_STUDY
4644         else if (PL_regkind[OP(scan)] == TRIE) {
4645             /* NOTE - There is similar code to this block above for handling
4646                BRANCH nodes on the initial study.  If you change stuff here
4647                check there too. */
4648             regnode *trie_node= scan;
4649             regnode *tail= regnext(scan);
4650             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4651             I32 max1 = 0, min1 = I32_MAX;
4652             struct regnode_charclass_class accum;
4653
4654             if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4655                 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4656             if (flags & SCF_DO_STCLASS)
4657                 cl_init_zero(pRExC_state, &accum);
4658
4659             if (!trie->jump) {
4660                 min1= trie->minlen;
4661                 max1= trie->maxlen;
4662             } else {
4663                 const regnode *nextbranch= NULL;
4664                 U32 word;
4665
4666                 for ( word=1 ; word <= trie->wordcount ; word++)
4667                 {
4668                     I32 deltanext=0, minnext=0, f = 0, fake;
4669                     struct regnode_charclass_class this_class;
4670
4671                     data_fake.flags = 0;
4672                     if (data) {
4673                         data_fake.whilem_c = data->whilem_c;
4674                         data_fake.last_closep = data->last_closep;
4675                     }
4676                     else
4677                         data_fake.last_closep = &fake;
4678                     data_fake.pos_delta = delta;
4679                     if (flags & SCF_DO_STCLASS) {
4680                         cl_init(pRExC_state, &this_class);
4681                         data_fake.start_class = &this_class;
4682                         f = SCF_DO_STCLASS_AND;
4683                     }
4684                     if (flags & SCF_WHILEM_VISITED_POS)
4685                         f |= SCF_WHILEM_VISITED_POS;
4686
4687                     if (trie->jump[word]) {
4688                         if (!nextbranch)
4689                             nextbranch = trie_node + trie->jump[0];
4690                         scan= trie_node + trie->jump[word];
4691                         /* We go from the jump point to the branch that follows
4692                            it. Note this means we need the vestigal unused branches
4693                            even though they arent otherwise used.
4694                          */
4695                         minnext = study_chunk(pRExC_state, &scan, minlenp,
4696                             &deltanext, (regnode *)nextbranch, &data_fake,
4697                             stopparen, recursed, NULL, f,depth+1);
4698                     }
4699                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4700                         nextbranch= regnext((regnode*)nextbranch);
4701
4702                     if (min1 > (I32)(minnext + trie->minlen))
4703                         min1 = minnext + trie->minlen;
4704                     if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4705                         max1 = minnext + deltanext + trie->maxlen;
4706                     if (deltanext == I32_MAX)
4707                         is_inf = is_inf_internal = 1;
4708
4709                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4710                         pars++;
4711                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
4712                         if ( stopmin > min + min1)
4713                             stopmin = min + min1;
4714                         flags &= ~SCF_DO_SUBSTR;
4715                         if (data)
4716                             data->flags |= SCF_SEEN_ACCEPT;
4717                     }
4718                     if (data) {
4719                         if (data_fake.flags & SF_HAS_EVAL)
4720                             data->flags |= SF_HAS_EVAL;
4721                         data->whilem_c = data_fake.whilem_c;
4722                     }
4723                     if (flags & SCF_DO_STCLASS)
4724                         cl_or(pRExC_state, &accum, &this_class);
4725                 }
4726             }
4727             if (flags & SCF_DO_SUBSTR) {
4728                 data->pos_min += min1;
4729                 data->pos_delta += max1 - min1;
4730                 if (max1 != min1 || is_inf)
4731                     data->longest = &(data->longest_float);
4732             }
4733             min += min1;
4734             delta += max1 - min1;
4735             if (flags & SCF_DO_STCLASS_OR) {
4736                 cl_or(pRExC_state, data->start_class, &accum);
4737                 if (min1) {
4738                     cl_and(data->start_class, and_withp);
4739                     flags &= ~SCF_DO_STCLASS;
4740                 }
4741             }
4742             else if (flags & SCF_DO_STCLASS_AND) {
4743                 if (min1) {
4744                     cl_and(data->start_class, &accum);
4745                     flags &= ~SCF_DO_STCLASS;
4746                 }
4747                 else {
4748                     /* Switch to OR mode: cache the old value of
4749                      * data->start_class */
4750                     INIT_AND_WITHP;
4751                     StructCopy(data->start_class, and_withp,
4752                                struct regnode_charclass_class);
4753                     flags &= ~SCF_DO_STCLASS_AND;
4754                     StructCopy(&accum, data->start_class,
4755                                struct regnode_charclass_class);
4756                     flags |= SCF_DO_STCLASS_OR;
4757                     data->start_class->flags |= ANYOF_EOS;
4758                 }
4759             }
4760             scan= tail;
4761             continue;
4762         }
4763 #else
4764         else if (PL_regkind[OP(scan)] == TRIE) {
4765             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4766             U8*bang=NULL;
4767
4768             min += trie->minlen;
4769             delta += (trie->maxlen - trie->minlen);
4770             flags &= ~SCF_DO_STCLASS; /* xxx */
4771             if (flags & SCF_DO_SUBSTR) {
4772                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
4773                 data->pos_min += trie->minlen;
4774                 data->pos_delta += (trie->maxlen - trie->minlen);
4775                 if (trie->maxlen != trie->minlen)
4776                     data->longest = &(data->longest_float);
4777             }
4778             if (trie->jump) /* no more substrings -- for now /grr*/
4779                 flags &= ~SCF_DO_SUBSTR;
4780         }
4781 #endif /* old or new */
4782 #endif /* TRIE_STUDY_OPT */
4783
4784         /* Else: zero-length, ignore. */
4785         scan = regnext(scan);
4786     }
4787     if (frame) {
4788         last = frame->last;
4789         scan = frame->next;
4790         stopparen = frame->stop;
4791         frame = frame->prev;
4792         goto fake_study_recurse;
4793     }
4794
4795   finish:
4796     assert(!frame);
4797     DEBUG_STUDYDATA("pre-fin:",data,depth);
4798
4799     *scanp = scan;
4800     *deltap = is_inf_internal ? I32_MAX : delta;
4801     if (flags & SCF_DO_SUBSTR && is_inf)
4802         data->pos_delta = I32_MAX - data->pos_min;
4803     if (is_par > (I32)U8_MAX)
4804         is_par = 0;
4805     if (is_par && pars==1 && data) {
4806         data->flags |= SF_IN_PAR;
4807         data->flags &= ~SF_HAS_PAR;
4808     }
4809     else if (pars && data) {
4810         data->flags |= SF_HAS_PAR;
4811         data->flags &= ~SF_IN_PAR;
4812     }
4813     if (flags & SCF_DO_STCLASS_OR)
4814         cl_and(data->start_class, and_withp);
4815     if (flags & SCF_TRIE_RESTUDY)
4816         data->flags |=  SCF_TRIE_RESTUDY;
4817
4818     DEBUG_STUDYDATA("post-fin:",data,depth);
4819
4820     return min < stopmin ? min : stopmin;
4821 }
4822
4823 STATIC U32
4824 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4825 {
4826     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4827
4828     PERL_ARGS_ASSERT_ADD_DATA;
4829
4830     Renewc(RExC_rxi->data,
4831            sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4832            char, struct reg_data);
4833     if(count)
4834         Renew(RExC_rxi->data->what, count + n, U8);
4835     else
4836         Newx(RExC_rxi->data->what, n, U8);
4837     RExC_rxi->data->count = count + n;
4838     Copy(s, RExC_rxi->data->what + count, n, U8);
4839     return count;
4840 }
4841
4842 /*XXX: todo make this not included in a non debugging perl */
4843 #ifndef PERL_IN_XSUB_RE
4844 void
4845 Perl_reginitcolors(pTHX)
4846 {
4847     dVAR;
4848     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4849     if (s) {
4850         char *t = savepv(s);
4851         int i = 0;
4852         PL_colors[0] = t;
4853         while (++i < 6) {
4854             t = strchr(t, '\t');
4855             if (t) {
4856                 *t = '\0';
4857                 PL_colors[i] = ++t;
4858             }
4859             else
4860                 PL_colors[i] = t = (char *)"";
4861         }
4862     } else {
4863         int i = 0;
4864         while (i < 6)
4865             PL_colors[i++] = (char *)"";
4866     }
4867     PL_colorset = 1;
4868 }
4869 #endif
4870
4871
4872 #ifdef TRIE_STUDY_OPT
4873 #define CHECK_RESTUDY_GOTO                                  \
4874         if (                                                \
4875               (data.flags & SCF_TRIE_RESTUDY)               \
4876               && ! restudied++                              \
4877         )     goto reStudy
4878 #else
4879 #define CHECK_RESTUDY_GOTO
4880 #endif
4881
4882 /*
4883  * pregcomp - compile a regular expression into internal code
4884  *
4885  * Decides which engine's compiler to call based on the hint currently in
4886  * scope
4887  */
4888
4889 #ifndef PERL_IN_XSUB_RE
4890
4891 /* return the currently in-scope regex engine (or the default if none)  */
4892
4893 regexp_engine const *
4894 Perl_current_re_engine(pTHX)
4895 {
4896     dVAR;
4897
4898     if (IN_PERL_COMPILETIME) {
4899         HV * const table = GvHV(PL_hintgv);
4900         SV **ptr;
4901
4902         if (!table)
4903             return &PL_core_reg_engine;
4904         ptr = hv_fetchs(table, "regcomp", FALSE);
4905         if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
4906             return &PL_core_reg_engine;
4907         return INT2PTR(regexp_engine*,SvIV(*ptr));
4908     }
4909     else {
4910         SV *ptr;
4911         if (!PL_curcop->cop_hints_hash)
4912             return &PL_core_reg_engine;
4913         ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
4914         if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
4915             return &PL_core_reg_engine;
4916         return INT2PTR(regexp_engine*,SvIV(ptr));
4917     }
4918 }
4919
4920
4921 REGEXP *
4922 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4923 {
4924     dVAR;
4925     regexp_engine const *eng = current_re_engine();
4926     GET_RE_DEBUG_FLAGS_DECL;
4927
4928     PERL_ARGS_ASSERT_PREGCOMP;
4929
4930     /* Dispatch a request to compile a regexp to correct regexp engine. */
4931     DEBUG_COMPILE_r({
4932         PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4933                         PTR2UV(eng));
4934     });
4935     return CALLREGCOMP_ENG(eng, pattern, flags);
4936 }
4937 #endif
4938
4939 /* public(ish) entry point for the perl core's own regex compiling code.
4940  * It's actually a wrapper for Perl_re_op_compile that only takes an SV
4941  * pattern rather than a list of OPs, and uses the internal engine rather
4942  * than the current one */
4943
4944 REGEXP *
4945 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
4946 {
4947     SV *pat = pattern; /* defeat constness! */
4948     PERL_ARGS_ASSERT_RE_COMPILE;
4949     return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
4950 #ifdef PERL_IN_XSUB_RE
4951                                 &my_reg_engine,
4952 #else
4953                                 &PL_core_reg_engine,
4954 #endif
4955                                 NULL, NULL, rx_flags, 0);
4956 }
4957
4958 /* see if there are any run-time code blocks in the pattern.
4959  * False positives are allowed */
4960
4961 static bool
4962 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state, OP *expr,
4963                     U32 pm_flags, char *pat, STRLEN plen)
4964 {
4965     int n = 0;
4966     STRLEN s;
4967
4968     /* avoid infinitely recursing when we recompile the pattern parcelled up
4969      * as qr'...'. A single constant qr// string can't have have any
4970      * run-time component in it, and thus, no runtime code. (A non-qr
4971      * string, however, can, e.g. $x =~ '(?{})') */
4972     if  ((pm_flags & PMf_IS_QR) && expr && expr->op_type == OP_CONST)
4973         return 0;
4974
4975     for (s = 0; s < plen; s++) {
4976         if (n < pRExC_state->num_code_blocks
4977             && s == pRExC_state->code_blocks[n].start)
4978         {
4979             s = pRExC_state->code_blocks[n].end;
4980             n++;
4981             continue;
4982         }
4983         /* TODO ideally should handle [..], (#..), /#.../x to reduce false
4984          * positives here */
4985         if (pat[s] == '(' && pat[s+1] == '?' &&
4986             (pat[s+2] == '{' || (pat[s+2] == '?' && pat[s+3] == '{'))
4987         )
4988             return 1;
4989     }
4990     return 0;
4991 }
4992
4993 /* Handle run-time code blocks. We will already have compiled any direct
4994  * or indirect literal code blocks. Now, take the pattern 'pat' and make a
4995  * copy of it, but with any literal code blocks blanked out and
4996  * appropriate chars escaped; then feed it into
4997  *
4998  *    eval "qr'modified_pattern'"
4999  *
5000  * For example,
5001  *
5002  *       a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
5003  *
5004  * becomes
5005  *
5006  *    qr'a\\bc                       def\'ghi\\\\jkl(?{"this is runtime"})mno'
5007  *
5008  * After eval_sv()-ing that, grab any new code blocks from the returned qr
5009  * and merge them with any code blocks of the original regexp.
5010  *
5011  * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
5012  * instead, just save the qr and return FALSE; this tells our caller that
5013  * the original pattern needs upgrading to utf8.
5014  */
5015
5016 static bool
5017 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5018     char *pat, STRLEN plen)
5019 {
5020     SV *qr;
5021
5022     GET_RE_DEBUG_FLAGS_DECL;
5023
5024     if (pRExC_state->runtime_code_qr) {
5025         /* this is the second time we've been called; this should
5026          * only happen if the main pattern got upgraded to utf8
5027          * during compilation; re-use the qr we compiled first time
5028          * round (which should be utf8 too)
5029          */
5030         qr = pRExC_state->runtime_code_qr;
5031         pRExC_state->runtime_code_qr = NULL;
5032         assert(RExC_utf8 && SvUTF8(qr));
5033     }
5034     else {
5035         int n = 0;
5036         STRLEN s;
5037         char *p, *newpat;
5038         int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
5039         SV *sv, *qr_ref;
5040         dSP;
5041
5042         /* determine how many extra chars we need for ' and \ escaping */
5043         for (s = 0; s < plen; s++) {
5044             if (pat[s] == '\'' || pat[s] == '\\')
5045                 newlen++;
5046         }
5047
5048         Newx(newpat, newlen, char);
5049         p = newpat;
5050         *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
5051
5052         for (s = 0; s < plen; s++) {
5053             if (n < pRExC_state->num_code_blocks
5054                 && s == pRExC_state->code_blocks[n].start)
5055             {
5056                 /* blank out literal code block */
5057                 assert(pat[s] == '(');
5058                 while (s <= pRExC_state->code_blocks[n].end) {
5059                     *p++ = ' ';
5060                     s++;
5061                 }
5062                 s--;
5063                 n++;
5064                 continue;
5065             }
5066             if (pat[s] == '\'' || pat[s] == '\\')
5067                 *p++ = '\\';
5068             *p++ = pat[s];
5069         }
5070         *p++ = '\'';
5071         if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
5072             *p++ = 'x';
5073         *p++ = '\0';
5074         DEBUG_COMPILE_r({
5075             PerlIO_printf(Perl_debug_log,
5076                 "%sre-parsing pattern for runtime code:%s %s\n",
5077                 PL_colors[4],PL_colors[5],newpat);
5078         });
5079
5080         sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
5081         Safefree(newpat);
5082
5083         ENTER;
5084         SAVETMPS;
5085         save_re_context();
5086         PUSHSTACKi(PERLSI_REQUIRE);
5087         /* this causes the toker to collapse \\ into \ when parsing
5088          * qr''; normally only q'' does this. It also alters hints
5089          * handling */
5090         PL_reg_state.re_reparsing = TRUE;
5091         eval_sv(sv, G_SCALAR);
5092         SvREFCNT_dec(sv);
5093         SPAGAIN;
5094         qr_ref = POPs;
5095         PUTBACK;
5096         if (SvTRUE(ERRSV))
5097             Perl_croak(aTHX_ "%s", SvPVx_nolen_const(ERRSV));
5098         assert(SvROK(qr_ref));
5099         qr = SvRV(qr_ref);
5100         assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
5101         /* the leaving below frees the tmp qr_ref.
5102          * Give qr a life of its own */
5103         SvREFCNT_inc(qr);
5104         POPSTACK;
5105         FREETMPS;
5106         LEAVE;
5107
5108     }
5109
5110     if (!RExC_utf8 && SvUTF8(qr)) {
5111         /* first time through; the pattern got upgraded; save the
5112          * qr for the next time through */
5113         assert(!pRExC_state->runtime_code_qr);
5114         pRExC_state->runtime_code_qr = qr;
5115         return 0;
5116     }
5117
5118
5119     /* extract any code blocks within the returned qr//  */
5120
5121
5122     /* merge the main (r1) and run-time (r2) code blocks into one */
5123     {
5124         RXi_GET_DECL(((struct regexp*)SvANY(qr)), r2);
5125         struct reg_code_block *new_block, *dst;
5126         RExC_state_t * const r1 = pRExC_state; /* convenient alias */
5127         int i1 = 0, i2 = 0;
5128
5129         if (!r2->num_code_blocks) /* we guessed wrong */
5130             return 1;
5131
5132         Newx(new_block,
5133             r1->num_code_blocks + r2->num_code_blocks,
5134             struct reg_code_block);
5135         dst = new_block;
5136
5137         while (    i1 < r1->num_code_blocks
5138                 || i2 < r2->num_code_blocks)
5139         {
5140             struct reg_code_block *src;
5141             bool is_qr = 0;
5142
5143             if (i1 == r1->num_code_blocks) {
5144                 src = &r2->code_blocks[i2++];
5145                 is_qr = 1;
5146             }
5147             else if (i2 == r2->num_code_blocks)
5148                 src = &r1->code_blocks[i1++];
5149             else if (  r1->code_blocks[i1].start
5150                      < r2->code_blocks[i2].start)
5151             {
5152                 src = &r1->code_blocks[i1++];
5153                 assert(src->end < r2->code_blocks[i2].start);
5154             }
5155             else {
5156                 assert(  r1->code_blocks[i1].start
5157                        > r2->code_blocks[i2].start);
5158                 src = &r2->code_blocks[i2++];
5159                 is_qr = 1;
5160                 assert(src->end < r1->code_blocks[i1].start);
5161             }
5162
5163             assert(pat[src->start] == '(');
5164             assert(pat[src->end]   == ')');
5165             dst->start      = src->start;
5166             dst->end        = src->end;
5167             dst->block      = src->block;
5168             dst->src_regex  = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
5169                                     : src->src_regex;
5170             dst++;
5171         }
5172         r1->num_code_blocks += r2->num_code_blocks;
5173         Safefree(r1->code_blocks);
5174         r1->code_blocks = new_block;
5175     }
5176
5177     SvREFCNT_dec(qr);
5178     return 1;
5179 }
5180
5181
5182 STATIC bool
5183 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, SV** rx_utf8, SV** rx_substr, I32* rx_end_shift, I32 lookbehind, I32 offset, I32 *minlen, STRLEN longest_length, bool eol, bool meol)
5184 {
5185     /* This is the common code for setting up the floating and fixed length
5186      * string data extracted from Perlre_op_compile() below.  Returns a boolean
5187      * as to whether succeeded or not */
5188
5189     I32 t,ml;
5190
5191     if (! (longest_length
5192            || (eol /* Can't have SEOL and MULTI */
5193                && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
5194           )
5195             /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
5196         || (RExC_seen & REG_SEEN_EXACTF_SHARP_S))
5197     {
5198         return FALSE;
5199     }
5200
5201     /* copy the information about the longest from the reg_scan_data
5202         over to the program. */
5203     if (SvUTF8(sv_longest)) {
5204         *rx_utf8 = sv_longest;
5205         *rx_substr = NULL;
5206     } else {
5207         *rx_substr = sv_longest;
5208         *rx_utf8 = NULL;
5209     }
5210     /* end_shift is how many chars that must be matched that
5211         follow this item. We calculate it ahead of time as once the
5212         lookbehind offset is added in we lose the ability to correctly
5213         calculate it.*/
5214     ml = minlen ? *(minlen) : (I32)longest_length;
5215     *rx_end_shift = ml - offset
5216         - longest_length + (SvTAIL(sv_longest) != 0)
5217         + lookbehind;
5218
5219     t = (eol/* Can't have SEOL and MULTI */
5220          && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
5221     fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
5222
5223     return TRUE;
5224 }
5225
5226 /*
5227  * Perl_re_op_compile - the perl internal RE engine's function to compile a
5228  * regular expression into internal code.
5229  * The pattern may be passed either as:
5230  *    a list of SVs (patternp plus pat_count)
5231  *    a list of OPs (expr)
5232  * If both are passed, the SV list is used, but the OP list indicates
5233  * which SVs are actually pre-compiled code blocks
5234  *
5235  * The SVs in the list have magic and qr overloading applied to them (and
5236  * the list may be modified in-place with replacement SVs in the latter
5237  * case).
5238  *
5239  * If the pattern hasn't changed from old_re, then old_re will be
5240  * returned.
5241  *
5242  * eng is the current engine. If that engine has an op_comp method, then
5243  * handle directly (i.e. we assume that op_comp was us); otherwise, just
5244  * do the initial concatenation of arguments and pass on to the external
5245  * engine.
5246  *
5247  * If is_bare_re is not null, set it to a boolean indicating whether the
5248  * arg list reduced (after overloading) to a single bare regex which has
5249  * been returned (i.e. /$qr/).
5250  *
5251  * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
5252  *
5253  * pm_flags contains the PMf_* flags, typically based on those from the
5254  * pm_flags field of the related PMOP. Currently we're only interested in
5255  * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
5256  *
5257  * We can't allocate space until we know how big the compiled form will be,
5258  * but we can't compile it (and thus know how big it is) until we've got a
5259  * place to put the code.  So we cheat:  we compile it twice, once with code
5260  * generation turned off and size counting turned on, and once "for real".
5261  * This also means that we don't allocate space until we are sure that the
5262  * thing really will compile successfully, and we never have to move the
5263  * code and thus invalidate pointers into it.  (Note that it has to be in
5264  * one piece because free() must be able to free it all.) [NB: not true in perl]
5265  *
5266  * Beware that the optimization-preparation code in here knows about some
5267  * of the structure of the compiled regexp.  [I'll say.]
5268  */
5269
5270 REGEXP *
5271 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
5272                     OP *expr, const regexp_engine* eng, REGEXP *VOL old_re,
5273                      bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
5274 {
5275     dVAR;
5276     REGEXP *rx;
5277     struct regexp *r;
5278     regexp_internal *ri;
5279     STRLEN plen;
5280     char  * VOL exp;
5281     char* xend;
5282     regnode *scan;
5283     I32 flags;
5284     I32 minlen = 0;
5285     U32 rx_flags;
5286     SV * VOL pat;
5287
5288     /* these are all flags - maybe they should be turned
5289      * into a single int with different bit masks */
5290     I32 sawlookahead = 0;
5291     I32 sawplus = 0;
5292     I32 sawopen = 0;
5293     bool used_setjump = FALSE;
5294     regex_charset initial_charset = get_regex_charset(orig_rx_flags);
5295     bool code_is_utf8 = 0;
5296     bool VOL recompile = 0;
5297     bool runtime_code = 0;
5298     U8 jump_ret = 0;
5299     dJMPENV;
5300     scan_data_t data;
5301     RExC_state_t RExC_state;
5302     RExC_state_t * const pRExC_state = &RExC_state;
5303 #ifdef TRIE_STUDY_OPT
5304     int restudied;
5305     RExC_state_t copyRExC_state;
5306 #endif
5307     GET_RE_DEBUG_FLAGS_DECL;
5308
5309     PERL_ARGS_ASSERT_RE_OP_COMPILE;
5310
5311     DEBUG_r(if (!PL_colorset) reginitcolors());
5312
5313 #ifndef PERL_IN_XSUB_RE
5314     /* Initialize these here instead of as-needed, as is quick and avoids
5315      * having to test them each time otherwise */
5316     if (! PL_AboveLatin1) {
5317         PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
5318         PL_ASCII = _new_invlist_C_array(ASCII_invlist);
5319         PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
5320
5321         PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist);
5322         PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist);
5323
5324         PL_L1PosixAlpha = _new_invlist_C_array(L1PosixAlpha_invlist);
5325         PL_PosixAlpha = _new_invlist_C_array(PosixAlpha_invlist);
5326
5327         PL_PosixBlank = _new_invlist_C_array(PosixBlank_invlist);
5328         PL_XPosixBlank = _new_invlist_C_array(XPosixBlank_invlist);
5329
5330         PL_L1Cased = _new_invlist_C_array(L1Cased_invlist);
5331
5332         PL_PosixCntrl = _new_invlist_C_array(PosixCntrl_invlist);
5333         PL_XPosixCntrl = _new_invlist_C_array(XPosixCntrl_invlist);
5334
5335         PL_PosixDigit = _new_invlist_C_array(PosixDigit_invlist);
5336
5337         PL_L1PosixGraph = _new_invlist_C_array(L1PosixGraph_invlist);
5338         PL_PosixGraph = _new_invlist_C_array(PosixGraph_invlist);
5339
5340         PL_L1PosixLower = _new_invlist_C_array(L1PosixLower_invlist);
5341         PL_PosixLower = _new_invlist_C_array(PosixLower_invlist);
5342
5343         PL_L1PosixPrint = _new_invlist_C_array(L1PosixPrint_invlist);
5344         PL_PosixPrint = _new_invlist_C_array(PosixPrint_invlist);
5345
5346         PL_L1PosixPunct = _new_invlist_C_array(L1PosixPunct_invlist);
5347         PL_PosixPunct = _new_invlist_C_array(PosixPunct_invlist);
5348
5349         PL_PerlSpace = _new_invlist_C_array(PerlSpace_invlist);
5350         PL_XPerlSpace = _new_invlist_C_array(XPerlSpace_invlist);
5351
5352         PL_PosixSpace = _new_invlist_C_array(PosixSpace_invlist);
5353         PL_XPosixSpace = _new_invlist_C_array(XPosixSpace_invlist);
5354
5355         PL_L1PosixUpper = _new_invlist_C_array(L1PosixUpper_invlist);
5356         PL_PosixUpper = _new_invlist_C_array(PosixUpper_invlist);
5357
5358         PL_VertSpace = _new_invlist_C_array(VertSpace_invlist);
5359
5360         PL_PosixWord = _new_invlist_C_array(PosixWord_invlist);
5361         PL_L1PosixWord = _new_invlist_C_array(L1PosixWord_invlist);
5362
5363         PL_PosixXDigit = _new_invlist_C_array(PosixXDigit_invlist);
5364         PL_XPosixXDigit = _new_invlist_C_array(XPosixXDigit_invlist);
5365
5366         PL_HasMultiCharFold = _new_invlist_C_array(_Perl_Multi_Char_Folds_invlist);
5367     }
5368 #endif
5369
5370     pRExC_state->code_blocks = NULL;
5371     pRExC_state->num_code_blocks = 0;
5372
5373     if (is_bare_re)
5374         *is_bare_re = FALSE;
5375
5376     if (expr && (expr->op_type == OP_LIST ||
5377                 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
5378
5379         /* is the source UTF8, and how many code blocks are there? */
5380         OP *o;
5381         int ncode = 0;
5382
5383         for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5384             if (o->op_type == OP_CONST && SvUTF8(cSVOPo_sv))
5385                 code_is_utf8 = 1;
5386             else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
5387                 /* count of DO blocks */
5388                 ncode++;
5389         }
5390         if (ncode) {
5391             pRExC_state->num_code_blocks = ncode;
5392             Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
5393         }
5394     }
5395
5396     if (pat_count) {
5397         /* handle a list of SVs */
5398
5399         SV **svp;
5400
5401         /* apply magic and RE overloading to each arg */
5402         for (svp = patternp; svp < patternp + pat_count; svp++) {
5403             SV *rx = *svp;
5404             SvGETMAGIC(rx);
5405             if (SvROK(rx) && SvAMAGIC(rx)) {
5406                 SV *sv = AMG_CALLunary(rx, regexp_amg);
5407                 if (sv) {
5408                     if (SvROK(sv))
5409                         sv = SvRV(sv);
5410                     if (SvTYPE(sv) != SVt_REGEXP)
5411                         Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5412                     *svp = sv;
5413                 }
5414             }
5415         }
5416
5417         if (pat_count > 1) {
5418             /* concat multiple args and find any code block indexes */
5419
5420             OP *o = NULL;
5421             int n = 0;
5422             bool utf8 = 0;
5423             STRLEN orig_patlen = 0;
5424
5425             if (pRExC_state->num_code_blocks) {
5426                 o = cLISTOPx(expr)->op_first;
5427                 assert(o->op_type == OP_PUSHMARK);
5428                 o = o->op_sibling;
5429             }
5430
5431             pat = newSVpvn("", 0);
5432             SAVEFREESV(pat);
5433
5434             /* determine if the pattern is going to be utf8 (needed
5435              * in advance to align code block indices correctly).
5436              * XXX This could fail to be detected for an arg with
5437              * overloading but not concat overloading; but the main effect
5438              * in this obscure case is to need a 'use re eval' for a
5439              * literal code block */
5440             for (svp = patternp; svp < patternp + pat_count; svp++) {
5441                 if (SvUTF8(*svp))
5442                     utf8 = 1;
5443             }
5444             if (utf8)
5445                 SvUTF8_on(pat);
5446
5447             for (svp = patternp; svp < patternp + pat_count; svp++) {
5448                 SV *sv, *msv = *svp;
5449                 SV *rx;
5450                 bool code = 0;
5451                 if (o) {
5452                     if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
5453                         assert(n < pRExC_state->num_code_blocks);
5454                         pRExC_state->code_blocks[n].start = SvCUR(pat);
5455                         pRExC_state->code_blocks[n].block = o;
5456                         pRExC_state->code_blocks[n].src_regex = NULL;
5457                         n++;
5458                         code = 1;
5459                         o = o->op_sibling; /* skip CONST */
5460                         assert(o);
5461                     }
5462                     o = o->op_sibling;;
5463                 }
5464
5465                 if ((SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5466                         (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5467                 {
5468                     sv_setsv(pat, sv);
5469                     /* overloading involved: all bets are off over literal
5470                      * code. Pretend we haven't seen it */
5471                     pRExC_state->num_code_blocks -= n;
5472                     n = 0;
5473                     rx = NULL;
5474
5475                 }
5476                 else  {
5477                     while (SvAMAGIC(msv)
5478                             && (sv = AMG_CALLunary(msv, string_amg))
5479                             && sv != msv
5480                             &&  !(   SvROK(msv)
5481                                   && SvROK(sv)
5482                                   && SvRV(msv) == SvRV(sv))
5483                     ) {
5484                         msv = sv;
5485                         SvGETMAGIC(msv);
5486                     }
5487                     if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5488                         msv = SvRV(msv);
5489                     orig_patlen = SvCUR(pat);
5490                     sv_catsv_nomg(pat, msv);
5491                     rx = msv;
5492                     if (code)
5493                         pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
5494                 }
5495
5496                 /* extract any code blocks within any embedded qr//'s */
5497                 if (rx && SvTYPE(rx) == SVt_REGEXP
5498                     && RX_ENGINE((REGEXP*)rx)->op_comp)
5499                 {
5500
5501                     RXi_GET_DECL(((struct regexp*)SvANY(rx)), ri);
5502                     if (ri->num_code_blocks) {
5503                         int i;
5504                         /* the presence of an embedded qr// with code means
5505                          * we should always recompile: the text of the
5506                          * qr// may not have changed, but it may be a
5507                          * different closure than last time */
5508                         recompile = 1;
5509                         Renew(pRExC_state->code_blocks,
5510                             pRExC_state->num_code_blocks + ri->num_code_blocks,
5511                             struct reg_code_block);
5512                         pRExC_state->num_code_blocks += ri->num_code_blocks;
5513                         for (i=0; i < ri->num_code_blocks; i++) {
5514                             struct reg_code_block *src, *dst;
5515                             STRLEN offset =  orig_patlen
5516                                 + ((struct regexp *)SvANY(rx))->pre_prefix;
5517                             assert(n < pRExC_state->num_code_blocks);
5518                             src = &ri->code_blocks[i];
5519                             dst = &pRExC_state->code_blocks[n];
5520                             dst->start      = src->start + offset;
5521                             dst->end        = src->end   + offset;
5522                             dst->block      = src->block;
5523                             dst->src_regex  = (REGEXP*) SvREFCNT_inc( (SV*)
5524                                                     src->src_regex
5525                                                         ? src->src_regex
5526                                                         : (REGEXP*)rx);
5527                             n++;
5528                         }
5529                     }
5530                 }
5531             }
5532             SvSETMAGIC(pat);
5533         }
5534         else {
5535             SV *sv;
5536             pat = *patternp;
5537             while (SvAMAGIC(pat)
5538                     && (sv = AMG_CALLunary(pat, string_amg))
5539                     && sv != pat)
5540             {
5541                 pat = sv;
5542                 SvGETMAGIC(pat);
5543             }
5544         }
5545
5546         /* handle bare regex: foo =~ $re */
5547         {
5548             SV *re = pat;
5549             if (SvROK(re))
5550                 re = SvRV(re);
5551             if (SvTYPE(re) == SVt_REGEXP) {
5552                 if (is_bare_re)
5553                     *is_bare_re = TRUE;
5554                 SvREFCNT_inc(re);
5555                 Safefree(pRExC_state->code_blocks);
5556                 return (REGEXP*)re;
5557             }
5558         }
5559     }
5560     else {
5561         /* not a list of SVs, so must be a list of OPs */
5562         assert(expr);
5563         if (expr->op_type == OP_LIST) {
5564             int i = -1;
5565             bool is_code = 0;
5566             OP *o;
5567
5568             pat = newSVpvn("", 0);
5569             SAVEFREESV(pat);
5570             if (code_is_utf8)
5571                 SvUTF8_on(pat);
5572
5573             /* given a list of CONSTs and DO blocks in expr, append all
5574              * the CONSTs to pat, and record the start and end of each
5575              * code block in code_blocks[] (each DO{} op is followed by an
5576              * OP_CONST containing the corresponding literal '(?{...})
5577              * text)
5578              */
5579             for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5580                 if (o->op_type == OP_CONST) {
5581                     sv_catsv(pat, cSVOPo_sv);
5582                     if (is_code) {
5583                         pRExC_state->code_blocks[i].end = SvCUR(pat)-1;
5584                         is_code = 0;
5585                     }
5586                 }
5587                 else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
5588                     assert(i+1 < pRExC_state->num_code_blocks);
5589                     pRExC_state->code_blocks[++i].start = SvCUR(pat);
5590                     pRExC_state->code_blocks[i].block = o;
5591                     pRExC_state->code_blocks[i].src_regex = NULL;
5592                     is_code = 1;
5593                 }
5594             }
5595         }
5596         else {
5597             assert(expr->op_type == OP_CONST);
5598             pat = cSVOPx_sv(expr);
5599         }
5600     }
5601
5602     exp = SvPV_nomg(pat, plen);
5603
5604     if (!eng->op_comp) {
5605         if ((SvUTF8(pat) && IN_BYTES)
5606                 || SvGMAGICAL(pat) || SvAMAGIC(pat))
5607         {
5608             /* make a temporary copy; either to convert to bytes,
5609              * or to avoid repeating get-magic / overloaded stringify */
5610             pat = newSVpvn_flags(exp, plen, SVs_TEMP |
5611                                         (IN_BYTES ? 0 : SvUTF8(pat)));
5612         }
5613         Safefree(pRExC_state->code_blocks);
5614         return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
5615     }
5616
5617     /* ignore the utf8ness if the pattern is 0 length */
5618     RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
5619     RExC_uni_semantics = 0;
5620     RExC_contains_locale = 0;
5621     pRExC_state->runtime_code_qr = NULL;
5622
5623     /****************** LONG JUMP TARGET HERE***********************/
5624     /* Longjmp back to here if have to switch in midstream to utf8 */
5625     if (! RExC_orig_utf8) {
5626         JMPENV_PUSH(jump_ret);
5627         used_setjump = TRUE;
5628     }
5629
5630     if (jump_ret == 0) {    /* First time through */
5631         xend = exp + plen;
5632
5633         DEBUG_COMPILE_r({
5634             SV *dsv= sv_newmortal();
5635             RE_PV_QUOTED_DECL(s, RExC_utf8,
5636                 dsv, exp, plen, 60);
5637             PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
5638                            PL_colors[4],PL_colors[5],s);
5639         });
5640     }
5641     else {  /* longjumped back */
5642         U8 *src, *dst;
5643         int n=0;
5644         STRLEN s = 0, d = 0;
5645         bool do_end = 0;
5646
5647         /* If the cause for the longjmp was other than changing to utf8, pop
5648          * our own setjmp, and longjmp to the correct handler */
5649         if (jump_ret != UTF8_LONGJMP) {
5650             JMPENV_POP;
5651             JMPENV_JUMP(jump_ret);
5652         }
5653
5654         GET_RE_DEBUG_FLAGS;
5655
5656         /* It's possible to write a regexp in ascii that represents Unicode
5657         codepoints outside of the byte range, such as via \x{100}. If we
5658         detect such a sequence we have to convert the entire pattern to utf8
5659         and then recompile, as our sizing calculation will have been based
5660         on 1 byte == 1 character, but we will need to use utf8 to encode
5661         at least some part of the pattern, and therefore must convert the whole
5662         thing.
5663         -- dmq */
5664         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5665             "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5666
5667         /* upgrade pattern to UTF8, and if there are code blocks,
5668          * recalculate the indices.
5669          * This is essentially an unrolled Perl_bytes_to_utf8() */
5670
5671         src = (U8*)SvPV_nomg(pat, plen);
5672         Newx(dst, plen * 2 + 1, U8);
5673
5674         while (s < plen) {
5675             const UV uv = NATIVE_TO_ASCII(src[s]);
5676             if (UNI_IS_INVARIANT(uv))
5677                 dst[d]   = (U8)UTF_TO_NATIVE(uv);
5678             else {
5679                 dst[d++] = (U8)UTF8_EIGHT_BIT_HI(uv);
5680                 dst[d]   = (U8)UTF8_EIGHT_BIT_LO(uv);
5681             }
5682             if (n < pRExC_state->num_code_blocks) {
5683                 if (!do_end && pRExC_state->code_blocks[n].start == s) {
5684                     pRExC_state->code_blocks[n].start = d;
5685                     assert(dst[d] == '(');
5686                     do_end = 1;
5687                 }
5688                 else if (do_end && pRExC_state->code_blocks[n].end == s) {
5689                     pRExC_state->code_blocks[n].end = d;
5690                     assert(dst[d] == ')');
5691                     do_end = 0;
5692                     n++;
5693                 }
5694             }
5695             s++;
5696             d++;
5697         }
5698         dst[d] = '\0';
5699         plen = d;
5700         exp = (char*) dst;
5701         xend = exp + plen;
5702         SAVEFREEPV(exp);
5703         RExC_orig_utf8 = RExC_utf8 = 1;
5704     }
5705
5706     /* return old regex if pattern hasn't changed */
5707
5708     if (   old_re
5709         && !recompile
5710         && !!RX_UTF8(old_re) == !!RExC_utf8
5711         && RX_PRECOMP(old_re)
5712         && RX_PRELEN(old_re) == plen
5713         && memEQ(RX_PRECOMP(old_re), exp, plen))
5714     {
5715         /* with runtime code, always recompile */
5716         runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags,
5717                                             exp, plen);
5718         if (!runtime_code) {
5719             if (used_setjump) {
5720                 JMPENV_POP;
5721             }
5722             Safefree(pRExC_state->code_blocks);
5723             return old_re;
5724         }
5725     }
5726     else if ((pm_flags & PMf_USE_RE_EVAL)
5727                 /* this second condition covers the non-regex literal case,
5728                  * i.e.  $foo =~ '(?{})'. */
5729                 || ( !PL_reg_state.re_reparsing && IN_PERL_COMPILETIME
5730                     && (PL_hints & HINT_RE_EVAL))
5731     )
5732         runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags,
5733                             exp, plen);
5734
5735 #ifdef TRIE_STUDY_OPT
5736     restudied = 0;
5737 #endif
5738
5739     rx_flags = orig_rx_flags;
5740
5741     if (initial_charset == REGEX_LOCALE_CHARSET) {
5742         RExC_contains_locale = 1;
5743     }
5744     else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
5745
5746         /* Set to use unicode semantics if the pattern is in utf8 and has the
5747          * 'depends' charset specified, as it means unicode when utf8  */
5748         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5749     }
5750
5751     RExC_precomp = exp;
5752     RExC_flags = rx_flags;
5753     RExC_pm_flags = pm_flags;
5754
5755     if (runtime_code) {
5756         if (PL_tainting && PL_tainted)
5757             Perl_croak(aTHX_ "Eval-group in insecure regular expression");
5758
5759         if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
5760             /* whoops, we have a non-utf8 pattern, whilst run-time code
5761              * got compiled as utf8. Try again with a utf8 pattern */
5762              JMPENV_JUMP(UTF8_LONGJMP);
5763         }
5764     }
5765     assert(!pRExC_state->runtime_code_qr);
5766
5767     RExC_sawback = 0;
5768
5769     RExC_seen = 0;
5770     RExC_in_lookbehind = 0;
5771     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
5772     RExC_extralen = 0;
5773     RExC_override_recoding = 0;
5774     RExC_in_multi_char_class = 0;
5775
5776     /* First pass: determine size, legality. */
5777     RExC_parse = exp;
5778     RExC_start = exp;
5779     RExC_end = xend;
5780     RExC_naughty = 0;
5781     RExC_npar = 1;
5782     RExC_nestroot = 0;
5783     RExC_size = 0L;
5784     RExC_emit = &PL_regdummy;
5785     RExC_whilem_seen = 0;
5786     RExC_open_parens = NULL;
5787     RExC_close_parens = NULL;
5788     RExC_opend = NULL;
5789     RExC_paren_names = NULL;
5790 #ifdef DEBUGGING
5791     RExC_paren_name_list = NULL;
5792 #endif
5793     RExC_recurse = NULL;
5794     RExC_recurse_count = 0;
5795     pRExC_state->code_index = 0;
5796
5797 #if 0 /* REGC() is (currently) a NOP at the first pass.
5798        * Clever compilers notice this and complain. --jhi */
5799     REGC((U8)REG_MAGIC, (char*)RExC_emit);
5800 #endif
5801     DEBUG_PARSE_r(
5802         PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
5803         RExC_lastnum=0;
5804         RExC_lastparse=NULL;
5805     );
5806     if (reg(pRExC_state, 0, &flags,1) == NULL) {
5807         RExC_precomp = NULL;
5808         Safefree(pRExC_state->code_blocks);
5809         return(NULL);
5810     }
5811
5812     /* Here, finished first pass.  Get rid of any added setjmp */
5813     if (used_setjump) {
5814         JMPENV_POP;
5815     }
5816
5817     DEBUG_PARSE_r({
5818         PerlIO_printf(Perl_debug_log,
5819             "Required size %"IVdf" nodes\n"
5820             "Starting second pass (creation)\n",
5821             (IV)RExC_size);
5822         RExC_lastnum=0;
5823         RExC_lastparse=NULL;
5824     });
5825
5826     /* The first pass could have found things that force Unicode semantics */
5827     if ((RExC_utf8 || RExC_uni_semantics)
5828          && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
5829     {
5830         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5831     }
5832
5833     /* Small enough for pointer-storage convention?
5834        If extralen==0, this means that we will not need long jumps. */
5835     if (RExC_size >= 0x10000L && RExC_extralen)
5836         RExC_size += RExC_extralen;
5837     else
5838         RExC_extralen = 0;
5839     if (RExC_whilem_seen > 15)
5840         RExC_whilem_seen = 15;
5841
5842     /* Allocate space and zero-initialize. Note, the two step process
5843        of zeroing when in debug mode, thus anything assigned has to
5844        happen after that */
5845     rx = (REGEXP*) newSV_type(SVt_REGEXP);
5846     r = (struct regexp*)SvANY(rx);
5847     Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
5848          char, regexp_internal);
5849     if ( r == NULL || ri == NULL )
5850         FAIL("Regexp out of space");
5851 #ifdef DEBUGGING
5852     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
5853     Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
5854 #else
5855     /* bulk initialize base fields with 0. */
5856     Zero(ri, sizeof(regexp_internal), char);
5857 #endif
5858
5859     /* non-zero initialization begins here */
5860     RXi_SET( r, ri );
5861     r->engine= eng;
5862     r->extflags = rx_flags;
5863     if (pm_flags & PMf_IS_QR) {
5864         ri->code_blocks = pRExC_state->code_blocks;
5865         ri->num_code_blocks = pRExC_state->num_code_blocks;
5866     }
5867     else
5868         SAVEFREEPV(pRExC_state->code_blocks);
5869
5870     {
5871         bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
5872         bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
5873
5874         /* The caret is output if there are any defaults: if not all the STD
5875          * flags are set, or if no character set specifier is needed */
5876         bool has_default =
5877                     (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
5878                     || ! has_charset);
5879         bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
5880         U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
5881                             >> RXf_PMf_STD_PMMOD_SHIFT);
5882         const char *fptr = STD_PAT_MODS;        /*"msix"*/
5883         char *p;
5884         /* Allocate for the worst case, which is all the std flags are turned
5885          * on.  If more precision is desired, we could do a population count of
5886          * the flags set.  This could be done with a small lookup table, or by
5887          * shifting, masking and adding, or even, when available, assembly
5888          * language for a machine-language population count.
5889          * We never output a minus, as all those are defaults, so are
5890          * covered by the caret */
5891         const STRLEN wraplen = plen + has_p + has_runon
5892             + has_default       /* If needs a caret */
5893
5894                 /* If needs a character set specifier */
5895             + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
5896             + (sizeof(STD_PAT_MODS) - 1)
5897             + (sizeof("(?:)") - 1);
5898
5899         p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
5900         SvPOK_on(rx);
5901         if (RExC_utf8)
5902             SvFLAGS(rx) |= SVf_UTF8;
5903         *p++='('; *p++='?';
5904
5905         /* If a default, cover it using the caret */
5906         if (has_default) {
5907             *p++= DEFAULT_PAT_MOD;
5908         }
5909         if (has_charset) {
5910             STRLEN len;
5911             const char* const name = get_regex_charset_name(r->extflags, &len);
5912             Copy(name, p, len, char);
5913             p += len;
5914         }
5915         if (has_p)
5916             *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
5917         {
5918             char ch;
5919             while((ch = *fptr++)) {
5920                 if(reganch & 1)
5921                     *p++ = ch;
5922                 reganch >>= 1;
5923             }
5924         }
5925
5926         *p++ = ':';
5927         Copy(RExC_precomp, p, plen, char);
5928         assert ((RX_WRAPPED(rx) - p) < 16);
5929         r->pre_prefix = p - RX_WRAPPED(rx);
5930         p += plen;
5931         if (has_runon)
5932             *p++ = '\n';
5933         *p++ = ')';
5934         *p = 0;
5935         SvCUR_set(rx, p - SvPVX_const(rx));
5936     }
5937
5938     r->intflags = 0;
5939     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
5940
5941     if (RExC_seen & REG_SEEN_RECURSE) {
5942         Newxz(RExC_open_parens, RExC_npar,regnode *);
5943         SAVEFREEPV(RExC_open_parens);
5944         Newxz(RExC_close_parens,RExC_npar,regnode *);
5945         SAVEFREEPV(RExC_close_parens);
5946     }
5947
5948     /* Useful during FAIL. */
5949 #ifdef RE_TRACK_PATTERN_OFFSETS
5950     Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
5951     DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
5952                           "%s %"UVuf" bytes for offset annotations.\n",
5953                           ri->u.offsets ? "Got" : "Couldn't get",
5954                           (UV)((2*RExC_size+1) * sizeof(U32))));
5955 #endif
5956     SetProgLen(ri,RExC_size);
5957     RExC_rx_sv = rx;
5958     RExC_rx = r;
5959     RExC_rxi = ri;
5960
5961     /* Second pass: emit code. */
5962     RExC_flags = rx_flags;      /* don't let top level (?i) bleed */
5963     RExC_pm_flags = pm_flags;
5964     RExC_parse = exp;
5965     RExC_end = xend;
5966     RExC_naughty = 0;
5967     RExC_npar = 1;
5968     RExC_emit_start = ri->program;
5969     RExC_emit = ri->program;
5970     RExC_emit_bound = ri->program + RExC_size + 1;
5971     pRExC_state->code_index = 0;
5972
5973     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
5974     if (reg(pRExC_state, 0, &flags,1) == NULL) {
5975         ReREFCNT_dec(rx);
5976         return(NULL);
5977     }
5978     /* XXXX To minimize changes to RE engine we always allocate
5979        3-units-long substrs field. */
5980     Newx(r->substrs, 1, struct reg_substr_data);
5981     if (RExC_recurse_count) {
5982         Newxz(RExC_recurse,RExC_recurse_count,regnode *);
5983         SAVEFREEPV(RExC_recurse);
5984     }
5985
5986 reStudy:
5987     r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
5988     Zero(r->substrs, 1, struct reg_substr_data);
5989
5990 #ifdef TRIE_STUDY_OPT
5991     if (!restudied) {
5992         StructCopy(&zero_scan_data, &data, scan_data_t);
5993         copyRExC_state = RExC_state;
5994     } else {
5995         U32 seen=RExC_seen;
5996         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
5997
5998         RExC_state = copyRExC_state;
5999         if (seen & REG_TOP_LEVEL_BRANCHES)
6000             RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6001         else
6002             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
6003         if (data.last_found) {
6004             SvREFCNT_dec(data.longest_fixed);
6005             SvREFCNT_dec(data.longest_float);
6006             SvREFCNT_dec(data.last_found);
6007         }
6008         StructCopy(&zero_scan_data, &data, scan_data_t);
6009     }
6010 #else
6011     StructCopy(&zero_scan_data, &data, scan_data_t);
6012 #endif
6013
6014     /* Dig out information for optimizations. */
6015     r->extflags = RExC_flags; /* was pm_op */
6016     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
6017
6018     if (UTF)
6019         SvUTF8_on(rx);  /* Unicode in it? */
6020     ri->regstclass = NULL;
6021     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
6022         r->intflags |= PREGf_NAUGHTY;
6023     scan = ri->program + 1;             /* First BRANCH. */
6024
6025     /* testing for BRANCH here tells us whether there is "must appear"
6026        data in the pattern. If there is then we can use it for optimisations */
6027     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /*  Only one top-level choice. */
6028         I32 fake;
6029         STRLEN longest_float_length, longest_fixed_length;
6030         struct regnode_charclass_class ch_class; /* pointed to by data */
6031         int stclass_flag;
6032         I32 last_close = 0; /* pointed to by data */
6033         regnode *first= scan;
6034         regnode *first_next= regnext(first);
6035         /*
6036          * Skip introductions and multiplicators >= 1
6037          * so that we can extract the 'meat' of the pattern that must
6038          * match in the large if() sequence following.
6039          * NOTE that EXACT is NOT covered here, as it is normally
6040          * picked up by the optimiser separately.
6041          *
6042          * This is unfortunate as the optimiser isnt handling lookahead
6043          * properly currently.
6044          *
6045          */
6046         while ((OP(first) == OPEN && (sawopen = 1)) ||
6047                /* An OR of *one* alternative - should not happen now. */
6048             (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
6049             /* for now we can't handle lookbehind IFMATCH*/
6050             (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
6051             (OP(first) == PLUS) ||
6052             (OP(first) == MINMOD) ||
6053                /* An {n,m} with n>0 */
6054             (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
6055             (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
6056         {
6057                 /*
6058                  * the only op that could be a regnode is PLUS, all the rest
6059                  * will be regnode_1 or regnode_2.
6060                  *
6061                  */
6062                 if (OP(first) == PLUS)
6063                     sawplus = 1;
6064                 else
6065                     first += regarglen[OP(first)];
6066
6067                 first = NEXTOPER(first);
6068                 first_next= regnext(first);
6069         }
6070
6071         /* Starting-point info. */
6072       again:
6073         DEBUG_PEEP("first:",first,0);
6074         /* Ignore EXACT as we deal with it later. */
6075         if (PL_regkind[OP(first)] == EXACT) {
6076             if (OP(first) == EXACT)
6077                 NOOP;   /* Empty, get anchored substr later. */
6078             else
6079                 ri->regstclass = first;
6080         }
6081 #ifdef TRIE_STCLASS
6082         else if (PL_regkind[OP(first)] == TRIE &&
6083                 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
6084         {
6085             regnode *trie_op;
6086             /* this can happen only on restudy */
6087             if ( OP(first) == TRIE ) {
6088                 struct regnode_1 *trieop = (struct regnode_1 *)
6089                     PerlMemShared_calloc(1, sizeof(struct regnode_1));
6090                 StructCopy(first,trieop,struct regnode_1);
6091                 trie_op=(regnode *)trieop;
6092             } else {
6093                 struct regnode_charclass *trieop = (struct regnode_charclass *)
6094                     PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
6095                 StructCopy(first,trieop,struct regnode_charclass);
6096                 trie_op=(regnode *)trieop;
6097             }
6098             OP(trie_op)+=2;
6099             make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
6100             ri->regstclass = trie_op;
6101         }
6102 #endif
6103         else if (REGNODE_SIMPLE(OP(first)))
6104             ri->regstclass = first;
6105         else if (PL_regkind[OP(first)] == BOUND ||
6106                  PL_regkind[OP(first)] == NBOUND)
6107             ri->regstclass = first;
6108         else if (PL_regkind[OP(first)] == BOL) {
6109             r->extflags |= (OP(first) == MBOL
6110                            ? RXf_ANCH_MBOL
6111                            : (OP(first) == SBOL
6112                               ? RXf_ANCH_SBOL
6113                               : RXf_ANCH_BOL));
6114             first = NEXTOPER(first);
6115             goto again;
6116         }
6117         else if (OP(first) == GPOS) {
6118             r->extflags |= RXf_ANCH_GPOS;
6119             first = NEXTOPER(first);
6120             goto again;
6121         }
6122         else if ((!sawopen || !RExC_sawback) &&
6123             (OP(first) == STAR &&
6124             PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
6125             !(r->extflags & RXf_ANCH) && !pRExC_state->num_code_blocks)
6126         {
6127             /* turn .* into ^.* with an implied $*=1 */
6128             const int type =
6129                 (OP(NEXTOPER(first)) == REG_ANY)
6130                     ? RXf_ANCH_MBOL
6131                     : RXf_ANCH_SBOL;
6132             r->extflags |= type;
6133             r->intflags |= PREGf_IMPLICIT;
6134             first = NEXTOPER(first);
6135             goto again;
6136         }
6137         if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
6138             && !pRExC_state->num_code_blocks) /* May examine pos and $& */
6139             /* x+ must match at the 1st pos of run of x's */
6140             r->intflags |= PREGf_SKIP;
6141
6142         /* Scan is after the zeroth branch, first is atomic matcher. */
6143 #ifdef TRIE_STUDY_OPT
6144         DEBUG_PARSE_r(
6145             if (!restudied)
6146                 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6147                               (IV)(first - scan + 1))
6148         );
6149 #else
6150         DEBUG_PARSE_r(
6151             PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6152                 (IV)(first - scan + 1))
6153         );
6154 #endif
6155
6156
6157         /*
6158         * If there's something expensive in the r.e., find the
6159         * longest literal string that must appear and make it the
6160         * regmust.  Resolve ties in favor of later strings, since
6161         * the regstart check works with the beginning of the r.e.
6162         * and avoiding duplication strengthens checking.  Not a
6163         * strong reason, but sufficient in the absence of others.
6164         * [Now we resolve ties in favor of the earlier string if
6165         * it happens that c_offset_min has been invalidated, since the
6166         * earlier string may buy us something the later one won't.]
6167         */
6168
6169         data.longest_fixed = newSVpvs("");
6170         data.longest_float = newSVpvs("");
6171         data.last_found = newSVpvs("");
6172         data.longest = &(data.longest_fixed);
6173         first = scan;
6174         if (!ri->regstclass) {
6175             cl_init(pRExC_state, &ch_class);
6176             data.start_class = &ch_class;
6177             stclass_flag = SCF_DO_STCLASS_AND;
6178         } else                          /* XXXX Check for BOUND? */
6179             stclass_flag = 0;
6180         data.last_closep = &last_close;
6181
6182         minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
6183             &data, -1, NULL, NULL,
6184             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
6185
6186
6187         CHECK_RESTUDY_GOTO;
6188
6189
6190         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
6191              && data.last_start_min == 0 && data.last_end > 0
6192              && !RExC_seen_zerolen
6193              && !(RExC_seen & REG_SEEN_VERBARG)
6194              && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
6195             r->extflags |= RXf_CHECK_ALL;
6196         scan_commit(pRExC_state, &data,&minlen,0);
6197         SvREFCNT_dec(data.last_found);
6198
6199         longest_float_length = CHR_SVLEN(data.longest_float);
6200
6201         if (! ((SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
6202                    && data.offset_fixed == data.offset_float_min
6203                    && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
6204             && S_setup_longest (aTHX_ pRExC_state,
6205                                     data.longest_float,
6206                                     &(r->float_utf8),
6207                                     &(r->float_substr),
6208                                     &(r->float_end_shift),
6209                                     data.lookbehind_float,
6210                                     data.offset_float_min,
6211                                     data.minlen_float,
6212                                     longest_float_length,
6213                                     data.flags & SF_FL_BEFORE_EOL,
6214                                     data.flags & SF_FL_BEFORE_MEOL))
6215         {
6216             r->float_min_offset = data.offset_float_min - data.lookbehind_float;
6217             r->float_max_offset = data.offset_float_max;
6218             if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
6219                 r->float_max_offset -= data.lookbehind_float;
6220         }
6221         else {
6222             r->float_substr = r->float_utf8 = NULL;
6223             SvREFCNT_dec(data.longest_float);
6224             longest_float_length = 0;
6225         }
6226
6227         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
6228
6229         if (S_setup_longest (aTHX_ pRExC_state,
6230                                 data.longest_fixed,
6231                                 &(r->anchored_utf8),
6232                                 &(r->anchored_substr),
6233                                 &(r->anchored_end_shift),
6234                                 data.lookbehind_fixed,
6235                                 data.offset_fixed,
6236                                 data.minlen_fixed,
6237                                 longest_fixed_length,
6238                                 data.flags & SF_FIX_BEFORE_EOL,
6239                                 data.flags & SF_FIX_BEFORE_MEOL))
6240         {
6241             r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
6242         }
6243         else {
6244             r->anchored_substr = r->anchored_utf8 = NULL;
6245             SvREFCNT_dec(data.longest_fixed);
6246             longest_fixed_length = 0;
6247         }
6248
6249         if (ri->regstclass
6250             && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
6251             ri->regstclass = NULL;
6252
6253         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
6254             && stclass_flag
6255             && !(data.start_class->flags & ANYOF_EOS)
6256             && !cl_is_anything(data.start_class))
6257         {
6258             const U32 n = add_data(pRExC_state, 1, "f");
6259             data.start_class->flags |= ANYOF_IS_SYNTHETIC;
6260
6261             Newx(RExC_rxi->data->data[n], 1,
6262                 struct regnode_charclass_class);
6263             StructCopy(data.start_class,
6264                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6265                        struct regnode_charclass_class);
6266             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6267             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6268             DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
6269                       regprop(r, sv, (regnode*)data.start_class);
6270                       PerlIO_printf(Perl_debug_log,
6271                                     "synthetic stclass \"%s\".\n",
6272                                     SvPVX_const(sv));});
6273         }
6274
6275         /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
6276         if (longest_fixed_length > longest_float_length) {
6277             r->check_end_shift = r->anchored_end_shift;
6278             r->check_substr = r->anchored_substr;
6279             r->check_utf8 = r->anchored_utf8;
6280             r->check_offset_min = r->check_offset_max = r->anchored_offset;
6281             if (r->extflags & RXf_ANCH_SINGLE)
6282                 r->extflags |= RXf_NOSCAN;
6283         }
6284         else {
6285             r->check_end_shift = r->float_end_shift;
6286             r->check_substr = r->float_substr;
6287             r->check_utf8 = r->float_utf8;
6288             r->check_offset_min = r->float_min_offset;
6289             r->check_offset_max = r->float_max_offset;
6290         }
6291         /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
6292            This should be changed ASAP!  */
6293         if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
6294             r->extflags |= RXf_USE_INTUIT;
6295             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
6296                 r->extflags |= RXf_INTUIT_TAIL;
6297         }
6298         /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
6299         if ( (STRLEN)minlen < longest_float_length )
6300             minlen= longest_float_length;
6301         if ( (STRLEN)minlen < longest_fixed_length )
6302             minlen= longest_fixed_length;
6303         */
6304     }
6305     else {
6306         /* Several toplevels. Best we can is to set minlen. */
6307         I32 fake;
6308         struct regnode_charclass_class ch_class;
6309         I32 last_close = 0;
6310
6311         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
6312
6313         scan = ri->program + 1;
6314         cl_init(pRExC_state, &ch_class);
6315         data.start_class = &ch_class;
6316         data.last_closep = &last_close;
6317
6318
6319         minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
6320             &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
6321
6322         CHECK_RESTUDY_GOTO;
6323
6324         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
6325                 = r->float_substr = r->float_utf8 = NULL;
6326
6327         if (!(data.start_class->flags & ANYOF_EOS)
6328             && !cl_is_anything(data.start_class))
6329         {
6330             const U32 n = add_data(pRExC_state, 1, "f");
6331             data.start_class->flags |= ANYOF_IS_SYNTHETIC;
6332
6333             Newx(RExC_rxi->data->data[n], 1,
6334                 struct regnode_charclass_class);
6335             StructCopy(data.start_class,
6336                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6337                        struct regnode_charclass_class);
6338             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6339             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6340             DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
6341                       regprop(r, sv, (regnode*)data.start_class);
6342                       PerlIO_printf(Perl_debug_log,
6343                                     "synthetic stclass \"%s\".\n",
6344                                     SvPVX_const(sv));});
6345         }
6346     }
6347
6348     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
6349        the "real" pattern. */
6350     DEBUG_OPTIMISE_r({
6351         PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
6352                       (IV)minlen, (IV)r->minlen);
6353     });
6354     r->minlenret = minlen;
6355     if (r->minlen < minlen)
6356         r->minlen = minlen;
6357
6358     if (RExC_seen & REG_SEEN_GPOS)
6359         r->extflags |= RXf_GPOS_SEEN;
6360     if (RExC_seen & REG_SEEN_LOOKBEHIND)
6361         r->extflags |= RXf_LOOKBEHIND_SEEN;
6362     if (pRExC_state->num_code_blocks)
6363         r->extflags |= RXf_EVAL_SEEN;
6364     if (RExC_seen & REG_SEEN_CANY)
6365         r->extflags |= RXf_CANY_SEEN;
6366     if (RExC_seen & REG_SEEN_VERBARG)
6367     {
6368         r->intflags |= PREGf_VERBARG_SEEN;
6369         r->extflags |= RXf_MODIFIES_VARS;
6370     }
6371     if (RExC_seen & REG_SEEN_CUTGROUP)
6372         r->intflags |= PREGf_CUTGROUP_SEEN;
6373     if (pm_flags & PMf_USE_RE_EVAL)
6374         r->intflags |= PREGf_USE_RE_EVAL;
6375     if (RExC_paren_names)
6376         RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
6377     else
6378         RXp_PAREN_NAMES(r) = NULL;
6379
6380 #ifdef STUPID_PATTERN_CHECKS
6381     if (RX_PRELEN(rx) == 0)
6382         r->extflags |= RXf_NULL;
6383     if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
6384         r->extflags |= RXf_WHITE;
6385     else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
6386         r->extflags |= RXf_START_ONLY;
6387 #else
6388     {
6389         regnode *first = ri->program + 1;
6390         U8 fop = OP(first);
6391
6392         if (PL_regkind[fop] == NOTHING && OP(NEXTOPER(first)) == END)
6393             r->extflags |= RXf_NULL;
6394         else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END)
6395             r->extflags |= RXf_START_ONLY;
6396         else if (fop == PLUS && OP(NEXTOPER(first)) == SPACE
6397                              && OP(regnext(first)) == END)
6398             r->extflags |= RXf_WHITE;
6399     }
6400 #endif
6401 #ifdef DEBUGGING
6402     if (RExC_paren_names) {
6403         ri->name_list_idx = add_data( pRExC_state, 1, "a" );
6404         ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
6405     } else
6406 #endif
6407         ri->name_list_idx = 0;
6408
6409     if (RExC_recurse_count) {
6410         for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
6411             const regnode *scan = RExC_recurse[RExC_recurse_count-1];
6412             ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
6413         }
6414     }
6415     Newxz(r->offs, RExC_npar, regexp_paren_pair);
6416     /* assume we don't need to swap parens around before we match */
6417
6418     DEBUG_DUMP_r({
6419         PerlIO_printf(Perl_debug_log,"Final program:\n");
6420         regdump(r);
6421     });
6422 #ifdef RE_TRACK_PATTERN_OFFSETS
6423     DEBUG_OFFSETS_r(if (ri->u.offsets) {
6424         const U32 len = ri->u.offsets[0];
6425         U32 i;
6426         GET_RE_DEBUG_FLAGS_DECL;
6427         PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
6428         for (i = 1; i <= len; i++) {
6429             if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
6430                 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
6431                 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
6432             }
6433         PerlIO_printf(Perl_debug_log, "\n");
6434     });
6435 #endif
6436     return rx;
6437 }
6438
6439
6440 SV*
6441 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
6442                     const U32 flags)
6443 {
6444     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
6445
6446     PERL_UNUSED_ARG(value);
6447
6448     if (flags & RXapif_FETCH) {
6449         return reg_named_buff_fetch(rx, key, flags);
6450     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
6451         Perl_croak_no_modify(aTHX);
6452         return NULL;
6453     } else if (flags & RXapif_EXISTS) {
6454         return reg_named_buff_exists(rx, key, flags)
6455             ? &PL_sv_yes
6456             : &PL_sv_no;
6457     } else if (flags & RXapif_REGNAMES) {
6458         return reg_named_buff_all(rx, flags);
6459     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
6460         return reg_named_buff_scalar(rx, flags);
6461     } else {
6462         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
6463         return NULL;
6464     }
6465 }
6466
6467 SV*
6468 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
6469                          const U32 flags)
6470 {
6471     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
6472     PERL_UNUSED_ARG(lastkey);
6473
6474     if (flags & RXapif_FIRSTKEY)
6475         return reg_named_buff_firstkey(rx, flags);
6476     else if (flags & RXapif_NEXTKEY)
6477         return reg_named_buff_nextkey(rx, flags);
6478     else {
6479         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
6480         return NULL;
6481     }
6482 }
6483
6484 SV*
6485 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
6486                           const U32 flags)
6487 {
6488     AV *retarray = NULL;
6489     SV *ret;
6490     struct regexp *const rx = (struct regexp *)SvANY(r);
6491
6492     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
6493
6494     if (flags & RXapif_ALL)
6495         retarray=newAV();
6496
6497     if (rx && RXp_PAREN_NAMES(rx)) {
6498         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
6499         if (he_str) {
6500             IV i;
6501             SV* sv_dat=HeVAL(he_str);
6502             I32 *nums=(I32*)SvPVX(sv_dat);
6503             for ( i=0; i<SvIVX(sv_dat); i++ ) {
6504                 if ((I32)(rx->nparens) >= nums[i]
6505                     && rx->offs[nums[i]].start != -1
6506                     && rx->offs[nums[i]].end != -1)
6507                 {
6508                     ret = newSVpvs("");
6509                     CALLREG_NUMBUF_FETCH(r,nums[i],ret);
6510                     if (!retarray)
6511                         return ret;
6512                 } else {
6513                     if (retarray)
6514                         ret = newSVsv(&PL_sv_undef);
6515                 }
6516                 if (retarray)
6517                     av_push(retarray, ret);
6518             }
6519             if (retarray)
6520                 return newRV_noinc(MUTABLE_SV(retarray));
6521         }
6522     }
6523     return NULL;
6524 }
6525
6526 bool
6527 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
6528                            const U32 flags)
6529 {
6530     struct regexp *const rx = (struct regexp *)SvANY(r);
6531
6532     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
6533
6534     if (rx && RXp_PAREN_NAMES(rx)) {
6535         if (flags & RXapif_ALL) {
6536             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
6537         } else {
6538             SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
6539             if (sv) {
6540                 SvREFCNT_dec(sv);
6541                 return TRUE;
6542             } else {
6543                 return FALSE;
6544             }
6545         }
6546     } else {
6547         return FALSE;
6548     }
6549 }
6550
6551 SV*
6552 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
6553 {
6554     struct regexp *const rx = (struct regexp *)SvANY(r);
6555
6556     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
6557
6558     if ( rx && RXp_PAREN_NAMES(rx) ) {
6559         (void)hv_iterinit(RXp_PAREN_NAMES(rx));
6560
6561         return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
6562     } else {
6563         return FALSE;
6564     }
6565 }
6566
6567 SV*
6568 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
6569 {
6570     struct regexp *const rx = (struct regexp *)SvANY(r);
6571     GET_RE_DEBUG_FLAGS_DECL;
6572
6573     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
6574
6575     if (rx && RXp_PAREN_NAMES(rx)) {
6576         HV *hv = RXp_PAREN_NAMES(rx);
6577         HE *temphe;
6578         while ( (temphe = hv_iternext_flags(hv,0)) ) {
6579             IV i;
6580             IV parno = 0;
6581             SV* sv_dat = HeVAL(temphe);
6582             I32 *nums = (I32*)SvPVX(sv_dat);
6583             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6584                 if ((I32)(rx->lastparen) >= nums[i] &&
6585                     rx->offs[nums[i]].start != -1 &&
6586                     rx->offs[nums[i]].end != -1)
6587                 {
6588                     parno = nums[i];
6589                     break;
6590                 }
6591             }
6592             if (parno || flags & RXapif_ALL) {
6593                 return newSVhek(HeKEY_hek(temphe));
6594             }
6595         }
6596     }
6597     return NULL;
6598 }
6599
6600 SV*
6601 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
6602 {
6603     SV *ret;
6604     AV *av;
6605     I32 length;
6606     struct regexp *const rx = (struct regexp *)SvANY(r);
6607
6608     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
6609
6610     if (rx && RXp_PAREN_NAMES(rx)) {
6611         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
6612             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
6613         } else if (flags & RXapif_ONE) {
6614             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
6615             av = MUTABLE_AV(SvRV(ret));
6616             length = av_len(av);
6617             SvREFCNT_dec(ret);
6618             return newSViv(length + 1);
6619         } else {
6620             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
6621             return NULL;
6622         }
6623     }
6624     return &PL_sv_undef;
6625 }
6626
6627 SV*
6628 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
6629 {
6630     struct regexp *const rx = (struct regexp *)SvANY(r);
6631     AV *av = newAV();
6632
6633     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
6634
6635     if (rx && RXp_PAREN_NAMES(rx)) {
6636         HV *hv= RXp_PAREN_NAMES(rx);
6637         HE *temphe;
6638         (void)hv_iterinit(hv);
6639         while ( (temphe = hv_iternext_flags(hv,0)) ) {
6640             IV i;
6641             IV parno = 0;
6642             SV* sv_dat = HeVAL(temphe);
6643             I32 *nums = (I32*)SvPVX(sv_dat);
6644             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6645                 if ((I32)(rx->lastparen) >= nums[i] &&
6646                     rx->offs[nums[i]].start != -1 &&
6647                     rx->offs[nums[i]].end != -1)
6648                 {
6649                     parno = nums[i];
6650                     break;
6651                 }
6652             }
6653             if (parno || flags & RXapif_ALL) {
6654                 av_push(av, newSVhek(HeKEY_hek(temphe)));
6655             }
6656         }
6657     }
6658
6659     return newRV_noinc(MUTABLE_SV(av));
6660 }
6661
6662 void
6663 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
6664                              SV * const sv)
6665 {
6666     struct regexp *const rx = (struct regexp *)SvANY(r);
6667     char *s = NULL;
6668     I32 i = 0;
6669     I32 s1, t1;
6670     I32 n = paren;
6671
6672     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
6673
6674     if ( (    n == RX_BUFF_IDX_CARET_PREMATCH
6675            || n == RX_BUFF_IDX_CARET_FULLMATCH
6676            || n == RX_BUFF_IDX_CARET_POSTMATCH
6677          )
6678          && !(rx->extflags & RXf_PMf_KEEPCOPY)
6679     )
6680         goto ret_undef;
6681
6682     if (!rx->subbeg)
6683         goto ret_undef;
6684
6685     if (n == RX_BUFF_IDX_CARET_FULLMATCH)
6686         /* no need to distinguish between them any more */
6687         n = RX_BUFF_IDX_FULLMATCH;
6688
6689     if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
6690         && rx->offs[0].start != -1)
6691     {
6692         /* $`, ${^PREMATCH} */
6693         i = rx->offs[0].start;
6694         s = rx->subbeg;
6695     }
6696     else
6697     if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
6698         && rx->offs[0].end != -1)
6699     {
6700         /* $', ${^POSTMATCH} */
6701         s = rx->subbeg - rx->suboffset + rx->offs[0].end;
6702         i = rx->sublen + rx->suboffset - rx->offs[0].end;
6703     }
6704     else
6705     if ( 0 <= n && n <= (I32)rx->nparens &&
6706         (s1 = rx->offs[n].start) != -1 &&
6707         (t1 = rx->offs[n].end) != -1)
6708     {
6709         /* $&, ${^MATCH},  $1 ... */
6710         i = t1 - s1;
6711         s = rx->subbeg + s1 - rx->suboffset;
6712     } else {
6713         goto ret_undef;
6714     }
6715
6716     assert(s >= rx->subbeg);
6717     assert(rx->sublen >= (s - rx->subbeg) + i );
6718     if (i >= 0) {
6719         const int oldtainted = PL_tainted;
6720         TAINT_NOT;
6721         sv_setpvn(sv, s, i);
6722         PL_tainted = oldtainted;
6723         if ( (rx->extflags & RXf_CANY_SEEN)
6724             ? (RXp_MATCH_UTF8(rx)
6725                         && (!i || is_utf8_string((U8*)s, i)))
6726             : (RXp_MATCH_UTF8(rx)) )
6727         {
6728             SvUTF8_on(sv);
6729         }
6730         else
6731             SvUTF8_off(sv);
6732         if (PL_tainting) {
6733             if (RXp_MATCH_TAINTED(rx)) {
6734                 if (SvTYPE(sv) >= SVt_PVMG) {
6735                     MAGIC* const mg = SvMAGIC(sv);
6736                     MAGIC* mgt;
6737                     PL_tainted = 1;
6738                     SvMAGIC_set(sv, mg->mg_moremagic);
6739                     SvTAINT(sv);
6740                     if ((mgt = SvMAGIC(sv))) {
6741                         mg->mg_moremagic = mgt;
6742                         SvMAGIC_set(sv, mg);
6743                     }
6744                 } else {
6745                     PL_tainted = 1;
6746                     SvTAINT(sv);
6747                 }
6748             } else
6749                 SvTAINTED_off(sv);
6750         }
6751     } else {
6752       ret_undef:
6753         sv_setsv(sv,&PL_sv_undef);
6754         return;
6755     }
6756 }
6757
6758 void
6759 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
6760                                                          SV const * const value)
6761 {
6762     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
6763
6764     PERL_UNUSED_ARG(rx);
6765     PERL_UNUSED_ARG(paren);
6766     PERL_UNUSED_ARG(value);
6767
6768     if (!PL_localizing)
6769         Perl_croak_no_modify(aTHX);
6770 }
6771
6772 I32
6773 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
6774                               const I32 paren)
6775 {
6776     struct regexp *const rx = (struct regexp *)SvANY(r);
6777     I32 i;
6778     I32 s1, t1;
6779
6780     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
6781
6782     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
6783     switch (paren) {
6784       case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
6785          if (!(rx->extflags & RXf_PMf_KEEPCOPY))
6786             goto warn_undef;
6787         /*FALLTHROUGH*/
6788
6789       case RX_BUFF_IDX_PREMATCH:       /* $` */
6790         if (rx->offs[0].start != -1) {
6791                         i = rx->offs[0].start;
6792                         if (i > 0) {
6793                                 s1 = 0;
6794                                 t1 = i;
6795                                 goto getlen;
6796                         }
6797             }
6798         return 0;
6799
6800       case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
6801          if (!(rx->extflags & RXf_PMf_KEEPCOPY))
6802             goto warn_undef;
6803       case RX_BUFF_IDX_POSTMATCH:       /* $' */
6804             if (rx->offs[0].end != -1) {
6805                         i = rx->sublen - rx->offs[0].end;
6806                         if (i > 0) {
6807                                 s1 = rx->offs[0].end;
6808                                 t1 = rx->sublen;
6809                                 goto getlen;
6810                         }
6811             }
6812         return 0;
6813
6814       case RX_BUFF_IDX_CARET_FULLMATCH: /* ${^MATCH} */
6815          if (!(rx->extflags & RXf_PMf_KEEPCOPY))
6816             goto warn_undef;
6817         /*FALLTHROUGH*/
6818
6819       /* $& / ${^MATCH}, $1, $2, ... */
6820       default:
6821             if (paren <= (I32)rx->nparens &&
6822             (s1 = rx->offs[paren].start) != -1 &&
6823             (t1 = rx->offs[paren].end) != -1)
6824             {
6825             i = t1 - s1;
6826             goto getlen;
6827         } else {
6828           warn_undef:
6829             if (ckWARN(WARN_UNINITIALIZED))
6830                 report_uninit((const SV *)sv);
6831             return 0;
6832         }
6833     }
6834   getlen:
6835     if (i > 0 && RXp_MATCH_UTF8(rx)) {
6836         const char * const s = rx->subbeg - rx->suboffset + s1;
6837         const U8 *ep;
6838         STRLEN el;
6839
6840         i = t1 - s1;
6841         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
6842                         i = el;
6843     }
6844     return i;
6845 }
6846
6847 SV*
6848 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
6849 {
6850     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
6851         PERL_UNUSED_ARG(rx);
6852         if (0)
6853             return NULL;
6854         else
6855             return newSVpvs("Regexp");
6856 }
6857
6858 /* Scans the name of a named buffer from the pattern.
6859  * If flags is REG_RSN_RETURN_NULL returns null.
6860  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
6861  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
6862  * to the parsed name as looked up in the RExC_paren_names hash.
6863  * If there is an error throws a vFAIL().. type exception.
6864  */
6865
6866 #define REG_RSN_RETURN_NULL    0
6867 #define REG_RSN_RETURN_NAME    1
6868 #define REG_RSN_RETURN_DATA    2
6869
6870 STATIC SV*
6871 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
6872 {
6873     char *name_start = RExC_parse;
6874
6875     PERL_ARGS_ASSERT_REG_SCAN_NAME;
6876
6877     if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
6878          /* skip IDFIRST by using do...while */
6879         if (UTF)
6880             do {
6881                 RExC_parse += UTF8SKIP(RExC_parse);
6882             } while (isALNUM_utf8((U8*)RExC_parse));
6883         else
6884             do {
6885                 RExC_parse++;
6886             } while (isALNUM(*RExC_parse));
6887     } else {
6888         RExC_parse++; /* so the <- from the vFAIL is after the offending character */
6889         vFAIL("Group name must start with a non-digit word character");
6890     }
6891     if ( flags ) {
6892         SV* sv_name
6893             = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
6894                              SVs_TEMP | (UTF ? SVf_UTF8 : 0));
6895         if ( flags == REG_RSN_RETURN_NAME)
6896             return sv_name;
6897         else if (flags==REG_RSN_RETURN_DATA) {
6898             HE *he_str = NULL;
6899             SV *sv_dat = NULL;
6900             if ( ! sv_name )      /* should not happen*/
6901                 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
6902             if (RExC_paren_names)
6903                 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
6904             if ( he_str )
6905                 sv_dat = HeVAL(he_str);
6906             if ( ! sv_dat )
6907                 vFAIL("Reference to nonexistent named group");
6908             return sv_dat;
6909         }
6910         else {
6911             Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
6912                        (unsigned long) flags);
6913         }
6914         assert(0); /* NOT REACHED */
6915     }
6916     return NULL;
6917 }
6918
6919 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
6920     int rem=(int)(RExC_end - RExC_parse);                       \
6921     int cut;                                                    \
6922     int num;                                                    \
6923     int iscut=0;                                                \
6924     if (rem>10) {                                               \
6925         rem=10;                                                 \
6926         iscut=1;                                                \
6927     }                                                           \
6928     cut=10-rem;                                                 \
6929     if (RExC_lastparse!=RExC_parse)                             \
6930         PerlIO_printf(Perl_debug_log," >%.*s%-*s",              \
6931             rem, RExC_parse,                                    \
6932             cut + 4,                                            \
6933             iscut ? "..." : "<"                                 \
6934         );                                                      \
6935     else                                                        \
6936         PerlIO_printf(Perl_debug_log,"%16s","");                \
6937                                                                 \
6938     if (SIZE_ONLY)                                              \
6939        num = RExC_size + 1;                                     \
6940     else                                                        \
6941        num=REG_NODE_NUM(RExC_emit);                             \
6942     if (RExC_lastnum!=num)                                      \
6943        PerlIO_printf(Perl_debug_log,"|%4d",num);                \
6944     else                                                        \
6945        PerlIO_printf(Perl_debug_log,"|%4s","");                 \
6946     PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
6947         (int)((depth*2)), "",                                   \
6948         (funcname)                                              \
6949     );                                                          \
6950     RExC_lastnum=num;                                           \
6951     RExC_lastparse=RExC_parse;                                  \
6952 })
6953
6954
6955
6956 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
6957     DEBUG_PARSE_MSG((funcname));                            \
6958     PerlIO_printf(Perl_debug_log,"%4s","\n");               \
6959 })
6960 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
6961     DEBUG_PARSE_MSG((funcname));                            \
6962     PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
6963 })
6964
6965 /* This section of code defines the inversion list object and its methods.  The
6966  * interfaces are highly subject to change, so as much as possible is static to
6967  * this file.  An inversion list is here implemented as a malloc'd C UV array
6968  * with some added info that is placed as UVs at the beginning in a header
6969  * portion.  An inversion list for Unicode is an array of code points, sorted
6970  * by ordinal number.  The zeroth element is the first code point in the list.
6971  * The 1th element is the first element beyond that not in the list.  In other
6972  * words, the first range is
6973  *  invlist[0]..(invlist[1]-1)
6974  * The other ranges follow.  Thus every element whose index is divisible by two
6975  * marks the beginning of a range that is in the list, and every element not
6976  * divisible by two marks the beginning of a range not in the list.  A single
6977  * element inversion list that contains the single code point N generally
6978  * consists of two elements
6979  *  invlist[0] == N
6980  *  invlist[1] == N+1
6981  * (The exception is when N is the highest representable value on the
6982  * machine, in which case the list containing just it would be a single
6983  * element, itself.  By extension, if the last range in the list extends to
6984  * infinity, then the first element of that range will be in the inversion list
6985  * at a position that is divisible by two, and is the final element in the
6986  * list.)
6987  * Taking the complement (inverting) an inversion list is quite simple, if the
6988  * first element is 0, remove it; otherwise add a 0 element at the beginning.
6989  * This implementation reserves an element at the beginning of each inversion
6990  * list to contain 0 when the list contains 0, and contains 1 otherwise.  The
6991  * actual beginning of the list is either that element if 0, or the next one if
6992  * 1.
6993  *
6994  * More about inversion lists can be found in "Unicode Demystified"
6995  * Chapter 13 by Richard Gillam, published by Addison-Wesley.
6996  * More will be coming when functionality is added later.
6997  *
6998  * The inversion list data structure is currently implemented as an SV pointing
6999  * to an array of UVs that the SV thinks are bytes.  This allows us to have an
7000  * array of UV whose memory management is automatically handled by the existing
7001  * facilities for SV's.
7002  *
7003  * Some of the methods should always be private to the implementation, and some
7004  * should eventually be made public */
7005
7006 /* The header definitions are in F<inline_invlist.c> */
7007
7008 #define TO_INTERNAL_SIZE(x) ((x + HEADER_LENGTH) * sizeof(UV))
7009 #define FROM_INTERNAL_SIZE(x) ((x / sizeof(UV)) - HEADER_LENGTH)
7010
7011 #define INVLIST_INITIAL_LEN 10
7012
7013 PERL_STATIC_INLINE UV*
7014 S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
7015 {
7016     /* Returns a pointer to the first element in the inversion list's array.
7017      * This is called upon initialization of an inversion list.  Where the
7018      * array begins depends on whether the list has the code point U+0000
7019      * in it or not.  The other parameter tells it whether the code that
7020      * follows this call is about to put a 0 in the inversion list or not.
7021      * The first element is either the element with 0, if 0, or the next one,
7022      * if 1 */
7023
7024     UV* zero = get_invlist_zero_addr(invlist);
7025
7026     PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
7027
7028     /* Must be empty */
7029     assert(! *_get_invlist_len_addr(invlist));
7030
7031     /* 1^1 = 0; 1^0 = 1 */
7032     *zero = 1 ^ will_have_0;
7033     return zero + *zero;
7034 }
7035
7036 PERL_STATIC_INLINE UV*
7037 S_invlist_array(pTHX_ SV* const invlist)
7038 {
7039     /* Returns the pointer to the inversion list's array.  Every time the
7040      * length changes, this needs to be called in case malloc or realloc moved
7041      * it */
7042
7043     PERL_ARGS_ASSERT_INVLIST_ARRAY;
7044
7045     /* Must not be empty.  If these fail, you probably didn't check for <len>
7046      * being non-zero before trying to get the array */
7047     assert(*_get_invlist_len_addr(invlist));
7048     assert(*get_invlist_zero_addr(invlist) == 0
7049            || *get_invlist_zero_addr(invlist) == 1);
7050
7051     /* The array begins either at the element reserved for zero if the
7052      * list contains 0 (that element will be set to 0), or otherwise the next
7053      * element (in which case the reserved element will be set to 1). */
7054     return (UV *) (get_invlist_zero_addr(invlist)
7055                    + *get_invlist_zero_addr(invlist));
7056 }
7057
7058 PERL_STATIC_INLINE void
7059 S_invlist_set_len(pTHX_ SV* const invlist, const UV len)
7060 {
7061     /* Sets the current number of elements stored in the inversion list */
7062
7063     PERL_ARGS_ASSERT_INVLIST_SET_LEN;
7064
7065     *_get_invlist_len_addr(invlist) = len;
7066
7067     assert(len <= SvLEN(invlist));
7068
7069     SvCUR_set(invlist, TO_INTERNAL_SIZE(len));
7070     /* If the list contains U+0000, that element is part of the header,
7071      * and should not be counted as part of the array.  It will contain
7072      * 0 in that case, and 1 otherwise.  So we could flop 0=>1, 1=>0 and
7073      * subtract:
7074      *  SvCUR_set(invlist,
7075      *            TO_INTERNAL_SIZE(len
7076      *                             - (*get_invlist_zero_addr(inv_list) ^ 1)));
7077      * But, this is only valid if len is not 0.  The consequences of not doing
7078      * this is that the memory allocation code may think that 1 more UV is
7079      * being used than actually is, and so might do an unnecessary grow.  That
7080      * seems worth not bothering to make this the precise amount.
7081      *
7082      * Note that when inverting, SvCUR shouldn't change */
7083 }
7084
7085 PERL_STATIC_INLINE IV*
7086 S_get_invlist_previous_index_addr(pTHX_ SV* invlist)
7087 {
7088     /* Return the address of the UV that is reserved to hold the cached index
7089      * */
7090
7091     PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
7092
7093     return (IV *) (SvPVX(invlist) + (INVLIST_PREVIOUS_INDEX_OFFSET * sizeof (UV)));
7094 }
7095
7096 PERL_STATIC_INLINE IV
7097 S_invlist_previous_index(pTHX_ SV* const invlist)
7098 {
7099     /* Returns cached index of previous search */
7100
7101     PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
7102
7103     return *get_invlist_previous_index_addr(invlist);
7104 }
7105
7106 PERL_STATIC_INLINE void
7107 S_invlist_set_previous_index(pTHX_ SV* const invlist, const IV index)
7108 {
7109     /* Caches <index> for later retrieval */
7110
7111     PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
7112
7113     assert(index == 0 || index < (int) _invlist_len(invlist));
7114
7115     *get_invlist_previous_index_addr(invlist) = index;
7116 }
7117
7118 PERL_STATIC_INLINE UV
7119 S_invlist_max(pTHX_ SV* const invlist)
7120 {
7121     /* Returns the maximum number of elements storable in the inversion list's
7122      * array, without having to realloc() */
7123
7124     PERL_ARGS_ASSERT_INVLIST_MAX;
7125
7126     return FROM_INTERNAL_SIZE(SvLEN(invlist));
7127 }
7128
7129 PERL_STATIC_INLINE UV*
7130 S_get_invlist_zero_addr(pTHX_ SV* invlist)
7131 {
7132     /* Return the address of the UV that is reserved to hold 0 if the inversion
7133      * list contains 0.  This has to be the last element of the heading, as the
7134      * list proper starts with either it if 0, or the next element if not.
7135      * (But we force it to contain either 0 or 1) */
7136
7137     PERL_ARGS_ASSERT_GET_INVLIST_ZERO_ADDR;
7138
7139     return (UV *) (SvPVX(invlist) + (INVLIST_ZERO_OFFSET * sizeof (UV)));
7140 }
7141
7142 #ifndef PERL_IN_XSUB_RE
7143 SV*
7144 Perl__new_invlist(pTHX_ IV initial_size)
7145 {
7146
7147     /* Return a pointer to a newly constructed inversion list, with enough
7148      * space to store 'initial_size' elements.  If that number is negative, a
7149      * system default is used instead */
7150
7151     SV* new_list;
7152
7153     if (initial_size < 0) {
7154         initial_size = INVLIST_INITIAL_LEN;
7155     }
7156
7157     /* Allocate the initial space */
7158     new_list = newSV(TO_INTERNAL_SIZE(initial_size));
7159     invlist_set_len(new_list, 0);
7160
7161     /* Force iterinit() to be used to get iteration to work */
7162     *get_invlist_iter_addr(new_list) = UV_MAX;
7163
7164     /* This should force a segfault if a method doesn't initialize this
7165      * properly */
7166     *get_invlist_zero_addr(new_list) = UV_MAX;
7167
7168     *get_invlist_previous_index_addr(new_list) = 0;
7169     *get_invlist_version_id_addr(new_list) = INVLIST_VERSION_ID;
7170 #if HEADER_LENGTH != 5
7171 #   error Need to regenerate VERSION_ID by running perl -E 'say int(rand 2**31-1)', and then changing the #if to the new length
7172 #endif
7173
7174     return new_list;
7175 }
7176 #endif
7177
7178 STATIC SV*
7179 S__new_invlist_C_array(pTHX_ UV* list)
7180 {
7181     /* Return a pointer to a newly constructed inversion list, initialized to
7182      * point to <list>, which has to be in the exact correct inversion list
7183      * form, including internal fields.  Thus this is a dangerous routine that
7184      * should not be used in the wrong hands */
7185
7186     SV* invlist = newSV_type(SVt_PV);
7187
7188     PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
7189
7190     SvPV_set(invlist, (char *) list);
7191     SvLEN_set(invlist, 0);  /* Means we own the contents, and the system
7192                                shouldn't touch it */
7193     SvCUR_set(invlist, TO_INTERNAL_SIZE(_invlist_len(invlist)));
7194
7195     if (*get_invlist_version_id_addr(invlist) != INVLIST_VERSION_ID) {
7196         Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
7197     }
7198
7199     return invlist;
7200 }
7201
7202 STATIC void
7203 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
7204 {
7205     /* Grow the maximum size of an inversion list */
7206
7207     PERL_ARGS_ASSERT_INVLIST_EXTEND;
7208
7209     SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max));
7210 }
7211
7212 PERL_STATIC_INLINE void
7213 S_invlist_trim(pTHX_ SV* const invlist)
7214 {
7215     PERL_ARGS_ASSERT_INVLIST_TRIM;
7216
7217     /* Change the length of the inversion list to how many entries it currently
7218      * has */
7219
7220     SvPV_shrink_to_cur((SV *) invlist);
7221 }
7222
7223 #define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
7224
7225 STATIC void
7226 S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
7227 {
7228    /* Subject to change or removal.  Append the range from 'start' to 'end' at
7229     * the end of the inversion list.  The range must be above any existing
7230     * ones. */
7231
7232     UV* array;
7233     UV max = invlist_max(invlist);
7234     UV len = _invlist_len(invlist);
7235
7236     PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
7237
7238     if (len == 0) { /* Empty lists must be initialized */
7239         array = _invlist_array_init(invlist, start == 0);
7240     }
7241     else {
7242         /* Here, the existing list is non-empty. The current max entry in the
7243          * list is generally the first value not in the set, except when the
7244          * set extends to the end of permissible values, in which case it is
7245          * the first entry in that final set, and so this call is an attempt to
7246          * append out-of-order */
7247
7248         UV final_element = len - 1;
7249         array = invlist_array(invlist);
7250         if (array[final_element] > start
7251             || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
7252         {
7253             Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
7254                        array[final_element], start,
7255                        ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
7256         }
7257
7258         /* Here, it is a legal append.  If the new range begins with the first
7259          * value not in the set, it is extending the set, so the new first
7260          * value not in the set is one greater than the newly extended range.
7261          * */
7262         if (array[final_element] == start) {
7263             if (end != UV_MAX) {
7264                 array[final_element] = end + 1;
7265             }
7266             else {
7267                 /* But if the end is the maximum representable on the machine,
7268                  * just let the range that this would extend to have no end */
7269                 invlist_set_len(invlist, len - 1);
7270             }
7271             return;
7272         }
7273     }
7274
7275     /* Here the new range doesn't extend any existing set.  Add it */
7276
7277     len += 2;   /* Includes an element each for the start and end of range */
7278
7279     /* If overflows the existing space, extend, which may cause the array to be
7280      * moved */
7281     if (max < len) {
7282         invlist_extend(invlist, len);
7283         invlist_set_len(invlist, len);  /* Have to set len here to avoid assert
7284                                            failure in invlist_array() */
7285         array = invlist_array(invlist);
7286     }
7287     else {
7288         invlist_set_len(invlist, len);
7289     }
7290
7291     /* The next item on the list starts the range, the one after that is
7292      * one past the new range.  */
7293     array[len - 2] = start;
7294     if (end != UV_MAX) {
7295         array[len - 1] = end + 1;
7296     }
7297     else {
7298         /* But if the end is the maximum representable on the machine, just let
7299          * the range have no end */
7300         invlist_set_len(invlist, len - 1);
7301     }
7302 }
7303
7304 #ifndef PERL_IN_XSUB_RE
7305
7306 IV
7307 Perl__invlist_search(pTHX_ SV* const invlist, const UV cp)
7308 {
7309     /* Searches the inversion list for the entry that contains the input code
7310      * point <cp>.  If <cp> is not in the list, -1 is returned.  Otherwise, the
7311      * return value is the index into the list's array of the range that
7312      * contains <cp> */
7313
7314     IV low = 0;
7315     IV mid;
7316     IV high = _invlist_len(invlist);
7317     const IV highest_element = high - 1;
7318     const UV* array;
7319
7320     PERL_ARGS_ASSERT__INVLIST_SEARCH;
7321
7322     /* If list is empty, return failure. */
7323     if (high == 0) {
7324         return -1;
7325     }
7326
7327     /* If the code point is before the first element, return failure.  (We
7328      * can't combine this with the test above, because we can't get the array
7329      * unless we know the list is non-empty) */
7330     array = invlist_array(invlist);
7331
7332     mid = invlist_previous_index(invlist);
7333     assert(mid >=0 && mid <= highest_element);
7334
7335     /* <mid> contains the cache of the result of the previous call to this
7336      * function (0 the first time).  See if this call is for the same result,
7337      * or if it is for mid-1.  This is under the theory that calls to this
7338      * function will often be for related code points that are near each other.
7339      * And benchmarks show that caching gives better results.  We also test
7340      * here if the code point is within the bounds of the list.  These tests
7341      * replace others that would have had to be made anyway to make sure that
7342      * the array bounds were not exceeded, and give us extra information at the
7343      * same time */
7344     if (cp >= array[mid]) {
7345         if (cp >= array[highest_element]) {
7346             return highest_element;
7347         }
7348
7349         /* Here, array[mid] <= cp < array[highest_element].  This means that
7350          * the final element is not the answer, so can exclude it; it also
7351          * means that <mid> is not the final element, so can refer to 'mid + 1'
7352          * safely */
7353         if (cp < array[mid + 1]) {
7354             return mid;
7355         }
7356         high--;
7357         low = mid + 1;
7358     }
7359     else { /* cp < aray[mid] */
7360         if (cp < array[0]) { /* Fail if outside the array */
7361             return -1;
7362         }
7363         high = mid;
7364         if (cp >= array[mid - 1]) {
7365             goto found_entry;
7366         }
7367     }
7368
7369     /* Binary search.  What we are looking for is <i> such that
7370      *  array[i] <= cp < array[i+1]
7371      * The loop below converges on the i+1.  Note that there may not be an
7372      * (i+1)th element in the array, and things work nonetheless */
7373     while (low < high) {
7374         mid = (low + high) / 2;
7375         assert(mid <= highest_element);
7376         if (array[mid] <= cp) { /* cp >= array[mid] */
7377             low = mid + 1;
7378
7379             /* We could do this extra test to exit the loop early.
7380             if (cp < array[low]) {
7381                 return mid;
7382             }
7383             */
7384         }
7385         else { /* cp < array[mid] */
7386             high = mid;
7387         }
7388     }
7389
7390   found_entry:
7391     high--;
7392     invlist_set_previous_index(invlist, high);
7393     return high;
7394 }
7395
7396 void
7397 Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch)
7398 {
7399     /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
7400      * but is used when the swash has an inversion list.  This makes this much
7401      * faster, as it uses a binary search instead of a linear one.  This is
7402      * intimately tied to that function, and perhaps should be in utf8.c,
7403      * except it is intimately tied to inversion lists as well.  It assumes
7404      * that <swatch> is all 0's on input */
7405
7406     UV current = start;
7407     const IV len = _invlist_len(invlist);
7408     IV i;
7409     const UV * array;
7410
7411     PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
7412
7413     if (len == 0) { /* Empty inversion list */
7414         return;
7415     }
7416
7417     array = invlist_array(invlist);
7418
7419     /* Find which element it is */
7420     i = _invlist_search(invlist, start);
7421
7422     /* We populate from <start> to <end> */
7423     while (current < end) {
7424         UV upper;
7425
7426         /* The inversion list gives the results for every possible code point
7427          * after the first one in the list.  Only those ranges whose index is
7428          * even are ones that the inversion list matches.  For the odd ones,
7429          * and if the initial code point is not in the list, we have to skip
7430          * forward to the next element */
7431         if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
7432             i++;
7433             if (i >= len) { /* Finished if beyond the end of the array */
7434                 return;
7435             }
7436             current = array[i];
7437             if (current >= end) {   /* Finished if beyond the end of what we
7438                                        are populating */
7439                 if (LIKELY(end < UV_MAX)) {
7440                     return;
7441                 }
7442
7443                 /* We get here when the upper bound is the maximum
7444                  * representable on the machine, and we are looking for just
7445                  * that code point.  Have to special case it */
7446                 i = len;
7447                 goto join_end_of_list;
7448             }
7449         }
7450         assert(current >= start);
7451
7452         /* The current range ends one below the next one, except don't go past
7453          * <end> */
7454         i++;
7455         upper = (i < len && array[i] < end) ? array[i] : end;
7456
7457         /* Here we are in a range that matches.  Populate a bit in the 3-bit U8
7458          * for each code point in it */
7459         for (; current < upper; current++) {
7460             const STRLEN offset = (STRLEN)(current - start);
7461             swatch[offset >> 3] |= 1 << (offset & 7);
7462         }
7463
7464     join_end_of_list:
7465
7466         /* Quit if at the end of the list */
7467         if (i >= len) {
7468
7469             /* But first, have to deal with the highest possible code point on
7470              * the platform.  The previous code assumes that <end> is one
7471              * beyond where we want to populate, but that is impossible at the
7472              * platform's infinity, so have to handle it specially */
7473             if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
7474             {
7475                 const STRLEN offset = (STRLEN)(end - start);
7476                 swatch[offset >> 3] |= 1 << (offset & 7);
7477             }
7478             return;
7479         }
7480
7481         /* Advance to the next range, which will be for code points not in the
7482          * inversion list */
7483         current = array[i];
7484     }
7485
7486     return;
7487 }
7488
7489 void
7490 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** output)
7491 {
7492     /* Take the union of two inversion lists and point <output> to it.  *output
7493      * should be defined upon input, and if it points to one of the two lists,
7494      * the reference count to that list will be decremented.  The first list,
7495      * <a>, may be NULL, in which case a copy of the second list is returned.
7496      * If <complement_b> is TRUE, the union is taken of the complement
7497      * (inversion) of <b> instead of b itself.
7498      *
7499      * The basis for this comes from "Unicode Demystified" Chapter 13 by
7500      * Richard Gillam, published by Addison-Wesley, and explained at some
7501      * length there.  The preface says to incorporate its examples into your
7502      * code at your own risk.
7503      *
7504      * The algorithm is like a merge sort.
7505      *
7506      * XXX A potential performance improvement is to keep track as we go along
7507      * if only one of the inputs contributes to the result, meaning the other
7508      * is a subset of that one.  In that case, we can skip the final copy and
7509      * return the larger of the input lists, but then outside code might need
7510      * to keep track of whether to free the input list or not */
7511
7512     UV* array_a;    /* a's array */
7513     UV* array_b;
7514     UV len_a;       /* length of a's array */
7515     UV len_b;
7516
7517     SV* u;                      /* the resulting union */
7518     UV* array_u;
7519     UV len_u;
7520
7521     UV i_a = 0;             /* current index into a's array */
7522     UV i_b = 0;
7523     UV i_u = 0;
7524
7525     /* running count, as explained in the algorithm source book; items are
7526      * stopped accumulating and are output when the count changes to/from 0.
7527      * The count is incremented when we start a range that's in the set, and
7528      * decremented when we start a range that's not in the set.  So its range
7529      * is 0 to 2.  Only when the count is zero is something not in the set.
7530      */
7531     UV count = 0;
7532
7533     PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
7534     assert(a != b);
7535
7536     /* If either one is empty, the union is the other one */
7537     if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
7538         if (*output == a) {
7539             if (a != NULL) {
7540                 SvREFCNT_dec(a);
7541             }
7542         }
7543         if (*output != b) {
7544             *output = invlist_clone(b);
7545             if (complement_b) {
7546                 _invlist_invert(*output);
7547             }
7548         } /* else *output already = b; */
7549         return;
7550     }
7551     else if ((len_b = _invlist_len(b)) == 0) {
7552         if (*output == b) {
7553             SvREFCNT_dec(b);
7554         }
7555
7556         /* The complement of an empty list is a list that has everything in it,
7557          * so the union with <a> includes everything too */
7558         if (complement_b) {
7559             if (a == *output) {
7560                 SvREFCNT_dec(a);
7561             }
7562             *output = _new_invlist(1);
7563             _append_range_to_invlist(*output, 0, UV_MAX);
7564         }
7565         else if (*output != a) {
7566             *output = invlist_clone(a);
7567         }
7568         /* else *output already = a; */
7569         return;
7570     }
7571
7572     /* Here both lists exist and are non-empty */
7573     array_a = invlist_array(a);
7574     array_b = invlist_array(b);
7575
7576     /* If are to take the union of 'a' with the complement of b, set it
7577      * up so are looking at b's complement. */
7578     if (complement_b) {
7579
7580         /* To complement, we invert: if the first element is 0, remove it.  To
7581          * do this, we just pretend the array starts one later, and clear the
7582          * flag as we don't have to do anything else later */
7583         if (array_b[0] == 0) {
7584             array_b++;
7585             len_b--;
7586             complement_b = FALSE;
7587         }
7588         else {
7589
7590             /* But if the first element is not zero, we unshift a 0 before the
7591              * array.  The data structure reserves a space for that 0 (which
7592              * should be a '1' right now), so physical shifting is unneeded,
7593              * but temporarily change that element to 0.  Before exiting the
7594              * routine, we must restore the element to '1' */
7595             array_b--;
7596             len_b++;
7597             array_b[0] = 0;
7598         }
7599     }
7600
7601     /* Size the union for the worst case: that the sets are completely
7602      * disjoint */
7603     u = _new_invlist(len_a + len_b);
7604
7605     /* Will contain U+0000 if either component does */
7606     array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
7607                                       || (len_b > 0 && array_b[0] == 0));
7608
7609     /* Go through each list item by item, stopping when exhausted one of
7610      * them */
7611     while (i_a < len_a && i_b < len_b) {
7612         UV cp;      /* The element to potentially add to the union's array */
7613         bool cp_in_set;   /* is it in the the input list's set or not */
7614
7615         /* We need to take one or the other of the two inputs for the union.
7616          * Since we are merging two sorted lists, we take the smaller of the
7617          * next items.  In case of a tie, we take the one that is in its set
7618          * first.  If we took one not in the set first, it would decrement the
7619          * count, possibly to 0 which would cause it to be output as ending the
7620          * range, and the next time through we would take the same number, and
7621          * output it again as beginning the next range.  By doing it the
7622          * opposite way, there is no possibility that the count will be
7623          * momentarily decremented to 0, and thus the two adjoining ranges will
7624          * be seamlessly merged.  (In a tie and both are in the set or both not
7625          * in the set, it doesn't matter which we take first.) */
7626         if (array_a[i_a] < array_b[i_b]
7627             || (array_a[i_a] == array_b[i_b]
7628                 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7629         {
7630             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7631             cp= array_a[i_a++];
7632         }
7633         else {
7634             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7635             cp= array_b[i_b++];
7636         }
7637
7638         /* Here, have chosen which of the two inputs to look at.  Only output
7639          * if the running count changes to/from 0, which marks the
7640          * beginning/end of a range in that's in the set */
7641         if (cp_in_set) {
7642             if (count == 0) {
7643                 array_u[i_u++] = cp;
7644             }
7645             count++;
7646         }
7647         else {
7648             count--;
7649             if (count == 0) {
7650                 array_u[i_u++] = cp;
7651             }
7652         }
7653     }
7654
7655     /* Here, we are finished going through at least one of the lists, which
7656      * means there is something remaining in at most one.  We check if the list
7657      * that hasn't been exhausted is positioned such that we are in the middle
7658      * of a range in its set or not.  (i_a and i_b point to the element beyond
7659      * the one we care about.) If in the set, we decrement 'count'; if 0, there
7660      * is potentially more to output.
7661      * There are four cases:
7662      *  1) Both weren't in their sets, count is 0, and remains 0.  What's left
7663      *     in the union is entirely from the non-exhausted set.
7664      *  2) Both were in their sets, count is 2.  Nothing further should
7665      *     be output, as everything that remains will be in the exhausted
7666      *     list's set, hence in the union; decrementing to 1 but not 0 insures
7667      *     that
7668      *  3) the exhausted was in its set, non-exhausted isn't, count is 1.
7669      *     Nothing further should be output because the union includes
7670      *     everything from the exhausted set.  Not decrementing ensures that.
7671      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1;
7672      *     decrementing to 0 insures that we look at the remainder of the
7673      *     non-exhausted set */
7674     if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7675         || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7676     {
7677         count--;
7678     }
7679
7680     /* The final length is what we've output so far, plus what else is about to
7681      * be output.  (If 'count' is non-zero, then the input list we exhausted
7682      * has everything remaining up to the machine's limit in its set, and hence
7683      * in the union, so there will be no further output. */
7684     len_u = i_u;
7685     if (count == 0) {
7686         /* At most one of the subexpressions will be non-zero */
7687         len_u += (len_a - i_a) + (len_b - i_b);
7688     }
7689
7690     /* Set result to final length, which can change the pointer to array_u, so
7691      * re-find it */
7692     if (len_u != _invlist_len(u)) {
7693         invlist_set_len(u, len_u);
7694         invlist_trim(u);
7695         array_u = invlist_array(u);
7696     }
7697
7698     /* When 'count' is 0, the list that was exhausted (if one was shorter than
7699      * the other) ended with everything above it not in its set.  That means
7700      * that the remaining part of the union is precisely the same as the
7701      * non-exhausted list, so can just copy it unchanged.  (If both list were
7702      * exhausted at the same time, then the operations below will be both 0.)
7703      */
7704     if (count == 0) {
7705         IV copy_count; /* At most one will have a non-zero copy count */
7706         if ((copy_count = len_a - i_a) > 0) {
7707             Copy(array_a + i_a, array_u + i_u, copy_count, UV);
7708         }
7709         else if ((copy_count = len_b - i_b) > 0) {
7710             Copy(array_b + i_b, array_u + i_u, copy_count, UV);
7711         }
7712     }
7713
7714     /*  We may be removing a reference to one of the inputs */
7715     if (a == *output || b == *output) {
7716         SvREFCNT_dec(*output);
7717     }
7718
7719     /* If we've changed b, restore it */
7720     if (complement_b) {
7721         array_b[0] = 1;
7722     }
7723
7724     *output = u;
7725     return;
7726 }
7727
7728 void
7729 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** i)
7730 {
7731     /* Take the intersection of two inversion lists and point <i> to it.  *i
7732      * should be defined upon input, and if it points to one of the two lists,
7733      * the reference count to that list will be decremented.
7734      * If <complement_b> is TRUE, the result will be the intersection of <a>
7735      * and the complement (or inversion) of <b> instead of <b> directly.
7736      *
7737      * The basis for this comes from "Unicode Demystified" Chapter 13 by
7738      * Richard Gillam, published by Addison-Wesley, and explained at some
7739      * length there.  The preface says to incorporate its examples into your
7740      * code at your own risk.  In fact, it had bugs
7741      *
7742      * The algorithm is like a merge sort, and is essentially the same as the
7743      * union above
7744      */
7745
7746     UV* array_a;                /* a's array */
7747     UV* array_b;
7748     UV len_a;   /* length of a's array */
7749     UV len_b;
7750
7751     SV* r;                   /* the resulting intersection */
7752     UV* array_r;
7753     UV len_r;
7754
7755     UV i_a = 0;             /* current index into a's array */
7756     UV i_b = 0;
7757     UV i_r = 0;
7758
7759     /* running count, as explained in the algorithm source book; items are
7760      * stopped accumulating and are output when the count changes to/from 2.
7761      * The count is incremented when we start a range that's in the set, and
7762      * decremented when we start a range that's not in the set.  So its range
7763      * is 0 to 2.  Only when the count is 2 is something in the intersection.
7764      */
7765     UV count = 0;
7766
7767     PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
7768     assert(a != b);
7769
7770     /* Special case if either one is empty */
7771     len_a = _invlist_len(a);
7772     if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
7773
7774         if (len_a != 0 && complement_b) {
7775
7776             /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
7777              * be empty.  Here, also we are using 'b's complement, which hence
7778              * must be every possible code point.  Thus the intersection is
7779              * simply 'a'. */
7780             if (*i != a) {
7781                 *i = invlist_clone(a);
7782
7783                 if (*i == b) {
7784                     SvREFCNT_dec(b);
7785                 }
7786             }
7787             /* else *i is already 'a' */
7788             return;
7789         }
7790
7791         /* Here, 'a' or 'b' is empty and not using the complement of 'b'.  The
7792          * intersection must be empty */
7793         if (*i == a) {
7794             SvREFCNT_dec(a);
7795         }
7796         else if (*i == b) {
7797             SvREFCNT_dec(b);
7798         }
7799         *i = _new_invlist(0);
7800         return;
7801     }
7802
7803     /* Here both lists exist and are non-empty */
7804     array_a = invlist_array(a);
7805     array_b = invlist_array(b);
7806
7807     /* If are to take the intersection of 'a' with the complement of b, set it
7808      * up so are looking at b's complement. */
7809     if (complement_b) {
7810
7811         /* To complement, we invert: if the first element is 0, remove it.  To
7812          * do this, we just pretend the array starts one later, and clear the
7813          * flag as we don't have to do anything else later */
7814         if (array_b[0] == 0) {
7815             array_b++;
7816             len_b--;
7817             complement_b = FALSE;
7818         }
7819         else {
7820
7821             /* But if the first element is not zero, we unshift a 0 before the
7822              * array.  The data structure reserves a space for that 0 (which
7823              * should be a '1' right now), so physical shifting is unneeded,
7824              * but temporarily change that element to 0.  Before exiting the
7825              * routine, we must restore the element to '1' */
7826             array_b--;
7827             len_b++;
7828             array_b[0] = 0;
7829         }
7830     }
7831
7832     /* Size the intersection for the worst case: that the intersection ends up
7833      * fragmenting everything to be completely disjoint */
7834     r= _new_invlist(len_a + len_b);
7835
7836     /* Will contain U+0000 iff both components do */
7837     array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
7838                                      && len_b > 0 && array_b[0] == 0);
7839
7840     /* Go through each list item by item, stopping when exhausted one of
7841      * them */
7842     while (i_a < len_a && i_b < len_b) {
7843         UV cp;      /* The element to potentially add to the intersection's
7844                        array */
7845         bool cp_in_set; /* Is it in the input list's set or not */
7846
7847         /* We need to take one or the other of the two inputs for the
7848          * intersection.  Since we are merging two sorted lists, we take the
7849          * smaller of the next items.  In case of a tie, we take the one that
7850          * is not in its set first (a difference from the union algorithm).  If
7851          * we took one in the set first, it would increment the count, possibly
7852          * to 2 which would cause it to be output as starting a range in the
7853          * intersection, and the next time through we would take that same
7854          * number, and output it again as ending the set.  By doing it the
7855          * opposite of this, there is no possibility that the count will be
7856          * momentarily incremented to 2.  (In a tie and both are in the set or
7857          * both not in the set, it doesn't matter which we take first.) */
7858         if (array_a[i_a] < array_b[i_b]
7859             || (array_a[i_a] == array_b[i_b]
7860                 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7861         {
7862             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7863             cp= array_a[i_a++];
7864         }
7865         else {
7866             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7867             cp= array_b[i_b++];
7868         }
7869
7870         /* Here, have chosen which of the two inputs to look at.  Only output
7871          * if the running count changes to/from 2, which marks the
7872          * beginning/end of a range that's in the intersection */
7873         if (cp_in_set) {
7874             count++;
7875             if (count == 2) {
7876                 array_r[i_r++] = cp;
7877             }
7878         }
7879         else {
7880             if (count == 2) {
7881                 array_r[i_r++] = cp;
7882             }
7883             count--;
7884         }
7885     }
7886
7887     /* Here, we are finished going through at least one of the lists, which
7888      * means there is something remaining in at most one.  We check if the list
7889      * that has been exhausted is positioned such that we are in the middle
7890      * of a range in its set or not.  (i_a and i_b point to elements 1 beyond
7891      * the ones we care about.)  There are four cases:
7892      *  1) Both weren't in their sets, count is 0, and remains 0.  There's
7893      *     nothing left in the intersection.
7894      *  2) Both were in their sets, count is 2 and perhaps is incremented to
7895      *     above 2.  What should be output is exactly that which is in the
7896      *     non-exhausted set, as everything it has is also in the intersection
7897      *     set, and everything it doesn't have can't be in the intersection
7898      *  3) The exhausted was in its set, non-exhausted isn't, count is 1, and
7899      *     gets incremented to 2.  Like the previous case, the intersection is
7900      *     everything that remains in the non-exhausted set.
7901      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
7902      *     remains 1.  And the intersection has nothing more. */
7903     if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7904         || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7905     {
7906         count++;
7907     }
7908
7909     /* The final length is what we've output so far plus what else is in the
7910      * intersection.  At most one of the subexpressions below will be non-zero */
7911     len_r = i_r;
7912     if (count >= 2) {
7913         len_r += (len_a - i_a) + (len_b - i_b);
7914     }
7915
7916     /* Set result to final length, which can change the pointer to array_r, so
7917      * re-find it */
7918     if (len_r != _invlist_len(r)) {
7919         invlist_set_len(r, len_r);
7920         invlist_trim(r);
7921         array_r = invlist_array(r);
7922     }
7923
7924     /* Finish outputting any remaining */
7925     if (count >= 2) { /* At most one will have a non-zero copy count */
7926         IV copy_count;
7927         if ((copy_count = len_a - i_a) > 0) {
7928             Copy(array_a + i_a, array_r + i_r, copy_count, UV);
7929         }
7930         else if ((copy_count = len_b - i_b) > 0) {
7931             Copy(array_b + i_b, array_r + i_r, copy_count, UV);
7932         }
7933     }
7934
7935     /*  We may be removing a reference to one of the inputs */
7936     if (a == *i || b == *i) {
7937         SvREFCNT_dec(*i);
7938     }
7939
7940     /* If we've changed b, restore it */
7941     if (complement_b) {
7942         array_b[0] = 1;
7943     }
7944
7945     *i = r;
7946     return;
7947 }
7948
7949 SV*
7950 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
7951 {
7952     /* Add the range from 'start' to 'end' inclusive to the inversion list's
7953      * set.  A pointer to the inversion list is returned.  This may actually be
7954      * a new list, in which case the passed in one has been destroyed.  The
7955      * passed in inversion list can be NULL, in which case a new one is created
7956      * with just the one range in it */
7957
7958     SV* range_invlist;
7959     UV len;
7960
7961     if (invlist == NULL) {
7962         invlist = _new_invlist(2);
7963         len = 0;
7964     }
7965     else {
7966         len = _invlist_len(invlist);
7967     }
7968
7969     /* If comes after the final entry, can just append it to the end */
7970     if (len == 0
7971         || start >= invlist_array(invlist)
7972                                     [_invlist_len(invlist) - 1])
7973     {
7974         _append_range_to_invlist(invlist, start, end);
7975         return invlist;
7976     }
7977
7978     /* Here, can't just append things, create and return a new inversion list
7979      * which is the union of this range and the existing inversion list */
7980     range_invlist = _new_invlist(2);
7981     _append_range_to_invlist(range_invlist, start, end);
7982
7983     _invlist_union(invlist, range_invlist, &invlist);
7984
7985     /* The temporary can be freed */
7986     SvREFCNT_dec(range_invlist);
7987
7988     return invlist;
7989 }
7990
7991 #endif
7992
7993 PERL_STATIC_INLINE SV*
7994 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
7995     return _add_range_to_invlist(invlist, cp, cp);
7996 }
7997
7998 #ifndef PERL_IN_XSUB_RE
7999 void
8000 Perl__invlist_invert(pTHX_ SV* const invlist)
8001 {
8002     /* Complement the input inversion list.  This adds a 0 if the list didn't
8003      * have a zero; removes it otherwise.  As described above, the data
8004      * structure is set up so that this is very efficient */
8005
8006     UV* len_pos = _get_invlist_len_addr(invlist);
8007
8008     PERL_ARGS_ASSERT__INVLIST_INVERT;
8009
8010     /* The inverse of matching nothing is matching everything */
8011     if (*len_pos == 0) {
8012         _append_range_to_invlist(invlist, 0, UV_MAX);
8013         return;
8014     }
8015
8016     /* The exclusive or complents 0 to 1; and 1 to 0.  If the result is 1, the
8017      * zero element was a 0, so it is being removed, so the length decrements
8018      * by 1; and vice-versa.  SvCUR is unaffected */
8019     if (*get_invlist_zero_addr(invlist) ^= 1) {
8020         (*len_pos)--;
8021     }
8022     else {
8023         (*len_pos)++;
8024     }
8025 }
8026
8027 void
8028 Perl__invlist_invert_prop(pTHX_ SV* const invlist)
8029 {
8030     /* Complement the input inversion list (which must be a Unicode property,
8031      * all of which don't match above the Unicode maximum code point.)  And
8032      * Perl has chosen to not have the inversion match above that either.  This
8033      * adds a 0x110000 if the list didn't end with it, and removes it if it did
8034      */
8035
8036     UV len;
8037     UV* array;
8038
8039     PERL_ARGS_ASSERT__INVLIST_INVERT_PROP;
8040
8041     _invlist_invert(invlist);
8042
8043     len = _invlist_len(invlist);
8044
8045     if (len != 0) { /* If empty do nothing */
8046         array = invlist_array(invlist);
8047         if (array[len - 1] != PERL_UNICODE_MAX + 1) {
8048             /* Add 0x110000.  First, grow if necessary */
8049             len++;
8050             if (invlist_max(invlist) < len) {
8051                 invlist_extend(invlist, len);
8052                 array = invlist_array(invlist);
8053             }
8054             invlist_set_len(invlist, len);
8055             array[len - 1] = PERL_UNICODE_MAX + 1;
8056         }
8057         else {  /* Remove the 0x110000 */
8058             invlist_set_len(invlist, len - 1);
8059         }
8060     }
8061
8062     return;
8063 }
8064 #endif
8065
8066 PERL_STATIC_INLINE SV*
8067 S_invlist_clone(pTHX_ SV* const invlist)
8068 {
8069
8070     /* Return a new inversion list that is a copy of the input one, which is
8071      * unchanged */
8072
8073     /* Need to allocate extra space to accommodate Perl's addition of a
8074      * trailing NUL to SvPV's, since it thinks they are always strings */
8075     SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
8076     STRLEN length = SvCUR(invlist);
8077
8078     PERL_ARGS_ASSERT_INVLIST_CLONE;
8079
8080     SvCUR_set(new_invlist, length); /* This isn't done automatically */
8081     Copy(SvPVX(invlist), SvPVX(new_invlist), length, char);
8082
8083     return new_invlist;
8084 }
8085
8086 PERL_STATIC_INLINE UV*
8087 S_get_invlist_iter_addr(pTHX_ SV* invlist)
8088 {
8089     /* Return the address of the UV that contains the current iteration
8090      * position */
8091
8092     PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
8093
8094     return (UV *) (SvPVX(invlist) + (INVLIST_ITER_OFFSET * sizeof (UV)));
8095 }
8096
8097 PERL_STATIC_INLINE UV*
8098 S_get_invlist_version_id_addr(pTHX_ SV* invlist)
8099 {
8100     /* Return the address of the UV that contains the version id. */
8101
8102     PERL_ARGS_ASSERT_GET_INVLIST_VERSION_ID_ADDR;
8103
8104     return (UV *) (SvPVX(invlist) + (INVLIST_VERSION_ID_OFFSET * sizeof (UV)));
8105 }
8106
8107 PERL_STATIC_INLINE void
8108 S_invlist_iterinit(pTHX_ SV* invlist)   /* Initialize iterator for invlist */
8109 {
8110     PERL_ARGS_ASSERT_INVLIST_ITERINIT;
8111
8112     *get_invlist_iter_addr(invlist) = 0;
8113 }
8114
8115 STATIC bool
8116 S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
8117 {
8118     /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
8119      * This call sets in <*start> and <*end>, the next range in <invlist>.
8120      * Returns <TRUE> if successful and the next call will return the next
8121      * range; <FALSE> if was already at the end of the list.  If the latter,
8122      * <*start> and <*end> are unchanged, and the next call to this function
8123      * will start over at the beginning of the list */
8124
8125     UV* pos = get_invlist_iter_addr(invlist);
8126     UV len = _invlist_len(invlist);
8127     UV *array;
8128
8129     PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
8130
8131     if (*pos >= len) {
8132         *pos = UV_MAX;  /* Force iternit() to be required next time */
8133         return FALSE;
8134     }
8135
8136     array = invlist_array(invlist);
8137
8138     *start = array[(*pos)++];
8139
8140     if (*pos >= len) {
8141         *end = UV_MAX;
8142     }
8143     else {
8144         *end = array[(*pos)++] - 1;
8145     }
8146
8147     return TRUE;
8148 }
8149
8150 PERL_STATIC_INLINE UV
8151 S_invlist_highest(pTHX_ SV* const invlist)
8152 {
8153     /* Returns the highest code point that matches an inversion list.  This API
8154      * has an ambiguity, as it returns 0 under either the highest is actually
8155      * 0, or if the list is empty.  If this distinction matters to you, check
8156      * for emptiness before calling this function */
8157
8158     UV len = _invlist_len(invlist);
8159     UV *array;
8160
8161     PERL_ARGS_ASSERT_INVLIST_HIGHEST;
8162
8163     if (len == 0) {
8164         return 0;
8165     }
8166
8167     array = invlist_array(invlist);
8168
8169     /* The last element in the array in the inversion list always starts a
8170      * range that goes to infinity.  That range may be for code points that are
8171      * matched in the inversion list, or it may be for ones that aren't
8172      * matched.  In the latter case, the highest code point in the set is one
8173      * less than the beginning of this range; otherwise it is the final element
8174      * of this range: infinity */
8175     return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
8176            ? UV_MAX
8177            : array[len - 1] - 1;
8178 }
8179
8180 #ifndef PERL_IN_XSUB_RE
8181 SV *
8182 Perl__invlist_contents(pTHX_ SV* const invlist)
8183 {
8184     /* Get the contents of an inversion list into a string SV so that they can
8185      * be printed out.  It uses the format traditionally done for debug tracing
8186      */
8187
8188     UV start, end;
8189     SV* output = newSVpvs("\n");
8190
8191     PERL_ARGS_ASSERT__INVLIST_CONTENTS;
8192
8193     invlist_iterinit(invlist);
8194     while (invlist_iternext(invlist, &start, &end)) {
8195         if (end == UV_MAX) {
8196             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
8197         }
8198         else if (end != start) {
8199             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
8200                     start,       end);
8201         }
8202         else {
8203             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
8204         }
8205     }
8206
8207     return output;
8208 }
8209 #endif
8210
8211 #if 0
8212 void
8213 S_invlist_dump(pTHX_ SV* const invlist, const char * const header)
8214 {
8215     /* Dumps out the ranges in an inversion list.  The string 'header'
8216      * if present is output on a line before the first range */
8217
8218     UV start, end;
8219
8220     if (header && strlen(header)) {
8221         PerlIO_printf(Perl_debug_log, "%s\n", header);
8222     }
8223     invlist_iterinit(invlist);
8224     while (invlist_iternext(invlist, &start, &end)) {
8225         if (end == UV_MAX) {
8226             PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. INFINITY\n", start);
8227         }
8228         else {
8229             PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. 0x%04"UVXf"\n", start, end);
8230         }
8231     }
8232 }
8233 #endif
8234
8235 #if 0
8236 bool
8237 S__invlistEQ(pTHX_ SV* const a, SV* const b, bool complement_b)
8238 {
8239     /* Return a boolean as to if the two passed in inversion lists are
8240      * identical.  The final argument, if TRUE, says to take the complement of
8241      * the second inversion list before doing the comparison */
8242
8243     UV* array_a = invlist_array(a);
8244     UV* array_b = invlist_array(b);
8245     UV len_a = _invlist_len(a);
8246     UV len_b = _invlist_len(b);
8247
8248     UV i = 0;               /* current index into the arrays */
8249     bool retval = TRUE;     /* Assume are identical until proven otherwise */
8250
8251     PERL_ARGS_ASSERT__INVLISTEQ;
8252
8253     /* If are to compare 'a' with the complement of b, set it
8254      * up so are looking at b's complement. */
8255     if (complement_b) {
8256
8257         /* The complement of nothing is everything, so <a> would have to have
8258          * just one element, starting at zero (ending at infinity) */
8259         if (len_b == 0) {
8260             return (len_a == 1 && array_a[0] == 0);
8261         }
8262         else if (array_b[0] == 0) {
8263
8264             /* Otherwise, to complement, we invert.  Here, the first element is
8265              * 0, just remove it.  To do this, we just pretend the array starts
8266              * one later, and clear the flag as we don't have to do anything
8267              * else later */
8268
8269             array_b++;
8270             len_b--;
8271             complement_b = FALSE;
8272         }
8273         else {
8274
8275             /* But if the first element is not zero, we unshift a 0 before the
8276              * array.  The data structure reserves a space for that 0 (which
8277              * should be a '1' right now), so physical shifting is unneeded,
8278              * but temporarily change that element to 0.  Before exiting the
8279              * routine, we must restore the element to '1' */
8280             array_b--;
8281             len_b++;
8282             array_b[0] = 0;
8283         }
8284     }
8285
8286     /* Make sure that the lengths are the same, as well as the final element
8287      * before looping through the remainder.  (Thus we test the length, final,
8288      * and first elements right off the bat) */
8289     if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
8290         retval = FALSE;
8291     }
8292     else for (i = 0; i < len_a - 1; i++) {
8293         if (array_a[i] != array_b[i]) {
8294             retval = FALSE;
8295             break;
8296         }
8297     }
8298
8299     if (complement_b) {
8300         array_b[0] = 1;
8301     }
8302     return retval;
8303 }
8304 #endif
8305
8306 #undef HEADER_LENGTH
8307 #undef INVLIST_INITIAL_LENGTH
8308 #undef TO_INTERNAL_SIZE
8309 #undef FROM_INTERNAL_SIZE
8310 #undef INVLIST_LEN_OFFSET
8311 #undef INVLIST_ZERO_OFFSET
8312 #undef INVLIST_ITER_OFFSET
8313 #undef INVLIST_VERSION_ID
8314
8315 /* End of inversion list object */
8316
8317 /*
8318  - reg - regular expression, i.e. main body or parenthesized thing
8319  *
8320  * Caller must absorb opening parenthesis.
8321  *
8322  * Combining parenthesis handling with the base level of regular expression
8323  * is a trifle forced, but the need to tie the tails of the branches to what
8324  * follows makes it hard to avoid.
8325  */
8326 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
8327 #ifdef DEBUGGING
8328 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
8329 #else
8330 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
8331 #endif
8332
8333 STATIC regnode *
8334 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
8335     /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
8336 {
8337     dVAR;
8338     regnode *ret;               /* Will be the head of the group. */
8339     regnode *br;
8340     regnode *lastbr;
8341     regnode *ender = NULL;
8342     I32 parno = 0;
8343     I32 flags;
8344     U32 oregflags = RExC_flags;
8345     bool have_branch = 0;
8346     bool is_open = 0;
8347     I32 freeze_paren = 0;
8348     I32 after_freeze = 0;
8349
8350     /* for (?g), (?gc), and (?o) warnings; warning
8351        about (?c) will warn about (?g) -- japhy    */
8352
8353 #define WASTED_O  0x01
8354 #define WASTED_G  0x02
8355 #define WASTED_C  0x04
8356 #define WASTED_GC (0x02|0x04)
8357     I32 wastedflags = 0x00;
8358
8359     char * parse_start = RExC_parse; /* MJD */
8360     char * const oregcomp_parse = RExC_parse;
8361
8362     GET_RE_DEBUG_FLAGS_DECL;
8363
8364     PERL_ARGS_ASSERT_REG;
8365     DEBUG_PARSE("reg ");
8366
8367     *flagp = 0;                         /* Tentatively. */
8368
8369
8370     /* Make an OPEN node, if parenthesized. */
8371     if (paren) {
8372         if ( *RExC_parse == '*') { /* (*VERB:ARG) */
8373             char *start_verb = RExC_parse;
8374             STRLEN verb_len = 0;
8375             char *start_arg = NULL;
8376             unsigned char op = 0;
8377             int argok = 1;
8378             int internal_argval = 0; /* internal_argval is only useful if !argok */
8379             while ( *RExC_parse && *RExC_parse != ')' ) {
8380                 if ( *RExC_parse == ':' ) {
8381                     start_arg = RExC_parse + 1;
8382                     break;
8383                 }
8384                 RExC_parse++;
8385             }
8386             ++start_verb;
8387             verb_len = RExC_parse - start_verb;
8388             if ( start_arg ) {
8389                 RExC_parse++;
8390                 while ( *RExC_parse && *RExC_parse != ')' )
8391                     RExC_parse++;
8392                 if ( *RExC_parse != ')' )
8393                     vFAIL("Unterminated verb pattern argument");
8394                 if ( RExC_parse == start_arg )
8395                     start_arg = NULL;
8396             } else {
8397                 if ( *RExC_parse != ')' )
8398                     vFAIL("Unterminated verb pattern");
8399             }
8400
8401             switch ( *start_verb ) {
8402             case 'A':  /* (*ACCEPT) */
8403                 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
8404                     op = ACCEPT;
8405                     internal_argval = RExC_nestroot;
8406                 }
8407                 break;
8408             case 'C':  /* (*COMMIT) */
8409                 if ( memEQs(start_verb,verb_len,"COMMIT") )
8410                     op = COMMIT;
8411                 break;
8412             case 'F':  /* (*FAIL) */
8413                 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
8414                     op = OPFAIL;
8415                     argok = 0;
8416                 }
8417                 break;
8418             case ':':  /* (*:NAME) */
8419             case 'M':  /* (*MARK:NAME) */
8420                 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
8421                     op = MARKPOINT;
8422                     argok = -1;
8423                 }
8424                 break;
8425             case 'P':  /* (*PRUNE) */
8426                 if ( memEQs(start_verb,verb_len,"PRUNE") )
8427                     op = PRUNE;
8428                 break;
8429             case 'S':   /* (*SKIP) */
8430                 if ( memEQs(start_verb,verb_len,"SKIP") )
8431                     op = SKIP;
8432                 break;
8433             case 'T':  /* (*THEN) */
8434                 /* [19:06] <TimToady> :: is then */
8435                 if ( memEQs(start_verb,verb_len,"THEN") ) {
8436                     op = CUTGROUP;
8437                     RExC_seen |= REG_SEEN_CUTGROUP;
8438                 }
8439                 break;
8440             }
8441             if ( ! op ) {
8442                 RExC_parse++;
8443                 vFAIL3("Unknown verb pattern '%.*s'",
8444                     verb_len, start_verb);
8445             }
8446             if ( argok ) {
8447                 if ( start_arg && internal_argval ) {
8448                     vFAIL3("Verb pattern '%.*s' may not have an argument",
8449                         verb_len, start_verb);
8450                 } else if ( argok < 0 && !start_arg ) {
8451                     vFAIL3("Verb pattern '%.*s' has a mandatory argument",
8452                         verb_len, start_verb);
8453                 } else {
8454                     ret = reganode(pRExC_state, op, internal_argval);
8455                     if ( ! internal_argval && ! SIZE_ONLY ) {
8456                         if (start_arg) {
8457                             SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
8458                             ARG(ret) = add_data( pRExC_state, 1, "S" );
8459                             RExC_rxi->data->data[ARG(ret)]=(void*)sv;
8460                             ret->flags = 0;
8461                         } else {
8462                             ret->flags = 1;
8463                         }
8464                     }
8465                 }
8466                 if (!internal_argval)
8467                     RExC_seen |= REG_SEEN_VERBARG;
8468             } else if ( start_arg ) {
8469                 vFAIL3("Verb pattern '%.*s' may not have an argument",
8470                         verb_len, start_verb);
8471             } else {
8472                 ret = reg_node(pRExC_state, op);
8473             }
8474             nextchar(pRExC_state);
8475             return ret;
8476         } else
8477         if (*RExC_parse == '?') { /* (?...) */
8478             bool is_logical = 0;
8479             const char * const seqstart = RExC_parse;
8480             bool has_use_defaults = FALSE;
8481
8482             RExC_parse++;
8483             paren = *RExC_parse++;
8484             ret = NULL;                 /* For look-ahead/behind. */
8485             switch (paren) {
8486
8487             case 'P':   /* (?P...) variants for those used to PCRE/Python */
8488                 paren = *RExC_parse++;
8489                 if ( paren == '<')         /* (?P<...>) named capture */
8490                     goto named_capture;
8491                 else if (paren == '>') {   /* (?P>name) named recursion */
8492                     goto named_recursion;
8493                 }
8494                 else if (paren == '=') {   /* (?P=...)  named backref */
8495                     /* this pretty much dupes the code for \k<NAME> in regatom(), if
8496                        you change this make sure you change that */
8497                     char* name_start = RExC_parse;
8498                     U32 num = 0;
8499                     SV *sv_dat = reg_scan_name(pRExC_state,
8500                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8501                     if (RExC_parse == name_start || *RExC_parse != ')')
8502                         vFAIL2("Sequence %.3s... not terminated",parse_start);
8503
8504                     if (!SIZE_ONLY) {
8505                         num = add_data( pRExC_state, 1, "S" );
8506                         RExC_rxi->data->data[num]=(void*)sv_dat;
8507                         SvREFCNT_inc_simple_void(sv_dat);
8508                     }
8509                     RExC_sawback = 1;
8510                     ret = reganode(pRExC_state,
8511                                    ((! FOLD)
8512                                      ? NREF
8513                                      : (ASCII_FOLD_RESTRICTED)
8514                                        ? NREFFA
8515                                        : (AT_LEAST_UNI_SEMANTICS)
8516                                          ? NREFFU
8517                                          : (LOC)
8518                                            ? NREFFL
8519                                            : NREFF),
8520                                     num);
8521                     *flagp |= HASWIDTH;
8522
8523                     Set_Node_Offset(ret, parse_start+1);
8524                     Set_Node_Cur_Length(ret); /* MJD */
8525
8526                     nextchar(pRExC_state);
8527                     return ret;
8528                 }
8529                 RExC_parse++;
8530                 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8531                 /*NOTREACHED*/
8532             case '<':           /* (?<...) */
8533                 if (*RExC_parse == '!')
8534                     paren = ',';
8535                 else if (*RExC_parse != '=')
8536               named_capture:
8537                 {               /* (?<...>) */
8538                     char *name_start;
8539                     SV *svname;
8540                     paren= '>';
8541             case '\'':          /* (?'...') */
8542                     name_start= RExC_parse;
8543                     svname = reg_scan_name(pRExC_state,
8544                         SIZE_ONLY ?  /* reverse test from the others */
8545                         REG_RSN_RETURN_NAME :
8546                         REG_RSN_RETURN_NULL);
8547                     if (RExC_parse == name_start) {
8548                         RExC_parse++;
8549                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8550                         /*NOTREACHED*/
8551                     }
8552                     if (*RExC_parse != paren)
8553                         vFAIL2("Sequence (?%c... not terminated",
8554                             paren=='>' ? '<' : paren);
8555                     if (SIZE_ONLY) {
8556                         HE *he_str;
8557                         SV *sv_dat = NULL;
8558                         if (!svname) /* shouldn't happen */
8559                             Perl_croak(aTHX_
8560                                 "panic: reg_scan_name returned NULL");
8561                         if (!RExC_paren_names) {
8562                             RExC_paren_names= newHV();
8563                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
8564 #ifdef DEBUGGING
8565                             RExC_paren_name_list= newAV();
8566                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
8567 #endif
8568                         }
8569                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
8570                         if ( he_str )
8571                             sv_dat = HeVAL(he_str);
8572                         if ( ! sv_dat ) {
8573                             /* croak baby croak */
8574                             Perl_croak(aTHX_
8575                                 "panic: paren_name hash element allocation failed");
8576                         } else if ( SvPOK(sv_dat) ) {
8577                             /* (?|...) can mean we have dupes so scan to check
8578                                its already been stored. Maybe a flag indicating
8579                                we are inside such a construct would be useful,
8580                                but the arrays are likely to be quite small, so
8581                                for now we punt -- dmq */
8582                             IV count = SvIV(sv_dat);
8583                             I32 *pv = (I32*)SvPVX(sv_dat);
8584                             IV i;
8585                             for ( i = 0 ; i < count ; i++ ) {
8586                                 if ( pv[i] == RExC_npar ) {
8587                                     count = 0;
8588                                     break;
8589                                 }
8590                             }
8591                             if ( count ) {
8592                                 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
8593                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
8594                                 pv[count] = RExC_npar;
8595                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
8596                             }
8597                         } else {
8598                             (void)SvUPGRADE(sv_dat,SVt_PVNV);
8599                             sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
8600                             SvIOK_on(sv_dat);
8601                             SvIV_set(sv_dat, 1);
8602                         }
8603 #ifdef DEBUGGING
8604                         /* Yes this does cause a memory leak in debugging Perls */
8605                         if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
8606                             SvREFCNT_dec(svname);
8607 #endif
8608
8609                         /*sv_dump(sv_dat);*/
8610                     }
8611                     nextchar(pRExC_state);
8612                     paren = 1;
8613                     goto capturing_parens;
8614                 }
8615                 RExC_seen |= REG_SEEN_LOOKBEHIND;
8616                 RExC_in_lookbehind++;
8617                 RExC_parse++;
8618             case '=':           /* (?=...) */
8619                 RExC_seen_zerolen++;
8620                 break;
8621             case '!':           /* (?!...) */
8622                 RExC_seen_zerolen++;
8623                 if (*RExC_parse == ')') {
8624                     ret=reg_node(pRExC_state, OPFAIL);
8625                     nextchar(pRExC_state);
8626                     return ret;
8627                 }
8628                 break;
8629             case '|':           /* (?|...) */
8630                 /* branch reset, behave like a (?:...) except that
8631                    buffers in alternations share the same numbers */
8632                 paren = ':';
8633                 after_freeze = freeze_paren = RExC_npar;
8634                 break;
8635             case ':':           /* (?:...) */
8636             case '>':           /* (?>...) */
8637                 break;
8638             case '$':           /* (?$...) */
8639             case '@':           /* (?@...) */
8640                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
8641                 break;
8642             case '#':           /* (?#...) */
8643                 while (*RExC_parse && *RExC_parse != ')')
8644                     RExC_parse++;
8645                 if (*RExC_parse != ')')
8646                     FAIL("Sequence (?#... not terminated");
8647                 nextchar(pRExC_state);
8648                 *flagp = TRYAGAIN;
8649                 return NULL;
8650             case '0' :           /* (?0) */
8651             case 'R' :           /* (?R) */
8652                 if (*RExC_parse != ')')
8653                     FAIL("Sequence (?R) not terminated");
8654                 ret = reg_node(pRExC_state, GOSTART);
8655                 *flagp |= POSTPONED;
8656                 nextchar(pRExC_state);
8657                 return ret;
8658                 /*notreached*/
8659             { /* named and numeric backreferences */
8660                 I32 num;
8661             case '&':            /* (?&NAME) */
8662                 parse_start = RExC_parse - 1;
8663               named_recursion:
8664                 {
8665                     SV *sv_dat = reg_scan_name(pRExC_state,
8666                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8667                      num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
8668                 }
8669                 goto gen_recurse_regop;
8670                 assert(0); /* NOT REACHED */
8671             case '+':
8672                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8673                     RExC_parse++;
8674                     vFAIL("Illegal pattern");
8675                 }
8676                 goto parse_recursion;
8677                 /* NOT REACHED*/
8678             case '-': /* (?-1) */
8679                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8680                     RExC_parse--; /* rewind to let it be handled later */
8681                     goto parse_flags;
8682                 }
8683                 /*FALLTHROUGH */
8684             case '1': case '2': case '3': case '4': /* (?1) */
8685             case '5': case '6': case '7': case '8': case '9':
8686                 RExC_parse--;
8687               parse_recursion:
8688                 num = atoi(RExC_parse);
8689                 parse_start = RExC_parse - 1; /* MJD */
8690                 if (*RExC_parse == '-')
8691                     RExC_parse++;
8692                 while (isDIGIT(*RExC_parse))
8693                         RExC_parse++;
8694                 if (*RExC_parse!=')')
8695                     vFAIL("Expecting close bracket");
8696
8697               gen_recurse_regop:
8698                 if ( paren == '-' ) {
8699                     /*
8700                     Diagram of capture buffer numbering.
8701                     Top line is the normal capture buffer numbers
8702                     Bottom line is the negative indexing as from
8703                     the X (the (?-2))
8704
8705                     +   1 2    3 4 5 X          6 7
8706                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
8707                     -   5 4    3 2 1 X          x x
8708
8709                     */
8710                     num = RExC_npar + num;
8711                     if (num < 1)  {
8712                         RExC_parse++;
8713                         vFAIL("Reference to nonexistent group");
8714                     }
8715                 } else if ( paren == '+' ) {
8716                     num = RExC_npar + num - 1;
8717                 }
8718
8719                 ret = reganode(pRExC_state, GOSUB, num);
8720                 if (!SIZE_ONLY) {
8721                     if (num > (I32)RExC_rx->nparens) {
8722                         RExC_parse++;
8723                         vFAIL("Reference to nonexistent group");
8724                     }
8725                     ARG2L_SET( ret, RExC_recurse_count++);
8726                     RExC_emit++;
8727                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
8728                         "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
8729                 } else {
8730                     RExC_size++;
8731                 }
8732                 RExC_seen |= REG_SEEN_RECURSE;
8733                 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
8734                 Set_Node_Offset(ret, parse_start); /* MJD */
8735
8736                 *flagp |= POSTPONED;
8737                 nextchar(pRExC_state);
8738                 return ret;
8739             } /* named and numeric backreferences */
8740             assert(0); /* NOT REACHED */
8741
8742             case '?':           /* (??...) */
8743                 is_logical = 1;
8744                 if (*RExC_parse != '{') {
8745                     RExC_parse++;
8746                     vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8747                     /*NOTREACHED*/
8748                 }
8749                 *flagp |= POSTPONED;
8750                 paren = *RExC_parse++;
8751                 /* FALL THROUGH */
8752             case '{':           /* (?{...}) */
8753             {
8754                 U32 n = 0;
8755                 struct reg_code_block *cb;
8756
8757                 RExC_seen_zerolen++;
8758
8759                 if (   !pRExC_state->num_code_blocks
8760                     || pRExC_state->code_index >= pRExC_state->num_code_blocks
8761                     || pRExC_state->code_blocks[pRExC_state->code_index].start
8762                         != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
8763                             - RExC_start)
8764                 ) {
8765                     if (RExC_pm_flags & PMf_USE_RE_EVAL)
8766                         FAIL("panic: Sequence (?{...}): no code block found\n");
8767                     FAIL("Eval-group not allowed at runtime, use re 'eval'");
8768                 }
8769                 /* this is a pre-compiled code block (?{...}) */
8770                 cb = &pRExC_state->code_blocks[pRExC_state->code_index];
8771                 RExC_parse = RExC_start + cb->end;
8772                 if (!SIZE_ONLY) {
8773                     OP *o = cb->block;
8774                     if (cb->src_regex) {
8775                         n = add_data(pRExC_state, 2, "rl");
8776                         RExC_rxi->data->data[n] =
8777                             (void*)SvREFCNT_inc((SV*)cb->src_regex);
8778                         RExC_rxi->data->data[n+1] = (void*)o;
8779                     }
8780                     else {
8781                         n = add_data(pRExC_state, 1,
8782                                (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l");
8783                         RExC_rxi->data->data[n] = (void*)o;
8784                     }
8785                 }
8786                 pRExC_state->code_index++;
8787                 nextchar(pRExC_state);
8788
8789                 if (is_logical) {
8790                     regnode *eval;
8791                     ret = reg_node(pRExC_state, LOGICAL);
8792                     eval = reganode(pRExC_state, EVAL, n);
8793                     if (!SIZE_ONLY) {
8794                         ret->flags = 2;
8795                         /* for later propagation into (??{}) return value */
8796                         eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME);
8797                     }
8798                     REGTAIL(pRExC_state, ret, eval);
8799                     /* deal with the length of this later - MJD */
8800                     return ret;
8801                 }
8802                 ret = reganode(pRExC_state, EVAL, n);
8803                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
8804                 Set_Node_Offset(ret, parse_start);
8805                 return ret;
8806             }
8807             case '(':           /* (?(?{...})...) and (?(?=...)...) */
8808             {
8809                 int is_define= 0;
8810                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
8811                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
8812                         || RExC_parse[1] == '<'
8813                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
8814                         I32 flag;
8815
8816                         ret = reg_node(pRExC_state, LOGICAL);
8817                         if (!SIZE_ONLY)
8818                             ret->flags = 1;
8819                         REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
8820                         goto insert_if;
8821                     }
8822                 }
8823                 else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
8824                          || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
8825                 {
8826                     char ch = RExC_parse[0] == '<' ? '>' : '\'';
8827                     char *name_start= RExC_parse++;
8828                     U32 num = 0;
8829                     SV *sv_dat=reg_scan_name(pRExC_state,
8830                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8831                     if (RExC_parse == name_start || *RExC_parse != ch)
8832                         vFAIL2("Sequence (?(%c... not terminated",
8833                             (ch == '>' ? '<' : ch));
8834                     RExC_parse++;
8835                     if (!SIZE_ONLY) {
8836                         num = add_data( pRExC_state, 1, "S" );
8837                         RExC_rxi->data->data[num]=(void*)sv_dat;
8838                         SvREFCNT_inc_simple_void(sv_dat);
8839                     }
8840                     ret = reganode(pRExC_state,NGROUPP,num);
8841                     goto insert_if_check_paren;
8842                 }
8843                 else if (RExC_parse[0] == 'D' &&
8844                          RExC_parse[1] == 'E' &&
8845                          RExC_parse[2] == 'F' &&
8846                          RExC_parse[3] == 'I' &&
8847                          RExC_parse[4] == 'N' &&
8848                          RExC_parse[5] == 'E')
8849                 {
8850                     ret = reganode(pRExC_state,DEFINEP,0);
8851                     RExC_parse +=6 ;
8852                     is_define = 1;
8853                     goto insert_if_check_paren;
8854                 }
8855                 else if (RExC_parse[0] == 'R') {
8856                     RExC_parse++;
8857                     parno = 0;
8858                     if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
8859                         parno = atoi(RExC_parse++);
8860                         while (isDIGIT(*RExC_parse))
8861                             RExC_parse++;
8862                     } else if (RExC_parse[0] == '&') {
8863                         SV *sv_dat;
8864                         RExC_parse++;
8865                         sv_dat = reg_scan_name(pRExC_state,
8866                             SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8867                         parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
8868                     }
8869                     ret = reganode(pRExC_state,INSUBP,parno);
8870                     goto insert_if_check_paren;
8871                 }
8872                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
8873                     /* (?(1)...) */
8874                     char c;
8875                     parno = atoi(RExC_parse++);
8876
8877                     while (isDIGIT(*RExC_parse))
8878                         RExC_parse++;
8879                     ret = reganode(pRExC_state, GROUPP, parno);
8880
8881                  insert_if_check_paren:
8882                     if ((c = *nextchar(pRExC_state)) != ')')
8883                         vFAIL("Switch condition not recognized");
8884                   insert_if:
8885                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
8886                     br = regbranch(pRExC_state, &flags, 1,depth+1);
8887                     if (br == NULL)
8888                         br = reganode(pRExC_state, LONGJMP, 0);
8889                     else
8890                         REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
8891                     c = *nextchar(pRExC_state);
8892                     if (flags&HASWIDTH)
8893                         *flagp |= HASWIDTH;
8894                     if (c == '|') {
8895                         if (is_define)
8896                             vFAIL("(?(DEFINE)....) does not allow branches");
8897                         lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
8898                         regbranch(pRExC_state, &flags, 1,depth+1);
8899                         REGTAIL(pRExC_state, ret, lastbr);
8900                         if (flags&HASWIDTH)
8901                             *flagp |= HASWIDTH;
8902                         c = *nextchar(pRExC_state);
8903                     }
8904                     else
8905                         lastbr = NULL;
8906                     if (c != ')')
8907                         vFAIL("Switch (?(condition)... contains too many branches");
8908                     ender = reg_node(pRExC_state, TAIL);
8909                     REGTAIL(pRExC_state, br, ender);
8910                     if (lastbr) {
8911                         REGTAIL(pRExC_state, lastbr, ender);
8912                         REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
8913                     }
8914                     else
8915                         REGTAIL(pRExC_state, ret, ender);
8916                     RExC_size++; /* XXX WHY do we need this?!!
8917                                     For large programs it seems to be required
8918                                     but I can't figure out why. -- dmq*/
8919                     return ret;
8920                 }
8921                 else {
8922                     vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
8923                 }
8924             }
8925             case 0:
8926                 RExC_parse--; /* for vFAIL to print correctly */
8927                 vFAIL("Sequence (? incomplete");
8928                 break;
8929             case DEFAULT_PAT_MOD:   /* Use default flags with the exceptions
8930                                        that follow */
8931                 has_use_defaults = TRUE;
8932                 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
8933                 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
8934                                                 ? REGEX_UNICODE_CHARSET
8935                                                 : REGEX_DEPENDS_CHARSET);
8936                 goto parse_flags;
8937             default:
8938                 --RExC_parse;
8939                 parse_flags:      /* (?i) */
8940             {
8941                 U32 posflags = 0, negflags = 0;
8942                 U32 *flagsp = &posflags;
8943                 char has_charset_modifier = '\0';
8944                 regex_charset cs = get_regex_charset(RExC_flags);
8945                 if (cs == REGEX_DEPENDS_CHARSET
8946                     && (RExC_utf8 || RExC_uni_semantics))
8947                 {
8948                     cs = REGEX_UNICODE_CHARSET;
8949                 }
8950
8951                 while (*RExC_parse) {
8952                     /* && strchr("iogcmsx", *RExC_parse) */
8953                     /* (?g), (?gc) and (?o) are useless here
8954                        and must be globally applied -- japhy */
8955                     switch (*RExC_parse) {
8956                     CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
8957                     case LOCALE_PAT_MOD:
8958                         if (has_charset_modifier) {
8959                             goto excess_modifier;
8960                         }
8961                         else if (flagsp == &negflags) {
8962                             goto neg_modifier;
8963                         }
8964                         cs = REGEX_LOCALE_CHARSET;
8965                         has_charset_modifier = LOCALE_PAT_MOD;
8966                         RExC_contains_locale = 1;
8967                         break;
8968                     case UNICODE_PAT_MOD:
8969                         if (has_charset_modifier) {
8970                             goto excess_modifier;
8971                         }
8972                         else if (flagsp == &negflags) {
8973                             goto neg_modifier;
8974                         }
8975                         cs = REGEX_UNICODE_CHARSET;
8976                         has_charset_modifier = UNICODE_PAT_MOD;
8977                         break;
8978                     case ASCII_RESTRICT_PAT_MOD:
8979                         if (flagsp == &negflags) {
8980                             goto neg_modifier;
8981                         }
8982                         if (has_charset_modifier) {
8983                             if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
8984                                 goto excess_modifier;
8985                             }
8986                             /* Doubled modifier implies more restricted */
8987                             cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
8988                         }
8989                         else {
8990                             cs = REGEX_ASCII_RESTRICTED_CHARSET;
8991                         }
8992                         has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
8993                         break;
8994                     case DEPENDS_PAT_MOD:
8995                         if (has_use_defaults) {
8996                             goto fail_modifiers;
8997                         }
8998                         else if (flagsp == &negflags) {
8999                             goto neg_modifier;
9000                         }
9001                         else if (has_charset_modifier) {
9002                             goto excess_modifier;
9003                         }
9004
9005                         /* The dual charset means unicode semantics if the
9006                          * pattern (or target, not known until runtime) are
9007                          * utf8, or something in the pattern indicates unicode
9008                          * semantics */
9009                         cs = (RExC_utf8 || RExC_uni_semantics)
9010                              ? REGEX_UNICODE_CHARSET
9011                              : REGEX_DEPENDS_CHARSET;
9012                         has_charset_modifier = DEPENDS_PAT_MOD;
9013                         break;
9014                     excess_modifier:
9015                         RExC_parse++;
9016                         if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
9017                             vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
9018                         }
9019                         else if (has_charset_modifier == *(RExC_parse - 1)) {
9020                             vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
9021                         }
9022                         else {
9023                             vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
9024                         }
9025                         /*NOTREACHED*/
9026                     neg_modifier:
9027                         RExC_parse++;
9028                         vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
9029                         /*NOTREACHED*/
9030                     case ONCE_PAT_MOD: /* 'o' */
9031                     case GLOBAL_PAT_MOD: /* 'g' */
9032                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9033                             const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9034                             if (! (wastedflags & wflagbit) ) {
9035                                 wastedflags |= wflagbit;
9036                                 vWARN5(
9037                                     RExC_parse + 1,
9038                                     "Useless (%s%c) - %suse /%c modifier",
9039                                     flagsp == &negflags ? "?-" : "?",
9040                                     *RExC_parse,
9041                                     flagsp == &negflags ? "don't " : "",
9042                                     *RExC_parse
9043                                 );
9044                             }
9045                         }
9046                         break;
9047
9048                     case CONTINUE_PAT_MOD: /* 'c' */
9049                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9050                             if (! (wastedflags & WASTED_C) ) {
9051                                 wastedflags |= WASTED_GC;
9052                                 vWARN3(
9053                                     RExC_parse + 1,
9054                                     "Useless (%sc) - %suse /gc modifier",
9055                                     flagsp == &negflags ? "?-" : "?",
9056                                     flagsp == &negflags ? "don't " : ""
9057                                 );
9058                             }
9059                         }
9060                         break;
9061                     case KEEPCOPY_PAT_MOD: /* 'p' */
9062                         if (flagsp == &negflags) {
9063                             if (SIZE_ONLY)
9064                                 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
9065                         } else {
9066                             *flagsp |= RXf_PMf_KEEPCOPY;
9067                         }
9068                         break;
9069                     case '-':
9070                         /* A flag is a default iff it is following a minus, so
9071                          * if there is a minus, it means will be trying to
9072                          * re-specify a default which is an error */
9073                         if (has_use_defaults || flagsp == &negflags) {
9074             fail_modifiers:
9075                             RExC_parse++;
9076                             vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
9077                             /*NOTREACHED*/
9078                         }
9079                         flagsp = &negflags;
9080                         wastedflags = 0;  /* reset so (?g-c) warns twice */
9081                         break;
9082                     case ':':
9083                         paren = ':';
9084                         /*FALLTHROUGH*/
9085                     case ')':
9086                         RExC_flags |= posflags;
9087                         RExC_flags &= ~negflags;
9088                         set_regex_charset(&RExC_flags, cs);
9089                         if (paren != ':') {
9090                             oregflags |= posflags;
9091                             oregflags &= ~negflags;
9092                             set_regex_charset(&oregflags, cs);
9093                         }
9094                         nextchar(pRExC_state);
9095                         if (paren != ':') {
9096                             *flagp = TRYAGAIN;
9097                             return NULL;
9098                         } else {
9099                             ret = NULL;
9100                             goto parse_rest;
9101                         }
9102                         /*NOTREACHED*/
9103                     default:
9104                         RExC_parse++;
9105                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
9106                         /*NOTREACHED*/
9107                     }
9108                     ++RExC_parse;
9109                 }
9110             }} /* one for the default block, one for the switch */
9111         }
9112         else {                  /* (...) */
9113           capturing_parens:
9114             parno = RExC_npar;
9115             RExC_npar++;
9116
9117             ret = reganode(pRExC_state, OPEN, parno);
9118             if (!SIZE_ONLY ){
9119                 if (!RExC_nestroot)
9120                     RExC_nestroot = parno;
9121                 if (RExC_seen & REG_SEEN_RECURSE
9122                     && !RExC_open_parens[parno-1])
9123                 {
9124                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9125                         "Setting open paren #%"IVdf" to %d\n",
9126                         (IV)parno, REG_NODE_NUM(ret)));
9127                     RExC_open_parens[parno-1]= ret;
9128                 }
9129             }
9130             Set_Node_Length(ret, 1); /* MJD */
9131             Set_Node_Offset(ret, RExC_parse); /* MJD */
9132             is_open = 1;
9133         }
9134     }
9135     else                        /* ! paren */
9136         ret = NULL;
9137
9138    parse_rest:
9139     /* Pick up the branches, linking them together. */
9140     parse_start = RExC_parse;   /* MJD */
9141     br = regbranch(pRExC_state, &flags, 1,depth+1);
9142
9143     /*     branch_len = (paren != 0); */
9144
9145     if (br == NULL)
9146         return(NULL);
9147     if (*RExC_parse == '|') {
9148         if (!SIZE_ONLY && RExC_extralen) {
9149             reginsert(pRExC_state, BRANCHJ, br, depth+1);
9150         }
9151         else {                  /* MJD */
9152             reginsert(pRExC_state, BRANCH, br, depth+1);
9153             Set_Node_Length(br, paren != 0);
9154             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
9155         }
9156         have_branch = 1;
9157         if (SIZE_ONLY)
9158             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
9159     }
9160     else if (paren == ':') {
9161         *flagp |= flags&SIMPLE;
9162     }
9163     if (is_open) {                              /* Starts with OPEN. */
9164         REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
9165     }
9166     else if (paren != '?')              /* Not Conditional */
9167         ret = br;
9168     *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9169     lastbr = br;
9170     while (*RExC_parse == '|') {
9171         if (!SIZE_ONLY && RExC_extralen) {
9172             ender = reganode(pRExC_state, LONGJMP,0);
9173             REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
9174         }
9175         if (SIZE_ONLY)
9176             RExC_extralen += 2;         /* Account for LONGJMP. */
9177         nextchar(pRExC_state);
9178         if (freeze_paren) {
9179             if (RExC_npar > after_freeze)
9180                 after_freeze = RExC_npar;
9181             RExC_npar = freeze_paren;
9182         }
9183         br = regbranch(pRExC_state, &flags, 0, depth+1);
9184
9185         if (br == NULL)
9186             return(NULL);
9187         REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
9188         lastbr = br;
9189         *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9190     }
9191
9192     if (have_branch || paren != ':') {
9193         /* Make a closing node, and hook it on the end. */
9194         switch (paren) {
9195         case ':':
9196             ender = reg_node(pRExC_state, TAIL);
9197             break;
9198         case 1:
9199             ender = reganode(pRExC_state, CLOSE, parno);
9200             if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
9201                 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9202                         "Setting close paren #%"IVdf" to %d\n",
9203                         (IV)parno, REG_NODE_NUM(ender)));
9204                 RExC_close_parens[parno-1]= ender;
9205                 if (RExC_nestroot == parno)
9206                     RExC_nestroot = 0;
9207             }
9208             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
9209             Set_Node_Length(ender,1); /* MJD */
9210             break;
9211         case '<':
9212         case ',':
9213         case '=':
9214         case '!':
9215             *flagp &= ~HASWIDTH;
9216             /* FALL THROUGH */
9217         case '>':
9218             ender = reg_node(pRExC_state, SUCCEED);
9219             break;
9220         case 0:
9221             ender = reg_node(pRExC_state, END);
9222             if (!SIZE_ONLY) {
9223                 assert(!RExC_opend); /* there can only be one! */
9224                 RExC_opend = ender;
9225             }
9226             break;
9227         }
9228         DEBUG_PARSE_r(if (!SIZE_ONLY) {
9229             SV * const mysv_val1=sv_newmortal();
9230             SV * const mysv_val2=sv_newmortal();
9231             DEBUG_PARSE_MSG("lsbr");
9232             regprop(RExC_rx, mysv_val1, lastbr);
9233             regprop(RExC_rx, mysv_val2, ender);
9234             PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9235                           SvPV_nolen_const(mysv_val1),
9236                           (IV)REG_NODE_NUM(lastbr),
9237                           SvPV_nolen_const(mysv_val2),
9238                           (IV)REG_NODE_NUM(ender),
9239                           (IV)(ender - lastbr)
9240             );
9241         });
9242         REGTAIL(pRExC_state, lastbr, ender);
9243
9244         if (have_branch && !SIZE_ONLY) {
9245             char is_nothing= 1;
9246             if (depth==1)
9247                 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
9248
9249             /* Hook the tails of the branches to the closing node. */
9250             for (br = ret; br; br = regnext(br)) {
9251                 const U8 op = PL_regkind[OP(br)];
9252                 if (op == BRANCH) {
9253                     REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9254                     if (OP(NEXTOPER(br)) != NOTHING || regnext(NEXTOPER(br)) != ender)
9255                         is_nothing= 0;
9256                 }
9257                 else if (op == BRANCHJ) {
9258                     REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9259                     /* for now we always disable this optimisation * /
9260                     if (OP(NEXTOPER(NEXTOPER(br))) != NOTHING || regnext(NEXTOPER(NEXTOPER(br))) != ender)
9261                     */
9262                         is_nothing= 0;
9263                 }
9264             }
9265             if (is_nothing) {
9266                 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
9267                 DEBUG_PARSE_r(if (!SIZE_ONLY) {
9268                     SV * const mysv_val1=sv_newmortal();
9269                     SV * const mysv_val2=sv_newmortal();
9270                     DEBUG_PARSE_MSG("NADA");
9271                     regprop(RExC_rx, mysv_val1, ret);
9272                     regprop(RExC_rx, mysv_val2, ender);
9273                     PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9274                                   SvPV_nolen_const(mysv_val1),
9275                                   (IV)REG_NODE_NUM(ret),
9276                                   SvPV_nolen_const(mysv_val2),
9277                                   (IV)REG_NODE_NUM(ender),
9278                                   (IV)(ender - ret)
9279                     );
9280                 });
9281                 OP(br)= NOTHING;
9282                 if (OP(ender) == TAIL) {
9283                     NEXT_OFF(br)= 0;
9284                     RExC_emit= br + 1;
9285                 } else {
9286                     regnode *opt;
9287                     for ( opt= br + 1; opt < ender ; opt++ )
9288                         OP(opt)= OPTIMIZED;
9289                     NEXT_OFF(br)= ender - br;
9290                 }
9291             }
9292         }
9293     }
9294
9295     {
9296         const char *p;
9297         static const char parens[] = "=!<,>";
9298
9299         if (paren && (p = strchr(parens, paren))) {
9300             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
9301             int flag = (p - parens) > 1;
9302
9303             if (paren == '>')
9304                 node = SUSPEND, flag = 0;
9305             reginsert(pRExC_state, node,ret, depth+1);
9306             Set_Node_Cur_Length(ret);
9307             Set_Node_Offset(ret, parse_start + 1);
9308             ret->flags = flag;
9309             REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
9310         }
9311     }
9312
9313     /* Check for proper termination. */
9314     if (paren) {
9315         RExC_flags = oregflags;
9316         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
9317             RExC_parse = oregcomp_parse;
9318             vFAIL("Unmatched (");
9319         }
9320     }
9321     else if (!paren && RExC_parse < RExC_end) {
9322         if (*RExC_parse == ')') {
9323             RExC_parse++;
9324             vFAIL("Unmatched )");
9325         }
9326         else
9327             FAIL("Junk on end of regexp");      /* "Can't happen". */
9328         assert(0); /* NOTREACHED */
9329     }
9330
9331     if (RExC_in_lookbehind) {
9332         RExC_in_lookbehind--;
9333     }
9334     if (after_freeze > RExC_npar)
9335         RExC_npar = after_freeze;
9336     return(ret);
9337 }
9338
9339 /*
9340  - regbranch - one alternative of an | operator
9341  *
9342  * Implements the concatenation operator.
9343  */
9344 STATIC regnode *
9345 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
9346 {
9347     dVAR;
9348     regnode *ret;
9349     regnode *chain = NULL;
9350     regnode *latest;
9351     I32 flags = 0, c = 0;
9352     GET_RE_DEBUG_FLAGS_DECL;
9353
9354     PERL_ARGS_ASSERT_REGBRANCH;
9355
9356     DEBUG_PARSE("brnc");
9357
9358     if (first)
9359         ret = NULL;
9360     else {
9361         if (!SIZE_ONLY && RExC_extralen)
9362             ret = reganode(pRExC_state, BRANCHJ,0);
9363         else {
9364             ret = reg_node(pRExC_state, BRANCH);
9365             Set_Node_Length(ret, 1);
9366         }
9367     }
9368
9369     if (!first && SIZE_ONLY)
9370         RExC_extralen += 1;                     /* BRANCHJ */
9371
9372     *flagp = WORST;                     /* Tentatively. */
9373
9374     RExC_parse--;
9375     nextchar(pRExC_state);
9376     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
9377         flags &= ~TRYAGAIN;
9378         latest = regpiece(pRExC_state, &flags,depth+1);
9379         if (latest == NULL) {
9380             if (flags & TRYAGAIN)
9381                 continue;
9382             return(NULL);
9383         }
9384         else if (ret == NULL)
9385             ret = latest;
9386         *flagp |= flags&(HASWIDTH|POSTPONED);
9387         if (chain == NULL)      /* First piece. */
9388             *flagp |= flags&SPSTART;
9389         else {
9390             RExC_naughty++;
9391             REGTAIL(pRExC_state, chain, latest);
9392         }
9393         chain = latest;
9394         c++;
9395     }
9396     if (chain == NULL) {        /* Loop ran zero times. */
9397         chain = reg_node(pRExC_state, NOTHING);
9398         if (ret == NULL)
9399             ret = chain;
9400     }
9401     if (c == 1) {
9402         *flagp |= flags&SIMPLE;
9403     }
9404
9405     return ret;
9406 }
9407
9408 /*
9409  - regpiece - something followed by possible [*+?]
9410  *
9411  * Note that the branching code sequences used for ? and the general cases
9412  * of * and + are somewhat optimized:  they use the same NOTHING node as
9413  * both the endmarker for their branch list and the body of the last branch.
9414  * It might seem that this node could be dispensed with entirely, but the
9415  * endmarker role is not redundant.
9416  */
9417 STATIC regnode *
9418 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
9419 {
9420     dVAR;
9421     regnode *ret;
9422     char op;
9423     char *next;
9424     I32 flags;
9425     const char * const origparse = RExC_parse;
9426     I32 min;
9427     I32 max = REG_INFTY;
9428 #ifdef RE_TRACK_PATTERN_OFFSETS
9429     char *parse_start;
9430 #endif
9431     const char *maxpos = NULL;
9432
9433     /* Save the original in case we change the emitted regop to a FAIL. */
9434     regnode * const orig_emit = RExC_emit;
9435
9436     GET_RE_DEBUG_FLAGS_DECL;
9437
9438     PERL_ARGS_ASSERT_REGPIECE;
9439
9440     DEBUG_PARSE("piec");
9441
9442     ret = regatom(pRExC_state, &flags,depth+1);
9443     if (ret == NULL) {
9444         if (flags & TRYAGAIN)
9445             *flagp |= TRYAGAIN;
9446         return(NULL);
9447     }
9448
9449     op = *RExC_parse;
9450
9451     if (op == '{' && regcurly(RExC_parse)) {
9452         maxpos = NULL;
9453 #ifdef RE_TRACK_PATTERN_OFFSETS
9454         parse_start = RExC_parse; /* MJD */
9455 #endif
9456         next = RExC_parse + 1;
9457         while (isDIGIT(*next) || *next == ',') {
9458             if (*next == ',') {
9459                 if (maxpos)
9460                     break;
9461                 else
9462                     maxpos = next;
9463             }
9464             next++;
9465         }
9466         if (*next == '}') {             /* got one */
9467             if (!maxpos)
9468                 maxpos = next;
9469             RExC_parse++;
9470             min = atoi(RExC_parse);
9471             if (*maxpos == ',')
9472                 maxpos++;
9473             else
9474                 maxpos = RExC_parse;
9475             max = atoi(maxpos);
9476             if (!max && *maxpos != '0')
9477                 max = REG_INFTY;                /* meaning "infinity" */
9478             else if (max >= REG_INFTY)
9479                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
9480             RExC_parse = next;
9481             nextchar(pRExC_state);
9482             if (max < min) {    /* If can't match, warn and optimize to fail
9483                                    unconditionally */
9484                 if (SIZE_ONLY) {
9485                     ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
9486
9487                     /* We can't back off the size because we have to reserve
9488                      * enough space for all the things we are about to throw
9489                      * away, but we can shrink it by the ammount we are about
9490                      * to re-use here */
9491                     RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
9492                 }
9493                 else {
9494                     RExC_emit = orig_emit;
9495                 }
9496                 ret = reg_node(pRExC_state, OPFAIL);
9497                 return ret;
9498             }
9499
9500         do_curly:
9501             if ((flags&SIMPLE)) {
9502                 RExC_naughty += 2 + RExC_naughty / 2;
9503                 reginsert(pRExC_state, CURLY, ret, depth+1);
9504                 Set_Node_Offset(ret, parse_start+1); /* MJD */
9505                 Set_Node_Cur_Length(ret);
9506             }
9507             else {
9508                 regnode * const w = reg_node(pRExC_state, WHILEM);
9509
9510                 w->flags = 0;
9511                 REGTAIL(pRExC_state, ret, w);
9512                 if (!SIZE_ONLY && RExC_extralen) {
9513                     reginsert(pRExC_state, LONGJMP,ret, depth+1);
9514                     reginsert(pRExC_state, NOTHING,ret, depth+1);
9515                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
9516                 }
9517                 reginsert(pRExC_state, CURLYX,ret, depth+1);
9518                                 /* MJD hk */
9519                 Set_Node_Offset(ret, parse_start+1);
9520                 Set_Node_Length(ret,
9521                                 op == '{' ? (RExC_parse - parse_start) : 1);
9522
9523                 if (!SIZE_ONLY && RExC_extralen)
9524                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
9525                 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
9526                 if (SIZE_ONLY)
9527                     RExC_whilem_seen++, RExC_extralen += 3;
9528                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
9529             }
9530             ret->flags = 0;
9531
9532             if (min > 0)
9533                 *flagp = WORST;
9534             if (max > 0)
9535                 *flagp |= HASWIDTH;
9536             if (!SIZE_ONLY) {
9537                 ARG1_SET(ret, (U16)min);
9538                 ARG2_SET(ret, (U16)max);
9539             }
9540
9541             goto nest_check;
9542         }
9543     }
9544
9545     if (!ISMULT1(op)) {
9546         *flagp = flags;
9547         return(ret);
9548     }
9549
9550 #if 0                           /* Now runtime fix should be reliable. */
9551
9552     /* if this is reinstated, don't forget to put this back into perldiag:
9553
9554             =item Regexp *+ operand could be empty at {#} in regex m/%s/
9555
9556            (F) The part of the regexp subject to either the * or + quantifier
9557            could match an empty string. The {#} shows in the regular
9558            expression about where the problem was discovered.
9559
9560     */
9561
9562     if (!(flags&HASWIDTH) && op != '?')
9563       vFAIL("Regexp *+ operand could be empty");
9564 #endif
9565
9566 #ifdef RE_TRACK_PATTERN_OFFSETS
9567     parse_start = RExC_parse;
9568 #endif
9569     nextchar(pRExC_state);
9570
9571     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
9572
9573     if (op == '*' && (flags&SIMPLE)) {
9574         reginsert(pRExC_state, STAR, ret, depth+1);
9575         ret->flags = 0;
9576         RExC_naughty += 4;
9577     }
9578     else if (op == '*') {
9579         min = 0;
9580         goto do_curly;
9581     }
9582     else if (op == '+' && (flags&SIMPLE)) {
9583         reginsert(pRExC_state, PLUS, ret, depth+1);
9584         ret->flags = 0;
9585         RExC_naughty += 3;
9586     }
9587     else if (op == '+') {
9588         min = 1;
9589         goto do_curly;
9590     }
9591     else if (op == '?') {
9592         min = 0; max = 1;
9593         goto do_curly;
9594     }
9595   nest_check:
9596     if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
9597         ckWARN3reg(RExC_parse,
9598                    "%.*s matches null string many times",
9599                    (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
9600                    origparse);
9601     }
9602
9603     if (RExC_parse < RExC_end && *RExC_parse == '?') {
9604         nextchar(pRExC_state);
9605         reginsert(pRExC_state, MINMOD, ret, depth+1);
9606         REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
9607     }
9608 #ifndef REG_ALLOW_MINMOD_SUSPEND
9609     else
9610 #endif
9611     if (RExC_parse < RExC_end && *RExC_parse == '+') {
9612         regnode *ender;
9613         nextchar(pRExC_state);
9614         ender = reg_node(pRExC_state, SUCCEED);
9615         REGTAIL(pRExC_state, ret, ender);
9616         reginsert(pRExC_state, SUSPEND, ret, depth+1);
9617         ret->flags = 0;
9618         ender = reg_node(pRExC_state, TAIL);
9619         REGTAIL(pRExC_state, ret, ender);
9620         /*ret= ender;*/
9621     }
9622
9623     if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
9624         RExC_parse++;
9625         vFAIL("Nested quantifiers");
9626     }
9627
9628     return(ret);
9629 }
9630
9631 STATIC bool
9632 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I32 *flagp, U32 depth, bool in_char_class)
9633 {
9634
9635  /* This is expected to be called by a parser routine that has recognized '\N'
9636    and needs to handle the rest. RExC_parse is expected to point at the first
9637    char following the N at the time of the call.  On successful return,
9638    RExC_parse has been updated to point to just after the sequence identified
9639    by this routine, and <*flagp> has been updated.
9640
9641    The \N may be inside (indicated by the boolean <in_char_class>) or outside a
9642    character class.
9643
9644    \N may begin either a named sequence, or if outside a character class, mean
9645    to match a non-newline.  For non single-quoted regexes, the tokenizer has
9646    attempted to decide which, and in the case of a named sequence, converted it
9647    into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
9648    where c1... are the characters in the sequence.  For single-quoted regexes,
9649    the tokenizer passes the \N sequence through unchanged; this code will not
9650    attempt to determine this nor expand those, instead raising a syntax error.
9651    The net effect is that if the beginning of the passed-in pattern isn't '{U+'
9652    or there is no '}', it signals that this \N occurrence means to match a
9653    non-newline.
9654
9655    Only the \N{U+...} form should occur in a character class, for the same
9656    reason that '.' inside a character class means to just match a period: it
9657    just doesn't make sense.
9658
9659    The function raises an error (via vFAIL), and doesn't return for various
9660    syntax errors.  Otherwise it returns TRUE and sets <node_p> or <valuep> on
9661    success; it returns FALSE otherwise.
9662
9663    If <valuep> is non-null, it means the caller can accept an input sequence
9664    consisting of a just a single code point; <*valuep> is set to that value
9665    if the input is such.
9666
9667    If <node_p> is non-null it signifies that the caller can accept any other
9668    legal sequence (i.e., one that isn't just a single code point).  <*node_p>
9669    is set as follows:
9670     1) \N means not-a-NL: points to a newly created REG_ANY node;
9671     2) \N{}:              points to a new NOTHING node;
9672     3) otherwise:         points to a new EXACT node containing the resolved
9673                           string.
9674    Note that FALSE is returned for single code point sequences if <valuep> is
9675    null.
9676  */
9677
9678     char * endbrace;    /* '}' following the name */
9679     char* p;
9680     char *endchar;      /* Points to '.' or '}' ending cur char in the input
9681                            stream */
9682     bool has_multiple_chars; /* true if the input stream contains a sequence of
9683                                 more than one character */
9684
9685     GET_RE_DEBUG_FLAGS_DECL;
9686
9687     PERL_ARGS_ASSERT_GROK_BSLASH_N;
9688
9689     GET_RE_DEBUG_FLAGS;
9690
9691     assert(cBOOL(node_p) ^ cBOOL(valuep));  /* Exactly one should be set */
9692
9693     /* The [^\n] meaning of \N ignores spaces and comments under the /x
9694      * modifier.  The other meaning does not */
9695     p = (RExC_flags & RXf_PMf_EXTENDED)
9696         ? regwhite( pRExC_state, RExC_parse )
9697         : RExC_parse;
9698
9699     /* Disambiguate between \N meaning a named character versus \N meaning
9700      * [^\n].  The former is assumed when it can't be the latter. */
9701     if (*p != '{' || regcurly(p)) {
9702         RExC_parse = p;
9703         if (! node_p) {
9704             /* no bare \N in a charclass */
9705             if (in_char_class) {
9706                 vFAIL("\\N in a character class must be a named character: \\N{...}");
9707             }
9708             return FALSE;
9709         }
9710         nextchar(pRExC_state);
9711         *node_p = reg_node(pRExC_state, REG_ANY);
9712         *flagp |= HASWIDTH|SIMPLE;
9713         RExC_naughty++;
9714         RExC_parse--;
9715         Set_Node_Length(*node_p, 1); /* MJD */
9716         return TRUE;
9717     }
9718
9719     /* Here, we have decided it should be a named character or sequence */
9720
9721     /* The test above made sure that the next real character is a '{', but
9722      * under the /x modifier, it could be separated by space (or a comment and
9723      * \n) and this is not allowed (for consistency with \x{...} and the
9724      * tokenizer handling of \N{NAME}). */
9725     if (*RExC_parse != '{') {
9726         vFAIL("Missing braces on \\N{}");
9727     }
9728
9729     RExC_parse++;       /* Skip past the '{' */
9730
9731     if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
9732         || ! (endbrace == RExC_parse            /* nothing between the {} */
9733               || (endbrace - RExC_parse >= 2    /* U+ (bad hex is checked below */
9734                   && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
9735     {
9736         if (endbrace) RExC_parse = endbrace;    /* position msg's '<--HERE' */
9737         vFAIL("\\N{NAME} must be resolved by the lexer");
9738     }
9739
9740     if (endbrace == RExC_parse) {   /* empty: \N{} */
9741         bool ret = TRUE;
9742         if (node_p) {
9743             *node_p = reg_node(pRExC_state,NOTHING);
9744         }
9745         else if (in_char_class) {
9746             if (SIZE_ONLY && in_char_class) {
9747                 ckWARNreg(RExC_parse,
9748                         "Ignoring zero length \\N{} in character class"
9749                 );
9750             }
9751             ret = FALSE;
9752         }
9753         else {
9754             return FALSE;
9755         }
9756         nextchar(pRExC_state);
9757         return ret;
9758     }
9759
9760     RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
9761     RExC_parse += 2;    /* Skip past the 'U+' */
9762
9763     endchar = RExC_parse + strcspn(RExC_parse, ".}");
9764
9765     /* Code points are separated by dots.  If none, there is only one code
9766      * point, and is terminated by the brace */
9767     has_multiple_chars = (endchar < endbrace);
9768
9769     if (valuep && (! has_multiple_chars || in_char_class)) {
9770         /* We only pay attention to the first char of
9771         multichar strings being returned in char classes. I kinda wonder
9772         if this makes sense as it does change the behaviour
9773         from earlier versions, OTOH that behaviour was broken
9774         as well. XXX Solution is to recharacterize as
9775         [rest-of-class]|multi1|multi2... */
9776
9777         STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
9778         I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
9779             | PERL_SCAN_DISALLOW_PREFIX
9780             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
9781
9782         *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
9783
9784         /* The tokenizer should have guaranteed validity, but it's possible to
9785          * bypass it by using single quoting, so check */
9786         if (length_of_hex == 0
9787             || length_of_hex != (STRLEN)(endchar - RExC_parse) )
9788         {
9789             RExC_parse += length_of_hex;        /* Includes all the valid */
9790             RExC_parse += (RExC_orig_utf8)      /* point to after 1st invalid */
9791                             ? UTF8SKIP(RExC_parse)
9792                             : 1;
9793             /* Guard against malformed utf8 */
9794             if (RExC_parse >= endchar) {
9795                 RExC_parse = endchar;
9796             }
9797             vFAIL("Invalid hexadecimal number in \\N{U+...}");
9798         }
9799
9800         if (in_char_class && has_multiple_chars) {
9801             ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
9802         }
9803
9804         RExC_parse = endbrace + 1;
9805     }
9806     else if (! node_p || ! has_multiple_chars) {
9807
9808         /* Here, the input is legal, but not according to the caller's
9809          * options.  We fail without advancing the parse, so that the
9810          * caller can try again */
9811         RExC_parse = p;
9812         return FALSE;
9813     }
9814     else {
9815
9816         /* What is done here is to convert this to a sub-pattern of the form
9817          * (?:\x{char1}\x{char2}...)
9818          * and then call reg recursively.  That way, it retains its atomicness,
9819          * while not having to worry about special handling that some code
9820          * points may have.  toke.c has converted the original Unicode values
9821          * to native, so that we can just pass on the hex values unchanged.  We
9822          * do have to set a flag to keep recoding from happening in the
9823          * recursion */
9824
9825         SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
9826         STRLEN len;
9827         char *orig_end = RExC_end;
9828         I32 flags;
9829
9830         while (RExC_parse < endbrace) {
9831
9832             /* Convert to notation the rest of the code understands */
9833             sv_catpv(substitute_parse, "\\x{");
9834             sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
9835             sv_catpv(substitute_parse, "}");
9836
9837             /* Point to the beginning of the next character in the sequence. */
9838             RExC_parse = endchar + 1;
9839             endchar = RExC_parse + strcspn(RExC_parse, ".}");
9840         }
9841         sv_catpv(substitute_parse, ")");
9842
9843         RExC_parse = SvPV(substitute_parse, len);
9844
9845         /* Don't allow empty number */
9846         if (len < 8) {
9847             vFAIL("Invalid hexadecimal number in \\N{U+...}");
9848         }
9849         RExC_end = RExC_parse + len;
9850
9851         /* The values are Unicode, and therefore not subject to recoding */
9852         RExC_override_recoding = 1;
9853
9854         *node_p = reg(pRExC_state, 1, &flags, depth+1);
9855         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
9856
9857         RExC_parse = endbrace;
9858         RExC_end = orig_end;
9859         RExC_override_recoding = 0;
9860
9861         nextchar(pRExC_state);
9862     }
9863
9864     return TRUE;
9865 }
9866
9867
9868 /*
9869  * reg_recode
9870  *
9871  * It returns the code point in utf8 for the value in *encp.
9872  *    value: a code value in the source encoding
9873  *    encp:  a pointer to an Encode object
9874  *
9875  * If the result from Encode is not a single character,
9876  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
9877  */
9878 STATIC UV
9879 S_reg_recode(pTHX_ const char value, SV **encp)
9880 {
9881     STRLEN numlen = 1;
9882     SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
9883     const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
9884     const STRLEN newlen = SvCUR(sv);
9885     UV uv = UNICODE_REPLACEMENT;
9886
9887     PERL_ARGS_ASSERT_REG_RECODE;
9888
9889     if (newlen)
9890         uv = SvUTF8(sv)
9891              ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
9892              : *(U8*)s;
9893
9894     if (!newlen || numlen != newlen) {
9895         uv = UNICODE_REPLACEMENT;
9896         *encp = NULL;
9897     }
9898     return uv;
9899 }
9900
9901 PERL_STATIC_INLINE U8
9902 S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state)
9903 {
9904     U8 op;
9905
9906     PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
9907
9908     if (! FOLD) {
9909         return EXACT;
9910     }
9911
9912     op = get_regex_charset(RExC_flags);
9913     if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
9914         op--; /* /a is same as /u, and map /aa's offset to what /a's would have
9915                  been, so there is no hole */
9916     }
9917
9918     return op + EXACTF;
9919 }
9920
9921 PERL_STATIC_INLINE void
9922 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32* flagp, STRLEN len, UV code_point)
9923 {
9924     /* This knows the details about sizing an EXACTish node, setting flags for
9925      * it (by setting <*flagp>, and potentially populating it with a single
9926      * character.
9927      *
9928      * If <len> (the length in bytes) is non-zero, this function assumes that
9929      * the node has already been populated, and just does the sizing.  In this
9930      * case <code_point> should be the final code point that has already been
9931      * placed into the node.  This value will be ignored except that under some
9932      * circumstances <*flagp> is set based on it.
9933      *
9934      * If <len> is zero, the function assumes that the node is to contain only
9935      * the single character given by <code_point> and calculates what <len>
9936      * should be.  In pass 1, it sizes the node appropriately.  In pass 2, it
9937      * additionally will populate the node's STRING with <code_point>, if <len>
9938      * is 0.  In both cases <*flagp> is appropriately set
9939      *
9940      * It knows that under FOLD, UTF characters and the Latin Sharp S must be
9941      * folded (the latter only when the rules indicate it can match 'ss') */
9942
9943     bool len_passed_in = cBOOL(len != 0);
9944     U8 character[UTF8_MAXBYTES_CASE+1];
9945
9946     PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
9947
9948     if (! len_passed_in) {
9949         if (UTF) {
9950             if (FOLD) {
9951                 to_uni_fold(NATIVE_TO_UNI(code_point), character, &len);
9952             }
9953             else {
9954                 uvchr_to_utf8( character, code_point);
9955                 len = UTF8SKIP(character);
9956             }
9957         }
9958         else if (! FOLD
9959                  || code_point != LATIN_SMALL_LETTER_SHARP_S
9960                  || ASCII_FOLD_RESTRICTED
9961                  || ! AT_LEAST_UNI_SEMANTICS)
9962         {
9963             *character = (U8) code_point;
9964             len = 1;
9965         }
9966         else {
9967             *character = 's';
9968             *(character + 1) = 's';
9969             len = 2;
9970         }
9971     }
9972
9973     if (SIZE_ONLY) {
9974         RExC_size += STR_SZ(len);
9975     }
9976     else {
9977         RExC_emit += STR_SZ(len);
9978         STR_LEN(node) = len;
9979         if (! len_passed_in) {
9980             Copy((char *) character, STRING(node), len, char);
9981         }
9982     }
9983
9984     *flagp |= HASWIDTH;
9985
9986     /* A single character node is SIMPLE, except for the special-cased SHARP S
9987      * under /di. */
9988     if ((len == 1 || (UTF && len == UNISKIP(code_point)))
9989         && (code_point != LATIN_SMALL_LETTER_SHARP_S
9990             || ! FOLD || ! DEPENDS_SEMANTICS))
9991     {
9992         *flagp |= SIMPLE;
9993     }
9994 }
9995
9996 /*
9997  - regatom - the lowest level
9998
9999    Try to identify anything special at the start of the pattern. If there
10000    is, then handle it as required. This may involve generating a single regop,
10001    such as for an assertion; or it may involve recursing, such as to
10002    handle a () structure.
10003
10004    If the string doesn't start with something special then we gobble up
10005    as much literal text as we can.
10006
10007    Once we have been able to handle whatever type of thing started the
10008    sequence, we return.
10009
10010    Note: we have to be careful with escapes, as they can be both literal
10011    and special, and in the case of \10 and friends, context determines which.
10012
10013    A summary of the code structure is:
10014
10015    switch (first_byte) {
10016         cases for each special:
10017             handle this special;
10018             break;
10019         case '\\':
10020             switch (2nd byte) {
10021                 cases for each unambiguous special:
10022                     handle this special;
10023                     break;
10024                 cases for each ambigous special/literal:
10025                     disambiguate;
10026                     if (special)  handle here
10027                     else goto defchar;
10028                 default: // unambiguously literal:
10029                     goto defchar;
10030             }
10031         default:  // is a literal char
10032             // FALL THROUGH
10033         defchar:
10034             create EXACTish node for literal;
10035             while (more input and node isn't full) {
10036                 switch (input_byte) {
10037                    cases for each special;
10038                        make sure parse pointer is set so that the next call to
10039                            regatom will see this special first
10040                        goto loopdone; // EXACTish node terminated by prev. char
10041                    default:
10042                        append char to EXACTISH node;
10043                 }
10044                 get next input byte;
10045             }
10046         loopdone:
10047    }
10048    return the generated node;
10049
10050    Specifically there are two separate switches for handling
10051    escape sequences, with the one for handling literal escapes requiring
10052    a dummy entry for all of the special escapes that are actually handled
10053    by the other.
10054 */
10055
10056 STATIC regnode *
10057 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10058 {
10059     dVAR;
10060     regnode *ret = NULL;
10061     I32 flags;
10062     char *parse_start = RExC_parse;
10063     U8 op;
10064     GET_RE_DEBUG_FLAGS_DECL;
10065     DEBUG_PARSE("atom");
10066     *flagp = WORST;             /* Tentatively. */
10067
10068     PERL_ARGS_ASSERT_REGATOM;
10069
10070 tryagain:
10071     switch ((U8)*RExC_parse) {
10072     case '^':
10073         RExC_seen_zerolen++;
10074         nextchar(pRExC_state);
10075         if (RExC_flags & RXf_PMf_MULTILINE)
10076             ret = reg_node(pRExC_state, MBOL);
10077         else if (RExC_flags & RXf_PMf_SINGLELINE)
10078             ret = reg_node(pRExC_state, SBOL);
10079         else
10080             ret = reg_node(pRExC_state, BOL);
10081         Set_Node_Length(ret, 1); /* MJD */
10082         break;
10083     case '$':
10084         nextchar(pRExC_state);
10085         if (*RExC_parse)
10086             RExC_seen_zerolen++;
10087         if (RExC_flags & RXf_PMf_MULTILINE)
10088             ret = reg_node(pRExC_state, MEOL);
10089         else if (RExC_flags & RXf_PMf_SINGLELINE)
10090             ret = reg_node(pRExC_state, SEOL);
10091         else
10092             ret = reg_node(pRExC_state, EOL);
10093         Set_Node_Length(ret, 1); /* MJD */
10094         break;
10095     case '.':
10096         nextchar(pRExC_state);
10097         if (RExC_flags & RXf_PMf_SINGLELINE)
10098             ret = reg_node(pRExC_state, SANY);
10099         else
10100             ret = reg_node(pRExC_state, REG_ANY);
10101         *flagp |= HASWIDTH|SIMPLE;
10102         RExC_naughty++;
10103         Set_Node_Length(ret, 1); /* MJD */
10104         break;
10105     case '[':
10106     {
10107         char * const oregcomp_parse = ++RExC_parse;
10108         ret = regclass(pRExC_state, flagp,depth+1);
10109         if (*RExC_parse != ']') {
10110             RExC_parse = oregcomp_parse;
10111             vFAIL("Unmatched [");
10112         }
10113         nextchar(pRExC_state);
10114         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
10115         break;
10116     }
10117     case '(':
10118         nextchar(pRExC_state);
10119         ret = reg(pRExC_state, 1, &flags,depth+1);
10120         if (ret == NULL) {
10121                 if (flags & TRYAGAIN) {
10122                     if (RExC_parse == RExC_end) {
10123                          /* Make parent create an empty node if needed. */
10124                         *flagp |= TRYAGAIN;
10125                         return(NULL);
10126                     }
10127                     goto tryagain;
10128                 }
10129                 return(NULL);
10130         }
10131         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10132         break;
10133     case '|':
10134     case ')':
10135         if (flags & TRYAGAIN) {
10136             *flagp |= TRYAGAIN;
10137             return NULL;
10138         }
10139         vFAIL("Internal urp");
10140                                 /* Supposed to be caught earlier. */
10141         break;
10142     case '?':
10143     case '+':
10144     case '*':
10145         RExC_parse++;
10146         vFAIL("Quantifier follows nothing");
10147         break;
10148     case '\\':
10149         /* Special Escapes
10150
10151            This switch handles escape sequences that resolve to some kind
10152            of special regop and not to literal text. Escape sequnces that
10153            resolve to literal text are handled below in the switch marked
10154            "Literal Escapes".
10155
10156            Every entry in this switch *must* have a corresponding entry
10157            in the literal escape switch. However, the opposite is not
10158            required, as the default for this switch is to jump to the
10159            literal text handling code.
10160         */
10161         switch ((U8)*++RExC_parse) {
10162         /* Special Escapes */
10163         case 'A':
10164             RExC_seen_zerolen++;
10165             ret = reg_node(pRExC_state, SBOL);
10166             *flagp |= SIMPLE;
10167             goto finish_meta_pat;
10168         case 'G':
10169             ret = reg_node(pRExC_state, GPOS);
10170             RExC_seen |= REG_SEEN_GPOS;
10171             *flagp |= SIMPLE;
10172             goto finish_meta_pat;
10173         case 'K':
10174             RExC_seen_zerolen++;
10175             ret = reg_node(pRExC_state, KEEPS);
10176             *flagp |= SIMPLE;
10177             /* XXX:dmq : disabling in-place substitution seems to
10178              * be necessary here to avoid cases of memory corruption, as
10179              * with: C<$_="x" x 80; s/x\K/y/> -- rgs
10180              */
10181             RExC_seen |= REG_SEEN_LOOKBEHIND;
10182             goto finish_meta_pat;
10183         case 'Z':
10184             ret = reg_node(pRExC_state, SEOL);
10185             *flagp |= SIMPLE;
10186             RExC_seen_zerolen++;                /* Do not optimize RE away */
10187             goto finish_meta_pat;
10188         case 'z':
10189             ret = reg_node(pRExC_state, EOS);
10190             *flagp |= SIMPLE;
10191             RExC_seen_zerolen++;                /* Do not optimize RE away */
10192             goto finish_meta_pat;
10193         case 'C':
10194             ret = reg_node(pRExC_state, CANY);
10195             RExC_seen |= REG_SEEN_CANY;
10196             *flagp |= HASWIDTH|SIMPLE;
10197             goto finish_meta_pat;
10198         case 'X':
10199             ret = reg_node(pRExC_state, CLUMP);
10200             *flagp |= HASWIDTH;
10201             goto finish_meta_pat;
10202         case 'w':
10203             op = ALNUM + get_regex_charset(RExC_flags);
10204             if (op > ALNUMA) {  /* /aa is same as /a */
10205                 op = ALNUMA;
10206             }
10207             ret = reg_node(pRExC_state, op);
10208             *flagp |= HASWIDTH|SIMPLE;
10209             goto finish_meta_pat;
10210         case 'W':
10211             op = NALNUM + get_regex_charset(RExC_flags);
10212             if (op > NALNUMA) { /* /aa is same as /a */
10213                 op = NALNUMA;
10214             }
10215             ret = reg_node(pRExC_state, op);
10216             *flagp |= HASWIDTH|SIMPLE;
10217             goto finish_meta_pat;
10218         case 'b':
10219             RExC_seen_zerolen++;
10220             RExC_seen |= REG_SEEN_LOOKBEHIND;
10221             op = BOUND + get_regex_charset(RExC_flags);
10222             if (op > BOUNDA) {  /* /aa is same as /a */
10223                 op = BOUNDA;
10224             }
10225             ret = reg_node(pRExC_state, op);
10226             FLAGS(ret) = get_regex_charset(RExC_flags);
10227             *flagp |= SIMPLE;
10228             goto finish_meta_pat;
10229         case 'B':
10230             RExC_seen_zerolen++;
10231             RExC_seen |= REG_SEEN_LOOKBEHIND;
10232             op = NBOUND + get_regex_charset(RExC_flags);
10233             if (op > NBOUNDA) { /* /aa is same as /a */
10234                 op = NBOUNDA;
10235             }
10236             ret = reg_node(pRExC_state, op);
10237             FLAGS(ret) = get_regex_charset(RExC_flags);
10238             *flagp |= SIMPLE;
10239             goto finish_meta_pat;
10240         case 's':
10241             op = SPACE + get_regex_charset(RExC_flags);
10242             if (op > SPACEA) {  /* /aa is same as /a */
10243                 op = SPACEA;
10244             }
10245             ret = reg_node(pRExC_state, op);
10246             *flagp |= HASWIDTH|SIMPLE;
10247             goto finish_meta_pat;
10248         case 'S':
10249             op = NSPACE + get_regex_charset(RExC_flags);
10250             if (op > NSPACEA) { /* /aa is same as /a */
10251                 op = NSPACEA;
10252             }
10253             ret = reg_node(pRExC_state, op);
10254             *flagp |= HASWIDTH|SIMPLE;
10255             goto finish_meta_pat;
10256         case 'D':
10257             op = NDIGIT;
10258             goto join_D_and_d;
10259         case 'd':
10260             op = DIGIT;
10261         join_D_and_d:
10262             {
10263                 U8 offset = get_regex_charset(RExC_flags);
10264                 if (offset == REGEX_UNICODE_CHARSET) {
10265                     offset = REGEX_DEPENDS_CHARSET;
10266                 }
10267                 else if (offset == REGEX_ASCII_MORE_RESTRICTED_CHARSET) {
10268                     offset = REGEX_ASCII_RESTRICTED_CHARSET;
10269                 }
10270                 op += offset;
10271             }
10272             ret = reg_node(pRExC_state, op);
10273             *flagp |= HASWIDTH|SIMPLE;
10274             goto finish_meta_pat;
10275         case 'R':
10276             ret = reg_node(pRExC_state, LNBREAK);
10277             *flagp |= HASWIDTH;
10278             goto finish_meta_pat;
10279         case 'h':
10280             ret = reg_node(pRExC_state, HORIZWS);
10281             *flagp |= HASWIDTH|SIMPLE;
10282             goto finish_meta_pat;
10283         case 'H':
10284             ret = reg_node(pRExC_state, NHORIZWS);
10285             *flagp |= HASWIDTH|SIMPLE;
10286             goto finish_meta_pat;
10287         case 'v':
10288             ret = reg_node(pRExC_state, VERTWS);
10289             *flagp |= HASWIDTH|SIMPLE;
10290             goto finish_meta_pat;
10291         case 'V':
10292             ret = reg_node(pRExC_state, NVERTWS);
10293             *flagp |= HASWIDTH|SIMPLE;
10294          finish_meta_pat:
10295             nextchar(pRExC_state);
10296             Set_Node_Length(ret, 2); /* MJD */
10297             break;
10298         case 'p':
10299         case 'P':
10300             {
10301                 char* const oldregxend = RExC_end;
10302 #ifdef DEBUGGING
10303                 char* parse_start = RExC_parse - 2;
10304 #endif
10305
10306                 if (RExC_parse[1] == '{') {
10307                   /* a lovely hack--pretend we saw [\pX] instead */
10308                     RExC_end = strchr(RExC_parse, '}');
10309                     if (!RExC_end) {
10310                         const U8 c = (U8)*RExC_parse;
10311                         RExC_parse += 2;
10312                         RExC_end = oldregxend;
10313                         vFAIL2("Missing right brace on \\%c{}", c);
10314                     }
10315                     RExC_end++;
10316                 }
10317                 else {
10318                     RExC_end = RExC_parse + 2;
10319                     if (RExC_end > oldregxend)
10320                         RExC_end = oldregxend;
10321                 }
10322                 RExC_parse--;
10323
10324                 ret = regclass(pRExC_state, flagp,depth+1);
10325
10326                 RExC_end = oldregxend;
10327                 RExC_parse--;
10328
10329                 Set_Node_Offset(ret, parse_start + 2);
10330                 Set_Node_Cur_Length(ret);
10331                 nextchar(pRExC_state);
10332             }
10333             break;
10334         case 'N':
10335             /* Handle \N and \N{NAME} with multiple code points here and not
10336              * below because it can be multicharacter. join_exact() will join
10337              * them up later on.  Also this makes sure that things like
10338              * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
10339              * The options to the grok function call causes it to fail if the
10340              * sequence is just a single code point.  We then go treat it as
10341              * just another character in the current EXACT node, and hence it
10342              * gets uniform treatment with all the other characters.  The
10343              * special treatment for quantifiers is not needed for such single
10344              * character sequences */
10345             ++RExC_parse;
10346             if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE)) {
10347                 RExC_parse--;
10348                 goto defchar;
10349             }
10350             break;
10351         case 'k':    /* Handle \k<NAME> and \k'NAME' */
10352         parse_named_seq:
10353         {
10354             char ch= RExC_parse[1];
10355             if (ch != '<' && ch != '\'' && ch != '{') {
10356                 RExC_parse++;
10357                 vFAIL2("Sequence %.2s... not terminated",parse_start);
10358             } else {
10359                 /* this pretty much dupes the code for (?P=...) in reg(), if
10360                    you change this make sure you change that */
10361                 char* name_start = (RExC_parse += 2);
10362                 U32 num = 0;
10363                 SV *sv_dat = reg_scan_name(pRExC_state,
10364                     SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10365                 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
10366                 if (RExC_parse == name_start || *RExC_parse != ch)
10367                     vFAIL2("Sequence %.3s... not terminated",parse_start);
10368
10369                 if (!SIZE_ONLY) {
10370                     num = add_data( pRExC_state, 1, "S" );
10371                     RExC_rxi->data->data[num]=(void*)sv_dat;
10372                     SvREFCNT_inc_simple_void(sv_dat);
10373                 }
10374
10375                 RExC_sawback = 1;
10376                 ret = reganode(pRExC_state,
10377                                ((! FOLD)
10378                                  ? NREF
10379                                  : (ASCII_FOLD_RESTRICTED)
10380                                    ? NREFFA
10381                                    : (AT_LEAST_UNI_SEMANTICS)
10382                                      ? NREFFU
10383                                      : (LOC)
10384                                        ? NREFFL
10385                                        : NREFF),
10386                                 num);
10387                 *flagp |= HASWIDTH;
10388
10389                 /* override incorrect value set in reganode MJD */
10390                 Set_Node_Offset(ret, parse_start+1);
10391                 Set_Node_Cur_Length(ret); /* MJD */
10392                 nextchar(pRExC_state);
10393
10394             }
10395             break;
10396         }
10397         case 'g':
10398         case '1': case '2': case '3': case '4':
10399         case '5': case '6': case '7': case '8': case '9':
10400             {
10401                 I32 num;
10402                 bool isg = *RExC_parse == 'g';
10403                 bool isrel = 0;
10404                 bool hasbrace = 0;
10405                 if (isg) {
10406                     RExC_parse++;
10407                     if (*RExC_parse == '{') {
10408                         RExC_parse++;
10409                         hasbrace = 1;
10410                     }
10411                     if (*RExC_parse == '-') {
10412                         RExC_parse++;
10413                         isrel = 1;
10414                     }
10415                     if (hasbrace && !isDIGIT(*RExC_parse)) {
10416                         if (isrel) RExC_parse--;
10417                         RExC_parse -= 2;
10418                         goto parse_named_seq;
10419                 }   }
10420                 num = atoi(RExC_parse);
10421                 if (isg && num == 0)
10422                     vFAIL("Reference to invalid group 0");
10423                 if (isrel) {
10424                     num = RExC_npar - num;
10425                     if (num < 1)
10426                         vFAIL("Reference to nonexistent or unclosed group");
10427                 }
10428                 if (!isg && num > 9 && num >= RExC_npar)
10429                     /* Probably a character specified in octal, e.g. \35 */
10430                     goto defchar;
10431                 else {
10432                     char * const parse_start = RExC_parse - 1; /* MJD */
10433                     while (isDIGIT(*RExC_parse))
10434                         RExC_parse++;
10435                     if (parse_start == RExC_parse - 1)
10436                         vFAIL("Unterminated \\g... pattern");
10437                     if (hasbrace) {
10438                         if (*RExC_parse != '}')
10439                             vFAIL("Unterminated \\g{...} pattern");
10440                         RExC_parse++;
10441                     }
10442                     if (!SIZE_ONLY) {
10443                         if (num > (I32)RExC_rx->nparens)
10444                             vFAIL("Reference to nonexistent group");
10445                     }
10446                     RExC_sawback = 1;
10447                     ret = reganode(pRExC_state,
10448                                    ((! FOLD)
10449                                      ? REF
10450                                      : (ASCII_FOLD_RESTRICTED)
10451                                        ? REFFA
10452                                        : (AT_LEAST_UNI_SEMANTICS)
10453                                          ? REFFU
10454                                          : (LOC)
10455                                            ? REFFL
10456                                            : REFF),
10457                                     num);
10458                     *flagp |= HASWIDTH;
10459
10460                     /* override incorrect value set in reganode MJD */
10461                     Set_Node_Offset(ret, parse_start+1);
10462                     Set_Node_Cur_Length(ret); /* MJD */
10463                     RExC_parse--;
10464                     nextchar(pRExC_state);
10465                 }
10466             }
10467             break;
10468         case '\0':
10469             if (RExC_parse >= RExC_end)
10470                 FAIL("Trailing \\");
10471             /* FALL THROUGH */
10472         default:
10473             /* Do not generate "unrecognized" warnings here, we fall
10474                back into the quick-grab loop below */
10475             parse_start--;
10476             goto defchar;
10477         }
10478         break;
10479
10480     case '#':
10481         if (RExC_flags & RXf_PMf_EXTENDED) {
10482             if ( reg_skipcomment( pRExC_state ) )
10483                 goto tryagain;
10484         }
10485         /* FALL THROUGH */
10486
10487     default:
10488
10489             parse_start = RExC_parse - 1;
10490
10491             RExC_parse++;
10492
10493         defchar: {
10494             STRLEN len = 0;
10495             UV ender;
10496             char *p;
10497             char *s;
10498 #define MAX_NODE_STRING_SIZE 127
10499             char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
10500             char *s0;
10501             U8 upper_parse = MAX_NODE_STRING_SIZE;
10502             STRLEN foldlen;
10503             U8 node_type;
10504             bool next_is_quantifier;
10505             char * oldp = NULL;
10506
10507             /* If a folding node contains only code points that don't
10508              * participate in folds, it can be changed into an EXACT node,
10509              * which allows the optimizer more things to look for */
10510             bool maybe_exact;
10511
10512             ender = 0;
10513             node_type = compute_EXACTish(pRExC_state);
10514             ret = reg_node(pRExC_state, node_type);
10515
10516             /* In pass1, folded, we use a temporary buffer instead of the
10517              * actual node, as the node doesn't exist yet */
10518             s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
10519
10520             s0 = s;
10521
10522         reparse:
10523
10524             /* We do the EXACTFish to EXACT node only if folding, and not if in
10525              * locale, as whether a character folds or not isn't known until
10526              * runtime */
10527             maybe_exact = FOLD && ! LOC;
10528
10529             /* XXX The node can hold up to 255 bytes, yet this only goes to
10530              * 127.  I (khw) do not know why.  Keeping it somewhat less than
10531              * 255 allows us to not have to worry about overflow due to
10532              * converting to utf8 and fold expansion, but that value is
10533              * 255-UTF8_MAXBYTES_CASE.  join_exact() may join adjacent nodes
10534              * split up by this limit into a single one using the real max of
10535              * 255.  Even at 127, this breaks under rare circumstances.  If
10536              * folding, we do not want to split a node at a character that is a
10537              * non-final in a multi-char fold, as an input string could just
10538              * happen to want to match across the node boundary.  The join
10539              * would solve that problem if the join actually happens.  But a
10540              * series of more than two nodes in a row each of 127 would cause
10541              * the first join to succeed to get to 254, but then there wouldn't
10542              * be room for the next one, which could at be one of those split
10543              * multi-char folds.  I don't know of any fool-proof solution.  One
10544              * could back off to end with only a code point that isn't such a
10545              * non-final, but it is possible for there not to be any in the
10546              * entire node. */
10547             for (p = RExC_parse - 1;
10548                  len < upper_parse && p < RExC_end;
10549                  len++)
10550             {
10551                 oldp = p;
10552
10553                 if (RExC_flags & RXf_PMf_EXTENDED)
10554                     p = regwhite( pRExC_state, p );
10555                 switch ((U8)*p) {
10556                 case '^':
10557                 case '$':
10558                 case '.':
10559                 case '[':
10560                 case '(':
10561                 case ')':
10562                 case '|':
10563                     goto loopdone;
10564                 case '\\':
10565                     /* Literal Escapes Switch
10566
10567                        This switch is meant to handle escape sequences that
10568                        resolve to a literal character.
10569
10570                        Every escape sequence that represents something
10571                        else, like an assertion or a char class, is handled
10572                        in the switch marked 'Special Escapes' above in this
10573                        routine, but also has an entry here as anything that
10574                        isn't explicitly mentioned here will be treated as
10575                        an unescaped equivalent literal.
10576                     */
10577
10578                     switch ((U8)*++p) {
10579                     /* These are all the special escapes. */
10580                     case 'A':             /* Start assertion */
10581                     case 'b': case 'B':   /* Word-boundary assertion*/
10582                     case 'C':             /* Single char !DANGEROUS! */
10583                     case 'd': case 'D':   /* digit class */
10584                     case 'g': case 'G':   /* generic-backref, pos assertion */
10585                     case 'h': case 'H':   /* HORIZWS */
10586                     case 'k': case 'K':   /* named backref, keep marker */
10587                     case 'p': case 'P':   /* Unicode property */
10588                               case 'R':   /* LNBREAK */
10589                     case 's': case 'S':   /* space class */
10590                     case 'v': case 'V':   /* VERTWS */
10591                     case 'w': case 'W':   /* word class */
10592                     case 'X':             /* eXtended Unicode "combining character sequence" */
10593                     case 'z': case 'Z':   /* End of line/string assertion */
10594                         --p;
10595                         goto loopdone;
10596
10597                     /* Anything after here is an escape that resolves to a
10598                        literal. (Except digits, which may or may not)
10599                      */
10600                     case 'n':
10601                         ender = '\n';
10602                         p++;
10603                         break;
10604                     case 'N': /* Handle a single-code point named character. */
10605                         /* The options cause it to fail if a multiple code
10606                          * point sequence.  Handle those in the switch() above
10607                          * */
10608                         RExC_parse = p + 1;
10609                         if (! grok_bslash_N(pRExC_state, NULL, &ender,
10610                                             flagp, depth, FALSE))
10611                         {
10612                             RExC_parse = p = oldp;
10613                             goto loopdone;
10614                         }
10615                         p = RExC_parse;
10616                         if (ender > 0xff) {
10617                             REQUIRE_UTF8;
10618                         }
10619                         break;
10620                     case 'r':
10621                         ender = '\r';
10622                         p++;
10623                         break;
10624                     case 't':
10625                         ender = '\t';
10626                         p++;
10627                         break;
10628                     case 'f':
10629                         ender = '\f';
10630                         p++;
10631                         break;
10632                     case 'e':
10633                           ender = ASCII_TO_NATIVE('\033');
10634                         p++;
10635                         break;
10636                     case 'a':
10637                           ender = ASCII_TO_NATIVE('\007');
10638                         p++;
10639                         break;
10640                     case 'o':
10641                         {
10642                             STRLEN brace_len = len;
10643                             UV result;
10644                             const char* error_msg;
10645
10646                             bool valid = grok_bslash_o(p,
10647                                                        &result,
10648                                                        &brace_len,
10649                                                        &error_msg,
10650                                                        1);
10651                             p += brace_len;
10652                             if (! valid) {
10653                                 RExC_parse = p; /* going to die anyway; point
10654                                                    to exact spot of failure */
10655                                 vFAIL(error_msg);
10656                             }
10657                             else
10658                             {
10659                                 ender = result;
10660                             }
10661                             if (PL_encoding && ender < 0x100) {
10662                                 goto recode_encoding;
10663                             }
10664                             if (ender > 0xff) {
10665                                 REQUIRE_UTF8;
10666                             }
10667                             break;
10668                         }
10669                     case 'x':
10670                         {
10671                             STRLEN brace_len = len;
10672                             UV result;
10673                             const char* error_msg;
10674
10675                             bool valid = grok_bslash_x(p,
10676                                                        &result,
10677                                                        &brace_len,
10678                                                        &error_msg,
10679                                                        1);
10680                             p += brace_len;
10681                             if (! valid) {
10682                                 RExC_parse = p; /* going to die anyway; point
10683                                                    to exact spot of failure */
10684                                 vFAIL(error_msg);
10685                             }
10686                             else {
10687                                 ender = result;
10688                             }
10689                             if (PL_encoding && ender < 0x100) {
10690                                 goto recode_encoding;
10691                             }
10692                             if (ender > 0xff) {
10693                                 REQUIRE_UTF8;
10694                             }
10695                             break;
10696                         }
10697                     case 'c':
10698                         p++;
10699                         ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
10700                         break;
10701                     case '0': case '1': case '2': case '3':case '4':
10702                     case '5': case '6': case '7':
10703                         if (*p == '0' ||
10704                             (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
10705                         {
10706                             I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
10707                             STRLEN numlen = 3;
10708                             ender = grok_oct(p, &numlen, &flags, NULL);
10709                             if (ender > 0xff) {
10710                                 REQUIRE_UTF8;
10711                             }
10712                             p += numlen;
10713                         }
10714                         else {
10715                             --p;
10716                             goto loopdone;
10717                         }
10718                         if (PL_encoding && ender < 0x100)
10719                             goto recode_encoding;
10720                         break;
10721                     recode_encoding:
10722                         if (! RExC_override_recoding) {
10723                             SV* enc = PL_encoding;
10724                             ender = reg_recode((const char)(U8)ender, &enc);
10725                             if (!enc && SIZE_ONLY)
10726                                 ckWARNreg(p, "Invalid escape in the specified encoding");
10727                             REQUIRE_UTF8;
10728                         }
10729                         break;
10730                     case '\0':
10731                         if (p >= RExC_end)
10732                             FAIL("Trailing \\");
10733                         /* FALL THROUGH */
10734                     default:
10735                         if (!SIZE_ONLY&& isALNUMC(*p)) {
10736                             ckWARN2reg(p + 1, "Unrecognized escape \\%.1s passed through", p);
10737                         }
10738                         goto normal_default;
10739                     }
10740                     break;
10741                 case '{':
10742                     /* Currently we don't warn when the lbrace is at the start
10743                      * of a construct.  This catches it in the middle of a
10744                      * literal string, or when its the first thing after
10745                      * something like "\b" */
10746                     if (! SIZE_ONLY
10747                         && (len || (p > RExC_start && isALPHA_A(*(p -1)))))
10748                     {
10749                         ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through");
10750                     }
10751                     /*FALLTHROUGH*/
10752                 default:
10753                   normal_default:
10754                     if (UTF8_IS_START(*p) && UTF) {
10755                         STRLEN numlen;
10756                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
10757                                                &numlen, UTF8_ALLOW_DEFAULT);
10758                         p += numlen;
10759                     }
10760                     else
10761                         ender = (U8) *p++;
10762                     break;
10763                 } /* End of switch on the literal */
10764
10765                 /* Here, have looked at the literal character and <ender>
10766                  * contains its ordinal, <p> points to the character after it
10767                  */
10768
10769                 if ( RExC_flags & RXf_PMf_EXTENDED)
10770                     p = regwhite( pRExC_state, p );
10771
10772                 /* If the next thing is a quantifier, it applies to this
10773                  * character only, which means that this character has to be in
10774                  * its own node and can't just be appended to the string in an
10775                  * existing node, so if there are already other characters in
10776                  * the node, close the node with just them, and set up to do
10777                  * this character again next time through, when it will be the
10778                  * only thing in its new node */
10779                 if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
10780                 {
10781                     p = oldp;
10782                     goto loopdone;
10783                 }
10784
10785                 if (FOLD) {
10786                     if (UTF
10787                             /* See comments for join_exact() as to why we fold
10788                              * this non-UTF at compile time */
10789                         || (node_type == EXACTFU
10790                             && ender == LATIN_SMALL_LETTER_SHARP_S))
10791                     {
10792
10793
10794                         /* Prime the casefolded buffer.  Locale rules, which
10795                          * apply only to code points < 256, aren't known until
10796                          * execution, so for them, just output the original
10797                          * character using utf8.  If we start to fold non-UTF
10798                          * patterns, be sure to update join_exact() */
10799                         if (LOC && ender < 256) {
10800                             if (UNI_IS_INVARIANT(ender)) {
10801                                 *s = (U8) ender;
10802                                 foldlen = 1;
10803                             } else {
10804                                 *s = UTF8_TWO_BYTE_HI(ender);
10805                                 *(s + 1) = UTF8_TWO_BYTE_LO(ender);
10806                                 foldlen = 2;
10807                             }
10808                         }
10809                         else {
10810                             UV folded = _to_uni_fold_flags(
10811                                            ender,
10812                                            (U8 *) s,
10813                                            &foldlen,
10814                                            FOLD_FLAGS_FULL
10815                                            | ((LOC) ?  FOLD_FLAGS_LOCALE
10816                                                     : (ASCII_FOLD_RESTRICTED)
10817                                                       ? FOLD_FLAGS_NOMIX_ASCII
10818                                                       : 0)
10819                                             );
10820
10821                             /* If this node only contains non-folding code
10822                              * points so far, see if this new one is also
10823                              * non-folding */
10824                             if (maybe_exact) {
10825                                 if (folded != ender) {
10826                                     maybe_exact = FALSE;
10827                                 }
10828                                 else {
10829                                     /* Here the fold is the original; we have
10830                                      * to check further to see if anything
10831                                      * folds to it */
10832                                     if (! PL_utf8_foldable) {
10833                                         SV* swash = swash_init("utf8",
10834                                                            "_Perl_Any_Folds",
10835                                                            &PL_sv_undef, 1, 0);
10836                                         PL_utf8_foldable =
10837                                                     _get_swash_invlist(swash);
10838                                         SvREFCNT_dec(swash);
10839                                     }
10840                                     if (_invlist_contains_cp(PL_utf8_foldable,
10841                                                              ender))
10842                                     {
10843                                         maybe_exact = FALSE;
10844                                     }
10845                                 }
10846                             }
10847                             ender = folded;
10848                         }
10849                         s += foldlen;
10850
10851                         /* The loop increments <len> each time, as all but this
10852                          * path (and the one just below for UTF) through it add
10853                          * a single byte to the EXACTish node.  But this one
10854                          * has changed len to be the correct final value, so
10855                          * subtract one to cancel out the increment that
10856                          * follows */
10857                         len += foldlen - 1;
10858                     }
10859                     else {
10860                         *(s++) = ender;
10861                         maybe_exact &= ! IS_IN_SOME_FOLD_L1(ender);
10862                     }
10863                 }
10864                 else if (UTF) {
10865                     const STRLEN unilen = reguni(pRExC_state, ender, s);
10866                     if (unilen > 0) {
10867                        s   += unilen;
10868                        len += unilen;
10869                     }
10870
10871                     /* See comment just above for - 1 */
10872                     len--;
10873                 }
10874                 else {
10875                     REGC((char)ender, s++);
10876                 }
10877
10878                 if (next_is_quantifier) {
10879
10880                     /* Here, the next input is a quantifier, and to get here,
10881                      * the current character is the only one in the node.
10882                      * Also, here <len> doesn't include the final byte for this
10883                      * character */
10884                     len++;
10885                     goto loopdone;
10886                 }
10887
10888             } /* End of loop through literal characters */
10889
10890             /* Here we have either exhausted the input or ran out of room in
10891              * the node.  (If we encountered a character that can't be in the
10892              * node, transfer is made directly to <loopdone>, and so we
10893              * wouldn't have fallen off the end of the loop.)  In the latter
10894              * case, we artificially have to split the node into two, because
10895              * we just don't have enough space to hold everything.  This
10896              * creates a problem if the final character participates in a
10897              * multi-character fold in the non-final position, as a match that
10898              * should have occurred won't, due to the way nodes are matched,
10899              * and our artificial boundary.  So back off until we find a non-
10900              * problematic character -- one that isn't at the beginning or
10901              * middle of such a fold.  (Either it doesn't participate in any
10902              * folds, or appears only in the final position of all the folds it
10903              * does participate in.)  A better solution with far fewer false
10904              * positives, and that would fill the nodes more completely, would
10905              * be to actually have available all the multi-character folds to
10906              * test against, and to back-off only far enough to be sure that
10907              * this node isn't ending with a partial one.  <upper_parse> is set
10908              * further below (if we need to reparse the node) to include just
10909              * up through that final non-problematic character that this code
10910              * identifies, so when it is set to less than the full node, we can
10911              * skip the rest of this */
10912             if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
10913
10914                 const STRLEN full_len = len;
10915
10916                 assert(len >= MAX_NODE_STRING_SIZE);
10917
10918                 /* Here, <s> points to the final byte of the final character.
10919                  * Look backwards through the string until find a non-
10920                  * problematic character */
10921
10922                 if (! UTF) {
10923
10924                     /* These two have no multi-char folds to non-UTF characters
10925                      */
10926                     if (ASCII_FOLD_RESTRICTED || LOC) {
10927                         goto loopdone;
10928                     }
10929
10930                     while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
10931                     len = s - s0 + 1;
10932                 }
10933                 else {
10934                     if (!  PL_NonL1NonFinalFold) {
10935                         PL_NonL1NonFinalFold = _new_invlist_C_array(
10936                                         NonL1_Perl_Non_Final_Folds_invlist);
10937                     }
10938
10939                     /* Point to the first byte of the final character */
10940                     s = (char *) utf8_hop((U8 *) s, -1);
10941
10942                     while (s >= s0) {   /* Search backwards until find
10943                                            non-problematic char */
10944                         if (UTF8_IS_INVARIANT(*s)) {
10945
10946                             /* There are no ascii characters that participate
10947                              * in multi-char folds under /aa.  In EBCDIC, the
10948                              * non-ascii invariants are all control characters,
10949                              * so don't ever participate in any folds. */
10950                             if (ASCII_FOLD_RESTRICTED
10951                                 || ! IS_NON_FINAL_FOLD(*s))
10952                             {
10953                                 break;
10954                             }
10955                         }
10956                         else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
10957
10958                             /* No Latin1 characters participate in multi-char
10959                              * folds under /l */
10960                             if (LOC
10961                                 || ! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_UNI(
10962                                                                 *s, *(s+1))))
10963                             {
10964                                 break;
10965                             }
10966                         }
10967                         else if (! _invlist_contains_cp(
10968                                         PL_NonL1NonFinalFold,
10969                                         valid_utf8_to_uvchr((U8 *) s, NULL)))
10970                         {
10971                             break;
10972                         }
10973
10974                         /* Here, the current character is problematic in that
10975                          * it does occur in the non-final position of some
10976                          * fold, so try the character before it, but have to
10977                          * special case the very first byte in the string, so
10978                          * we don't read outside the string */
10979                         s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
10980                     } /* End of loop backwards through the string */
10981
10982                     /* If there were only problematic characters in the string,
10983                      * <s> will point to before s0, in which case the length
10984                      * should be 0, otherwise include the length of the
10985                      * non-problematic character just found */
10986                     len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
10987                 }
10988
10989                 /* Here, have found the final character, if any, that is
10990                  * non-problematic as far as ending the node without splitting
10991                  * it across a potential multi-char fold.  <len> contains the
10992                  * number of bytes in the node up-to and including that
10993                  * character, or is 0 if there is no such character, meaning
10994                  * the whole node contains only problematic characters.  In
10995                  * this case, give up and just take the node as-is.  We can't
10996                  * do any better */
10997                 if (len == 0) {
10998                     len = full_len;
10999                 } else {
11000
11001                     /* Here, the node does contain some characters that aren't
11002                      * problematic.  If one such is the final character in the
11003                      * node, we are done */
11004                     if (len == full_len) {
11005                         goto loopdone;
11006                     }
11007                     else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
11008
11009                         /* If the final character is problematic, but the
11010                          * penultimate is not, back-off that last character to
11011                          * later start a new node with it */
11012                         p = oldp;
11013                         goto loopdone;
11014                     }
11015
11016                     /* Here, the final non-problematic character is earlier
11017                      * in the input than the penultimate character.  What we do
11018                      * is reparse from the beginning, going up only as far as
11019                      * this final ok one, thus guaranteeing that the node ends
11020                      * in an acceptable character.  The reason we reparse is
11021                      * that we know how far in the character is, but we don't
11022                      * know how to correlate its position with the input parse.
11023                      * An alternate implementation would be to build that
11024                      * correlation as we go along during the original parse,
11025                      * but that would entail extra work for every node, whereas
11026                      * this code gets executed only when the string is too
11027                      * large for the node, and the final two characters are
11028                      * problematic, an infrequent occurrence.  Yet another
11029                      * possible strategy would be to save the tail of the
11030                      * string, and the next time regatom is called, initialize
11031                      * with that.  The problem with this is that unless you
11032                      * back off one more character, you won't be guaranteed
11033                      * regatom will get called again, unless regbranch,
11034                      * regpiece ... are also changed.  If you do back off that
11035                      * extra character, so that there is input guaranteed to
11036                      * force calling regatom, you can't handle the case where
11037                      * just the first character in the node is acceptable.  I
11038                      * (khw) decided to try this method which doesn't have that
11039                      * pitfall; if performance issues are found, we can do a
11040                      * combination of the current approach plus that one */
11041                     upper_parse = len;
11042                     len = 0;
11043                     s = s0;
11044                     goto reparse;
11045                 }
11046             }   /* End of verifying node ends with an appropriate char */
11047
11048         loopdone:   /* Jumped to when encounters something that shouldn't be in
11049                        the node */
11050
11051             /* If 'maybe_exact' is still set here, means there are no
11052              * code points in the node that participate in folds */
11053             if (FOLD && maybe_exact) {
11054                 OP(ret) = EXACT;
11055             }
11056
11057             /* I (khw) don't know if you can get here with zero length, but the
11058              * old code handled this situation by creating a zero-length EXACT
11059              * node.  Might as well be NOTHING instead */
11060             if (len == 0) {
11061                 OP(ret) = NOTHING;
11062             }
11063             else{
11064                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender);
11065             }
11066
11067             RExC_parse = p - 1;
11068             Set_Node_Cur_Length(ret); /* MJD */
11069             nextchar(pRExC_state);
11070             {
11071                 /* len is STRLEN which is unsigned, need to copy to signed */
11072                 IV iv = len;
11073                 if (iv < 0)
11074                     vFAIL("Internal disaster");
11075             }
11076
11077         } /* End of label 'defchar:' */
11078         break;
11079     } /* End of giant switch on input character */
11080
11081     return(ret);
11082 }
11083
11084 STATIC char *
11085 S_regwhite( RExC_state_t *pRExC_state, char *p )
11086 {
11087     const char *e = RExC_end;
11088
11089     PERL_ARGS_ASSERT_REGWHITE;
11090
11091     while (p < e) {
11092         if (isSPACE(*p))
11093             ++p;
11094         else if (*p == '#') {
11095             bool ended = 0;
11096             do {
11097                 if (*p++ == '\n') {
11098                     ended = 1;
11099                     break;
11100                 }
11101             } while (p < e);
11102             if (!ended)
11103                 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11104         }
11105         else
11106             break;
11107     }
11108     return p;
11109 }
11110
11111 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
11112    Character classes ([:foo:]) can also be negated ([:^foo:]).
11113    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
11114    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
11115    but trigger failures because they are currently unimplemented. */
11116
11117 #define POSIXCC_DONE(c)   ((c) == ':')
11118 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
11119 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
11120
11121 STATIC I32
11122 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
11123 {
11124     dVAR;
11125     I32 namedclass = OOB_NAMEDCLASS;
11126
11127     PERL_ARGS_ASSERT_REGPPOSIXCC;
11128
11129     if (value == '[' && RExC_parse + 1 < RExC_end &&
11130         /* I smell either [: or [= or [. -- POSIX has been here, right? */
11131         POSIXCC(UCHARAT(RExC_parse))) {
11132         const char c = UCHARAT(RExC_parse);
11133         char* const s = RExC_parse++;
11134
11135         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
11136             RExC_parse++;
11137         if (RExC_parse == RExC_end)
11138             /* Grandfather lone [:, [=, [. */
11139             RExC_parse = s;
11140         else {
11141             const char* const t = RExC_parse++; /* skip over the c */
11142             assert(*t == c);
11143
11144             if (UCHARAT(RExC_parse) == ']') {
11145                 const char *posixcc = s + 1;
11146                 RExC_parse++; /* skip over the ending ] */
11147
11148                 if (*s == ':') {
11149                     const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
11150                     const I32 skip = t - posixcc;
11151
11152                     /* Initially switch on the length of the name.  */
11153                     switch (skip) {
11154                     case 4:
11155                         if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
11156                             namedclass = ANYOF_WORDCHAR;
11157                         break;
11158                     case 5:
11159                         /* Names all of length 5.  */
11160                         /* alnum alpha ascii blank cntrl digit graph lower
11161                            print punct space upper  */
11162                         /* Offset 4 gives the best switch position.  */
11163                         switch (posixcc[4]) {
11164                         case 'a':
11165                             if (memEQ(posixcc, "alph", 4)) /* alpha */
11166                                 namedclass = ANYOF_ALPHA;
11167                             break;
11168                         case 'e':
11169                             if (memEQ(posixcc, "spac", 4)) /* space */
11170                                 namedclass = ANYOF_PSXSPC;
11171                             break;
11172                         case 'h':
11173                             if (memEQ(posixcc, "grap", 4)) /* graph */
11174                                 namedclass = ANYOF_GRAPH;
11175                             break;
11176                         case 'i':
11177                             if (memEQ(posixcc, "asci", 4)) /* ascii */
11178                                 namedclass = ANYOF_ASCII;
11179                             break;
11180                         case 'k':
11181                             if (memEQ(posixcc, "blan", 4)) /* blank */
11182                                 namedclass = ANYOF_BLANK;
11183                             break;
11184                         case 'l':
11185                             if (memEQ(posixcc, "cntr", 4)) /* cntrl */
11186                                 namedclass = ANYOF_CNTRL;
11187                             break;
11188                         case 'm':
11189                             if (memEQ(posixcc, "alnu", 4)) /* alnum */
11190                                 namedclass = ANYOF_ALNUMC;
11191                             break;
11192                         case 'r':
11193                             if (memEQ(posixcc, "lowe", 4)) /* lower */
11194                                 namedclass = ANYOF_LOWER;
11195                             else if (memEQ(posixcc, "uppe", 4)) /* upper */
11196                                 namedclass = ANYOF_UPPER;
11197                             break;
11198                         case 't':
11199                             if (memEQ(posixcc, "digi", 4)) /* digit */
11200                                 namedclass = ANYOF_DIGIT;
11201                             else if (memEQ(posixcc, "prin", 4)) /* print */
11202                                 namedclass = ANYOF_PRINT;
11203                             else if (memEQ(posixcc, "punc", 4)) /* punct */
11204                                 namedclass = ANYOF_PUNCT;
11205                             break;
11206                         }
11207                         break;
11208                     case 6:
11209                         if (memEQ(posixcc, "xdigit", 6))
11210                             namedclass = ANYOF_XDIGIT;
11211                         break;
11212                     }
11213
11214                     if (namedclass == OOB_NAMEDCLASS)
11215                         Simple_vFAIL3("POSIX class [:%.*s:] unknown",
11216                                       t - s - 1, s + 1);
11217
11218                     /* The #defines are structured so each complement is +1 to
11219                      * the normal one */
11220                     if (complement) {
11221                         namedclass++;
11222                     }
11223                     assert (posixcc[skip] == ':');
11224                     assert (posixcc[skip+1] == ']');
11225                 } else if (!SIZE_ONLY) {
11226                     /* [[=foo=]] and [[.foo.]] are still future. */
11227
11228                     /* adjust RExC_parse so the warning shows after
11229                        the class closes */
11230                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
11231                         RExC_parse++;
11232                     Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
11233                 }
11234             } else {
11235                 /* Maternal grandfather:
11236                  * "[:" ending in ":" but not in ":]" */
11237                 RExC_parse = s;
11238             }
11239         }
11240     }
11241
11242     return namedclass;
11243 }
11244
11245 STATIC void
11246 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
11247 {
11248     dVAR;
11249
11250     PERL_ARGS_ASSERT_CHECKPOSIXCC;
11251
11252     if (POSIXCC(UCHARAT(RExC_parse))) {
11253         const char *s = RExC_parse;
11254         const char  c = *s++;
11255
11256         while (isALNUM(*s))
11257             s++;
11258         if (*s && c == *s && s[1] == ']') {
11259             ckWARN3reg(s+2,
11260                        "POSIX syntax [%c %c] belongs inside character classes",
11261                        c, c);
11262
11263             /* [[=foo=]] and [[.foo.]] are still future. */
11264             if (POSIXCC_NOTYET(c)) {
11265                 /* adjust RExC_parse so the error shows after
11266                    the class closes */
11267                 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
11268                     NOOP;
11269                 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
11270             }
11271         }
11272     }
11273 }
11274
11275 /* Generate the code to add a full posix character <class> to the bracketed
11276  * character class given by <node>.  (<node> is needed only under locale rules)
11277  * destlist     is the inversion list for non-locale rules that this class is
11278  *              to be added to
11279  * sourcelist   is the ASCII-range inversion list to add under /a rules
11280  * Xsourcelist  is the full Unicode range list to use otherwise. */
11281 #define DO_POSIX(node, class, destlist, sourcelist, Xsourcelist)           \
11282     if (LOC) {                                                             \
11283         SV* scratch_list = NULL;                                           \
11284                                                                            \
11285         /* Set this class in the node for runtime matching */              \
11286         ANYOF_CLASS_SET(node, class);                                      \
11287                                                                            \
11288         /* For above Latin1 code points, we use the full Unicode range */  \
11289         _invlist_intersection(PL_AboveLatin1,                              \
11290                               Xsourcelist,                                 \
11291                               &scratch_list);                              \
11292         /* And set the output to it, adding instead if there already is an \
11293          * output.  Checking if <destlist> is NULL first saves an extra    \
11294          * clone.  Its reference count will be decremented at the next     \
11295          * union, etc, or if this is the only instance, at the end of the  \
11296          * routine */                                                      \
11297         if (! destlist) {                                                  \
11298             destlist = scratch_list;                                       \
11299         }                                                                  \
11300         else {                                                             \
11301             _invlist_union(destlist, scratch_list, &destlist);             \
11302             SvREFCNT_dec(scratch_list);                                    \
11303         }                                                                  \
11304     }                                                                      \
11305     else {                                                                 \
11306         /* For non-locale, just add it to any existing list */             \
11307         _invlist_union(destlist,                                           \
11308                        (AT_LEAST_ASCII_RESTRICTED)                         \
11309                            ? sourcelist                                    \
11310                            : Xsourcelist,                                  \
11311                        &destlist);                                         \
11312     }
11313
11314 /* Like DO_POSIX, but matches the complement of <sourcelist> and <Xsourcelist>.
11315  */
11316 #define DO_N_POSIX(node, class, destlist, sourcelist, Xsourcelist)         \
11317     if (LOC) {                                                             \
11318         SV* scratch_list = NULL;                                           \
11319         ANYOF_CLASS_SET(node, class);                                      \
11320         _invlist_subtract(PL_AboveLatin1, Xsourcelist, &scratch_list);     \
11321         if (! destlist) {                                                  \
11322             destlist = scratch_list;                                       \
11323         }                                                                  \
11324         else {                                                             \
11325             _invlist_union(destlist, scratch_list, &destlist);             \
11326             SvREFCNT_dec(scratch_list);                                    \
11327         }                                                                  \
11328     }                                                                      \
11329     else {                                                                 \
11330         _invlist_union_complement_2nd(destlist,                            \
11331                                     (AT_LEAST_ASCII_RESTRICTED)            \
11332                                         ? sourcelist                       \
11333                                         : Xsourcelist,                     \
11334                                     &destlist);                            \
11335         /* Under /d, everything in the upper half of the Latin1 range      \
11336          * matches this complement */                                      \
11337         if (DEPENDS_SEMANTICS) {                                           \
11338             ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL;                \
11339         }                                                                  \
11340     }
11341
11342 /* Generate the code to add a posix character <class> to the bracketed
11343  * character class given by <node>.  (<node> is needed only under locale rules)
11344  * destlist       is the inversion list for non-locale rules that this class is
11345  *                to be added to
11346  * sourcelist     is the ASCII-range inversion list to add under /a rules
11347  * l1_sourcelist  is the Latin1 range list to use otherwise.
11348  * Xpropertyname  is the name to add to <run_time_list> of the property to
11349  *                specify the code points above Latin1 that will have to be
11350  *                determined at run-time
11351  * run_time_list  is a SV* that contains text names of properties that are to
11352  *                be computed at run time.  This concatenates <Xpropertyname>
11353  *                to it, appropriately
11354  * This is essentially DO_POSIX, but we know only the Latin1 values at compile
11355  * time */
11356 #define DO_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist,      \
11357                               l1_sourcelist, Xpropertyname, run_time_list) \
11358         /* First, resolve whether to use the ASCII-only list or the L1     \
11359          * list */                                                         \
11360         DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(node, class, destlist,      \
11361                 ((AT_LEAST_ASCII_RESTRICTED) ? sourcelist : l1_sourcelist),\
11362                 Xpropertyname, run_time_list)
11363
11364 #define DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(node, class, destlist, sourcelist, \
11365                 Xpropertyname, run_time_list)                              \
11366     /* If not /a matching, there are going to be code points we will have  \
11367      * to defer to runtime to look-up */                                   \
11368     if (! AT_LEAST_ASCII_RESTRICTED) {                                     \
11369         Perl_sv_catpvf(aTHX_ run_time_list, "+utf8::%s\n", Xpropertyname); \
11370     }                                                                      \
11371     if (LOC) {                                                             \
11372         ANYOF_CLASS_SET(node, class);                                      \
11373     }                                                                      \
11374     else {                                                                 \
11375         _invlist_union(destlist, sourcelist, &destlist);                   \
11376     }
11377
11378 /* Like DO_POSIX_LATIN1_ONLY_KNOWN, but for the complement.  A combination of
11379  * this and DO_N_POSIX.  Sets <matches_above_unicode> only if it can; unchanged
11380  * otherwise */
11381 #define DO_N_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist,    \
11382        l1_sourcelist, Xpropertyname, run_time_list, matches_above_unicode) \
11383     if (AT_LEAST_ASCII_RESTRICTED) {                                       \
11384         _invlist_union_complement_2nd(destlist, sourcelist, &destlist);    \
11385     }                                                                      \
11386     else {                                                                 \
11387         Perl_sv_catpvf(aTHX_ run_time_list, "!utf8::%s\n", Xpropertyname); \
11388         matches_above_unicode = TRUE;                                      \
11389         if (LOC) {                                                         \
11390             ANYOF_CLASS_SET(node, namedclass);                             \
11391         }                                                                  \
11392         else {                                                             \
11393             SV* scratch_list = NULL;                                       \
11394             _invlist_subtract(PL_Latin1, l1_sourcelist, &scratch_list);    \
11395             if (! destlist) {                                              \
11396                 destlist = scratch_list;                                   \
11397             }                                                              \
11398             else {                                                         \
11399                 _invlist_union(destlist, scratch_list, &destlist);         \
11400                 SvREFCNT_dec(scratch_list);                                \
11401             }                                                              \
11402             if (DEPENDS_SEMANTICS) {                                       \
11403                 ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL;            \
11404             }                                                              \
11405         }                                                                  \
11406     }
11407
11408 /* The names of properties whose definitions are not known at compile time are
11409  * stored in this SV, after a constant heading.  So if the length has been
11410  * changed since initialization, then there is a run-time definition. */
11411 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION (SvCUR(listsv) != initial_listsv_len)
11412
11413 /* This converts the named class defined in regcomp.h to its equivalent class
11414  * number defined in handy.h. */
11415 #define namedclass_to_classnum(class)  ((class) / 2)
11416
11417 STATIC regnode *
11418 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
11419 {
11420     /* parse a bracketed class specification.  Most of these will produce an ANYOF node;
11421      * but something like [a] will produce an EXACT node; [aA], an EXACTFish
11422      * node; [[:ascii:]], a POSIXA node; etc.  It is more complex under /i with
11423      * multi-character folds: it will be rewritten following the paradigm of
11424      * this example, where the <multi-fold>s are characters which fold to
11425      * multiple character sequences:
11426      *      /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
11427      * gets effectively rewritten as:
11428      *      /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
11429      * reg() gets called (recursively) on the rewritten version, and this
11430      * function will return what it constructs.  (Actually the <multi-fold>s
11431      * aren't physically removed from the [abcdefghi], it's just that they are
11432      * ignored in the recursion by means of a a flag:
11433      * <RExC_in_multi_char_class>.)
11434      *
11435      * ANYOF nodes contain a bit map for the first 256 characters, with the
11436      * corresponding bit set if that character is in the list.  For characters
11437      * above 255, a range list or swash is used.  There are extra bits for \w,
11438      * etc. in locale ANYOFs, as what these match is not determinable at
11439      * compile time */
11440
11441     dVAR;
11442     UV nextvalue;
11443     UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
11444     IV range = 0;
11445     UV value = OOB_UNICODE, save_value = OOB_UNICODE;
11446     regnode *ret;
11447     STRLEN numlen;
11448     IV namedclass = OOB_NAMEDCLASS;
11449     char *rangebegin = NULL;
11450     bool need_class = 0;
11451     bool allow_full_fold = TRUE;   /* Assume wants multi-char folding */
11452     SV *listsv = NULL;
11453     STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
11454                                       than just initialized.  */
11455     SV* properties = NULL;    /* Code points that match \p{} \P{} */
11456     SV* posixes = NULL;     /* Code points that match classes like, [:word:],
11457                                extended beyond the Latin1 range */
11458     UV element_count = 0;   /* Number of distinct elements in the class.
11459                                Optimizations may be possible if this is tiny */
11460     AV * multi_char_matches = NULL; /* Code points that fold to more than one
11461                                        character; used under /i */
11462     UV n;
11463
11464     /* Unicode properties are stored in a swash; this holds the current one
11465      * being parsed.  If this swash is the only above-latin1 component of the
11466      * character class, an optimization is to pass it directly on to the
11467      * execution engine.  Otherwise, it is set to NULL to indicate that there
11468      * are other things in the class that have to be dealt with at execution
11469      * time */
11470     SV* swash = NULL;           /* Code points that match \p{} \P{} */
11471
11472     /* Set if a component of this character class is user-defined; just passed
11473      * on to the engine */
11474     bool has_user_defined_property = FALSE;
11475
11476     /* inversion list of code points this node matches only when the target
11477      * string is in UTF-8.  (Because is under /d) */
11478     SV* depends_list = NULL;
11479
11480     /* inversion list of code points this node matches.  For much of the
11481      * function, it includes only those that match regardless of the utf8ness
11482      * of the target string */
11483     SV* cp_list = NULL;
11484
11485 #ifdef EBCDIC
11486     /* In a range, counts how many 0-2 of the ends of it came from literals,
11487      * not escapes.  Thus we can tell if 'A' was input vs \x{C1} */
11488     UV literal_endpoint = 0;
11489 #endif
11490     bool invert = FALSE;    /* Is this class to be complemented */
11491
11492     /* Is there any thing like \W or [:^digit:] that matches above the legal
11493      * Unicode range? */
11494     bool runtime_posix_matches_above_Unicode = FALSE;
11495
11496     regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
11497         case we need to change the emitted regop to an EXACT. */
11498     const char * orig_parse = RExC_parse;
11499     const I32 orig_size = RExC_size;
11500     GET_RE_DEBUG_FLAGS_DECL;
11501
11502     PERL_ARGS_ASSERT_REGCLASS;
11503 #ifndef DEBUGGING
11504     PERL_UNUSED_ARG(depth);
11505 #endif
11506
11507     DEBUG_PARSE("clas");
11508
11509     /* Assume we are going to generate an ANYOF node. */
11510     ret = reganode(pRExC_state, ANYOF, 0);
11511
11512     if (!SIZE_ONLY) {
11513         ANYOF_FLAGS(ret) = 0;
11514     }
11515
11516     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
11517         RExC_parse++;
11518         if (! RExC_in_multi_char_class) {
11519             invert = TRUE;
11520             RExC_naughty++;
11521
11522             /* We have decided to not allow multi-char folds in inverted
11523              * character classes, due to the confusion that can happen,
11524              * especially with classes that are designed for a non-Unicode
11525              * world:  You have the peculiar case that:
11526                 "s s" =~ /^[^\xDF]+$/i => Y
11527                 "ss"  =~ /^[^\xDF]+$/i => N
11528             *
11529             * See [perl #89750] */
11530             allow_full_fold = FALSE;
11531         }
11532     }
11533
11534     if (SIZE_ONLY) {
11535         RExC_size += ANYOF_SKIP;
11536         listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
11537     }
11538     else {
11539         RExC_emit += ANYOF_SKIP;
11540         if (LOC) {
11541             ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
11542         }
11543         listsv = newSVpvs("# comment\n");
11544         initial_listsv_len = SvCUR(listsv);
11545     }
11546
11547     nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
11548
11549     if (!SIZE_ONLY && POSIXCC(nextvalue))
11550         checkposixcc(pRExC_state);
11551
11552     /* allow 1st char to be ] (allowing it to be - is dealt with later) */
11553     if (UCHARAT(RExC_parse) == ']')
11554         goto charclassloop;
11555
11556 parseit:
11557     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
11558
11559     charclassloop:
11560
11561         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
11562         save_value = value;
11563         save_prevvalue = prevvalue;
11564
11565         if (!range) {
11566             rangebegin = RExC_parse;
11567             element_count++;
11568         }
11569         if (UTF) {
11570             value = utf8n_to_uvchr((U8*)RExC_parse,
11571                                    RExC_end - RExC_parse,
11572                                    &numlen, UTF8_ALLOW_DEFAULT);
11573             RExC_parse += numlen;
11574         }
11575         else
11576             value = UCHARAT(RExC_parse++);
11577
11578         nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
11579         if (value == '[' && POSIXCC(nextvalue))
11580             namedclass = regpposixcc(pRExC_state, value);
11581         else if (value == '\\') {
11582             if (UTF) {
11583                 value = utf8n_to_uvchr((U8*)RExC_parse,
11584                                    RExC_end - RExC_parse,
11585                                    &numlen, UTF8_ALLOW_DEFAULT);
11586                 RExC_parse += numlen;
11587             }
11588             else
11589                 value = UCHARAT(RExC_parse++);
11590             /* Some compilers cannot handle switching on 64-bit integer
11591              * values, therefore value cannot be an UV.  Yes, this will
11592              * be a problem later if we want switch on Unicode.
11593              * A similar issue a little bit later when switching on
11594              * namedclass. --jhi */
11595             switch ((I32)value) {
11596             case 'w':   namedclass = ANYOF_WORDCHAR;    break;
11597             case 'W':   namedclass = ANYOF_NWORDCHAR;   break;
11598             case 's':   namedclass = ANYOF_SPACE;       break;
11599             case 'S':   namedclass = ANYOF_NSPACE;      break;
11600             case 'd':   namedclass = ANYOF_DIGIT;       break;
11601             case 'D':   namedclass = ANYOF_NDIGIT;      break;
11602             case 'v':   namedclass = ANYOF_VERTWS;      break;
11603             case 'V':   namedclass = ANYOF_NVERTWS;     break;
11604             case 'h':   namedclass = ANYOF_HORIZWS;     break;
11605             case 'H':   namedclass = ANYOF_NHORIZWS;    break;
11606             case 'N':  /* Handle \N{NAME} in class */
11607                 {
11608                     /* We only pay attention to the first char of
11609                     multichar strings being returned. I kinda wonder
11610                     if this makes sense as it does change the behaviour
11611                     from earlier versions, OTOH that behaviour was broken
11612                     as well. */
11613                     if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth,
11614                                       TRUE /* => charclass */))
11615                     {
11616                         goto parseit;
11617                     }
11618                 }
11619                 break;
11620             case 'p':
11621             case 'P':
11622                 {
11623                 char *e;
11624
11625                 /* This routine will handle any undefined properties */
11626                 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF;
11627
11628                 if (RExC_parse >= RExC_end)
11629                     vFAIL2("Empty \\%c{}", (U8)value);
11630                 if (*RExC_parse == '{') {
11631                     const U8 c = (U8)value;
11632                     e = strchr(RExC_parse++, '}');
11633                     if (!e)
11634                         vFAIL2("Missing right brace on \\%c{}", c);
11635                     while (isSPACE(UCHARAT(RExC_parse)))
11636                         RExC_parse++;
11637                     if (e == RExC_parse)
11638                         vFAIL2("Empty \\%c{}", c);
11639                     n = e - RExC_parse;
11640                     while (isSPACE(UCHARAT(RExC_parse + n - 1)))
11641                         n--;
11642                 }
11643                 else {
11644                     e = RExC_parse;
11645                     n = 1;
11646                 }
11647                 if (!SIZE_ONLY) {
11648                     SV* invlist;
11649                     char* name;
11650
11651                     if (UCHARAT(RExC_parse) == '^') {
11652                          RExC_parse++;
11653                          n--;
11654                          value = value == 'p' ? 'P' : 'p'; /* toggle */
11655                          while (isSPACE(UCHARAT(RExC_parse))) {
11656                               RExC_parse++;
11657                               n--;
11658                          }
11659                     }
11660                     /* Try to get the definition of the property into
11661                      * <invlist>.  If /i is in effect, the effective property
11662                      * will have its name be <__NAME_i>.  The design is
11663                      * discussed in commit
11664                      * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
11665                     Newx(name, n + sizeof("_i__\n"), char);
11666
11667                     sprintf(name, "%s%.*s%s\n",
11668                                     (FOLD) ? "__" : "",
11669                                     (int)n,
11670                                     RExC_parse,
11671                                     (FOLD) ? "_i" : ""
11672                     );
11673
11674                     /* Look up the property name, and get its swash and
11675                      * inversion list, if the property is found  */
11676                     if (swash) {
11677                         SvREFCNT_dec(swash);
11678                     }
11679                     swash = _core_swash_init("utf8", name, &PL_sv_undef,
11680                                              1, /* binary */
11681                                              0, /* not tr/// */
11682                                              NULL, /* No inversion list */
11683                                              &swash_init_flags
11684                                             );
11685                     if (! swash || ! (invlist = _get_swash_invlist(swash))) {
11686                         if (swash) {
11687                             SvREFCNT_dec(swash);
11688                             swash = NULL;
11689                         }
11690
11691                         /* Here didn't find it.  It could be a user-defined
11692                          * property that will be available at run-time.  Add it
11693                          * to the list to look up then */
11694                         Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n",
11695                                         (value == 'p' ? '+' : '!'),
11696                                         name);
11697                         has_user_defined_property = TRUE;
11698
11699                         /* We don't know yet, so have to assume that the
11700                          * property could match something in the Latin1 range,
11701                          * hence something that isn't utf8.  Note that this
11702                          * would cause things in <depends_list> to match
11703                          * inappropriately, except that any \p{}, including
11704                          * this one forces Unicode semantics, which means there
11705                          * is <no depends_list> */
11706                         ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
11707                     }
11708                     else {
11709
11710                         /* Here, did get the swash and its inversion list.  If
11711                          * the swash is from a user-defined property, then this
11712                          * whole character class should be regarded as such */
11713                         has_user_defined_property =
11714                                     (swash_init_flags
11715                                      & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY);
11716
11717                         /* Invert if asking for the complement */
11718                         if (value == 'P') {
11719                             _invlist_union_complement_2nd(properties,
11720                                                           invlist,
11721                                                           &properties);
11722
11723                             /* The swash can't be used as-is, because we've
11724                              * inverted things; delay removing it to here after
11725                              * have copied its invlist above */
11726                             SvREFCNT_dec(swash);
11727                             swash = NULL;
11728                         }
11729                         else {
11730                             _invlist_union(properties, invlist, &properties);
11731                         }
11732                     }
11733                     Safefree(name);
11734                 }
11735                 RExC_parse = e + 1;
11736                 namedclass = ANYOF_MAX;  /* no official name, but it's named */
11737
11738                 /* \p means they want Unicode semantics */
11739                 RExC_uni_semantics = 1;
11740                 }
11741                 break;
11742             case 'n':   value = '\n';                   break;
11743             case 'r':   value = '\r';                   break;
11744             case 't':   value = '\t';                   break;
11745             case 'f':   value = '\f';                   break;
11746             case 'b':   value = '\b';                   break;
11747             case 'e':   value = ASCII_TO_NATIVE('\033');break;
11748             case 'a':   value = ASCII_TO_NATIVE('\007');break;
11749             case 'o':
11750                 RExC_parse--;   /* function expects to be pointed at the 'o' */
11751                 {
11752                     const char* error_msg;
11753                     bool valid = grok_bslash_o(RExC_parse,
11754                                                &value,
11755                                                &numlen,
11756                                                &error_msg,
11757                                                SIZE_ONLY);
11758                     RExC_parse += numlen;
11759                     if (! valid) {
11760                         vFAIL(error_msg);
11761                     }
11762                 }
11763                 if (PL_encoding && value < 0x100) {
11764                     goto recode_encoding;
11765                 }
11766                 break;
11767             case 'x':
11768                 RExC_parse--;   /* function expects to be pointed at the 'x' */
11769                 {
11770                     const char* error_msg;
11771                     bool valid = grok_bslash_x(RExC_parse,
11772                                                &value,
11773                                                &numlen,
11774                                                &error_msg,
11775                                                1);
11776                     RExC_parse += numlen;
11777                     if (! valid) {
11778                         vFAIL(error_msg);
11779                     }
11780                 }
11781                 if (PL_encoding && value < 0x100)
11782                     goto recode_encoding;
11783                 break;
11784             case 'c':
11785                 value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
11786                 break;
11787             case '0': case '1': case '2': case '3': case '4':
11788             case '5': case '6': case '7':
11789                 {
11790                     /* Take 1-3 octal digits */
11791                     I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
11792                     numlen = 3;
11793                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
11794                     RExC_parse += numlen;
11795                     if (PL_encoding && value < 0x100)
11796                         goto recode_encoding;
11797                     break;
11798                 }
11799             recode_encoding:
11800                 if (! RExC_override_recoding) {
11801                     SV* enc = PL_encoding;
11802                     value = reg_recode((const char)(U8)value, &enc);
11803                     if (!enc && SIZE_ONLY)
11804                         ckWARNreg(RExC_parse,
11805                                   "Invalid escape in the specified encoding");
11806                     break;
11807                 }
11808             default:
11809                 /* Allow \_ to not give an error */
11810                 if (!SIZE_ONLY && isALNUM(value) && value != '_') {
11811                     ckWARN2reg(RExC_parse,
11812                                "Unrecognized escape \\%c in character class passed through",
11813                                (int)value);
11814                 }
11815                 break;
11816             }
11817         } /* end of \blah */
11818 #ifdef EBCDIC
11819         else
11820             literal_endpoint++;
11821 #endif
11822
11823             /* What matches in a locale is not known until runtime.  This
11824              * includes what the Posix classes (like \w, [:space:]) match.
11825              * Room must be reserved (one time per class) to store such
11826              * classes, either if Perl is compiled so that locale nodes always
11827              * should have this space, or if there is such class info to be
11828              * stored.  The space will contain a bit for each named class that
11829              * is to be matched against.  This isn't needed for \p{} and
11830              * pseudo-classes, as they are not affected by locale, and hence
11831              * are dealt with separately */
11832             if (LOC
11833                 && ! need_class
11834                 && (ANYOF_LOCALE == ANYOF_CLASS
11835                     || (namedclass > OOB_NAMEDCLASS && namedclass < ANYOF_MAX)))
11836             {
11837                 need_class = 1;
11838                 if (SIZE_ONLY) {
11839                     RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
11840                 }
11841                 else {
11842                     RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
11843                     ANYOF_CLASS_ZERO(ret);
11844                 }
11845                 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
11846             }
11847
11848         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
11849
11850             /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
11851              * literal, as is the character that began the false range, i.e.
11852              * the 'a' in the examples */
11853             if (range) {
11854                 if (!SIZE_ONLY) {
11855                     const int w =
11856                         RExC_parse >= rangebegin ?
11857                         RExC_parse - rangebegin : 0;
11858                     ckWARN4reg(RExC_parse,
11859                                "False [] range \"%*.*s\"",
11860                                w, w, rangebegin);
11861                     cp_list = add_cp_to_invlist(cp_list, '-');
11862                     cp_list = add_cp_to_invlist(cp_list, prevvalue);
11863                 }
11864
11865                 range = 0; /* this was not a true range */
11866                 element_count += 2; /* So counts for three values */
11867             }
11868
11869             if (! SIZE_ONLY) {
11870                 switch ((I32)namedclass) {
11871
11872                 case ANYOF_ALNUMC: /* C's alnum, in contrast to \w */
11873                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11874                         PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv);
11875                     break;
11876                 case ANYOF_NALNUMC:
11877                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11878                         PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv,
11879                         runtime_posix_matches_above_Unicode);
11880                     break;
11881                 case ANYOF_ALPHA:
11882                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11883                         PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv);
11884                     break;
11885                 case ANYOF_NALPHA:
11886                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11887                         PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv,
11888                         runtime_posix_matches_above_Unicode);
11889                     break;
11890                 case ANYOF_ASCII:
11891 #ifdef HAS_ISASCII
11892                     if (LOC) {
11893                         ANYOF_CLASS_SET(ret, namedclass);
11894                     }
11895                     else
11896 #endif  /* Not isascii(); just use the hard-coded definition for it */
11897                         _invlist_union(posixes, PL_ASCII, &posixes);
11898                     break;
11899                 case ANYOF_NASCII:
11900 #ifdef HAS_ISASCII
11901                     if (LOC) {
11902                         ANYOF_CLASS_SET(ret, namedclass);
11903                     }
11904                     else {
11905 #endif
11906                         _invlist_union_complement_2nd(posixes,
11907                                                     PL_ASCII, &posixes);
11908                         if (DEPENDS_SEMANTICS) {
11909                             ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
11910                         }
11911 #ifdef HAS_ISASCII
11912                     }
11913 #endif
11914                     break;
11915                 case ANYOF_BLANK:
11916                     if (hasISBLANK || ! LOC) {
11917                         DO_POSIX(ret, namedclass, posixes,
11918                                             PL_PosixBlank, PL_XPosixBlank);
11919                     }
11920                     else { /* There is no isblank() and we are in locale:  We
11921                               use the ASCII range and the above-Latin1 range
11922                               code points */
11923                         SV* scratch_list = NULL;
11924
11925                         /* Include all above-Latin1 blanks */
11926                         _invlist_intersection(PL_AboveLatin1,
11927                                               PL_XPosixBlank,
11928                                               &scratch_list);
11929                         /* Add it to the running total of posix classes */
11930                         if (! posixes) {
11931                             posixes = scratch_list;
11932                         }
11933                         else {
11934                             _invlist_union(posixes, scratch_list, &posixes);
11935                             SvREFCNT_dec(scratch_list);
11936                         }
11937                         /* Add the ASCII-range blanks to the running total. */
11938                         _invlist_union(posixes, PL_PosixBlank, &posixes);
11939                     }
11940                     break;
11941                 case ANYOF_NBLANK:
11942                     if (hasISBLANK || ! LOC) {
11943                         DO_N_POSIX(ret, namedclass, posixes,
11944                                                 PL_PosixBlank, PL_XPosixBlank);
11945                     }
11946                     else { /* There is no isblank() and we are in locale */
11947                         SV* scratch_list = NULL;
11948
11949                         /* Include all above-Latin1 non-blanks */
11950                         _invlist_subtract(PL_AboveLatin1, PL_XPosixBlank,
11951                                           &scratch_list);
11952
11953                         /* Add them to the running total of posix classes */
11954                         _invlist_subtract(PL_AboveLatin1, PL_XPosixBlank,
11955                                           &scratch_list);
11956                         if (! posixes) {
11957                             posixes = scratch_list;
11958                         }
11959                         else {
11960                             _invlist_union(posixes, scratch_list, &posixes);
11961                             SvREFCNT_dec(scratch_list);
11962                         }
11963
11964                         /* Get the list of all non-ASCII-blanks in Latin 1, and
11965                          * add them to the running total */
11966                         _invlist_subtract(PL_Latin1, PL_PosixBlank,
11967                                           &scratch_list);
11968                         _invlist_union(posixes, scratch_list, &posixes);
11969                         SvREFCNT_dec(scratch_list);
11970                     }
11971                     break;
11972                 case ANYOF_CNTRL:
11973                     DO_POSIX(ret, namedclass, posixes,
11974                                             PL_PosixCntrl, PL_XPosixCntrl);
11975                     break;
11976                 case ANYOF_NCNTRL:
11977                     DO_N_POSIX(ret, namedclass, posixes,
11978                                             PL_PosixCntrl, PL_XPosixCntrl);
11979                     break;
11980                 case ANYOF_DIGIT:
11981                     /* There are no digits in the Latin1 range outside of
11982                      * ASCII, so call the macro that doesn't have to resolve
11983                      * them */
11984                     DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(ret, namedclass, posixes,
11985                         PL_PosixDigit, "XPosixDigit", listsv);
11986                     break;
11987                 case ANYOF_NDIGIT:
11988                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11989                         PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv,
11990                         runtime_posix_matches_above_Unicode);
11991                     break;
11992                 case ANYOF_GRAPH:
11993                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11994                         PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv);
11995                     break;
11996                 case ANYOF_NGRAPH:
11997                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11998                         PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv,
11999                         runtime_posix_matches_above_Unicode);
12000                     break;
12001                 case ANYOF_HORIZWS:
12002                     /* For these, we use the cp_list, as /d doesn't make a
12003                      * difference in what these match.  There would be problems
12004                      * if these characters had folds other than themselves, as
12005                      * cp_list is subject to folding.  It turns out that \h
12006                      * is just a synonym for XPosixBlank */
12007                     _invlist_union(cp_list, PL_XPosixBlank, &cp_list);
12008                     break;
12009                 case ANYOF_NHORIZWS:
12010                     _invlist_union_complement_2nd(cp_list,
12011                                                  PL_XPosixBlank, &cp_list);
12012                     break;
12013                 case ANYOF_LOWER:
12014                 case ANYOF_NLOWER:
12015                 {   /* These require special handling, as they differ under
12016                        folding, matching Cased there (which in the ASCII range
12017                        is the same as Alpha */
12018
12019                     SV* ascii_source;
12020                     SV* l1_source;
12021                     const char *Xname;
12022
12023                     if (FOLD && ! LOC) {
12024                         ascii_source = PL_PosixAlpha;
12025                         l1_source = PL_L1Cased;
12026                         Xname = "Cased";
12027                     }
12028                     else {
12029                         ascii_source = PL_PosixLower;
12030                         l1_source = PL_L1PosixLower;
12031                         Xname = "XPosixLower";
12032                     }
12033                     if (namedclass == ANYOF_LOWER) {
12034                         DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12035                                     ascii_source, l1_source, Xname, listsv);
12036                     }
12037                     else {
12038                         DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
12039                             posixes, ascii_source, l1_source, Xname, listsv,
12040                             runtime_posix_matches_above_Unicode);
12041                     }
12042                     break;
12043                 }
12044                 case ANYOF_PRINT:
12045                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12046                         PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv);
12047                     break;
12048                 case ANYOF_NPRINT:
12049                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12050                         PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv,
12051                         runtime_posix_matches_above_Unicode);
12052                     break;
12053                 case ANYOF_PUNCT:
12054                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12055                         PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv);
12056                     break;
12057                 case ANYOF_NPUNCT:
12058                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12059                         PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv,
12060                         runtime_posix_matches_above_Unicode);
12061                     break;
12062                 case ANYOF_PSXSPC:
12063                     DO_POSIX(ret, namedclass, posixes,
12064                                             PL_PosixSpace, PL_XPosixSpace);
12065                     break;
12066                 case ANYOF_NPSXSPC:
12067                     DO_N_POSIX(ret, namedclass, posixes,
12068                                             PL_PosixSpace, PL_XPosixSpace);
12069                     break;
12070                 case ANYOF_SPACE:
12071                     DO_POSIX(ret, namedclass, posixes,
12072                                             PL_PerlSpace, PL_XPerlSpace);
12073                     break;
12074                 case ANYOF_NSPACE:
12075                     DO_N_POSIX(ret, namedclass, posixes,
12076                                             PL_PerlSpace, PL_XPerlSpace);
12077                     break;
12078                 case ANYOF_UPPER:   /* Same as LOWER, above */
12079                 case ANYOF_NUPPER:
12080                 {
12081                     SV* ascii_source;
12082                     SV* l1_source;
12083                     const char *Xname;
12084
12085                     if (FOLD && ! LOC) {
12086                         ascii_source = PL_PosixAlpha;
12087                         l1_source = PL_L1Cased;
12088                         Xname = "Cased";
12089                     }
12090                     else {
12091                         ascii_source = PL_PosixUpper;
12092                         l1_source = PL_L1PosixUpper;
12093                         Xname = "XPosixUpper";
12094                     }
12095                     if (namedclass == ANYOF_UPPER) {
12096                         DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12097                                     ascii_source, l1_source, Xname, listsv);
12098                     }
12099                     else {
12100                         DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
12101                         posixes, ascii_source, l1_source, Xname, listsv,
12102                         runtime_posix_matches_above_Unicode);
12103                     }
12104                     break;
12105                 }
12106                 case ANYOF_WORDCHAR:
12107                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12108                             PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
12109                     break;
12110                 case ANYOF_NWORDCHAR:
12111                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12112                             PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv,
12113                             runtime_posix_matches_above_Unicode);
12114                     break;
12115                 case ANYOF_VERTWS:
12116                     /* For these, we use the cp_list, as /d doesn't make a
12117                      * difference in what these match.  There would be problems
12118                      * if these characters had folds other than themselves, as
12119                      * cp_list is subject to folding */
12120                     _invlist_union(cp_list, PL_VertSpace, &cp_list);
12121                     break;
12122                 case ANYOF_NVERTWS:
12123                     _invlist_union_complement_2nd(cp_list,
12124                                                     PL_VertSpace, &cp_list);
12125                     break;
12126                 case ANYOF_XDIGIT:
12127                     DO_POSIX(ret, namedclass, posixes,
12128                                             PL_PosixXDigit, PL_XPosixXDigit);
12129                     break;
12130                 case ANYOF_NXDIGIT:
12131                     DO_N_POSIX(ret, namedclass, posixes,
12132                                             PL_PosixXDigit, PL_XPosixXDigit);
12133                     break;
12134                 case ANYOF_MAX:
12135                     /* this is to handle \p and \P */
12136                     break;
12137                 default:
12138                     vFAIL("Invalid [::] class");
12139                     break;
12140                 }
12141
12142                 continue;   /* Go get next character */
12143             }
12144         } /* end of namedclass \blah */
12145
12146         if (range) {
12147             if (prevvalue > value) /* b-a */ {
12148                 const int w = RExC_parse - rangebegin;
12149                 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
12150                 range = 0; /* not a valid range */
12151             }
12152         }
12153         else {
12154             prevvalue = value; /* save the beginning of the potential range */
12155             if (RExC_parse+1 < RExC_end
12156                 && *RExC_parse == '-'
12157                 && RExC_parse[1] != ']')
12158             {
12159                 RExC_parse++;
12160
12161                 /* a bad range like \w-, [:word:]- ? */
12162                 if (namedclass > OOB_NAMEDCLASS) {
12163                     if (ckWARN(WARN_REGEXP)) {
12164                         const int w =
12165                             RExC_parse >= rangebegin ?
12166                             RExC_parse - rangebegin : 0;
12167                         vWARN4(RExC_parse,
12168                                "False [] range \"%*.*s\"",
12169                                w, w, rangebegin);
12170                     }
12171                     if (!SIZE_ONLY) {
12172                         cp_list = add_cp_to_invlist(cp_list, '-');
12173                     }
12174                     element_count++;
12175                 } else
12176                     range = 1;  /* yeah, it's a range! */
12177                 continue;       /* but do it the next time */
12178             }
12179         }
12180
12181         /* Here, <prevvalue> is the beginning of the range, if any; or <value>
12182          * if not */
12183
12184         /* non-Latin1 code point implies unicode semantics.  Must be set in
12185          * pass1 so is there for the whole of pass 2 */
12186         if (value > 255) {
12187             RExC_uni_semantics = 1;
12188         }
12189
12190         /* Ready to process either the single value, or the completed range.
12191          * For single-valued non-inverted ranges, we consider the possibility
12192          * of multi-char folds.  (We made a conscious decision to not do this
12193          * for the other cases because it can often lead to non-intuitive
12194          * results) */
12195         if (FOLD && ! invert && value == prevvalue) {
12196             if (value == LATIN_SMALL_LETTER_SHARP_S
12197                 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
12198                                                         value)))
12199             {
12200                 /* Here <value> is indeed a multi-char fold.  Get what it is */
12201
12202                 U8 foldbuf[UTF8_MAXBYTES_CASE];
12203                 STRLEN foldlen;
12204
12205                 UV folded = _to_uni_fold_flags(
12206                                 value,
12207                                 foldbuf,
12208                                 &foldlen,
12209                                 FOLD_FLAGS_FULL
12210                                 | ((LOC) ?  FOLD_FLAGS_LOCALE
12211                                             : (ASCII_FOLD_RESTRICTED)
12212                                               ? FOLD_FLAGS_NOMIX_ASCII
12213                                               : 0)
12214                                 );
12215
12216                 /* Here, <folded> should be the first character of the
12217                  * multi-char fold of <value>, with <foldbuf> containing the
12218                  * whole thing.  But, if this fold is not allowed (because of
12219                  * the flags), <fold> will be the same as <value>, and should
12220                  * be processed like any other character, so skip the special
12221                  * handling */
12222                 if (folded != value) {
12223
12224                     /* Skip if we are recursed, currently parsing the class
12225                      * again.  Otherwise add this character to the list of
12226                      * multi-char folds. */
12227                     if (! RExC_in_multi_char_class) {
12228                         AV** this_array_ptr;
12229                         AV* this_array;
12230                         STRLEN cp_count = utf8_length(foldbuf,
12231                                                       foldbuf + foldlen);
12232                         SV* multi_fold = sv_2mortal(newSVpvn("", 0));
12233
12234                         Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
12235
12236
12237                         if (! multi_char_matches) {
12238                             multi_char_matches = newAV();
12239                         }
12240
12241                         /* <multi_char_matches> is actually an array of arrays.
12242                          * There will be one or two top-level elements: [2],
12243                          * and/or [3].  The [2] element is an array, each
12244                          * element thereof is a character which folds to two
12245                          * characters; likewise for [3].  (Unicode guarantees a
12246                          * maximum of 3 characters in any fold.)  When we
12247                          * rewrite the character class below, we will do so
12248                          * such that the longest folds are written first, so
12249                          * that it prefers the longest matching strings first.
12250                          * This is done even if it turns out that any
12251                          * quantifier is non-greedy, out of programmer
12252                          * laziness.  Tom Christiansen has agreed that this is
12253                          * ok.  This makes the test for the ligature 'ffi' come
12254                          * before the test for 'ff' */
12255                         if (av_exists(multi_char_matches, cp_count)) {
12256                             this_array_ptr = (AV**) av_fetch(multi_char_matches,
12257                                                              cp_count, FALSE);
12258                             this_array = *this_array_ptr;
12259                         }
12260                         else {
12261                             this_array = newAV();
12262                             av_store(multi_char_matches, cp_count,
12263                                      (SV*) this_array);
12264                         }
12265                         av_push(this_array, multi_fold);
12266                     }
12267
12268                     /* This element should not be processed further in this
12269                      * class */
12270                     element_count--;
12271                     value = save_value;
12272                     prevvalue = save_prevvalue;
12273                     continue;
12274                 }
12275             }
12276         }
12277
12278         /* Deal with this element of the class */
12279         if (! SIZE_ONLY) {
12280 #ifndef EBCDIC
12281             cp_list = _add_range_to_invlist(cp_list, prevvalue, value);
12282 #else
12283             UV* this_range = _new_invlist(1);
12284             _append_range_to_invlist(this_range, prevvalue, value);
12285
12286             /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
12287              * If this range was specified using something like 'i-j', we want
12288              * to include only the 'i' and the 'j', and not anything in
12289              * between, so exclude non-ASCII, non-alphabetics from it.
12290              * However, if the range was specified with something like
12291              * [\x89-\x91] or [\x89-j], all code points within it should be
12292              * included.  literal_endpoint==2 means both ends of the range used
12293              * a literal character, not \x{foo} */
12294             if (literal_endpoint == 2
12295                 && (prevvalue >= 'a' && value <= 'z')
12296                     || (prevvalue >= 'A' && value <= 'Z'))
12297             {
12298                 _invlist_intersection(this_range, PL_ASCII, &this_range, );
12299                 _invlist_intersection(this_range, PL_Alpha, &this_range, );
12300             }
12301             _invlist_union(cp_list, this_range, &cp_list);
12302             literal_endpoint = 0;
12303 #endif
12304         }
12305
12306         range = 0; /* this range (if it was one) is done now */
12307     } /* End of loop through all the text within the brackets */
12308
12309     /* If anything in the class expands to more than one character, we have to
12310      * deal with them by building up a substitute parse string, and recursively
12311      * calling reg() on it, instead of proceeding */
12312     if (multi_char_matches) {
12313         SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
12314         I32 cp_count;
12315         STRLEN len;
12316         char *save_end = RExC_end;
12317         char *save_parse = RExC_parse;
12318         bool first_time = TRUE;     /* First multi-char occurrence doesn't get
12319                                        a "|" */
12320         I32 reg_flags;
12321
12322         assert(! invert);
12323 #if 0   /* Have decided not to deal with multi-char folds in inverted classes,
12324            because too confusing */
12325         if (invert) {
12326             sv_catpv(substitute_parse, "(?:");
12327         }
12328 #endif
12329
12330         /* Look at the longest folds first */
12331         for (cp_count = av_len(multi_char_matches); cp_count > 0; cp_count--) {
12332
12333             if (av_exists(multi_char_matches, cp_count)) {
12334                 AV** this_array_ptr;
12335                 SV* this_sequence;
12336
12337                 this_array_ptr = (AV**) av_fetch(multi_char_matches,
12338                                                  cp_count, FALSE);
12339                 while ((this_sequence = av_pop(*this_array_ptr)) !=
12340                                                                 &PL_sv_undef)
12341                 {
12342                     if (! first_time) {
12343                         sv_catpv(substitute_parse, "|");
12344                     }
12345                     first_time = FALSE;
12346
12347                     sv_catpv(substitute_parse, SvPVX(this_sequence));
12348                 }
12349             }
12350         }
12351
12352         /* If the character class contains anything else besides these
12353          * multi-character folds, have to include it in recursive parsing */
12354         if (element_count) {
12355             sv_catpv(substitute_parse, "|[");
12356             sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
12357             sv_catpv(substitute_parse, "]");
12358         }
12359
12360         sv_catpv(substitute_parse, ")");
12361 #if 0
12362         if (invert) {
12363             /* This is a way to get the parse to skip forward a whole named
12364              * sequence instead of matching the 2nd character when it fails the
12365              * first */
12366             sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
12367         }
12368 #endif
12369
12370         RExC_parse = SvPV(substitute_parse, len);
12371         RExC_end = RExC_parse + len;
12372         RExC_in_multi_char_class = 1;
12373         RExC_emit = (regnode *)orig_emit;
12374
12375         ret = reg(pRExC_state, 1, &reg_flags, depth+1);
12376
12377         *flagp |= reg_flags&(HASWIDTH|SPSTART|POSTPONED);
12378
12379         RExC_parse = save_parse;
12380         RExC_end = save_end;
12381         RExC_in_multi_char_class = 0;
12382         SvREFCNT_dec(multi_char_matches);
12383         return ret;
12384     }
12385
12386     /* If the character class contains only a single element, it may be
12387      * optimizable into another node type which is smaller and runs faster.
12388      * Check if this is the case for this class */
12389     if (element_count == 1) {
12390         U8 op = END;
12391         U8 arg = 0;
12392
12393         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like \w or
12394                                               [:digit:] or \p{foo} */
12395
12396             /* Certain named classes have equivalents that can appear outside a
12397              * character class, e.g. \w, \H.  We use these instead of a
12398              * character class. */
12399             switch ((I32)namedclass) {
12400                 U8 offset;
12401
12402                 /* The first group is for node types that depend on the charset
12403                  * modifier to the regex.  We first calculate the base node
12404                  * type, and if it should be inverted */
12405
12406                 case ANYOF_NWORDCHAR:
12407                     invert = ! invert;
12408                     /* FALLTHROUGH */
12409                 case ANYOF_WORDCHAR:
12410                     op = ALNUM;
12411                     goto join_charset_classes;
12412
12413                 case ANYOF_NSPACE:
12414                     invert = ! invert;
12415                     /* FALLTHROUGH */
12416                 case ANYOF_SPACE:
12417                     op = SPACE;
12418                     goto join_charset_classes;
12419
12420                 case ANYOF_NDIGIT:
12421                     invert = ! invert;
12422                     /* FALLTHROUGH */
12423                 case ANYOF_DIGIT:
12424                     op = DIGIT;
12425
12426                   join_charset_classes:
12427
12428                     /* Now that we have the base node type, we take advantage
12429                      * of the enum ordering of the charset modifiers to get the
12430                      * exact node type,  For example the base SPACE also has
12431                      * SPACEL, SPACEU, and SPACEA */
12432
12433                     offset = get_regex_charset(RExC_flags);
12434
12435                     /* /aa is the same as /a for these */
12436                     if (offset == REGEX_ASCII_MORE_RESTRICTED_CHARSET) {
12437                         offset = REGEX_ASCII_RESTRICTED_CHARSET;
12438                     }
12439                     else if (op == DIGIT && offset == REGEX_UNICODE_CHARSET) {
12440                         offset = REGEX_DEPENDS_CHARSET; /* There is no DIGITU */
12441                     }
12442
12443                     op += offset;
12444
12445                     /* The number of varieties of each of these is the same,
12446                      * hence, so is the delta between the normal and
12447                      * complemented nodes */
12448                     if (invert) {
12449                         op += NALNUM - ALNUM;
12450                     }
12451                     *flagp |= HASWIDTH|SIMPLE;
12452                     break;
12453
12454                 /* The second group doesn't depend of the charset modifiers.
12455                  * We just have normal and complemented */
12456                 case ANYOF_NHORIZWS:
12457                     invert = ! invert;
12458                     /* FALLTHROUGH */
12459                 case ANYOF_HORIZWS:
12460                   is_horizws:
12461                     op = (invert) ? NHORIZWS : HORIZWS;
12462                     *flagp |= HASWIDTH|SIMPLE;
12463                     break;
12464
12465                 case ANYOF_NVERTWS:
12466                     invert = ! invert;
12467                     /* FALLTHROUGH */
12468                 case ANYOF_VERTWS:
12469                     op = (invert) ? NVERTWS : VERTWS;
12470                     *flagp |= HASWIDTH|SIMPLE;
12471                     break;
12472
12473                 case ANYOF_MAX:
12474                     break;
12475
12476                 case ANYOF_NBLANK:
12477                     invert = ! invert;
12478                     /* FALLTHROUGH */
12479                 case ANYOF_BLANK:
12480                     if (AT_LEAST_UNI_SEMANTICS && ! AT_LEAST_ASCII_RESTRICTED) {
12481                         goto is_horizws;
12482                     }
12483                     /* FALLTHROUGH */
12484                 default:
12485                     /* A generic posix class.  All the /a ones can be handled
12486                      * by the POSIXA opcode.  And all are closed under folding
12487                      * in the ASCII range, so FOLD doesn't matter */
12488                     if (AT_LEAST_ASCII_RESTRICTED
12489                         || (! LOC && namedclass == ANYOF_ASCII))
12490                     {
12491                         /* The odd numbered ones are the complements of the
12492                          * next-lower even number one */
12493                         if (namedclass % 2 == 1) {
12494                             invert = ! invert;
12495                             namedclass--;
12496                         }
12497                         arg = namedclass_to_classnum(namedclass);
12498                         op = (invert) ? NPOSIXA : POSIXA;
12499                     }
12500                     break;
12501             }
12502         }
12503         else if (value == prevvalue) {
12504
12505             /* Here, the class consists of just a single code point */
12506
12507             if (invert) {
12508                 if (! LOC && value == '\n') {
12509                     op = REG_ANY; /* Optimize [^\n] */
12510                     *flagp |= HASWIDTH|SIMPLE;
12511                     RExC_naughty++;
12512                 }
12513             }
12514             else if (value < 256 || UTF) {
12515
12516                 /* Optimize a single value into an EXACTish node, but not if it
12517                  * would require converting the pattern to UTF-8. */
12518                 op = compute_EXACTish(pRExC_state);
12519             }
12520         } /* Otherwise is a range */
12521         else if (! LOC) {   /* locale could vary these */
12522             if (prevvalue == '0') {
12523                 if (value == '9') {
12524                     op = (invert) ? NDIGITA : DIGITA;
12525                     *flagp |= HASWIDTH|SIMPLE;
12526                 }
12527             }
12528         }
12529
12530         /* Here, we have changed <op> away from its initial value iff we found
12531          * an optimization */
12532         if (op != END) {
12533
12534             /* Throw away this ANYOF regnode, and emit the calculated one,
12535              * which should correspond to the beginning, not current, state of
12536              * the parse */
12537             const char * cur_parse = RExC_parse;
12538             RExC_parse = (char *)orig_parse;
12539             if ( SIZE_ONLY) {
12540                 if (! LOC) {
12541
12542                     /* To get locale nodes to not use the full ANYOF size would
12543                      * require moving the code above that writes the portions
12544                      * of it that aren't in other nodes to after this point.
12545                      * e.g.  ANYOF_CLASS_SET */
12546                     RExC_size = orig_size;
12547                 }
12548             }
12549             else {
12550                 RExC_emit = (regnode *)orig_emit;
12551             }
12552
12553             ret = reg_node(pRExC_state, op);
12554
12555             if (PL_regkind[op] == POSIXD) {
12556                 if (! SIZE_ONLY) {
12557                     FLAGS(ret) = arg;
12558                 }
12559                 *flagp |= HASWIDTH|SIMPLE;
12560             }
12561             else if (PL_regkind[op] == EXACT) {
12562                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
12563             }
12564
12565             RExC_parse = (char *) cur_parse;
12566
12567             SvREFCNT_dec(listsv);
12568             return ret;
12569         }
12570     }
12571
12572     if (SIZE_ONLY)
12573         return ret;
12574     /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
12575
12576     /* If folding, we calculate all characters that could fold to or from the
12577      * ones already on the list */
12578     if (FOLD && cp_list) {
12579         UV start, end;  /* End points of code point ranges */
12580
12581         SV* fold_intersection = NULL;
12582
12583         /* If the highest code point is within Latin1, we can use the
12584          * compiled-in Alphas list, and not have to go out to disk.  This
12585          * yields two false positives, the masculine and feminine oridinal
12586          * indicators, which are weeded out below using the
12587          * IS_IN_SOME_FOLD_L1() macro */
12588         if (invlist_highest(cp_list) < 256) {
12589             _invlist_intersection(PL_L1PosixAlpha, cp_list, &fold_intersection);
12590         }
12591         else {
12592
12593             /* Here, there are non-Latin1 code points, so we will have to go
12594              * fetch the list of all the characters that participate in folds
12595              */
12596             if (! PL_utf8_foldable) {
12597                 SV* swash = swash_init("utf8", "_Perl_Any_Folds",
12598                                        &PL_sv_undef, 1, 0);
12599                 PL_utf8_foldable = _get_swash_invlist(swash);
12600                 SvREFCNT_dec(swash);
12601             }
12602
12603             /* This is a hash that for a particular fold gives all characters
12604              * that are involved in it */
12605             if (! PL_utf8_foldclosures) {
12606
12607                 /* If we were unable to find any folds, then we likely won't be
12608                  * able to find the closures.  So just create an empty list.
12609                  * Folding will effectively be restricted to the non-Unicode
12610                  * rules hard-coded into Perl.  (This case happens legitimately
12611                  * during compilation of Perl itself before the Unicode tables
12612                  * are generated) */
12613                 if (_invlist_len(PL_utf8_foldable) == 0) {
12614                     PL_utf8_foldclosures = newHV();
12615                 }
12616                 else {
12617                     /* If the folds haven't been read in, call a fold function
12618                      * to force that */
12619                     if (! PL_utf8_tofold) {
12620                         U8 dummy[UTF8_MAXBYTES+1];
12621
12622                         /* This string is just a short named one above \xff */
12623                         to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
12624                         assert(PL_utf8_tofold); /* Verify that worked */
12625                     }
12626                     PL_utf8_foldclosures =
12627                                         _swash_inversion_hash(PL_utf8_tofold);
12628                 }
12629             }
12630
12631             /* Only the characters in this class that participate in folds need
12632              * be checked.  Get the intersection of this class and all the
12633              * possible characters that are foldable.  This can quickly narrow
12634              * down a large class */
12635             _invlist_intersection(PL_utf8_foldable, cp_list,
12636                                   &fold_intersection);
12637         }
12638
12639         /* Now look at the foldable characters in this class individually */
12640         invlist_iterinit(fold_intersection);
12641         while (invlist_iternext(fold_intersection, &start, &end)) {
12642             UV j;
12643
12644             /* Locale folding for Latin1 characters is deferred until runtime */
12645             if (LOC && start < 256) {
12646                 start = 256;
12647             }
12648
12649             /* Look at every character in the range */
12650             for (j = start; j <= end; j++) {
12651
12652                 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
12653                 STRLEN foldlen;
12654                 UV f;
12655                 SV** listp;
12656
12657                 if (j < 256) {
12658
12659                     /* We have the latin1 folding rules hard-coded here so that
12660                      * an innocent-looking character class, like /[ks]/i won't
12661                      * have to go out to disk to find the possible matches.
12662                      * XXX It would be better to generate these via regen, in
12663                      * case a new version of the Unicode standard adds new
12664                      * mappings, though that is not really likely, and may be
12665                      * caught by the default: case of the switch below. */
12666
12667                     if (IS_IN_SOME_FOLD_L1(j)) {
12668
12669                         /* ASCII is always matched; non-ASCII is matched only
12670                          * under Unicode rules */
12671                         if (isASCII(j) || AT_LEAST_UNI_SEMANTICS) {
12672                             cp_list =
12673                                 add_cp_to_invlist(cp_list, PL_fold_latin1[j]);
12674                         }
12675                         else {
12676                             depends_list =
12677                              add_cp_to_invlist(depends_list, PL_fold_latin1[j]);
12678                         }
12679                     }
12680
12681                     if (HAS_NONLATIN1_FOLD_CLOSURE(j)
12682                         && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
12683                     {
12684                         /* Certain Latin1 characters have matches outside
12685                          * Latin1.  To get here, <j> is one of those
12686                          * characters.   None of these matches is valid for
12687                          * ASCII characters under /aa, which is why the 'if'
12688                          * just above excludes those.  These matches only
12689                          * happen when the target string is utf8.  The code
12690                          * below adds the single fold closures for <j> to the
12691                          * inversion list. */
12692                         switch (j) {
12693                             case 'k':
12694                             case 'K':
12695                                 cp_list =
12696                                     add_cp_to_invlist(cp_list, KELVIN_SIGN);
12697                                 break;
12698                             case 's':
12699                             case 'S':
12700                                 cp_list = add_cp_to_invlist(cp_list,
12701                                                     LATIN_SMALL_LETTER_LONG_S);
12702                                 break;
12703                             case MICRO_SIGN:
12704                                 cp_list = add_cp_to_invlist(cp_list,
12705                                                     GREEK_CAPITAL_LETTER_MU);
12706                                 cp_list = add_cp_to_invlist(cp_list,
12707                                                     GREEK_SMALL_LETTER_MU);
12708                                 break;
12709                             case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
12710                             case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
12711                                 cp_list =
12712                                     add_cp_to_invlist(cp_list, ANGSTROM_SIGN);
12713                                 break;
12714                             case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
12715                                 cp_list = add_cp_to_invlist(cp_list,
12716                                         LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
12717                                 break;
12718                             case LATIN_SMALL_LETTER_SHARP_S:
12719                                 cp_list = add_cp_to_invlist(cp_list,
12720                                                 LATIN_CAPITAL_LETTER_SHARP_S);
12721                                 break;
12722                             case 'F': case 'f':
12723                             case 'I': case 'i':
12724                             case 'L': case 'l':
12725                             case 'T': case 't':
12726                             case 'A': case 'a':
12727                             case 'H': case 'h':
12728                             case 'J': case 'j':
12729                             case 'N': case 'n':
12730                             case 'W': case 'w':
12731                             case 'Y': case 'y':
12732                                 /* These all are targets of multi-character
12733                                  * folds from code points that require UTF8 to
12734                                  * express, so they can't match unless the
12735                                  * target string is in UTF-8, so no action here
12736                                  * is necessary, as regexec.c properly handles
12737                                  * the general case for UTF-8 matching and
12738                                  * multi-char folds */
12739                                 break;
12740                             default:
12741                                 /* Use deprecated warning to increase the
12742                                  * chances of this being output */
12743                                 ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j);
12744                                 break;
12745                         }
12746                     }
12747                     continue;
12748                 }
12749
12750                 /* Here is an above Latin1 character.  We don't have the rules
12751                  * hard-coded for it.  First, get its fold.  This is the simple
12752                  * fold, as the multi-character folds have been handled earlier
12753                  * and separated out */
12754                 f = _to_uni_fold_flags(j, foldbuf, &foldlen,
12755                                         ((LOC)
12756                                         ? FOLD_FLAGS_LOCALE
12757                                         : (ASCII_FOLD_RESTRICTED)
12758                                             ? FOLD_FLAGS_NOMIX_ASCII
12759                                             : 0));
12760
12761                 /* Single character fold of above Latin1.  Add everything in
12762                  * its fold closure to the list that this node should match.
12763                  * The fold closures data structure is a hash with the keys
12764                  * being the UTF-8 of every character that is folded to, like
12765                  * 'k', and the values each an array of all code points that
12766                  * fold to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ].
12767                  * Multi-character folds are not included */
12768                 if ((listp = hv_fetch(PL_utf8_foldclosures,
12769                                       (char *) foldbuf, foldlen, FALSE)))
12770                 {
12771                     AV* list = (AV*) *listp;
12772                     IV k;
12773                     for (k = 0; k <= av_len(list); k++) {
12774                         SV** c_p = av_fetch(list, k, FALSE);
12775                         UV c;
12776                         if (c_p == NULL) {
12777                             Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
12778                         }
12779                         c = SvUV(*c_p);
12780
12781                         /* /aa doesn't allow folds between ASCII and non-; /l
12782                          * doesn't allow them between above and below 256 */
12783                         if ((ASCII_FOLD_RESTRICTED
12784                                   && (isASCII(c) != isASCII(j)))
12785                             || (LOC && ((c < 256) != (j < 256))))
12786                         {
12787                             continue;
12788                         }
12789
12790                         /* Folds involving non-ascii Latin1 characters
12791                          * under /d are added to a separate list */
12792                         if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
12793                         {
12794                             cp_list = add_cp_to_invlist(cp_list, c);
12795                         }
12796                         else {
12797                           depends_list = add_cp_to_invlist(depends_list, c);
12798                         }
12799                     }
12800                 }
12801             }
12802         }
12803         SvREFCNT_dec(fold_intersection);
12804     }
12805
12806     /* And combine the result (if any) with any inversion list from posix
12807      * classes.  The lists are kept separate up to now because we don't want to
12808      * fold the classes (folding of those is automatically handled by the swash
12809      * fetching code) */
12810     if (posixes) {
12811         if (! DEPENDS_SEMANTICS) {
12812             if (cp_list) {
12813                 _invlist_union(cp_list, posixes, &cp_list);
12814                 SvREFCNT_dec(posixes);
12815             }
12816             else {
12817                 cp_list = posixes;
12818             }
12819         }
12820         else {
12821             /* Under /d, we put into a separate list the Latin1 things that
12822              * match only when the target string is utf8 */
12823             SV* nonascii_but_latin1_properties = NULL;
12824             _invlist_intersection(posixes, PL_Latin1,
12825                                   &nonascii_but_latin1_properties);
12826             _invlist_subtract(nonascii_but_latin1_properties, PL_ASCII,
12827                               &nonascii_but_latin1_properties);
12828             _invlist_subtract(posixes, nonascii_but_latin1_properties,
12829                               &posixes);
12830             if (cp_list) {
12831                 _invlist_union(cp_list, posixes, &cp_list);
12832                 SvREFCNT_dec(posixes);
12833             }
12834             else {
12835                 cp_list = posixes;
12836             }
12837
12838             if (depends_list) {
12839                 _invlist_union(depends_list, nonascii_but_latin1_properties,
12840                                &depends_list);
12841                 SvREFCNT_dec(nonascii_but_latin1_properties);
12842             }
12843             else {
12844                 depends_list = nonascii_but_latin1_properties;
12845             }
12846         }
12847     }
12848
12849     /* And combine the result (if any) with any inversion list from properties.
12850      * The lists are kept separate up to now so that we can distinguish the two
12851      * in regards to matching above-Unicode.  A run-time warning is generated
12852      * if a Unicode property is matched against a non-Unicode code point. But,
12853      * we allow user-defined properties to match anything, without any warning,
12854      * and we also suppress the warning if there is a portion of the character
12855      * class that isn't a Unicode property, and which matches above Unicode, \W
12856      * or [\x{110000}] for example.
12857      * (Note that in this case, unlike the Posix one above, there is no
12858      * <depends_list>, because having a Unicode property forces Unicode
12859      * semantics */
12860     if (properties) {
12861         bool warn_super = ! has_user_defined_property;
12862         if (cp_list) {
12863
12864             /* If it matters to the final outcome, see if a non-property
12865              * component of the class matches above Unicode.  If so, the
12866              * warning gets suppressed.  This is true even if just a single
12867              * such code point is specified, as though not strictly correct if
12868              * another such code point is matched against, the fact that they
12869              * are using above-Unicode code points indicates they should know
12870              * the issues involved */
12871             if (warn_super) {
12872                 bool non_prop_matches_above_Unicode =
12873                             runtime_posix_matches_above_Unicode
12874                             | (invlist_highest(cp_list) > PERL_UNICODE_MAX);
12875                 if (invert) {
12876                     non_prop_matches_above_Unicode =
12877                                             !  non_prop_matches_above_Unicode;
12878                 }
12879                 warn_super = ! non_prop_matches_above_Unicode;
12880             }
12881
12882             _invlist_union(properties, cp_list, &cp_list);
12883             SvREFCNT_dec(properties);
12884         }
12885         else {
12886             cp_list = properties;
12887         }
12888
12889         if (warn_super) {
12890             ANYOF_FLAGS(ret) |= ANYOF_WARN_SUPER;
12891         }
12892     }
12893
12894     /* Here, we have calculated what code points should be in the character
12895      * class.
12896      *
12897      * Now we can see about various optimizations.  Fold calculation (which we
12898      * did above) needs to take place before inversion.  Otherwise /[^k]/i
12899      * would invert to include K, which under /i would match k, which it
12900      * shouldn't.  Therefore we can't invert folded locale now, as it won't be
12901      * folded until runtime */
12902
12903     /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
12904      * at compile time.  Besides not inverting folded locale now, we can't
12905      * invert if there are things such as \w, which aren't known until runtime
12906      * */
12907     if (invert
12908         && ! (LOC && (FOLD || (ANYOF_FLAGS(ret) & ANYOF_CLASS)))
12909         && ! depends_list
12910         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
12911     {
12912         _invlist_invert(cp_list);
12913
12914         /* Any swash can't be used as-is, because we've inverted things */
12915         if (swash) {
12916             SvREFCNT_dec(swash);
12917             swash = NULL;
12918         }
12919
12920         /* Clear the invert flag since have just done it here */
12921         invert = FALSE;
12922     }
12923
12924     /* If we didn't do folding, it's because some information isn't available
12925      * until runtime; set the run-time fold flag for these.  (We don't have to
12926      * worry about properties folding, as that is taken care of by the swash
12927      * fetching) */
12928     if (FOLD && LOC)
12929     {
12930        ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
12931     }
12932
12933     /* Some character classes are equivalent to other nodes.  Such nodes take
12934      * up less room and generally fewer operations to execute than ANYOF nodes.
12935      * Above, we checked for and optimized into some such equivalents for
12936      * certain common classes that are easy to test.  Getting to this point in
12937      * the code means that the class didn't get optimized there.  Since this
12938      * code is only executed in Pass 2, it is too late to save space--it has
12939      * been allocated in Pass 1, and currently isn't given back.  But turning
12940      * things into an EXACTish node can allow the optimizer to join it to any
12941      * adjacent such nodes.  And if the class is equivalent to things like /./,
12942      * expensive run-time swashes can be avoided.  Now that we have more
12943      * complete information, we can find things necessarily missed by the
12944      * earlier code.  I (khw) am not sure how much to look for here.  It would
12945      * be easy, but perhaps too slow, to check any candidates against all the
12946      * node types they could possibly match using _invlistEQ(). */
12947
12948     if (cp_list
12949         && ! invert
12950         && ! depends_list
12951         && ! (ANYOF_FLAGS(ret) & ANYOF_CLASS)
12952         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
12953     {
12954        UV start, end;
12955        U8 op = END;  /* The optimzation node-type */
12956         const char * cur_parse= RExC_parse;
12957
12958        invlist_iterinit(cp_list);
12959        if (! invlist_iternext(cp_list, &start, &end)) {
12960
12961             /* Here, the list is empty.  This happens, for example, when a
12962              * Unicode property is the only thing in the character class, and
12963              * it doesn't match anything.  (perluniprops.pod notes such
12964              * properties) */
12965             op = OPFAIL;
12966             *flagp |= HASWIDTH|SIMPLE;
12967         }
12968         else if (start == end) {    /* The range is a single code point */
12969             if (! invlist_iternext(cp_list, &start, &end)
12970
12971                     /* Don't do this optimization if it would require changing
12972                      * the pattern to UTF-8 */
12973                 && (start < 256 || UTF))
12974             {
12975                 /* Here, the list contains a single code point.  Can optimize
12976                  * into an EXACT node */
12977
12978                 value = start;
12979
12980                 if (! FOLD) {
12981                     op = EXACT;
12982                 }
12983                 else if (LOC) {
12984
12985                     /* A locale node under folding with one code point can be
12986                      * an EXACTFL, as its fold won't be calculated until
12987                      * runtime */
12988                     op = EXACTFL;
12989                 }
12990                 else {
12991
12992                     /* Here, we are generally folding, but there is only one
12993                      * code point to match.  If we have to, we use an EXACT
12994                      * node, but it would be better for joining with adjacent
12995                      * nodes in the optimization pass if we used the same
12996                      * EXACTFish node that any such are likely to be.  We can
12997                      * do this iff the code point doesn't participate in any
12998                      * folds.  For example, an EXACTF of a colon is the same as
12999                      * an EXACT one, since nothing folds to or from a colon. */
13000                     if (value < 256) {
13001                         if (IS_IN_SOME_FOLD_L1(value)) {
13002                             op = EXACT;
13003                         }
13004                     }
13005                     else {
13006                         if (! PL_utf8_foldable) {
13007                             SV* swash = swash_init("utf8", "_Perl_Any_Folds",
13008                                                 &PL_sv_undef, 1, 0);
13009                             PL_utf8_foldable = _get_swash_invlist(swash);
13010                             SvREFCNT_dec(swash);
13011                         }
13012                         if (_invlist_contains_cp(PL_utf8_foldable, value)) {
13013                             op = EXACT;
13014                         }
13015                     }
13016
13017                     /* If we haven't found the node type, above, it means we
13018                      * can use the prevailing one */
13019                     if (op == END) {
13020                         op = compute_EXACTish(pRExC_state);
13021                     }
13022                 }
13023             }
13024         }
13025         else if (start == 0) {
13026             if (end == UV_MAX) {
13027                 op = SANY;
13028                 *flagp |= HASWIDTH|SIMPLE;
13029                 RExC_naughty++;
13030             }
13031             else if (end == '\n' - 1
13032                     && invlist_iternext(cp_list, &start, &end)
13033                     && start == '\n' + 1 && end == UV_MAX)
13034             {
13035                 op = REG_ANY;
13036                 *flagp |= HASWIDTH|SIMPLE;
13037                 RExC_naughty++;
13038             }
13039         }
13040
13041         if (op != END) {
13042             RExC_parse = (char *)orig_parse;
13043             RExC_emit = (regnode *)orig_emit;
13044
13045             ret = reg_node(pRExC_state, op);
13046
13047             RExC_parse = (char *)cur_parse;
13048
13049             if (PL_regkind[op] == EXACT) {
13050                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
13051             }
13052
13053             SvREFCNT_dec(listsv);
13054             return ret;
13055         }
13056     }
13057
13058     /* Here, <cp_list> contains all the code points we can determine at
13059      * compile time that match under all conditions.  Go through it, and
13060      * for things that belong in the bitmap, put them there, and delete from
13061      * <cp_list>.  While we are at it, see if everything above 255 is in the
13062      * list, and if so, set a flag to speed up execution */
13063     ANYOF_BITMAP_ZERO(ret);
13064     if (cp_list) {
13065
13066         /* This gets set if we actually need to modify things */
13067         bool change_invlist = FALSE;
13068
13069         UV start, end;
13070
13071         /* Start looking through <cp_list> */
13072         invlist_iterinit(cp_list);
13073         while (invlist_iternext(cp_list, &start, &end)) {
13074             UV high;
13075             int i;
13076
13077             if (end == UV_MAX && start <= 256) {
13078                 ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
13079             }
13080
13081             /* Quit if are above what we should change */
13082             if (start > 255) {
13083                 break;
13084             }
13085
13086             change_invlist = TRUE;
13087
13088             /* Set all the bits in the range, up to the max that we are doing */
13089             high = (end < 255) ? end : 255;
13090             for (i = start; i <= (int) high; i++) {
13091                 if (! ANYOF_BITMAP_TEST(ret, i)) {
13092                     ANYOF_BITMAP_SET(ret, i);
13093                     prevvalue = value;
13094                     value = i;
13095                 }
13096             }
13097         }
13098
13099         /* Done with loop; remove any code points that are in the bitmap from
13100          * <cp_list> */
13101         if (change_invlist) {
13102             _invlist_subtract(cp_list, PL_Latin1, &cp_list);
13103         }
13104
13105         /* If have completely emptied it, remove it completely */
13106         if (_invlist_len(cp_list) == 0) {
13107             SvREFCNT_dec(cp_list);
13108             cp_list = NULL;
13109         }
13110     }
13111
13112     if (invert) {
13113         ANYOF_FLAGS(ret) |= ANYOF_INVERT;
13114     }
13115
13116     /* Here, the bitmap has been populated with all the Latin1 code points that
13117      * always match.  Can now add to the overall list those that match only
13118      * when the target string is UTF-8 (<depends_list>). */
13119     if (depends_list) {
13120         if (cp_list) {
13121             _invlist_union(cp_list, depends_list, &cp_list);
13122             SvREFCNT_dec(depends_list);
13123         }
13124         else {
13125             cp_list = depends_list;
13126         }
13127     }
13128
13129     /* If there is a swash and more than one element, we can't use the swash in
13130      * the optimization below. */
13131     if (swash && element_count > 1) {
13132         SvREFCNT_dec(swash);
13133         swash = NULL;
13134     }
13135
13136     if (! cp_list
13137         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13138     {
13139         ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
13140         SvREFCNT_dec(listsv);
13141     }
13142     else {
13143         /* av[0] stores the character class description in its textual form:
13144          *       used later (regexec.c:Perl_regclass_swash()) to initialize the
13145          *       appropriate swash, and is also useful for dumping the regnode.
13146          * av[1] if NULL, is a placeholder to later contain the swash computed
13147          *       from av[0].  But if no further computation need be done, the
13148          *       swash is stored there now.
13149          * av[2] stores the cp_list inversion list for use in addition or
13150          *       instead of av[0]; used only if av[1] is NULL
13151          * av[3] is set if any component of the class is from a user-defined
13152          *       property; used only if av[1] is NULL */
13153         AV * const av = newAV();
13154         SV *rv;
13155
13156         av_store(av, 0, (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13157                         ? listsv
13158                         : &PL_sv_undef);
13159         if (swash) {
13160             av_store(av, 1, swash);
13161             SvREFCNT_dec(cp_list);
13162         }
13163         else {
13164             av_store(av, 1, NULL);
13165             if (cp_list) {
13166                 av_store(av, 2, cp_list);
13167                 av_store(av, 3, newSVuv(has_user_defined_property));
13168             }
13169         }
13170
13171         rv = newRV_noinc(MUTABLE_SV(av));
13172         n = add_data(pRExC_state, 1, "s");
13173         RExC_rxi->data->data[n] = (void*)rv;
13174         ARG_SET(ret, n);
13175     }
13176
13177     *flagp |= HASWIDTH|SIMPLE;
13178     return ret;
13179 }
13180 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
13181
13182
13183 /* reg_skipcomment()
13184
13185    Absorbs an /x style # comments from the input stream.
13186    Returns true if there is more text remaining in the stream.
13187    Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
13188    terminates the pattern without including a newline.
13189
13190    Note its the callers responsibility to ensure that we are
13191    actually in /x mode
13192
13193 */
13194
13195 STATIC bool
13196 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
13197 {
13198     bool ended = 0;
13199
13200     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
13201
13202     while (RExC_parse < RExC_end)
13203         if (*RExC_parse++ == '\n') {
13204             ended = 1;
13205             break;
13206         }
13207     if (!ended) {
13208         /* we ran off the end of the pattern without ending
13209            the comment, so we have to add an \n when wrapping */
13210         RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
13211         return 0;
13212     } else
13213         return 1;
13214 }
13215
13216 /* nextchar()
13217
13218    Advances the parse position, and optionally absorbs
13219    "whitespace" from the inputstream.
13220
13221    Without /x "whitespace" means (?#...) style comments only,
13222    with /x this means (?#...) and # comments and whitespace proper.
13223
13224    Returns the RExC_parse point from BEFORE the scan occurs.
13225
13226    This is the /x friendly way of saying RExC_parse++.
13227 */
13228
13229 STATIC char*
13230 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
13231 {
13232     char* const retval = RExC_parse++;
13233
13234     PERL_ARGS_ASSERT_NEXTCHAR;
13235
13236     for (;;) {
13237         if (RExC_end - RExC_parse >= 3
13238             && *RExC_parse == '('
13239             && RExC_parse[1] == '?'
13240             && RExC_parse[2] == '#')
13241         {
13242             while (*RExC_parse != ')') {
13243                 if (RExC_parse == RExC_end)
13244                     FAIL("Sequence (?#... not terminated");
13245                 RExC_parse++;
13246             }
13247             RExC_parse++;
13248             continue;
13249         }
13250         if (RExC_flags & RXf_PMf_EXTENDED) {
13251             if (isSPACE(*RExC_parse)) {
13252                 RExC_parse++;
13253                 continue;
13254             }
13255             else if (*RExC_parse == '#') {
13256                 if ( reg_skipcomment( pRExC_state ) )
13257                     continue;
13258             }
13259         }
13260         return retval;
13261     }
13262 }
13263
13264 /*
13265 - reg_node - emit a node
13266 */
13267 STATIC regnode *                        /* Location. */
13268 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
13269 {
13270     dVAR;
13271     regnode *ptr;
13272     regnode * const ret = RExC_emit;
13273     GET_RE_DEBUG_FLAGS_DECL;
13274
13275     PERL_ARGS_ASSERT_REG_NODE;
13276
13277     if (SIZE_ONLY) {
13278         SIZE_ALIGN(RExC_size);
13279         RExC_size += 1;
13280         return(ret);
13281     }
13282     if (RExC_emit >= RExC_emit_bound)
13283         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
13284                    op, RExC_emit, RExC_emit_bound);
13285
13286     NODE_ALIGN_FILL(ret);
13287     ptr = ret;
13288     FILL_ADVANCE_NODE(ptr, op);
13289 #ifdef RE_TRACK_PATTERN_OFFSETS
13290     if (RExC_offsets) {         /* MJD */
13291         MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
13292               "reg_node", __LINE__,
13293               PL_reg_name[op],
13294               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
13295                 ? "Overwriting end of array!\n" : "OK",
13296               (UV)(RExC_emit - RExC_emit_start),
13297               (UV)(RExC_parse - RExC_start),
13298               (UV)RExC_offsets[0]));
13299         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
13300     }
13301 #endif
13302     RExC_emit = ptr;
13303     return(ret);
13304 }
13305
13306 /*
13307 - reganode - emit a node with an argument
13308 */
13309 STATIC regnode *                        /* Location. */
13310 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
13311 {
13312     dVAR;
13313     regnode *ptr;
13314     regnode * const ret = RExC_emit;
13315     GET_RE_DEBUG_FLAGS_DECL;
13316
13317     PERL_ARGS_ASSERT_REGANODE;
13318
13319     if (SIZE_ONLY) {
13320         SIZE_ALIGN(RExC_size);
13321         RExC_size += 2;
13322         /*
13323            We can't do this:
13324
13325            assert(2==regarglen[op]+1);
13326
13327            Anything larger than this has to allocate the extra amount.
13328            If we changed this to be:
13329
13330            RExC_size += (1 + regarglen[op]);
13331
13332            then it wouldn't matter. Its not clear what side effect
13333            might come from that so its not done so far.
13334            -- dmq
13335         */
13336         return(ret);
13337     }
13338     if (RExC_emit >= RExC_emit_bound)
13339         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
13340                    op, RExC_emit, RExC_emit_bound);
13341
13342     NODE_ALIGN_FILL(ret);
13343     ptr = ret;
13344     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
13345 #ifdef RE_TRACK_PATTERN_OFFSETS
13346     if (RExC_offsets) {         /* MJD */
13347         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
13348               "reganode",
13349               __LINE__,
13350               PL_reg_name[op],
13351               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
13352               "Overwriting end of array!\n" : "OK",
13353               (UV)(RExC_emit - RExC_emit_start),
13354               (UV)(RExC_parse - RExC_start),
13355               (UV)RExC_offsets[0]));
13356         Set_Cur_Node_Offset;
13357     }
13358 #endif
13359     RExC_emit = ptr;
13360     return(ret);
13361 }
13362
13363 /*
13364 - reguni - emit (if appropriate) a Unicode character
13365 */
13366 STATIC STRLEN
13367 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
13368 {
13369     dVAR;
13370
13371     PERL_ARGS_ASSERT_REGUNI;
13372
13373     return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
13374 }
13375
13376 /*
13377 - reginsert - insert an operator in front of already-emitted operand
13378 *
13379 * Means relocating the operand.
13380 */
13381 STATIC void
13382 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
13383 {
13384     dVAR;
13385     regnode *src;
13386     regnode *dst;
13387     regnode *place;
13388     const int offset = regarglen[(U8)op];
13389     const int size = NODE_STEP_REGNODE + offset;
13390     GET_RE_DEBUG_FLAGS_DECL;
13391
13392     PERL_ARGS_ASSERT_REGINSERT;
13393     PERL_UNUSED_ARG(depth);
13394 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
13395     DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
13396     if (SIZE_ONLY) {
13397         RExC_size += size;
13398         return;
13399     }
13400
13401     src = RExC_emit;
13402     RExC_emit += size;
13403     dst = RExC_emit;
13404     if (RExC_open_parens) {
13405         int paren;
13406         /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
13407         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
13408             if ( RExC_open_parens[paren] >= opnd ) {
13409                 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
13410                 RExC_open_parens[paren] += size;
13411             } else {
13412                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
13413             }
13414             if ( RExC_close_parens[paren] >= opnd ) {
13415                 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
13416                 RExC_close_parens[paren] += size;
13417             } else {
13418                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
13419             }
13420         }
13421     }
13422
13423     while (src > opnd) {
13424         StructCopy(--src, --dst, regnode);
13425 #ifdef RE_TRACK_PATTERN_OFFSETS
13426         if (RExC_offsets) {     /* MJD 20010112 */
13427             MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
13428                   "reg_insert",
13429                   __LINE__,
13430                   PL_reg_name[op],
13431                   (UV)(dst - RExC_emit_start) > RExC_offsets[0]
13432                     ? "Overwriting end of array!\n" : "OK",
13433                   (UV)(src - RExC_emit_start),
13434                   (UV)(dst - RExC_emit_start),
13435                   (UV)RExC_offsets[0]));
13436             Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
13437             Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
13438         }
13439 #endif
13440     }
13441
13442
13443     place = opnd;               /* Op node, where operand used to be. */
13444 #ifdef RE_TRACK_PATTERN_OFFSETS
13445     if (RExC_offsets) {         /* MJD */
13446         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
13447               "reginsert",
13448               __LINE__,
13449               PL_reg_name[op],
13450               (UV)(place - RExC_emit_start) > RExC_offsets[0]
13451               ? "Overwriting end of array!\n" : "OK",
13452               (UV)(place - RExC_emit_start),
13453               (UV)(RExC_parse - RExC_start),
13454               (UV)RExC_offsets[0]));
13455         Set_Node_Offset(place, RExC_parse);
13456         Set_Node_Length(place, 1);
13457     }
13458 #endif
13459     src = NEXTOPER(place);
13460     FILL_ADVANCE_NODE(place, op);
13461     Zero(src, offset, regnode);
13462 }
13463
13464 /*
13465 - regtail - set the next-pointer at the end of a node chain of p to val.
13466 - SEE ALSO: regtail_study
13467 */
13468 /* TODO: All three parms should be const */
13469 STATIC void
13470 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
13471 {
13472     dVAR;
13473     regnode *scan;
13474     GET_RE_DEBUG_FLAGS_DECL;
13475
13476     PERL_ARGS_ASSERT_REGTAIL;
13477 #ifndef DEBUGGING
13478     PERL_UNUSED_ARG(depth);
13479 #endif
13480
13481     if (SIZE_ONLY)
13482         return;
13483
13484     /* Find last node. */
13485     scan = p;
13486     for (;;) {
13487         regnode * const temp = regnext(scan);
13488         DEBUG_PARSE_r({
13489             SV * const mysv=sv_newmortal();
13490             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
13491             regprop(RExC_rx, mysv, scan);
13492             PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
13493                 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
13494                     (temp == NULL ? "->" : ""),
13495                     (temp == NULL ? PL_reg_name[OP(val)] : "")
13496             );
13497         });
13498         if (temp == NULL)
13499             break;
13500         scan = temp;
13501     }
13502
13503     if (reg_off_by_arg[OP(scan)]) {
13504         ARG_SET(scan, val - scan);
13505     }
13506     else {
13507         NEXT_OFF(scan) = val - scan;
13508     }
13509 }
13510
13511 #ifdef DEBUGGING
13512 /*
13513 - regtail_study - set the next-pointer at the end of a node chain of p to val.
13514 - Look for optimizable sequences at the same time.
13515 - currently only looks for EXACT chains.
13516
13517 This is experimental code. The idea is to use this routine to perform
13518 in place optimizations on branches and groups as they are constructed,
13519 with the long term intention of removing optimization from study_chunk so
13520 that it is purely analytical.
13521
13522 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
13523 to control which is which.
13524
13525 */
13526 /* TODO: All four parms should be const */
13527
13528 STATIC U8
13529 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
13530 {
13531     dVAR;
13532     regnode *scan;
13533     U8 exact = PSEUDO;
13534 #ifdef EXPERIMENTAL_INPLACESCAN
13535     I32 min = 0;
13536 #endif
13537     GET_RE_DEBUG_FLAGS_DECL;
13538
13539     PERL_ARGS_ASSERT_REGTAIL_STUDY;
13540
13541
13542     if (SIZE_ONLY)
13543         return exact;
13544
13545     /* Find last node. */
13546
13547     scan = p;
13548     for (;;) {
13549         regnode * const temp = regnext(scan);
13550 #ifdef EXPERIMENTAL_INPLACESCAN
13551         if (PL_regkind[OP(scan)] == EXACT) {
13552             bool has_exactf_sharp_s;    /* Unexamined in this routine */
13553             if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1))
13554                 return EXACT;
13555         }
13556 #endif
13557         if ( exact ) {
13558             switch (OP(scan)) {
13559                 case EXACT:
13560                 case EXACTF:
13561                 case EXACTFA:
13562                 case EXACTFU:
13563                 case EXACTFU_SS:
13564                 case EXACTFU_TRICKYFOLD:
13565                 case EXACTFL:
13566                         if( exact == PSEUDO )
13567                             exact= OP(scan);
13568                         else if ( exact != OP(scan) )
13569                             exact= 0;
13570                 case NOTHING:
13571                     break;
13572                 default:
13573                     exact= 0;
13574             }
13575         }
13576         DEBUG_PARSE_r({
13577             SV * const mysv=sv_newmortal();
13578             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
13579             regprop(RExC_rx, mysv, scan);
13580             PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
13581                 SvPV_nolen_const(mysv),
13582                 REG_NODE_NUM(scan),
13583                 PL_reg_name[exact]);
13584         });
13585         if (temp == NULL)
13586             break;
13587         scan = temp;
13588     }
13589     DEBUG_PARSE_r({
13590         SV * const mysv_val=sv_newmortal();
13591         DEBUG_PARSE_MSG("");
13592         regprop(RExC_rx, mysv_val, val);
13593         PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
13594                       SvPV_nolen_const(mysv_val),
13595                       (IV)REG_NODE_NUM(val),
13596                       (IV)(val - scan)
13597         );
13598     });
13599     if (reg_off_by_arg[OP(scan)]) {
13600         ARG_SET(scan, val - scan);
13601     }
13602     else {
13603         NEXT_OFF(scan) = val - scan;
13604     }
13605
13606     return exact;
13607 }
13608 #endif
13609
13610 /*
13611  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
13612  */
13613 #ifdef DEBUGGING
13614 static void
13615 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
13616 {
13617     int bit;
13618     int set=0;
13619     regex_charset cs;
13620
13621     for (bit=0; bit<32; bit++) {
13622         if (flags & (1<<bit)) {
13623             if ((1<<bit) & RXf_PMf_CHARSET) {   /* Output separately, below */
13624                 continue;
13625             }
13626             if (!set++ && lead)
13627                 PerlIO_printf(Perl_debug_log, "%s",lead);
13628             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
13629         }
13630     }
13631     if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
13632             if (!set++ && lead) {
13633                 PerlIO_printf(Perl_debug_log, "%s",lead);
13634             }
13635             switch (cs) {
13636                 case REGEX_UNICODE_CHARSET:
13637                     PerlIO_printf(Perl_debug_log, "UNICODE");
13638                     break;
13639                 case REGEX_LOCALE_CHARSET:
13640                     PerlIO_printf(Perl_debug_log, "LOCALE");
13641                     break;
13642                 case REGEX_ASCII_RESTRICTED_CHARSET:
13643                     PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
13644                     break;
13645                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
13646                     PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
13647                     break;
13648                 default:
13649                     PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
13650                     break;
13651             }
13652     }
13653     if (lead)  {
13654         if (set)
13655             PerlIO_printf(Perl_debug_log, "\n");
13656         else
13657             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
13658     }
13659 }
13660 #endif
13661
13662 void
13663 Perl_regdump(pTHX_ const regexp *r)
13664 {
13665 #ifdef DEBUGGING
13666     dVAR;
13667     SV * const sv = sv_newmortal();
13668     SV *dsv= sv_newmortal();
13669     RXi_GET_DECL(r,ri);
13670     GET_RE_DEBUG_FLAGS_DECL;
13671
13672     PERL_ARGS_ASSERT_REGDUMP;
13673
13674     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
13675
13676     /* Header fields of interest. */
13677     if (r->anchored_substr) {
13678         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
13679             RE_SV_DUMPLEN(r->anchored_substr), 30);
13680         PerlIO_printf(Perl_debug_log,
13681                       "anchored %s%s at %"IVdf" ",
13682                       s, RE_SV_TAIL(r->anchored_substr),
13683                       (IV)r->anchored_offset);
13684     } else if (r->anchored_utf8) {
13685         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
13686             RE_SV_DUMPLEN(r->anchored_utf8), 30);
13687         PerlIO_printf(Perl_debug_log,
13688                       "anchored utf8 %s%s at %"IVdf" ",
13689                       s, RE_SV_TAIL(r->anchored_utf8),
13690                       (IV)r->anchored_offset);
13691     }
13692     if (r->float_substr) {
13693         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
13694             RE_SV_DUMPLEN(r->float_substr), 30);
13695         PerlIO_printf(Perl_debug_log,
13696                       "floating %s%s at %"IVdf"..%"UVuf" ",
13697                       s, RE_SV_TAIL(r->float_substr),
13698                       (IV)r->float_min_offset, (UV)r->float_max_offset);
13699     } else if (r->float_utf8) {
13700         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
13701             RE_SV_DUMPLEN(r->float_utf8), 30);
13702         PerlIO_printf(Perl_debug_log,
13703                       "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
13704                       s, RE_SV_TAIL(r->float_utf8),
13705                       (IV)r->float_min_offset, (UV)r->float_max_offset);
13706     }
13707     if (r->check_substr || r->check_utf8)
13708         PerlIO_printf(Perl_debug_log,
13709                       (const char *)
13710                       (r->check_substr == r->float_substr
13711                        && r->check_utf8 == r->float_utf8
13712                        ? "(checking floating" : "(checking anchored"));
13713     if (r->extflags & RXf_NOSCAN)
13714         PerlIO_printf(Perl_debug_log, " noscan");
13715     if (r->extflags & RXf_CHECK_ALL)
13716         PerlIO_printf(Perl_debug_log, " isall");
13717     if (r->check_substr || r->check_utf8)
13718         PerlIO_printf(Perl_debug_log, ") ");
13719
13720     if (ri->regstclass) {
13721         regprop(r, sv, ri->regstclass);
13722         PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
13723     }
13724     if (r->extflags & RXf_ANCH) {
13725         PerlIO_printf(Perl_debug_log, "anchored");
13726         if (r->extflags & RXf_ANCH_BOL)
13727             PerlIO_printf(Perl_debug_log, "(BOL)");
13728         if (r->extflags & RXf_ANCH_MBOL)
13729             PerlIO_printf(Perl_debug_log, "(MBOL)");
13730         if (r->extflags & RXf_ANCH_SBOL)
13731             PerlIO_printf(Perl_debug_log, "(SBOL)");
13732         if (r->extflags & RXf_ANCH_GPOS)
13733             PerlIO_printf(Perl_debug_log, "(GPOS)");
13734         PerlIO_putc(Perl_debug_log, ' ');
13735     }
13736     if (r->extflags & RXf_GPOS_SEEN)
13737         PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
13738     if (r->intflags & PREGf_SKIP)
13739         PerlIO_printf(Perl_debug_log, "plus ");
13740     if (r->intflags & PREGf_IMPLICIT)
13741         PerlIO_printf(Perl_debug_log, "implicit ");
13742     PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
13743     if (r->extflags & RXf_EVAL_SEEN)
13744         PerlIO_printf(Perl_debug_log, "with eval ");
13745     PerlIO_printf(Perl_debug_log, "\n");
13746     DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
13747 #else
13748     PERL_ARGS_ASSERT_REGDUMP;
13749     PERL_UNUSED_CONTEXT;
13750     PERL_UNUSED_ARG(r);
13751 #endif  /* DEBUGGING */
13752 }
13753
13754 /*
13755 - regprop - printable representation of opcode
13756 */
13757 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
13758 STMT_START { \
13759         if (do_sep) {                           \
13760             Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
13761             if (flags & ANYOF_INVERT)           \
13762                 /*make sure the invert info is in each */ \
13763                 sv_catpvs(sv, "^");             \
13764             do_sep = 0;                         \
13765         }                                       \
13766 } STMT_END
13767
13768 void
13769 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
13770 {
13771 #ifdef DEBUGGING
13772     dVAR;
13773     int k;
13774
13775     /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
13776     static const char * const anyofs[] = {
13777         "\\w",
13778         "\\W",
13779         "\\s",
13780         "\\S",
13781         "\\d",
13782         "\\D",
13783         "[:alnum:]",
13784         "[:^alnum:]",
13785         "[:alpha:]",
13786         "[:^alpha:]",
13787         "[:ascii:]",
13788         "[:^ascii:]",
13789         "[:cntrl:]",
13790         "[:^cntrl:]",
13791         "[:graph:]",
13792         "[:^graph:]",
13793         "[:lower:]",
13794         "[:^lower:]",
13795         "[:print:]",
13796         "[:^print:]",
13797         "[:punct:]",
13798         "[:^punct:]",
13799         "[:upper:]",
13800         "[:^upper:]",
13801         "[:xdigit:]",
13802         "[:^xdigit:]",
13803         "[:space:]",
13804         "[:^space:]",
13805         "[:blank:]",
13806         "[:^blank:]"
13807     };
13808     RXi_GET_DECL(prog,progi);
13809     GET_RE_DEBUG_FLAGS_DECL;
13810
13811     PERL_ARGS_ASSERT_REGPROP;
13812
13813     sv_setpvs(sv, "");
13814
13815     if (OP(o) > REGNODE_MAX)            /* regnode.type is unsigned */
13816         /* It would be nice to FAIL() here, but this may be called from
13817            regexec.c, and it would be hard to supply pRExC_state. */
13818         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
13819     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
13820
13821     k = PL_regkind[OP(o)];
13822
13823     if (k == EXACT) {
13824         sv_catpvs(sv, " ");
13825         /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
13826          * is a crude hack but it may be the best for now since
13827          * we have no flag "this EXACTish node was UTF-8"
13828          * --jhi */
13829         pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
13830                   PERL_PV_ESCAPE_UNI_DETECT |
13831                   PERL_PV_ESCAPE_NONASCII   |
13832                   PERL_PV_PRETTY_ELLIPSES   |
13833                   PERL_PV_PRETTY_LTGT       |
13834                   PERL_PV_PRETTY_NOCLEAR
13835                   );
13836     } else if (k == TRIE) {
13837         /* print the details of the trie in dumpuntil instead, as
13838          * progi->data isn't available here */
13839         const char op = OP(o);
13840         const U32 n = ARG(o);
13841         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
13842                (reg_ac_data *)progi->data->data[n] :
13843                NULL;
13844         const reg_trie_data * const trie
13845             = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
13846
13847         Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
13848         DEBUG_TRIE_COMPILE_r(
13849             Perl_sv_catpvf(aTHX_ sv,
13850                 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
13851                 (UV)trie->startstate,
13852                 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
13853                 (UV)trie->wordcount,
13854                 (UV)trie->minlen,
13855                 (UV)trie->maxlen,
13856                 (UV)TRIE_CHARCOUNT(trie),
13857                 (UV)trie->uniquecharcount
13858             )
13859         );
13860         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
13861             int i;
13862             int rangestart = -1;
13863             U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
13864             sv_catpvs(sv, "[");
13865             for (i = 0; i <= 256; i++) {
13866                 if (i < 256 && BITMAP_TEST(bitmap,i)) {
13867                     if (rangestart == -1)
13868                         rangestart = i;
13869                 } else if (rangestart != -1) {
13870                     if (i <= rangestart + 3)
13871                         for (; rangestart < i; rangestart++)
13872                             put_byte(sv, rangestart);
13873                     else {
13874                         put_byte(sv, rangestart);
13875                         sv_catpvs(sv, "-");
13876                         put_byte(sv, i - 1);
13877                     }
13878                     rangestart = -1;
13879                 }
13880             }
13881             sv_catpvs(sv, "]");
13882         }
13883
13884     } else if (k == CURLY) {
13885         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
13886             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
13887         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
13888     }
13889     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
13890         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
13891     else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
13892         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
13893         if ( RXp_PAREN_NAMES(prog) ) {
13894             if ( k != REF || (OP(o) < NREF)) {
13895                 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
13896                 SV **name= av_fetch(list, ARG(o), 0 );
13897                 if (name)
13898                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
13899             }
13900             else {
13901                 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
13902                 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
13903                 I32 *nums=(I32*)SvPVX(sv_dat);
13904                 SV **name= av_fetch(list, nums[0], 0 );
13905                 I32 n;
13906                 if (name) {
13907                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
13908                         Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
13909                                     (n ? "," : ""), (IV)nums[n]);
13910                     }
13911                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
13912                 }
13913             }
13914         }
13915     } else if (k == GOSUB)
13916         Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
13917     else if (k == VERB) {
13918         if (!o->flags)
13919             Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
13920                            SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
13921     } else if (k == LOGICAL)
13922         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);     /* 2: embedded, otherwise 1 */
13923     else if (k == ANYOF) {
13924         int i, rangestart = -1;
13925         const U8 flags = ANYOF_FLAGS(o);
13926         int do_sep = 0;
13927
13928
13929         if (flags & ANYOF_LOCALE)
13930             sv_catpvs(sv, "{loc}");
13931         if (flags & ANYOF_LOC_FOLD)
13932             sv_catpvs(sv, "{i}");
13933         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
13934         if (flags & ANYOF_INVERT)
13935             sv_catpvs(sv, "^");
13936
13937         /* output what the standard cp 0-255 bitmap matches */
13938         for (i = 0; i <= 256; i++) {
13939             if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
13940                 if (rangestart == -1)
13941                     rangestart = i;
13942             } else if (rangestart != -1) {
13943                 if (i <= rangestart + 3)
13944                     for (; rangestart < i; rangestart++)
13945                         put_byte(sv, rangestart);
13946                 else {
13947                     put_byte(sv, rangestart);
13948                     sv_catpvs(sv, "-");
13949                     put_byte(sv, i - 1);
13950                 }
13951                 do_sep = 1;
13952                 rangestart = -1;
13953             }
13954         }
13955
13956         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
13957         /* output any special charclass tests (used entirely under use locale) */
13958         if (ANYOF_CLASS_TEST_ANY_SET(o))
13959             for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
13960                 if (ANYOF_CLASS_TEST(o,i)) {
13961                     sv_catpv(sv, anyofs[i]);
13962                     do_sep = 1;
13963                 }
13964
13965         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
13966
13967         if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
13968             sv_catpvs(sv, "{non-utf8-latin1-all}");
13969         }
13970
13971         /* output information about the unicode matching */
13972         if (flags & ANYOF_UNICODE_ALL)
13973             sv_catpvs(sv, "{unicode_all}");
13974         else if (ANYOF_NONBITMAP(o))
13975             sv_catpvs(sv, "{unicode}");
13976         if (flags & ANYOF_NONBITMAP_NON_UTF8)
13977             sv_catpvs(sv, "{outside bitmap}");
13978
13979         if (ANYOF_NONBITMAP(o)) {
13980             SV *lv; /* Set if there is something outside the bit map */
13981             SV * const sw = regclass_swash(prog, o, FALSE, &lv, NULL);
13982             bool byte_output = FALSE;   /* If something in the bitmap has been
13983                                            output */
13984
13985             if (lv && lv != &PL_sv_undef) {
13986                 if (sw) {
13987                     U8 s[UTF8_MAXBYTES_CASE+1];
13988
13989                     for (i = 0; i <= 256; i++) { /* Look at chars in bitmap */
13990                         uvchr_to_utf8(s, i);
13991
13992                         if (i < 256
13993                             && ! ANYOF_BITMAP_TEST(o, i)    /* Don't duplicate
13994                                                                things already
13995                                                                output as part
13996                                                                of the bitmap */
13997                             && swash_fetch(sw, s, TRUE))
13998                         {
13999                             if (rangestart == -1)
14000                                 rangestart = i;
14001                         } else if (rangestart != -1) {
14002                             byte_output = TRUE;
14003                             if (i <= rangestart + 3)
14004                                 for (; rangestart < i; rangestart++) {
14005                                     put_byte(sv, rangestart);
14006                                 }
14007                             else {
14008                                 put_byte(sv, rangestart);
14009                                 sv_catpvs(sv, "-");
14010                                 put_byte(sv, i-1);
14011                             }
14012                             rangestart = -1;
14013                         }
14014                     }
14015                 }
14016
14017                 {
14018                     char *s = savesvpv(lv);
14019                     char * const origs = s;
14020
14021                     while (*s && *s != '\n')
14022                         s++;
14023
14024                     if (*s == '\n') {
14025                         const char * const t = ++s;
14026
14027                         if (byte_output) {
14028                             sv_catpvs(sv, " ");
14029                         }
14030
14031                         while (*s) {
14032                             if (*s == '\n') {
14033
14034                                 /* Truncate very long output */
14035                                 if (s - origs > 256) {
14036                                     Perl_sv_catpvf(aTHX_ sv,
14037                                                    "%.*s...",
14038                                                    (int) (s - origs - 1),
14039                                                    t);
14040                                     goto out_dump;
14041                                 }
14042                                 *s = ' ';
14043                             }
14044                             else if (*s == '\t') {
14045                                 *s = '-';
14046                             }
14047                             s++;
14048                         }
14049                         if (s[-1] == ' ')
14050                             s[-1] = 0;
14051
14052                         sv_catpv(sv, t);
14053                     }
14054
14055                 out_dump:
14056
14057                     Safefree(origs);
14058                 }
14059                 SvREFCNT_dec(lv);
14060             }
14061         }
14062
14063         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
14064     }
14065     else if (k == POSIXD) {
14066         U8 index = FLAGS(o) * 2;
14067         if (index > (sizeof(anyofs) / sizeof(anyofs[0]))) {
14068             Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
14069         }
14070         else {
14071             sv_catpv(sv, anyofs[index]);
14072         }
14073     }
14074     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
14075         Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
14076 #else
14077     PERL_UNUSED_CONTEXT;
14078     PERL_UNUSED_ARG(sv);
14079     PERL_UNUSED_ARG(o);
14080     PERL_UNUSED_ARG(prog);
14081 #endif  /* DEBUGGING */
14082 }
14083
14084 SV *
14085 Perl_re_intuit_string(pTHX_ REGEXP * const r)
14086 {                               /* Assume that RE_INTUIT is set */
14087     dVAR;
14088     struct regexp *const prog = (struct regexp *)SvANY(r);
14089     GET_RE_DEBUG_FLAGS_DECL;
14090
14091     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
14092     PERL_UNUSED_CONTEXT;
14093
14094     DEBUG_COMPILE_r(
14095         {
14096             const char * const s = SvPV_nolen_const(prog->check_substr
14097                       ? prog->check_substr : prog->check_utf8);
14098
14099             if (!PL_colorset) reginitcolors();
14100             PerlIO_printf(Perl_debug_log,
14101                       "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
14102                       PL_colors[4],
14103                       prog->check_substr ? "" : "utf8 ",
14104                       PL_colors[5],PL_colors[0],
14105                       s,
14106                       PL_colors[1],
14107                       (strlen(s) > 60 ? "..." : ""));
14108         } );
14109
14110     return prog->check_substr ? prog->check_substr : prog->check_utf8;
14111 }
14112
14113 /*
14114    pregfree()
14115
14116    handles refcounting and freeing the perl core regexp structure. When
14117    it is necessary to actually free the structure the first thing it
14118    does is call the 'free' method of the regexp_engine associated to
14119    the regexp, allowing the handling of the void *pprivate; member
14120    first. (This routine is not overridable by extensions, which is why
14121    the extensions free is called first.)
14122
14123    See regdupe and regdupe_internal if you change anything here.
14124 */
14125 #ifndef PERL_IN_XSUB_RE
14126 void
14127 Perl_pregfree(pTHX_ REGEXP *r)
14128 {
14129     SvREFCNT_dec(r);
14130 }
14131
14132 void
14133 Perl_pregfree2(pTHX_ REGEXP *rx)
14134 {
14135     dVAR;
14136     struct regexp *const r = (struct regexp *)SvANY(rx);
14137     GET_RE_DEBUG_FLAGS_DECL;
14138
14139     PERL_ARGS_ASSERT_PREGFREE2;
14140
14141     if (r->mother_re) {
14142         ReREFCNT_dec(r->mother_re);
14143     } else {
14144         CALLREGFREE_PVT(rx); /* free the private data */
14145         SvREFCNT_dec(RXp_PAREN_NAMES(r));
14146     }
14147     if (r->substrs) {
14148         SvREFCNT_dec(r->anchored_substr);
14149         SvREFCNT_dec(r->anchored_utf8);
14150         SvREFCNT_dec(r->float_substr);
14151         SvREFCNT_dec(r->float_utf8);
14152         Safefree(r->substrs);
14153     }
14154     RX_MATCH_COPY_FREE(rx);
14155 #ifdef PERL_OLD_COPY_ON_WRITE
14156     SvREFCNT_dec(r->saved_copy);
14157 #endif
14158     Safefree(r->offs);
14159     SvREFCNT_dec(r->qr_anoncv);
14160 }
14161
14162 /*  reg_temp_copy()
14163
14164     This is a hacky workaround to the structural issue of match results
14165     being stored in the regexp structure which is in turn stored in
14166     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
14167     could be PL_curpm in multiple contexts, and could require multiple
14168     result sets being associated with the pattern simultaneously, such
14169     as when doing a recursive match with (??{$qr})
14170
14171     The solution is to make a lightweight copy of the regexp structure
14172     when a qr// is returned from the code executed by (??{$qr}) this
14173     lightweight copy doesn't actually own any of its data except for
14174     the starp/end and the actual regexp structure itself.
14175
14176 */
14177
14178
14179 REGEXP *
14180 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
14181 {
14182     struct regexp *ret;
14183     struct regexp *const r = (struct regexp *)SvANY(rx);
14184
14185     PERL_ARGS_ASSERT_REG_TEMP_COPY;
14186
14187     if (!ret_x)
14188         ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
14189     ret = (struct regexp *)SvANY(ret_x);
14190
14191     (void)ReREFCNT_inc(rx);
14192     /* We can take advantage of the existing "copied buffer" mechanism in SVs
14193        by pointing directly at the buffer, but flagging that the allocated
14194        space in the copy is zero. As we've just done a struct copy, it's now
14195        a case of zero-ing that, rather than copying the current length.  */
14196     SvPV_set(ret_x, RX_WRAPPED(rx));
14197     SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
14198     memcpy(&(ret->xpv_cur), &(r->xpv_cur),
14199            sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
14200     SvLEN_set(ret_x, 0);
14201     SvSTASH_set(ret_x, NULL);
14202     SvMAGIC_set(ret_x, NULL);
14203     if (r->offs) {
14204         const I32 npar = r->nparens+1;
14205         Newx(ret->offs, npar, regexp_paren_pair);
14206         Copy(r->offs, ret->offs, npar, regexp_paren_pair);
14207     }
14208     if (r->substrs) {
14209         Newx(ret->substrs, 1, struct reg_substr_data);
14210         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
14211
14212         SvREFCNT_inc_void(ret->anchored_substr);
14213         SvREFCNT_inc_void(ret->anchored_utf8);
14214         SvREFCNT_inc_void(ret->float_substr);
14215         SvREFCNT_inc_void(ret->float_utf8);
14216
14217         /* check_substr and check_utf8, if non-NULL, point to either their
14218            anchored or float namesakes, and don't hold a second reference.  */
14219     }
14220     RX_MATCH_COPIED_off(ret_x);
14221 #ifdef PERL_OLD_COPY_ON_WRITE
14222     ret->saved_copy = NULL;
14223 #endif
14224     ret->mother_re = rx;
14225     SvREFCNT_inc_void(ret->qr_anoncv);
14226
14227     return ret_x;
14228 }
14229 #endif
14230
14231 /* regfree_internal()
14232
14233    Free the private data in a regexp. This is overloadable by
14234    extensions. Perl takes care of the regexp structure in pregfree(),
14235    this covers the *pprivate pointer which technically perl doesn't
14236    know about, however of course we have to handle the
14237    regexp_internal structure when no extension is in use.
14238
14239    Note this is called before freeing anything in the regexp
14240    structure.
14241  */
14242
14243 void
14244 Perl_regfree_internal(pTHX_ REGEXP * const rx)
14245 {
14246     dVAR;
14247     struct regexp *const r = (struct regexp *)SvANY(rx);
14248     RXi_GET_DECL(r,ri);
14249     GET_RE_DEBUG_FLAGS_DECL;
14250
14251     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
14252
14253     DEBUG_COMPILE_r({
14254         if (!PL_colorset)
14255             reginitcolors();
14256         {
14257             SV *dsv= sv_newmortal();
14258             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
14259                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
14260             PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
14261                 PL_colors[4],PL_colors[5],s);
14262         }
14263     });
14264 #ifdef RE_TRACK_PATTERN_OFFSETS
14265     if (ri->u.offsets)
14266         Safefree(ri->u.offsets);             /* 20010421 MJD */
14267 #endif
14268     if (ri->code_blocks) {
14269         int n;
14270         for (n = 0; n < ri->num_code_blocks; n++)
14271             SvREFCNT_dec(ri->code_blocks[n].src_regex);
14272         Safefree(ri->code_blocks);
14273     }
14274
14275     if (ri->data) {
14276         int n = ri->data->count;
14277
14278         while (--n >= 0) {
14279           /* If you add a ->what type here, update the comment in regcomp.h */
14280             switch (ri->data->what[n]) {
14281             case 'a':
14282             case 'r':
14283             case 's':
14284             case 'S':
14285             case 'u':
14286                 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
14287                 break;
14288             case 'f':
14289                 Safefree(ri->data->data[n]);
14290                 break;
14291             case 'l':
14292             case 'L':
14293                 break;
14294             case 'T':
14295                 { /* Aho Corasick add-on structure for a trie node.
14296                      Used in stclass optimization only */
14297                     U32 refcount;
14298                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
14299                     OP_REFCNT_LOCK;
14300                     refcount = --aho->refcount;
14301                     OP_REFCNT_UNLOCK;
14302                     if ( !refcount ) {
14303                         PerlMemShared_free(aho->states);
14304                         PerlMemShared_free(aho->fail);
14305                          /* do this last!!!! */
14306                         PerlMemShared_free(ri->data->data[n]);
14307                         PerlMemShared_free(ri->regstclass);
14308                     }
14309                 }
14310                 break;
14311             case 't':
14312                 {
14313                     /* trie structure. */
14314                     U32 refcount;
14315                     reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
14316                     OP_REFCNT_LOCK;
14317                     refcount = --trie->refcount;
14318                     OP_REFCNT_UNLOCK;
14319                     if ( !refcount ) {
14320                         PerlMemShared_free(trie->charmap);
14321                         PerlMemShared_free(trie->states);
14322                         PerlMemShared_free(trie->trans);
14323                         if (trie->bitmap)
14324                             PerlMemShared_free(trie->bitmap);
14325                         if (trie->jump)
14326                             PerlMemShared_free(trie->jump);
14327                         PerlMemShared_free(trie->wordinfo);
14328                         /* do this last!!!! */
14329                         PerlMemShared_free(ri->data->data[n]);
14330                     }
14331                 }
14332                 break;
14333             default:
14334                 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
14335             }
14336         }
14337         Safefree(ri->data->what);
14338         Safefree(ri->data);
14339     }
14340
14341     Safefree(ri);
14342 }
14343
14344 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
14345 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
14346 #define SAVEPVN(p,n)    ((p) ? savepvn(p,n) : NULL)
14347
14348 /*
14349    re_dup - duplicate a regexp.
14350
14351    This routine is expected to clone a given regexp structure. It is only
14352    compiled under USE_ITHREADS.
14353
14354    After all of the core data stored in struct regexp is duplicated
14355    the regexp_engine.dupe method is used to copy any private data
14356    stored in the *pprivate pointer. This allows extensions to handle
14357    any duplication it needs to do.
14358
14359    See pregfree() and regfree_internal() if you change anything here.
14360 */
14361 #if defined(USE_ITHREADS)
14362 #ifndef PERL_IN_XSUB_RE
14363 void
14364 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
14365 {
14366     dVAR;
14367     I32 npar;
14368     const struct regexp *r = (const struct regexp *)SvANY(sstr);
14369     struct regexp *ret = (struct regexp *)SvANY(dstr);
14370
14371     PERL_ARGS_ASSERT_RE_DUP_GUTS;
14372
14373     npar = r->nparens+1;
14374     Newx(ret->offs, npar, regexp_paren_pair);
14375     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
14376     if(ret->swap) {
14377         /* no need to copy these */
14378         Newx(ret->swap, npar, regexp_paren_pair);
14379     }
14380
14381     if (ret->substrs) {
14382         /* Do it this way to avoid reading from *r after the StructCopy().
14383            That way, if any of the sv_dup_inc()s dislodge *r from the L1
14384            cache, it doesn't matter.  */
14385         const bool anchored = r->check_substr
14386             ? r->check_substr == r->anchored_substr
14387             : r->check_utf8 == r->anchored_utf8;
14388         Newx(ret->substrs, 1, struct reg_substr_data);
14389         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
14390
14391         ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
14392         ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
14393         ret->float_substr = sv_dup_inc(ret->float_substr, param);
14394         ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
14395
14396         /* check_substr and check_utf8, if non-NULL, point to either their
14397            anchored or float namesakes, and don't hold a second reference.  */
14398
14399         if (ret->check_substr) {
14400             if (anchored) {
14401                 assert(r->check_utf8 == r->anchored_utf8);
14402                 ret->check_substr = ret->anchored_substr;
14403                 ret->check_utf8 = ret->anchored_utf8;
14404             } else {
14405                 assert(r->check_substr == r->float_substr);
14406                 assert(r->check_utf8 == r->float_utf8);
14407                 ret->check_substr = ret->float_substr;
14408                 ret->check_utf8 = ret->float_utf8;
14409             }
14410         } else if (ret->check_utf8) {
14411             if (anchored) {
14412                 ret->check_utf8 = ret->anchored_utf8;
14413             } else {
14414                 ret->check_utf8 = ret->float_utf8;
14415             }
14416         }
14417     }
14418
14419     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
14420     ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
14421
14422     if (ret->pprivate)
14423         RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
14424
14425     if (RX_MATCH_COPIED(dstr))
14426         ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
14427     else
14428         ret->subbeg = NULL;
14429 #ifdef PERL_OLD_COPY_ON_WRITE
14430     ret->saved_copy = NULL;
14431 #endif
14432
14433     if (ret->mother_re) {
14434         if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
14435             /* Our storage points directly to our mother regexp, but that's
14436                1: a buffer in a different thread
14437                2: something we no longer hold a reference on
14438                so we need to copy it locally.  */
14439             /* Note we need to use SvCUR(), rather than
14440                SvLEN(), on our mother_re, because it, in
14441                turn, may well be pointing to its own mother_re.  */
14442             SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
14443                                    SvCUR(ret->mother_re)+1));
14444             SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
14445         }
14446         ret->mother_re      = NULL;
14447     }
14448     ret->gofs = 0;
14449 }
14450 #endif /* PERL_IN_XSUB_RE */
14451
14452 /*
14453    regdupe_internal()
14454
14455    This is the internal complement to regdupe() which is used to copy
14456    the structure pointed to by the *pprivate pointer in the regexp.
14457    This is the core version of the extension overridable cloning hook.
14458    The regexp structure being duplicated will be copied by perl prior
14459    to this and will be provided as the regexp *r argument, however
14460    with the /old/ structures pprivate pointer value. Thus this routine
14461    may override any copying normally done by perl.
14462
14463    It returns a pointer to the new regexp_internal structure.
14464 */
14465
14466 void *
14467 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
14468 {
14469     dVAR;
14470     struct regexp *const r = (struct regexp *)SvANY(rx);
14471     regexp_internal *reti;
14472     int len;
14473     RXi_GET_DECL(r,ri);
14474
14475     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
14476
14477     len = ProgLen(ri);
14478
14479     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
14480     Copy(ri->program, reti->program, len+1, regnode);
14481
14482     reti->num_code_blocks = ri->num_code_blocks;
14483     if (ri->code_blocks) {
14484         int n;
14485         Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
14486                 struct reg_code_block);
14487         Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
14488                 struct reg_code_block);
14489         for (n = 0; n < ri->num_code_blocks; n++)
14490              reti->code_blocks[n].src_regex = (REGEXP*)
14491                     sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
14492     }
14493     else
14494         reti->code_blocks = NULL;
14495
14496     reti->regstclass = NULL;
14497
14498     if (ri->data) {
14499         struct reg_data *d;
14500         const int count = ri->data->count;
14501         int i;
14502
14503         Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
14504                 char, struct reg_data);
14505         Newx(d->what, count, U8);
14506
14507         d->count = count;
14508         for (i = 0; i < count; i++) {
14509             d->what[i] = ri->data->what[i];
14510             switch (d->what[i]) {
14511                 /* see also regcomp.h and regfree_internal() */
14512             case 'a': /* actually an AV, but the dup function is identical.  */
14513             case 'r':
14514             case 's':
14515             case 'S':
14516             case 'u': /* actually an HV, but the dup function is identical.  */
14517                 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
14518                 break;
14519             case 'f':
14520                 /* This is cheating. */
14521                 Newx(d->data[i], 1, struct regnode_charclass_class);
14522                 StructCopy(ri->data->data[i], d->data[i],
14523                             struct regnode_charclass_class);
14524                 reti->regstclass = (regnode*)d->data[i];
14525                 break;
14526             case 'T':
14527                 /* Trie stclasses are readonly and can thus be shared
14528                  * without duplication. We free the stclass in pregfree
14529                  * when the corresponding reg_ac_data struct is freed.
14530                  */
14531                 reti->regstclass= ri->regstclass;
14532                 /* Fall through */
14533             case 't':
14534                 OP_REFCNT_LOCK;
14535                 ((reg_trie_data*)ri->data->data[i])->refcount++;
14536                 OP_REFCNT_UNLOCK;
14537                 /* Fall through */
14538             case 'l':
14539             case 'L':
14540                 d->data[i] = ri->data->data[i];
14541                 break;
14542             default:
14543                 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
14544             }
14545         }
14546
14547         reti->data = d;
14548     }
14549     else
14550         reti->data = NULL;
14551
14552     reti->name_list_idx = ri->name_list_idx;
14553
14554 #ifdef RE_TRACK_PATTERN_OFFSETS
14555     if (ri->u.offsets) {
14556         Newx(reti->u.offsets, 2*len+1, U32);
14557         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
14558     }
14559 #else
14560     SetProgLen(reti,len);
14561 #endif
14562
14563     return (void*)reti;
14564 }
14565
14566 #endif    /* USE_ITHREADS */
14567
14568 #ifndef PERL_IN_XSUB_RE
14569
14570 /*
14571  - regnext - dig the "next" pointer out of a node
14572  */
14573 regnode *
14574 Perl_regnext(pTHX_ register regnode *p)
14575 {
14576     dVAR;
14577     I32 offset;
14578
14579     if (!p)
14580         return(NULL);
14581
14582     if (OP(p) > REGNODE_MAX) {          /* regnode.type is unsigned */
14583         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
14584     }
14585
14586     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
14587     if (offset == 0)
14588         return(NULL);
14589
14590     return(p+offset);
14591 }
14592 #endif
14593
14594 STATIC void
14595 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
14596 {
14597     va_list args;
14598     STRLEN l1 = strlen(pat1);
14599     STRLEN l2 = strlen(pat2);
14600     char buf[512];
14601     SV *msv;
14602     const char *message;
14603
14604     PERL_ARGS_ASSERT_RE_CROAK2;
14605
14606     if (l1 > 510)
14607         l1 = 510;
14608     if (l1 + l2 > 510)
14609         l2 = 510 - l1;
14610     Copy(pat1, buf, l1 , char);
14611     Copy(pat2, buf + l1, l2 , char);
14612     buf[l1 + l2] = '\n';
14613     buf[l1 + l2 + 1] = '\0';
14614 #ifdef I_STDARG
14615     /* ANSI variant takes additional second argument */
14616     va_start(args, pat2);
14617 #else
14618     va_start(args);
14619 #endif
14620     msv = vmess(buf, &args);
14621     va_end(args);
14622     message = SvPV_const(msv,l1);
14623     if (l1 > 512)
14624         l1 = 512;
14625     Copy(message, buf, l1 , char);
14626     buf[l1-1] = '\0';                   /* Overwrite \n */
14627     Perl_croak(aTHX_ "%s", buf);
14628 }
14629
14630 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
14631
14632 #ifndef PERL_IN_XSUB_RE
14633 void
14634 Perl_save_re_context(pTHX)
14635 {
14636     dVAR;
14637
14638     struct re_save_state *state;
14639
14640     SAVEVPTR(PL_curcop);
14641     SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
14642
14643     state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
14644     PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
14645     SSPUSHUV(SAVEt_RE_STATE);
14646
14647     Copy(&PL_reg_state, state, 1, struct re_save_state);
14648
14649     PL_reg_oldsaved = NULL;
14650     PL_reg_oldsavedlen = 0;
14651     PL_reg_oldsavedoffset = 0;
14652     PL_reg_oldsavedcoffset = 0;
14653     PL_reg_maxiter = 0;
14654     PL_reg_leftiter = 0;
14655     PL_reg_poscache = NULL;
14656     PL_reg_poscache_size = 0;
14657 #ifdef PERL_OLD_COPY_ON_WRITE
14658     PL_nrs = NULL;
14659 #endif
14660
14661     /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
14662     if (PL_curpm) {
14663         const REGEXP * const rx = PM_GETRE(PL_curpm);
14664         if (rx) {
14665             U32 i;
14666             for (i = 1; i <= RX_NPARENS(rx); i++) {
14667                 char digits[TYPE_CHARS(long)];
14668                 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
14669                 GV *const *const gvp
14670                     = (GV**)hv_fetch(PL_defstash, digits, len, 0);
14671
14672                 if (gvp) {
14673                     GV * const gv = *gvp;
14674                     if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
14675                         save_scalar(gv);
14676                 }
14677             }
14678         }
14679     }
14680 }
14681 #endif
14682
14683 static void
14684 clear_re(pTHX_ void *r)
14685 {
14686     dVAR;
14687     ReREFCNT_dec((REGEXP *)r);
14688 }
14689
14690 #ifdef DEBUGGING
14691
14692 STATIC void
14693 S_put_byte(pTHX_ SV *sv, int c)
14694 {
14695     PERL_ARGS_ASSERT_PUT_BYTE;
14696
14697     /* Our definition of isPRINT() ignores locales, so only bytes that are
14698        not part of UTF-8 are considered printable. I assume that the same
14699        holds for UTF-EBCDIC.
14700        Also, code point 255 is not printable in either (it's E0 in EBCDIC,
14701        which Wikipedia says:
14702
14703        EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
14704        ones (binary 1111 1111, hexadecimal FF). It is similar, but not
14705        identical, to the ASCII delete (DEL) or rubout control character.
14706        ) So the old condition can be simplified to !isPRINT(c)  */
14707     if (!isPRINT(c)) {
14708         if (c < 256) {
14709             Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
14710         }
14711         else {
14712             Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
14713         }
14714     }
14715     else {
14716         const char string = c;
14717         if (c == '-' || c == ']' || c == '\\' || c == '^')
14718             sv_catpvs(sv, "\\");
14719         sv_catpvn(sv, &string, 1);
14720     }
14721 }
14722
14723
14724 #define CLEAR_OPTSTART \
14725     if (optstart) STMT_START { \
14726             DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
14727             optstart=NULL; \
14728     } STMT_END
14729
14730 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
14731
14732 STATIC const regnode *
14733 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
14734             const regnode *last, const regnode *plast,
14735             SV* sv, I32 indent, U32 depth)
14736 {
14737     dVAR;
14738     U8 op = PSEUDO;     /* Arbitrary non-END op. */
14739     const regnode *next;
14740     const regnode *optstart= NULL;
14741
14742     RXi_GET_DECL(r,ri);
14743     GET_RE_DEBUG_FLAGS_DECL;
14744
14745     PERL_ARGS_ASSERT_DUMPUNTIL;
14746
14747 #ifdef DEBUG_DUMPUNTIL
14748     PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
14749         last ? last-start : 0,plast ? plast-start : 0);
14750 #endif
14751
14752     if (plast && plast < last)
14753         last= plast;
14754
14755     while (PL_regkind[op] != END && (!last || node < last)) {
14756         /* While that wasn't END last time... */
14757         NODE_ALIGN(node);
14758         op = OP(node);
14759         if (op == CLOSE || op == WHILEM)
14760             indent--;
14761         next = regnext((regnode *)node);
14762
14763         /* Where, what. */
14764         if (OP(node) == OPTIMIZED) {
14765             if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
14766                 optstart = node;
14767             else
14768                 goto after_print;
14769         } else
14770             CLEAR_OPTSTART;
14771
14772         regprop(r, sv, node);
14773         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
14774                       (int)(2*indent + 1), "", SvPVX_const(sv));
14775
14776         if (OP(node) != OPTIMIZED) {
14777             if (next == NULL)           /* Next ptr. */
14778                 PerlIO_printf(Perl_debug_log, " (0)");
14779             else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
14780                 PerlIO_printf(Perl_debug_log, " (FAIL)");
14781             else
14782                 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
14783             (void)PerlIO_putc(Perl_debug_log, '\n');
14784         }
14785
14786       after_print:
14787         if (PL_regkind[(U8)op] == BRANCHJ) {
14788             assert(next);
14789             {
14790                 const regnode *nnode = (OP(next) == LONGJMP
14791                                        ? regnext((regnode *)next)
14792                                        : next);
14793                 if (last && nnode > last)
14794                     nnode = last;
14795                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
14796             }
14797         }
14798         else if (PL_regkind[(U8)op] == BRANCH) {
14799             assert(next);
14800             DUMPUNTIL(NEXTOPER(node), next);
14801         }
14802         else if ( PL_regkind[(U8)op]  == TRIE ) {
14803             const regnode *this_trie = node;
14804             const char op = OP(node);
14805             const U32 n = ARG(node);
14806             const reg_ac_data * const ac = op>=AHOCORASICK ?
14807                (reg_ac_data *)ri->data->data[n] :
14808                NULL;
14809             const reg_trie_data * const trie =
14810                 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
14811 #ifdef DEBUGGING
14812             AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
14813 #endif
14814             const regnode *nextbranch= NULL;
14815             I32 word_idx;
14816             sv_setpvs(sv, "");
14817             for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
14818                 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
14819
14820                 PerlIO_printf(Perl_debug_log, "%*s%s ",
14821                    (int)(2*(indent+3)), "",
14822                     elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
14823                             PL_colors[0], PL_colors[1],
14824                             (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
14825                             PERL_PV_PRETTY_ELLIPSES    |
14826                             PERL_PV_PRETTY_LTGT
14827                             )
14828                             : "???"
14829                 );
14830                 if (trie->jump) {
14831                     U16 dist= trie->jump[word_idx+1];
14832                     PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
14833                                   (UV)((dist ? this_trie + dist : next) - start));
14834                     if (dist) {
14835                         if (!nextbranch)
14836                             nextbranch= this_trie + trie->jump[0];
14837                         DUMPUNTIL(this_trie + dist, nextbranch);
14838                     }
14839                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
14840                         nextbranch= regnext((regnode *)nextbranch);
14841                 } else {
14842                     PerlIO_printf(Perl_debug_log, "\n");
14843                 }
14844             }
14845             if (last && next > last)
14846                 node= last;
14847             else
14848                 node= next;
14849         }
14850         else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
14851             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
14852                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
14853         }
14854         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
14855             assert(next);
14856             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
14857         }
14858         else if ( op == PLUS || op == STAR) {
14859             DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
14860         }
14861         else if (PL_regkind[(U8)op] == ANYOF) {
14862             /* arglen 1 + class block */
14863             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
14864                     ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
14865             node = NEXTOPER(node);
14866         }
14867         else if (PL_regkind[(U8)op] == EXACT) {
14868             /* Literal string, where present. */
14869             node += NODE_SZ_STR(node) - 1;
14870             node = NEXTOPER(node);
14871         }
14872         else {
14873             node = NEXTOPER(node);
14874             node += regarglen[(U8)op];
14875         }
14876         if (op == CURLYX || op == OPEN)
14877             indent++;
14878     }
14879     CLEAR_OPTSTART;
14880 #ifdef DEBUG_DUMPUNTIL
14881     PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
14882 #endif
14883     return node;
14884 }
14885
14886 #endif  /* DEBUGGING */
14887
14888 /*
14889  * Local variables:
14890  * c-indentation-style: bsd
14891  * c-basic-offset: 4
14892  * indent-tabs-mode: nil
14893  * End:
14894  *
14895  * ex: set ts=8 sts=4 sw=4 et:
14896  */