src/5017005/regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  *      Copyright (c) 1986 by University of Toronto.
  42  *      Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  *      Permission is granted to anyone to use this software for any
  45  *      purpose on any computer system, and to redistribute it freely,
  46  *      subject to the following restrictions:
  47  *
  48  *      1. The author is not responsible for the consequences of use of
  49  *              this software, no matter how awful, even if they arise
  50  *              from defects in it.
  51  *
  52  *      2. The origin of this software must not be misrepresented, either
  53  *              by explicit claim or by omission.
  54  *
  55  *      3. Altered versions must be plainly marked as such, and must not
  56  *              be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #include "perl.h"
  76
  77 #ifndef PERL_IN_XSUB_RE
  78 #include "re_defs.h"
  79 #endif
  80
  81 #define REG_COMP_C
  82 #ifdef PERL_IN_XSUB_RE
  83 #  include "re_comp.h"
  84 extern const struct regexp_engine my_reg_engine;
  85 #else
  86 #  include "regcomp.h"
  87 #endif
  88
  89 #include "dquote_static.c"
  90 #include "charclass_invlists.h"
  91 #include "inline_invlist.c"
  92 #include "unicode_constants.h"
  93
  94 #ifdef HAS_ISBLANK
  95 #   define hasISBLANK 1
  96 #else
  97 #   define hasISBLANK 0
  98 #endif
  99
 100 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
 101 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
 102 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
 103
 104 #ifdef op
 105 #undef op
 106 #endif /* op */
 107
 108 #ifdef MSDOS
 109 #  if defined(BUGGY_MSC6)
 110  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
 111 #    pragma optimize("a",off)
 112  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
 113 #    pragma optimize("w",on )
 114 #  endif /* BUGGY_MSC6 */
 115 #endif /* MSDOS */
 116
 117 #ifndef STATIC
 118 #define STATIC  static
 119 #endif
 120
 121
 122 typedef struct RExC_state_t {
 123     U32         flags;                  /* RXf_* are we folding, multilining? */
 124     U32         pm_flags;               /* PMf_* stuff from the calling PMOP */
 125     char        *precomp;               /* uncompiled string. */
 126     REGEXP      *rx_sv;                 /* The SV that is the regexp. */
 127     regexp      *rx;                    /* perl core regexp structure */
 128     regexp_internal     *rxi;           /* internal data for regexp object pprivate field */
 129     char        *start;                 /* Start of input for compile */
 130     char        *end;                   /* End of input for compile */
 131     char        *parse;                 /* Input-scan pointer. */
 132     I32         whilem_seen;            /* number of WHILEM in this expr */
 133     regnode     *emit_start;            /* Start of emitted-code area */
 134     regnode     *emit_bound;            /* First regnode outside of the allocated space */
 135     regnode     *emit;                  /* Code-emit pointer; &regdummy = don't = compiling */
 136     I32         naughty;                /* How bad is this pattern? */
 137     I32         sawback;                /* Did we see \1, ...? */
 138     U32         seen;
 139     I32         size;                   /* Code size. */
 140     I32         npar;                   /* Capture buffer count, (OPEN). */
 141     I32         cpar;                   /* Capture buffer count, (CLOSE). */
 142     I32         nestroot;               /* root parens we are in - used by accept */
 143     I32         extralen;
 144     I32         seen_zerolen;
 145     regnode     **open_parens;          /* pointers to open parens */
 146     regnode     **close_parens;         /* pointers to close parens */
 147     regnode     *opend;                 /* END node in program */
 148     I32         utf8;           /* whether the pattern is utf8 or not */
 149     I32         orig_utf8;      /* whether the pattern was originally in utf8 */
 150                                 /* XXX use this for future optimisation of case
 151                                  * where pattern must be upgraded to utf8. */
 152     I32         uni_semantics;  /* If a d charset modifier should use unicode
 153                                    rules, even if the pattern is not in
 154                                    utf8 */
 155     HV          *paren_names;           /* Paren names */
 156
 157     regnode     **recurse;              /* Recurse regops */
 158     I32         recurse_count;          /* Number of recurse regops */
 159     I32         in_lookbehind;
 160     I32         contains_locale;
 161     I32         override_recoding;
 162     I32         in_multi_char_class;
 163     struct reg_code_block *code_blocks; /* positions of literal (?{})
 164                                             within pattern */
 165     int         num_code_blocks;        /* size of code_blocks[] */
 166     int         code_index;             /* next code_blocks[] slot */
 167 #if ADD_TO_REGEXEC
 168     char        *starttry;              /* -Dr: where regtry was called. */
 169 #define RExC_starttry   (pRExC_state->starttry)
 170 #endif
 171     SV          *runtime_code_qr;       /* qr with the runtime code blocks */
 172 #ifdef DEBUGGING
 173     const char  *lastparse;
 174     I32         lastnum;
 175     AV          *paren_name_list;       /* idx -> name */
 176 #define RExC_lastparse  (pRExC_state->lastparse)
 177 #define RExC_lastnum    (pRExC_state->lastnum)
 178 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 179 #endif
 180 } RExC_state_t;
 181
 182 #define RExC_flags      (pRExC_state->flags)
 183 #define RExC_pm_flags   (pRExC_state->pm_flags)
 184 #define RExC_precomp    (pRExC_state->precomp)
 185 #define RExC_rx_sv      (pRExC_state->rx_sv)
 186 #define RExC_rx         (pRExC_state->rx)
 187 #define RExC_rxi        (pRExC_state->rxi)
 188 #define RExC_start      (pRExC_state->start)
 189 #define RExC_end        (pRExC_state->end)
 190 #define RExC_parse      (pRExC_state->parse)
 191 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
 192 #ifdef RE_TRACK_PATTERN_OFFSETS
 193 #define RExC_offsets    (pRExC_state->rxi->u.offsets) /* I am not like the others */
 194 #endif
 195 #define RExC_emit       (pRExC_state->emit)
 196 #define RExC_emit_start (pRExC_state->emit_start)
 197 #define RExC_emit_bound (pRExC_state->emit_bound)
 198 #define RExC_naughty    (pRExC_state->naughty)
 199 #define RExC_sawback    (pRExC_state->sawback)
 200 #define RExC_seen       (pRExC_state->seen)
 201 #define RExC_size       (pRExC_state->size)
 202 #define RExC_npar       (pRExC_state->npar)
 203 #define RExC_nestroot   (pRExC_state->nestroot)
 204 #define RExC_extralen   (pRExC_state->extralen)
 205 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
 206 #define RExC_utf8       (pRExC_state->utf8)
 207 #define RExC_uni_semantics      (pRExC_state->uni_semantics)
 208 #define RExC_orig_utf8  (pRExC_state->orig_utf8)
 209 #define RExC_open_parens        (pRExC_state->open_parens)
 210 #define RExC_close_parens       (pRExC_state->close_parens)
 211 #define RExC_opend      (pRExC_state->opend)
 212 #define RExC_paren_names        (pRExC_state->paren_names)
 213 #define RExC_recurse    (pRExC_state->recurse)
 214 #define RExC_recurse_count      (pRExC_state->recurse_count)
 215 #define RExC_in_lookbehind      (pRExC_state->in_lookbehind)
 216 #define RExC_contains_locale    (pRExC_state->contains_locale)
 217 #define RExC_override_recoding (pRExC_state->override_recoding)
 218 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
 219
 220
 221 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
 222 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 223         ((*s) == '{' && regcurly(s)))
 224
 225 #ifdef SPSTART
 226 #undef SPSTART          /* dratted cpp namespace... */
 227 #endif
 228 /*
 229  * Flags to be passed up and down.
 230  */
 231 #define WORST           0       /* Worst case. */
 232 #define HASWIDTH        0x01    /* Known to match non-null strings. */
 233
 234 /* Simple enough to be STAR/PLUS operand; in an EXACT node must be a single
 235  * character.  Note that this is not the same thing as REGNODE_SIMPLE */
 236 #define SIMPLE          0x02
 237 #define SPSTART         0x04    /* Starts with * or +. */
 238 #define TRYAGAIN        0x08    /* Weeded out a declaration. */
 239 #define POSTPONED       0x10    /* (?1),(?&name), (??{...}) or similar */
 240
 241 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 242
 243 /* whether trie related optimizations are enabled */
 244 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 245 #define TRIE_STUDY_OPT
 246 #define FULL_TRIE_STUDY
 247 #define TRIE_STCLASS
 248 #endif
 249
 250
 251
 252 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 253 #define PBITVAL(paren) (1 << ((paren) & 7))
 254 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 255 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 256 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 257
 258 /* If not already in utf8, do a longjmp back to the beginning */
 259 #define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
 260 #define REQUIRE_UTF8    STMT_START {                                       \
 261                                      if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
 262                         } STMT_END
 263
 264 /* About scan_data_t.
 265
 266   During optimisation we recurse through the regexp program performing
 267   various inplace (keyhole style) optimisations. In addition study_chunk
 268   and scan_commit populate this data structure with information about
 269   what strings MUST appear in the pattern. We look for the longest
 270   string that must appear at a fixed location, and we look for the
 271   longest string that may appear at a floating location. So for instance
 272   in the pattern:
 273
 274     /FOO[xX]A.*B[xX]BAR/
 275
 276   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 277   strings (because they follow a .* construct). study_chunk will identify
 278   both FOO and BAR as being the longest fixed and floating strings respectively.
 279
 280   The strings can be composites, for instance
 281
 282      /(f)(o)(o)/
 283
 284   will result in a composite fixed substring 'foo'.
 285
 286   For each string some basic information is maintained:
 287
 288   - offset or min_offset
 289     This is the position the string must appear at, or not before.
 290     It also implicitly (when combined with minlenp) tells us how many
 291     characters must match before the string we are searching for.
 292     Likewise when combined with minlenp and the length of the string it
 293     tells us how many characters must appear after the string we have
 294     found.
 295
 296   - max_offset
 297     Only used for floating strings. This is the rightmost point that
 298     the string can appear at. If set to I32 max it indicates that the
 299     string can occur infinitely far to the right.
 300
 301   - minlenp
 302     A pointer to the minimum number of characters of the pattern that the
 303     string was found inside. This is important as in the case of positive
 304     lookahead or positive lookbehind we can have multiple patterns
 305     involved. Consider
 306
 307     /(?=FOO).*F/
 308
 309     The minimum length of the pattern overall is 3, the minimum length
 310     of the lookahead part is 3, but the minimum length of the part that
 311     will actually match is 1. So 'FOO's minimum length is 3, but the
 312     minimum length for the F is 1. This is important as the minimum length
 313     is used to determine offsets in front of and behind the string being
 314     looked for.  Since strings can be composites this is the length of the
 315     pattern at the time it was committed with a scan_commit. Note that
 316     the length is calculated by study_chunk, so that the minimum lengths
 317     are not known until the full pattern has been compiled, thus the
 318     pointer to the value.
 319
 320   - lookbehind
 321
 322     In the case of lookbehind the string being searched for can be
 323     offset past the start point of the final matching string.
 324     If this value was just blithely removed from the min_offset it would
 325     invalidate some of the calculations for how many chars must match
 326     before or after (as they are derived from min_offset and minlen and
 327     the length of the string being searched for).
 328     When the final pattern is compiled and the data is moved from the
 329     scan_data_t structure into the regexp structure the information
 330     about lookbehind is factored in, with the information that would
 331     have been lost precalculated in the end_shift field for the
 332     associated string.
 333
 334   The fields pos_min and pos_delta are used to store the minimum offset
 335   and the delta to the maximum offset at the current point in the pattern.
 336
 337 */
 338
 339 typedef struct scan_data_t {
 340     /*I32 len_min;      unused */
 341     /*I32 len_delta;    unused */
 342     I32 pos_min;
 343     I32 pos_delta;
 344     SV *last_found;
 345     I32 last_end;           /* min value, <0 unless valid. */
 346     I32 last_start_min;
 347     I32 last_start_max;
 348     SV **longest;           /* Either &l_fixed, or &l_float. */
 349     SV *longest_fixed;      /* longest fixed string found in pattern */
 350     I32 offset_fixed;       /* offset where it starts */
 351     I32 *minlen_fixed;      /* pointer to the minlen relevant to the string */
 352     I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 353     SV *longest_float;      /* longest floating string found in pattern */
 354     I32 offset_float_min;   /* earliest point in string it can appear */
 355     I32 offset_float_max;   /* latest point in string it can appear */
 356     I32 *minlen_float;      /* pointer to the minlen relevant to the string */
 357     I32 lookbehind_float;   /* is the position of the string modified by LB */
 358     I32 flags;
 359     I32 whilem_c;
 360     I32 *last_closep;
 361     struct regnode_charclass_class *start_class;
 362 } scan_data_t;
 363
 364 /*
 365  * Forward declarations for pregcomp()'s friends.
 366  */
 367
 368 static const scan_data_t zero_scan_data =
 369   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 370
 371 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 372 #define SF_BEFORE_SEOL          0x0001
 373 #define SF_BEFORE_MEOL          0x0002
 374 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 375 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 376
 377 #ifdef NO_UNARY_PLUS
 378 #  define SF_FIX_SHIFT_EOL      (0+2)
 379 #  define SF_FL_SHIFT_EOL               (0+4)
 380 #else
 381 #  define SF_FIX_SHIFT_EOL      (+2)
 382 #  define SF_FL_SHIFT_EOL               (+4)
 383 #endif
 384
 385 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 386 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 387
 388 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 389 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 390 #define SF_IS_INF               0x0040
 391 #define SF_HAS_PAR              0x0080
 392 #define SF_IN_PAR               0x0100
 393 #define SF_HAS_EVAL             0x0200
 394 #define SCF_DO_SUBSTR           0x0400
 395 #define SCF_DO_STCLASS_AND      0x0800
 396 #define SCF_DO_STCLASS_OR       0x1000
 397 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 398 #define SCF_WHILEM_VISITED_POS  0x2000
 399
 400 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 401 #define SCF_SEEN_ACCEPT         0x8000
 402
 403 #define UTF cBOOL(RExC_utf8)
 404
 405 /* The enums for all these are ordered so things work out correctly */
 406 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 407 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
 408 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
 409 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
 410 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
 411 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
 412 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
 413
 414 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 415
 416 #define OOB_NAMEDCLASS          -1
 417
 418 /* There is no code point that is out-of-bounds, so this is problematic.  But
 419  * its only current use is to initialize a variable that is always set before
 420  * looked at. */
 421 #define OOB_UNICODE             0xDEADBEEF
 422
 423 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 424 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 425
 426
 427 /* length of regex to show in messages that don't mark a position within */
 428 #define RegexLengthToShowInErrorMessages 127
 429
 430 /*
 431  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 432  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 433  * op/pragma/warn/regcomp.
 434  */
 435 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 436 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 437
 438 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
 439
 440 /*
 441  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 442  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 443  * "...".
 444  */
 445 #define _FAIL(code) STMT_START {                                        \
 446     const char *ellipses = "";                                          \
 447     IV len = RExC_end - RExC_precomp;                                   \
 448                                                                         \
 449     if (!SIZE_ONLY)                                                     \
 450         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);                   \
 451     if (len > RegexLengthToShowInErrorMessages) {                       \
 452         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
 453         len = RegexLengthToShowInErrorMessages - 10;                    \
 454         ellipses = "...";                                               \
 455     }                                                                   \
 456     code;                                                               \
 457 } STMT_END
 458
 459 #define FAIL(msg) _FAIL(                            \
 460     Perl_croak(aTHX_ "%s in regex m/%.*s%s/",       \
 461             msg, (int)len, RExC_precomp, ellipses))
 462
 463 #define FAIL2(msg,arg) _FAIL(                       \
 464     Perl_croak(aTHX_ msg " in regex m/%.*s%s/",     \
 465             arg, (int)len, RExC_precomp, ellipses))
 466
 467 /*
 468  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 469  */
 470 #define Simple_vFAIL(m) STMT_START {                                    \
 471     const IV offset = RExC_parse - RExC_precomp;                        \
 472     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
 473             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
 474 } STMT_END
 475
 476 /*
 477  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 478  */
 479 #define vFAIL(m) STMT_START {                           \
 480     if (!SIZE_ONLY)                                     \
 481         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 482     Simple_vFAIL(m);                                    \
 483 } STMT_END
 484
 485 /*
 486  * Like Simple_vFAIL(), but accepts two arguments.
 487  */
 488 #define Simple_vFAIL2(m,a1) STMT_START {                        \
 489     const IV offset = RExC_parse - RExC_precomp;                        \
 490     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,                   \
 491             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 492 } STMT_END
 493
 494 /*
 495  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 496  */
 497 #define vFAIL2(m,a1) STMT_START {                       \
 498     if (!SIZE_ONLY)                                     \
 499         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 500     Simple_vFAIL2(m, a1);                               \
 501 } STMT_END
 502
 503
 504 /*
 505  * Like Simple_vFAIL(), but accepts three arguments.
 506  */
 507 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
 508     const IV offset = RExC_parse - RExC_precomp;                \
 509     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,               \
 510             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 511 } STMT_END
 512
 513 /*
 514  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 515  */
 516 #define vFAIL3(m,a1,a2) STMT_START {                    \
 517     if (!SIZE_ONLY)                                     \
 518         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 519     Simple_vFAIL3(m, a1, a2);                           \
 520 } STMT_END
 521
 522 /*
 523  * Like Simple_vFAIL(), but accepts four arguments.
 524  */
 525 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
 526     const IV offset = RExC_parse - RExC_precomp;                \
 527     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,           \
 528             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 529 } STMT_END
 530
 531 #define ckWARNreg(loc,m) STMT_START {                                   \
 532     const IV offset = loc - RExC_precomp;                               \
 533     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 534             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 535 } STMT_END
 536
 537 #define ckWARNregdep(loc,m) STMT_START {                                \
 538     const IV offset = loc - RExC_precomp;                               \
 539     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 540             m REPORT_LOCATION,                                          \
 541             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 542 } STMT_END
 543
 544 #define ckWARN2regdep(loc,m, a1) STMT_START {                           \
 545     const IV offset = loc - RExC_precomp;                               \
 546     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 547             m REPORT_LOCATION,                                          \
 548             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
 549 } STMT_END
 550
 551 #define ckWARN2reg(loc, m, a1) STMT_START {                             \
 552     const IV offset = loc - RExC_precomp;                               \
 553     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 554             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
 555 } STMT_END
 556
 557 #define vWARN3(loc, m, a1, a2) STMT_START {                             \
 558     const IV offset = loc - RExC_precomp;                               \
 559     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 560             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 561 } STMT_END
 562
 563 #define ckWARN3reg(loc, m, a1, a2) STMT_START {                         \
 564     const IV offset = loc - RExC_precomp;                               \
 565     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 566             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 567 } STMT_END
 568
 569 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
 570     const IV offset = loc - RExC_precomp;                               \
 571     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 572             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 573 } STMT_END
 574
 575 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START {                     \
 576     const IV offset = loc - RExC_precomp;                               \
 577     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 578             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 579 } STMT_END
 580
 581 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
 582     const IV offset = loc - RExC_precomp;                               \
 583     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 584             a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
 585 } STMT_END
 586
 587
 588 /* Allow for side effects in s */
 589 #define REGC(c,s) STMT_START {                  \
 590     if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
 591 } STMT_END
 592
 593 /* Macros for recording node offsets.   20001227 mjd@plover.com
 594  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 595  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 596  * Element 0 holds the number n.
 597  * Position is 1 indexed.
 598  */
 599 #ifndef RE_TRACK_PATTERN_OFFSETS
 600 #define Set_Node_Offset_To_R(node,byte)
 601 #define Set_Node_Offset(node,byte)
 602 #define Set_Cur_Node_Offset
 603 #define Set_Node_Length_To_R(node,len)
 604 #define Set_Node_Length(node,len)
 605 #define Set_Node_Cur_Length(node)
 606 #define Node_Offset(n)
 607 #define Node_Length(n)
 608 #define Set_Node_Offset_Length(node,offset,len)
 609 #define ProgLen(ri) ri->u.proglen
 610 #define SetProgLen(ri,x) ri->u.proglen = x
 611 #else
 612 #define ProgLen(ri) ri->u.offsets[0]
 613 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 614 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
 615     if (! SIZE_ONLY) {                                                  \
 616         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
 617                     __LINE__, (int)(node), (int)(byte)));               \
 618         if((node) < 0) {                                                \
 619             Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
 620         } else {                                                        \
 621             RExC_offsets[2*(node)-1] = (byte);                          \
 622         }                                                               \
 623     }                                                                   \
 624 } STMT_END
 625
 626 #define Set_Node_Offset(node,byte) \
 627     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 628 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 629
 630 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
 631     if (! SIZE_ONLY) {                                                  \
 632         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
 633                 __LINE__, (int)(node), (int)(len)));                    \
 634         if((node) < 0) {                                                \
 635             Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
 636         } else {                                                        \
 637             RExC_offsets[2*(node)] = (len);                             \
 638         }                                                               \
 639     }                                                                   \
 640 } STMT_END
 641
 642 #define Set_Node_Length(node,len) \
 643     Set_Node_Length_To_R((node)-RExC_emit_start, len)
 644 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
 645 #define Set_Node_Cur_Length(node) \
 646     Set_Node_Length(node, RExC_parse - parse_start)
 647
 648 /* Get offsets and lengths */
 649 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 650 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 651
 652 #define Set_Node_Offset_Length(node,offset,len) STMT_START {    \
 653     Set_Node_Offset_To_R((node)-RExC_emit_start, (offset));     \
 654     Set_Node_Length_To_R((node)-RExC_emit_start, (len));        \
 655 } STMT_END
 656 #endif
 657
 658 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 659 #define EXPERIMENTAL_INPLACESCAN
 660 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
 661
 662 #define DEBUG_STUDYDATA(str,data,depth)                              \
 663 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
 664     PerlIO_printf(Perl_debug_log,                                    \
 665         "%*s" str "Pos:%"IVdf"/%"IVdf                                \
 666         " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s",       \
 667         (int)(depth)*2, "",                                          \
 668         (IV)((data)->pos_min),                                       \
 669         (IV)((data)->pos_delta),                                     \
 670         (UV)((data)->flags),                                         \
 671         (IV)((data)->whilem_c),                                      \
 672         (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
 673         is_inf ? "INF " : ""                                         \
 674     );                                                               \
 675     if ((data)->last_found)                                          \
 676         PerlIO_printf(Perl_debug_log,                                \
 677             "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 678             " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 679             SvPVX_const((data)->last_found),                         \
 680             (IV)((data)->last_end),                                  \
 681             (IV)((data)->last_start_min),                            \
 682             (IV)((data)->last_start_max),                            \
 683             ((data)->longest &&                                      \
 684              (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 685             SvPVX_const((data)->longest_fixed),                      \
 686             (IV)((data)->offset_fixed),                              \
 687             ((data)->longest &&                                      \
 688              (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 689             SvPVX_const((data)->longest_float),                      \
 690             (IV)((data)->offset_float_min),                          \
 691             (IV)((data)->offset_float_max)                           \
 692         );                                                           \
 693     PerlIO_printf(Perl_debug_log,"\n");                              \
 694 });
 695
 696 static void clear_re(pTHX_ void *r);
 697
 698 /* Mark that we cannot extend a found fixed substring at this point.
 699    Update the longest found anchored substring and the longest found
 700    floating substrings if needed. */
 701
 702 STATIC void
 703 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
 704 {
 705     const STRLEN l = CHR_SVLEN(data->last_found);
 706     const STRLEN old_l = CHR_SVLEN(*data->longest);
 707     GET_RE_DEBUG_FLAGS_DECL;
 708
 709     PERL_ARGS_ASSERT_SCAN_COMMIT;
 710
 711     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 712         SvSetMagicSV(*data->longest, data->last_found);
 713         if (*data->longest == data->longest_fixed) {
 714             data->offset_fixed = l ? data->last_start_min : data->pos_min;
 715             if (data->flags & SF_BEFORE_EOL)
 716                 data->flags
 717                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
 718             else
 719                 data->flags &= ~SF_FIX_BEFORE_EOL;
 720             data->minlen_fixed=minlenp;
 721             data->lookbehind_fixed=0;
 722         }
 723         else { /* *data->longest == data->longest_float */
 724             data->offset_float_min = l ? data->last_start_min : data->pos_min;
 725             data->offset_float_max = (l
 726                                       ? data->last_start_max
 727                                       : data->pos_min + data->pos_delta);
 728             if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
 729                 data->offset_float_max = I32_MAX;
 730             if (data->flags & SF_BEFORE_EOL)
 731                 data->flags
 732                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
 733             else
 734                 data->flags &= ~SF_FL_BEFORE_EOL;
 735             data->minlen_float=minlenp;
 736             data->lookbehind_float=0;
 737         }
 738     }
 739     SvCUR_set(data->last_found, 0);
 740     {
 741         SV * const sv = data->last_found;
 742         if (SvUTF8(sv) && SvMAGICAL(sv)) {
 743             MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 744             if (mg)
 745                 mg->mg_len = 0;
 746         }
 747     }
 748     data->last_end = -1;
 749     data->flags &= ~SF_BEFORE_EOL;
 750     DEBUG_STUDYDATA("commit: ",data,0);
 751 }
 752
 753 /* Can match anything (initialization) */
 754 STATIC void
 755 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 756 {
 757     PERL_ARGS_ASSERT_CL_ANYTHING;
 758
 759     ANYOF_BITMAP_SETALL(cl);
 760     cl->flags = ANYOF_CLASS|ANYOF_EOS|ANYOF_UNICODE_ALL
 761                 |ANYOF_NON_UTF8_LATIN1_ALL;
 762
 763     /* If any portion of the regex is to operate under locale rules,
 764      * initialization includes it.  The reason this isn't done for all regexes
 765      * is that the optimizer was written under the assumption that locale was
 766      * all-or-nothing.  Given the complexity and lack of documentation in the
 767      * optimizer, and that there are inadequate test cases for locale, so many
 768      * parts of it may not work properly, it is safest to avoid locale unless
 769      * necessary. */
 770     if (RExC_contains_locale) {
 771         ANYOF_CLASS_SETALL(cl);     /* /l uses class */
 772         cl->flags |= ANYOF_LOCALE|ANYOF_LOC_FOLD;
 773     }
 774     else {
 775         ANYOF_CLASS_ZERO(cl);       /* Only /l uses class now */
 776     }
 777 }
 778
 779 /* Can match anything (initialization) */
 780 STATIC int
 781 S_cl_is_anything(const struct regnode_charclass_class *cl)
 782 {
 783     int value;
 784
 785     PERL_ARGS_ASSERT_CL_IS_ANYTHING;
 786
 787     for (value = 0; value <= ANYOF_MAX; value += 2)
 788         if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
 789             return 1;
 790     if (!(cl->flags & ANYOF_UNICODE_ALL))
 791         return 0;
 792     if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
 793         return 0;
 794     return 1;
 795 }
 796
 797 /* Can match anything (initialization) */
 798 STATIC void
 799 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 800 {
 801     PERL_ARGS_ASSERT_CL_INIT;
 802
 803     Zero(cl, 1, struct regnode_charclass_class);
 804     cl->type = ANYOF;
 805     cl_anything(pRExC_state, cl);
 806     ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
 807 }
 808
 809 /* These two functions currently do the exact same thing */
 810 #define cl_init_zero            S_cl_init
 811
 812 /* 'AND' a given class with another one.  Can create false positives.  'cl'
 813  * should not be inverted.  'and_with->flags & ANYOF_CLASS' should be 0 if
 814  * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
 815 STATIC void
 816 S_cl_and(struct regnode_charclass_class *cl,
 817         const struct regnode_charclass_class *and_with)
 818 {
 819     PERL_ARGS_ASSERT_CL_AND;
 820
 821     assert(and_with->type == ANYOF);
 822
 823     /* I (khw) am not sure all these restrictions are necessary XXX */
 824     if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
 825         && !(ANYOF_CLASS_TEST_ANY_SET(cl))
 826         && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 827         && !(and_with->flags & ANYOF_LOC_FOLD)
 828         && !(cl->flags & ANYOF_LOC_FOLD)) {
 829         int i;
 830
 831         if (and_with->flags & ANYOF_INVERT)
 832             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 833                 cl->bitmap[i] &= ~and_with->bitmap[i];
 834         else
 835             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 836                 cl->bitmap[i] &= and_with->bitmap[i];
 837     } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
 838
 839     if (and_with->flags & ANYOF_INVERT) {
 840
 841         /* Here, the and'ed node is inverted.  Get the AND of the flags that
 842          * aren't affected by the inversion.  Those that are affected are
 843          * handled individually below */
 844         U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
 845         cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
 846         cl->flags |= affected_flags;
 847
 848         /* We currently don't know how to deal with things that aren't in the
 849          * bitmap, but we know that the intersection is no greater than what
 850          * is already in cl, so let there be false positives that get sorted
 851          * out after the synthetic start class succeeds, and the node is
 852          * matched for real. */
 853
 854         /* The inversion of these two flags indicate that the resulting
 855          * intersection doesn't have them */
 856         if (and_with->flags & ANYOF_UNICODE_ALL) {
 857             cl->flags &= ~ANYOF_UNICODE_ALL;
 858         }
 859         if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
 860             cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
 861         }
 862     }
 863     else {   /* and'd node is not inverted */
 864         U8 outside_bitmap_but_not_utf8; /* Temp variable */
 865
 866         if (! ANYOF_NONBITMAP(and_with)) {
 867
 868             /* Here 'and_with' doesn't match anything outside the bitmap
 869              * (except possibly ANYOF_UNICODE_ALL), which means the
 870              * intersection can't either, except for ANYOF_UNICODE_ALL, in
 871              * which case we don't know what the intersection is, but it's no
 872              * greater than what cl already has, so can just leave it alone,
 873              * with possible false positives */
 874             if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
 875                 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
 876                 cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
 877             }
 878         }
 879         else if (! ANYOF_NONBITMAP(cl)) {
 880
 881             /* Here, 'and_with' does match something outside the bitmap, and cl
 882              * doesn't have a list of things to match outside the bitmap.  If
 883              * cl can match all code points above 255, the intersection will
 884              * be those above-255 code points that 'and_with' matches.  If cl
 885              * can't match all Unicode code points, it means that it can't
 886              * match anything outside the bitmap (since the 'if' that got us
 887              * into this block tested for that), so we leave the bitmap empty.
 888              */
 889             if (cl->flags & ANYOF_UNICODE_ALL) {
 890                 ARG_SET(cl, ARG(and_with));
 891
 892                 /* and_with's ARG may match things that don't require UTF8.
 893                  * And now cl's will too, in spite of this being an 'and'.  See
 894                  * the comments below about the kludge */
 895                 cl->flags |= and_with->flags & ANYOF_NONBITMAP_NON_UTF8;
 896             }
 897         }
 898         else {
 899             /* Here, both 'and_with' and cl match something outside the
 900              * bitmap.  Currently we do not do the intersection, so just match
 901              * whatever cl had at the beginning.  */
 902         }
 903
 904
 905         /* Take the intersection of the two sets of flags.  However, the
 906          * ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'.  This is a
 907          * kludge around the fact that this flag is not treated like the others
 908          * which are initialized in cl_anything().  The way the optimizer works
 909          * is that the synthetic start class (SSC) is initialized to match
 910          * anything, and then the first time a real node is encountered, its
 911          * values are AND'd with the SSC's with the result being the values of
 912          * the real node.  However, there are paths through the optimizer where
 913          * the AND never gets called, so those initialized bits are set
 914          * inappropriately, which is not usually a big deal, as they just cause
 915          * false positives in the SSC, which will just mean a probably
 916          * imperceptible slow down in execution.  However this bit has a
 917          * higher false positive consequence in that it can cause utf8.pm,
 918          * utf8_heavy.pl ... to be loaded when not necessary, which is a much
 919          * bigger slowdown and also causes significant extra memory to be used.
 920          * In order to prevent this, the code now takes a different tack.  The
 921          * bit isn't set unless some part of the regular expression needs it,
 922          * but once set it won't get cleared.  This means that these extra
 923          * modules won't get loaded unless there was some path through the
 924          * pattern that would have required them anyway, and  so any false
 925          * positives that occur by not ANDing them out when they could be
 926          * aren't as severe as they would be if we treated this bit like all
 927          * the others */
 928         outside_bitmap_but_not_utf8 = (cl->flags | and_with->flags)
 929                                       & ANYOF_NONBITMAP_NON_UTF8;
 930         cl->flags &= and_with->flags;
 931         cl->flags |= outside_bitmap_but_not_utf8;
 932     }
 933 }
 934
 935 /* 'OR' a given class with another one.  Can create false positives.  'cl'
 936  * should not be inverted.  'or_with->flags & ANYOF_CLASS' should be 0 if
 937  * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
 938 STATIC void
 939 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
 940 {
 941     PERL_ARGS_ASSERT_CL_OR;
 942
 943     if (or_with->flags & ANYOF_INVERT) {
 944
 945         /* Here, the or'd node is to be inverted.  This means we take the
 946          * complement of everything not in the bitmap, but currently we don't
 947          * know what that is, so give up and match anything */
 948         if (ANYOF_NONBITMAP(or_with)) {
 949             cl_anything(pRExC_state, cl);
 950         }
 951         /* We do not use
 952          * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
 953          *   <= (B1 | !B2) | (CL1 | !CL2)
 954          * which is wasteful if CL2 is small, but we ignore CL2:
 955          *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
 956          * XXXX Can we handle case-fold?  Unclear:
 957          *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
 958          *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
 959          */
 960         else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 961              && !(or_with->flags & ANYOF_LOC_FOLD)
 962              && !(cl->flags & ANYOF_LOC_FOLD) ) {
 963             int i;
 964
 965             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 966                 cl->bitmap[i] |= ~or_with->bitmap[i];
 967         } /* XXXX: logic is complicated otherwise */
 968         else {
 969             cl_anything(pRExC_state, cl);
 970         }
 971
 972         /* And, we can just take the union of the flags that aren't affected
 973          * by the inversion */
 974         cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
 975
 976         /* For the remaining flags:
 977             ANYOF_UNICODE_ALL and inverted means to not match anything above
 978                     255, which means that the union with cl should just be
 979                     what cl has in it, so can ignore this flag
 980             ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
 981                     is 127-255 to match them, but then invert that, so the
 982                     union with cl should just be what cl has in it, so can
 983                     ignore this flag
 984          */
 985     } else {    /* 'or_with' is not inverted */
 986         /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
 987         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 988              && (!(or_with->flags & ANYOF_LOC_FOLD)
 989                  || (cl->flags & ANYOF_LOC_FOLD)) ) {
 990             int i;
 991
 992             /* OR char bitmap and class bitmap separately */
 993             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 994                 cl->bitmap[i] |= or_with->bitmap[i];
 995             if (ANYOF_CLASS_TEST_ANY_SET(or_with)) {
 996                 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
 997                     cl->classflags[i] |= or_with->classflags[i];
 998                 cl->flags |= ANYOF_CLASS;
 999             }
1000         }
1001         else { /* XXXX: logic is complicated, leave it along for a moment. */
1002             cl_anything(pRExC_state, cl);
1003         }
1004
1005         if (ANYOF_NONBITMAP(or_with)) {
1006
1007             /* Use the added node's outside-the-bit-map match if there isn't a
1008              * conflict.  If there is a conflict (both nodes match something
1009              * outside the bitmap, but what they match outside is not the same
1010              * pointer, and hence not easily compared until XXX we extend
1011              * inversion lists this far), give up and allow the start class to
1012              * match everything outside the bitmap.  If that stuff is all above
1013              * 255, can just set UNICODE_ALL, otherwise caould be anything. */
1014             if (! ANYOF_NONBITMAP(cl)) {
1015                 ARG_SET(cl, ARG(or_with));
1016             }
1017             else if (ARG(cl) != ARG(or_with)) {
1018
1019                 if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
1020                     cl_anything(pRExC_state, cl);
1021                 }
1022                 else {
1023                     cl->flags |= ANYOF_UNICODE_ALL;
1024                 }
1025             }
1026         }
1027
1028         /* Take the union */
1029         cl->flags |= or_with->flags;
1030     }
1031 }
1032
1033 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1034 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
1035 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1036 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
1037
1038
1039 #ifdef DEBUGGING
1040 /*
1041    dump_trie(trie,widecharmap,revcharmap)
1042    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1043    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1044
1045    These routines dump out a trie in a somewhat readable format.
1046    The _interim_ variants are used for debugging the interim
1047    tables that are used to generate the final compressed
1048    representation which is what dump_trie expects.
1049
1050    Part of the reason for their existence is to provide a form
1051    of documentation as to how the different representations function.
1052
1053 */
1054
1055 /*
1056   Dumps the final compressed table form of the trie to Perl_debug_log.
1057   Used for debugging make_trie().
1058 */
1059
1060 STATIC void
1061 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1062             AV *revcharmap, U32 depth)
1063 {
1064     U32 state;
1065     SV *sv=sv_newmortal();
1066     int colwidth= widecharmap ? 6 : 4;
1067     U16 word;
1068     GET_RE_DEBUG_FLAGS_DECL;
1069
1070     PERL_ARGS_ASSERT_DUMP_TRIE;
1071
1072     PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1073         (int)depth * 2 + 2,"",
1074         "Match","Base","Ofs" );
1075
1076     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1077         SV ** const tmp = av_fetch( revcharmap, state, 0);
1078         if ( tmp ) {
1079             PerlIO_printf( Perl_debug_log, "%*s",
1080                 colwidth,
1081                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1082                             PL_colors[0], PL_colors[1],
1083                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1084                             PERL_PV_ESCAPE_FIRSTCHAR
1085                 )
1086             );
1087         }
1088     }
1089     PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1090         (int)depth * 2 + 2,"");
1091
1092     for( state = 0 ; state < trie->uniquecharcount ; state++ )
1093         PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1094     PerlIO_printf( Perl_debug_log, "\n");
1095
1096     for( state = 1 ; state < trie->statecount ; state++ ) {
1097         const U32 base = trie->states[ state ].trans.base;
1098
1099         PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1100
1101         if ( trie->states[ state ].wordnum ) {
1102             PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1103         } else {
1104             PerlIO_printf( Perl_debug_log, "%6s", "" );
1105         }
1106
1107         PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1108
1109         if ( base ) {
1110             U32 ofs = 0;
1111
1112             while( ( base + ofs  < trie->uniquecharcount ) ||
1113                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
1114                      && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1115                     ofs++;
1116
1117             PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1118
1119             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1120                 if ( ( base + ofs >= trie->uniquecharcount ) &&
1121                      ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1122                      trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1123                 {
1124                    PerlIO_printf( Perl_debug_log, "%*"UVXf,
1125                     colwidth,
1126                     (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1127                 } else {
1128                     PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
1129                 }
1130             }
1131
1132             PerlIO_printf( Perl_debug_log, "]");
1133
1134         }
1135         PerlIO_printf( Perl_debug_log, "\n" );
1136     }
1137     PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1138     for (word=1; word <= trie->wordcount; word++) {
1139         PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1140             (int)word, (int)(trie->wordinfo[word].prev),
1141             (int)(trie->wordinfo[word].len));
1142     }
1143     PerlIO_printf(Perl_debug_log, "\n" );
1144 }
1145 /*
1146   Dumps a fully constructed but uncompressed trie in list form.
1147   List tries normally only are used for construction when the number of
1148   possible chars (trie->uniquecharcount) is very high.
1149   Used for debugging make_trie().
1150 */
1151 STATIC void
1152 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1153                          HV *widecharmap, AV *revcharmap, U32 next_alloc,
1154                          U32 depth)
1155 {
1156     U32 state;
1157     SV *sv=sv_newmortal();
1158     int colwidth= widecharmap ? 6 : 4;
1159     GET_RE_DEBUG_FLAGS_DECL;
1160
1161     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1162
1163     /* print out the table precompression.  */
1164     PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1165         (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1166         "------:-----+-----------------\n" );
1167
1168     for( state=1 ; state < next_alloc ; state ++ ) {
1169         U16 charid;
1170
1171         PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1172             (int)depth * 2 + 2,"", (UV)state  );
1173         if ( ! trie->states[ state ].wordnum ) {
1174             PerlIO_printf( Perl_debug_log, "%5s| ","");
1175         } else {
1176             PerlIO_printf( Perl_debug_log, "W%4x| ",
1177                 trie->states[ state ].wordnum
1178             );
1179         }
1180         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1181             SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1182             if ( tmp ) {
1183                 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1184                     colwidth,
1185                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1186                             PL_colors[0], PL_colors[1],
1187                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1188                             PERL_PV_ESCAPE_FIRSTCHAR
1189                     ) ,
1190                     TRIE_LIST_ITEM(state,charid).forid,
1191                     (UV)TRIE_LIST_ITEM(state,charid).newstate
1192                 );
1193                 if (!(charid % 10))
1194                     PerlIO_printf(Perl_debug_log, "\n%*s| ",
1195                         (int)((depth * 2) + 14), "");
1196             }
1197         }
1198         PerlIO_printf( Perl_debug_log, "\n");
1199     }
1200 }
1201
1202 /*
1203   Dumps a fully constructed but uncompressed trie in table form.
1204   This is the normal DFA style state transition table, with a few
1205   twists to facilitate compression later.
1206   Used for debugging make_trie().
1207 */
1208 STATIC void
1209 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1210                           HV *widecharmap, AV *revcharmap, U32 next_alloc,
1211                           U32 depth)
1212 {
1213     U32 state;
1214     U16 charid;
1215     SV *sv=sv_newmortal();
1216     int colwidth= widecharmap ? 6 : 4;
1217     GET_RE_DEBUG_FLAGS_DECL;
1218
1219     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1220
1221     /*
1222        print out the table precompression so that we can do a visual check
1223        that they are identical.
1224      */
1225
1226     PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1227
1228     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1229         SV ** const tmp = av_fetch( revcharmap, charid, 0);
1230         if ( tmp ) {
1231             PerlIO_printf( Perl_debug_log, "%*s",
1232                 colwidth,
1233                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1234                             PL_colors[0], PL_colors[1],
1235                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1236                             PERL_PV_ESCAPE_FIRSTCHAR
1237                 )
1238             );
1239         }
1240     }
1241
1242     PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1243
1244     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1245         PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1246     }
1247
1248     PerlIO_printf( Perl_debug_log, "\n" );
1249
1250     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1251
1252         PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1253             (int)depth * 2 + 2,"",
1254             (UV)TRIE_NODENUM( state ) );
1255
1256         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1257             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1258             if (v)
1259                 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1260             else
1261                 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1262         }
1263         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1264             PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1265         } else {
1266             PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1267             trie->states[ TRIE_NODENUM( state ) ].wordnum );
1268         }
1269     }
1270 }
1271
1272 #endif
1273
1274
1275 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1276   startbranch: the first branch in the whole branch sequence
1277   first      : start branch of sequence of branch-exact nodes.
1278                May be the same as startbranch
1279   last       : Thing following the last branch.
1280                May be the same as tail.
1281   tail       : item following the branch sequence
1282   count      : words in the sequence
1283   flags      : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1284   depth      : indent depth
1285
1286 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1287
1288 A trie is an N'ary tree where the branches are determined by digital
1289 decomposition of the key. IE, at the root node you look up the 1st character and
1290 follow that branch repeat until you find the end of the branches. Nodes can be
1291 marked as "accepting" meaning they represent a complete word. Eg:
1292
1293   /he|she|his|hers/
1294
1295 would convert into the following structure. Numbers represent states, letters
1296 following numbers represent valid transitions on the letter from that state, if
1297 the number is in square brackets it represents an accepting state, otherwise it
1298 will be in parenthesis.
1299
1300       +-h->+-e->[3]-+-r->(8)-+-s->[9]
1301       |    |
1302       |   (2)
1303       |    |
1304      (1)   +-i->(6)-+-s->[7]
1305       |
1306       +-s->(3)-+-h->(4)-+-e->[5]
1307
1308       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1309
1310 This shows that when matching against the string 'hers' we will begin at state 1
1311 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1312 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1313 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1314 single traverse. We store a mapping from accepting to state to which word was
1315 matched, and then when we have multiple possibilities we try to complete the
1316 rest of the regex in the order in which they occured in the alternation.
1317
1318 The only prior NFA like behaviour that would be changed by the TRIE support is
1319 the silent ignoring of duplicate alternations which are of the form:
1320
1321  / (DUPE|DUPE) X? (?{ ... }) Y /x
1322
1323 Thus EVAL blocks following a trie may be called a different number of times with
1324 and without the optimisation. With the optimisations dupes will be silently
1325 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1326 the following demonstrates:
1327
1328  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1329
1330 which prints out 'word' three times, but
1331
1332  'words'=~/(word|word|word)(?{ print $1 })S/
1333
1334 which doesnt print it out at all. This is due to other optimisations kicking in.
1335
1336 Example of what happens on a structural level:
1337
1338 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1339
1340    1: CURLYM[1] {1,32767}(18)
1341    5:   BRANCH(8)
1342    6:     EXACT <ac>(16)
1343    8:   BRANCH(11)
1344    9:     EXACT <ad>(16)
1345   11:   BRANCH(14)
1346   12:     EXACT <ab>(16)
1347   16:   SUCCEED(0)
1348   17:   NOTHING(18)
1349   18: END(0)
1350
1351 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1352 and should turn into:
1353
1354    1: CURLYM[1] {1,32767}(18)
1355    5:   TRIE(16)
1356         [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1357           <ac>
1358           <ad>
1359           <ab>
1360   16:   SUCCEED(0)
1361   17:   NOTHING(18)
1362   18: END(0)
1363
1364 Cases where tail != last would be like /(?foo|bar)baz/:
1365
1366    1: BRANCH(4)
1367    2:   EXACT <foo>(8)
1368    4: BRANCH(7)
1369    5:   EXACT <bar>(8)
1370    7: TAIL(8)
1371    8: EXACT <baz>(10)
1372   10: END(0)
1373
1374 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1375 and would end up looking like:
1376
1377     1: TRIE(8)
1378       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1379         <foo>
1380         <bar>
1381    7: TAIL(8)
1382    8: EXACT <baz>(10)
1383   10: END(0)
1384
1385     d = uvuni_to_utf8_flags(d, uv, 0);
1386
1387 is the recommended Unicode-aware way of saying
1388
1389     *(d++) = uv;
1390 */
1391
1392 #define TRIE_STORE_REVCHAR(val)                                            \
1393     STMT_START {                                                           \
1394         if (UTF) {                                                         \
1395             SV *zlopp = newSV(7); /* XXX: optimize me */                   \
1396             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
1397             unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, val); \
1398             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
1399             SvPOK_on(zlopp);                                               \
1400             SvUTF8_on(zlopp);                                              \
1401             av_push(revcharmap, zlopp);                                    \
1402         } else {                                                           \
1403             char ooooff = (char)val;                                           \
1404             av_push(revcharmap, newSVpvn(&ooooff, 1));                     \
1405         }                                                                  \
1406         } STMT_END
1407
1408 #define TRIE_READ_CHAR STMT_START {                                                     \
1409     wordlen++;                                                                          \
1410     if ( UTF ) {                                                                        \
1411         /* if it is UTF then it is either already folded, or does not need folding */   \
1412         uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags);             \
1413     }                                                                                   \
1414     else if (folder == PL_fold_latin1) {                                                \
1415         /* if we use this folder we have to obey unicode rules on latin-1 data */       \
1416         if ( foldlen > 0 ) {                                                            \
1417            uvc = utf8n_to_uvuni( (const U8*) scan, UTF8_MAXLEN, &len, uniflags );       \
1418            foldlen -= len;                                                              \
1419            scan += len;                                                                 \
1420            len = 0;                                                                     \
1421         } else {                                                                        \
1422             len = 1;                                                                    \
1423             uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1);                     \
1424             skiplen = UNISKIP(uvc);                                                     \
1425             foldlen -= skiplen;                                                         \
1426             scan = foldbuf + skiplen;                                                   \
1427         }                                                                               \
1428     } else {                                                                            \
1429         /* raw data, will be folded later if needed */                                  \
1430         uvc = (U32)*uc;                                                                 \
1431         len = 1;                                                                        \
1432     }                                                                                   \
1433 } STMT_END
1434
1435
1436
1437 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
1438     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
1439         U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
1440         Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1441     }                                                           \
1442     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
1443     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
1444     TRIE_LIST_CUR( state )++;                                   \
1445 } STMT_END
1446
1447 #define TRIE_LIST_NEW(state) STMT_START {                       \
1448     Newxz( trie->states[ state ].trans.list,               \
1449         4, reg_trie_trans_le );                                 \
1450      TRIE_LIST_CUR( state ) = 1;                                \
1451      TRIE_LIST_LEN( state ) = 4;                                \
1452 } STMT_END
1453
1454 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
1455     U16 dupe= trie->states[ state ].wordnum;                    \
1456     regnode * const noper_next = regnext( noper );              \
1457                                                                 \
1458     DEBUG_r({                                                   \
1459         /* store the word for dumping */                        \
1460         SV* tmp;                                                \
1461         if (OP(noper) != NOTHING)                               \
1462             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);    \
1463         else                                                    \
1464             tmp = newSVpvn_utf8( "", 0, UTF );                  \
1465         av_push( trie_words, tmp );                             \
1466     });                                                         \
1467                                                                 \
1468     curword++;                                                  \
1469     trie->wordinfo[curword].prev   = 0;                         \
1470     trie->wordinfo[curword].len    = wordlen;                   \
1471     trie->wordinfo[curword].accept = state;                     \
1472                                                                 \
1473     if ( noper_next < tail ) {                                  \
1474         if (!trie->jump)                                        \
1475             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1476         trie->jump[curword] = (U16)(noper_next - convert);      \
1477         if (!jumper)                                            \
1478             jumper = noper_next;                                \
1479         if (!nextbranch)                                        \
1480             nextbranch= regnext(cur);                           \
1481     }                                                           \
1482                                                                 \
1483     if ( dupe ) {                                               \
1484         /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
1485         /* chain, so that when the bits of chain are later    */\
1486         /* linked together, the dups appear in the chain      */\
1487         trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1488         trie->wordinfo[dupe].prev = curword;                    \
1489     } else {                                                    \
1490         /* we haven't inserted this word yet.                */ \
1491         trie->states[ state ].wordnum = curword;                \
1492     }                                                           \
1493 } STMT_END
1494
1495
1496 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)          \
1497      ( ( base + charid >=  ucharcount                                   \
1498          && base + charid < ubound                                      \
1499          && state == trie->trans[ base - ucharcount + charid ].check    \
1500          && trie->trans[ base - ucharcount + charid ].next )            \
1501            ? trie->trans[ base - ucharcount + charid ].next             \
1502            : ( state==1 ? special : 0 )                                 \
1503       )
1504
1505 #define MADE_TRIE       1
1506 #define MADE_JUMP_TRIE  2
1507 #define MADE_EXACT_TRIE 4
1508
1509 STATIC I32
1510 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1511 {
1512     dVAR;
1513     /* first pass, loop through and scan words */
1514     reg_trie_data *trie;
1515     HV *widecharmap = NULL;
1516     AV *revcharmap = newAV();
1517     regnode *cur;
1518     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1519     STRLEN len = 0;
1520     UV uvc = 0;
1521     U16 curword = 0;
1522     U32 next_alloc = 0;
1523     regnode *jumper = NULL;
1524     regnode *nextbranch = NULL;
1525     regnode *convert = NULL;
1526     U32 *prev_states; /* temp array mapping each state to previous one */
1527     /* we just use folder as a flag in utf8 */
1528     const U8 * folder = NULL;
1529
1530 #ifdef DEBUGGING
1531     const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1532     AV *trie_words = NULL;
1533     /* along with revcharmap, this only used during construction but both are
1534      * useful during debugging so we store them in the struct when debugging.
1535      */
1536 #else
1537     const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1538     STRLEN trie_charcount=0;
1539 #endif
1540     SV *re_trie_maxbuff;
1541     GET_RE_DEBUG_FLAGS_DECL;
1542
1543     PERL_ARGS_ASSERT_MAKE_TRIE;
1544 #ifndef DEBUGGING
1545     PERL_UNUSED_ARG(depth);
1546 #endif
1547
1548     switch (flags) {
1549         case EXACT: break;
1550         case EXACTFA:
1551         case EXACTFU_SS:
1552         case EXACTFU_TRICKYFOLD:
1553         case EXACTFU: folder = PL_fold_latin1; break;
1554         case EXACTF:  folder = PL_fold; break;
1555         case EXACTFL: folder = PL_fold_locale; break;
1556         default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1557     }
1558
1559     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1560     trie->refcount = 1;
1561     trie->startstate = 1;
1562     trie->wordcount = word_count;
1563     RExC_rxi->data->data[ data_slot ] = (void*)trie;
1564     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1565     if (flags == EXACT)
1566         trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1567     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1568                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
1569
1570     DEBUG_r({
1571         trie_words = newAV();
1572     });
1573
1574     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1575     if (!SvIOK(re_trie_maxbuff)) {
1576         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1577     }
1578     DEBUG_TRIE_COMPILE_r({
1579                 PerlIO_printf( Perl_debug_log,
1580                   "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1581                   (int)depth * 2 + 2, "",
1582                   REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1583                   REG_NODE_NUM(last), REG_NODE_NUM(tail),
1584                   (int)depth);
1585     });
1586
1587    /* Find the node we are going to overwrite */
1588     if ( first == startbranch && OP( last ) != BRANCH ) {
1589         /* whole branch chain */
1590         convert = first;
1591     } else {
1592         /* branch sub-chain */
1593         convert = NEXTOPER( first );
1594     }
1595
1596     /*  -- First loop and Setup --
1597
1598        We first traverse the branches and scan each word to determine if it
1599        contains widechars, and how many unique chars there are, this is
1600        important as we have to build a table with at least as many columns as we
1601        have unique chars.
1602
1603        We use an array of integers to represent the character codes 0..255
1604        (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1605        native representation of the character value as the key and IV's for the
1606        coded index.
1607
1608        *TODO* If we keep track of how many times each character is used we can
1609        remap the columns so that the table compression later on is more
1610        efficient in terms of memory by ensuring the most common value is in the
1611        middle and the least common are on the outside.  IMO this would be better
1612        than a most to least common mapping as theres a decent chance the most
1613        common letter will share a node with the least common, meaning the node
1614        will not be compressible. With a middle is most common approach the worst
1615        case is when we have the least common nodes twice.
1616
1617      */
1618
1619     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1620         regnode *noper = NEXTOPER( cur );
1621         const U8 *uc = (U8*)STRING( noper );
1622         const U8 *e  = uc + STR_LEN( noper );
1623         STRLEN foldlen = 0;
1624         U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1625         STRLEN skiplen = 0;
1626         const U8 *scan = (U8*)NULL;
1627         U32 wordlen      = 0;         /* required init */
1628         STRLEN chars = 0;
1629         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1630
1631         if (OP(noper) == NOTHING) {
1632             regnode *noper_next= regnext(noper);
1633             if (noper_next != tail && OP(noper_next) == flags) {
1634                 noper = noper_next;
1635                 uc= (U8*)STRING(noper);
1636                 e= uc + STR_LEN(noper);
1637                 trie->minlen= STR_LEN(noper);
1638             } else {
1639                 trie->minlen= 0;
1640                 continue;
1641             }
1642         }
1643
1644         if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
1645             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1646                                           regardless of encoding */
1647             if (OP( noper ) == EXACTFU_SS) {
1648                 /* false positives are ok, so just set this */
1649                 TRIE_BITMAP_SET(trie,0xDF);
1650             }
1651         }
1652         for ( ; uc < e ; uc += len ) {
1653             TRIE_CHARCOUNT(trie)++;
1654             TRIE_READ_CHAR;
1655             chars++;
1656             if ( uvc < 256 ) {
1657                 if ( folder ) {
1658                     U8 folded= folder[ (U8) uvc ];
1659                     if ( !trie->charmap[ folded ] ) {
1660                         trie->charmap[ folded ]=( ++trie->uniquecharcount );
1661                         TRIE_STORE_REVCHAR( folded );
1662                     }
1663                 }
1664                 if ( !trie->charmap[ uvc ] ) {
1665                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1666                     TRIE_STORE_REVCHAR( uvc );
1667                 }
1668                 if ( set_bit ) {
1669                     /* store the codepoint in the bitmap, and its folded
1670                      * equivalent. */
1671                     TRIE_BITMAP_SET(trie, uvc);
1672
1673                     /* store the folded codepoint */
1674                     if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
1675
1676                     if ( !UTF ) {
1677                         /* store first byte of utf8 representation of
1678                            variant codepoints */
1679                         if (! UNI_IS_INVARIANT(uvc)) {
1680                             TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
1681                         }
1682                     }
1683                     set_bit = 0; /* We've done our bit :-) */
1684                 }
1685             } else {
1686                 SV** svpp;
1687                 if ( !widecharmap )
1688                     widecharmap = newHV();
1689
1690                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1691
1692                 if ( !svpp )
1693                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1694
1695                 if ( !SvTRUE( *svpp ) ) {
1696                     sv_setiv( *svpp, ++trie->uniquecharcount );
1697                     TRIE_STORE_REVCHAR(uvc);
1698                 }
1699             }
1700         }
1701         if( cur == first ) {
1702             trie->minlen = chars;
1703             trie->maxlen = chars;
1704         } else if (chars < trie->minlen) {
1705             trie->minlen = chars;
1706         } else if (chars > trie->maxlen) {
1707             trie->maxlen = chars;
1708         }
1709         if (OP( noper ) == EXACTFU_SS) {
1710             /* XXX: workaround - 'ss' could match "\x{DF}" so minlen could be 1 and not 2*/
1711             if (trie->minlen > 1)
1712                 trie->minlen= 1;
1713         }
1714         if (OP( noper ) == EXACTFU_TRICKYFOLD) {
1715             /* XXX: workround - things like "\x{1FBE}\x{0308}\x{0301}" can match "\x{0390}"
1716              *                - We assume that any such sequence might match a 2 byte string */
1717             if (trie->minlen > 2 )
1718                 trie->minlen= 2;
1719         }
1720
1721     } /* end first pass */
1722     DEBUG_TRIE_COMPILE_r(
1723         PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1724                 (int)depth * 2 + 2,"",
1725                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1726                 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1727                 (int)trie->minlen, (int)trie->maxlen )
1728     );
1729
1730     /*
1731         We now know what we are dealing with in terms of unique chars and
1732         string sizes so we can calculate how much memory a naive
1733         representation using a flat table  will take. If it's over a reasonable
1734         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1735         conservative but potentially much slower representation using an array
1736         of lists.
1737
1738         At the end we convert both representations into the same compressed
1739         form that will be used in regexec.c for matching with. The latter
1740         is a form that cannot be used to construct with but has memory
1741         properties similar to the list form and access properties similar
1742         to the table form making it both suitable for fast searches and
1743         small enough that its feasable to store for the duration of a program.
1744
1745         See the comment in the code where the compressed table is produced
1746         inplace from the flat tabe representation for an explanation of how
1747         the compression works.
1748
1749     */
1750
1751
1752     Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1753     prev_states[1] = 0;
1754
1755     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1756         /*
1757             Second Pass -- Array Of Lists Representation
1758
1759             Each state will be represented by a list of charid:state records
1760             (reg_trie_trans_le) the first such element holds the CUR and LEN
1761             points of the allocated array. (See defines above).
1762
1763             We build the initial structure using the lists, and then convert
1764             it into the compressed table form which allows faster lookups
1765             (but cant be modified once converted).
1766         */
1767
1768         STRLEN transcount = 1;
1769
1770         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1771             "%*sCompiling trie using list compiler\n",
1772             (int)depth * 2 + 2, ""));
1773
1774         trie->states = (reg_trie_state *)
1775             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1776                                   sizeof(reg_trie_state) );
1777         TRIE_LIST_NEW(1);
1778         next_alloc = 2;
1779
1780         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1781
1782             regnode *noper   = NEXTOPER( cur );
1783             U8 *uc           = (U8*)STRING( noper );
1784             const U8 *e      = uc + STR_LEN( noper );
1785             U32 state        = 1;         /* required init */
1786             U16 charid       = 0;         /* sanity init */
1787             U8 *scan         = (U8*)NULL; /* sanity init */
1788             STRLEN foldlen   = 0;         /* required init */
1789             U32 wordlen      = 0;         /* required init */
1790             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1791             STRLEN skiplen   = 0;
1792
1793             if (OP(noper) == NOTHING) {
1794                 regnode *noper_next= regnext(noper);
1795                 if (noper_next != tail && OP(noper_next) == flags) {
1796                     noper = noper_next;
1797                     uc= (U8*)STRING(noper);
1798                     e= uc + STR_LEN(noper);
1799                 }
1800             }
1801
1802             if (OP(noper) != NOTHING) {
1803                 for ( ; uc < e ; uc += len ) {
1804
1805                     TRIE_READ_CHAR;
1806
1807                     if ( uvc < 256 ) {
1808                         charid = trie->charmap[ uvc ];
1809                     } else {
1810                         SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1811                         if ( !svpp ) {
1812                             charid = 0;
1813                         } else {
1814                             charid=(U16)SvIV( *svpp );
1815                         }
1816                     }
1817                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
1818                     if ( charid ) {
1819
1820                         U16 check;
1821                         U32 newstate = 0;
1822
1823                         charid--;
1824                         if ( !trie->states[ state ].trans.list ) {
1825                             TRIE_LIST_NEW( state );
1826                         }
1827                         for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1828                             if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1829                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1830                                 break;
1831                             }
1832                         }
1833                         if ( ! newstate ) {
1834                             newstate = next_alloc++;
1835                             prev_states[newstate] = state;
1836                             TRIE_LIST_PUSH( state, charid, newstate );
1837                             transcount++;
1838                         }
1839                         state = newstate;
1840                     } else {
1841                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1842                     }
1843                 }
1844             }
1845             TRIE_HANDLE_WORD(state);
1846
1847         } /* end second pass */
1848
1849         /* next alloc is the NEXT state to be allocated */
1850         trie->statecount = next_alloc;
1851         trie->states = (reg_trie_state *)
1852             PerlMemShared_realloc( trie->states,
1853                                    next_alloc
1854                                    * sizeof(reg_trie_state) );
1855
1856         /* and now dump it out before we compress it */
1857         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1858                                                          revcharmap, next_alloc,
1859                                                          depth+1)
1860         );
1861
1862         trie->trans = (reg_trie_trans *)
1863             PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1864         {
1865             U32 state;
1866             U32 tp = 0;
1867             U32 zp = 0;
1868
1869
1870             for( state=1 ; state < next_alloc ; state ++ ) {
1871                 U32 base=0;
1872
1873                 /*
1874                 DEBUG_TRIE_COMPILE_MORE_r(
1875                     PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1876                 );
1877                 */
1878
1879                 if (trie->states[state].trans.list) {
1880                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1881                     U16 maxid=minid;
1882                     U16 idx;
1883
1884                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1885                         const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1886                         if ( forid < minid ) {
1887                             minid=forid;
1888                         } else if ( forid > maxid ) {
1889                             maxid=forid;
1890                         }
1891                     }
1892                     if ( transcount < tp + maxid - minid + 1) {
1893                         transcount *= 2;
1894                         trie->trans = (reg_trie_trans *)
1895                             PerlMemShared_realloc( trie->trans,
1896                                                      transcount
1897                                                      * sizeof(reg_trie_trans) );
1898                         Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1899                     }
1900                     base = trie->uniquecharcount + tp - minid;
1901                     if ( maxid == minid ) {
1902                         U32 set = 0;
1903                         for ( ; zp < tp ; zp++ ) {
1904                             if ( ! trie->trans[ zp ].next ) {
1905                                 base = trie->uniquecharcount + zp - minid;
1906                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1907                                 trie->trans[ zp ].check = state;
1908                                 set = 1;
1909                                 break;
1910                             }
1911                         }
1912                         if ( !set ) {
1913                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1914                             trie->trans[ tp ].check = state;
1915                             tp++;
1916                             zp = tp;
1917                         }
1918                     } else {
1919                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1920                             const U32 tid = base -  trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1921                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1922                             trie->trans[ tid ].check = state;
1923                         }
1924                         tp += ( maxid - minid + 1 );
1925                     }
1926                     Safefree(trie->states[ state ].trans.list);
1927                 }
1928                 /*
1929                 DEBUG_TRIE_COMPILE_MORE_r(
1930                     PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1931                 );
1932                 */
1933                 trie->states[ state ].trans.base=base;
1934             }
1935             trie->lasttrans = tp + 1;
1936         }
1937     } else {
1938         /*
1939            Second Pass -- Flat Table Representation.
1940
1941            we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1942            We know that we will need Charcount+1 trans at most to store the data
1943            (one row per char at worst case) So we preallocate both structures
1944            assuming worst case.
1945
1946            We then construct the trie using only the .next slots of the entry
1947            structs.
1948
1949            We use the .check field of the first entry of the node temporarily to
1950            make compression both faster and easier by keeping track of how many non
1951            zero fields are in the node.
1952
1953            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1954            transition.
1955
1956            There are two terms at use here: state as a TRIE_NODEIDX() which is a
1957            number representing the first entry of the node, and state as a
1958            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1959            TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1960            are 2 entrys per node. eg:
1961
1962              A B       A B
1963           1. 2 4    1. 3 7
1964           2. 0 3    3. 0 5
1965           3. 0 0    5. 0 0
1966           4. 0 0    7. 0 0
1967
1968            The table is internally in the right hand, idx form. However as we also
1969            have to deal with the states array which is indexed by nodenum we have to
1970            use TRIE_NODENUM() to convert.
1971
1972         */
1973         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1974             "%*sCompiling trie using table compiler\n",
1975             (int)depth * 2 + 2, ""));
1976
1977         trie->trans = (reg_trie_trans *)
1978             PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1979                                   * trie->uniquecharcount + 1,
1980                                   sizeof(reg_trie_trans) );
1981         trie->states = (reg_trie_state *)
1982             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1983                                   sizeof(reg_trie_state) );
1984         next_alloc = trie->uniquecharcount + 1;
1985
1986
1987         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1988
1989             regnode *noper   = NEXTOPER( cur );
1990             const U8 *uc     = (U8*)STRING( noper );
1991             const U8 *e      = uc + STR_LEN( noper );
1992
1993             U32 state        = 1;         /* required init */
1994
1995             U16 charid       = 0;         /* sanity init */
1996             U32 accept_state = 0;         /* sanity init */
1997             U8 *scan         = (U8*)NULL; /* sanity init */
1998
1999             STRLEN foldlen   = 0;         /* required init */
2000             U32 wordlen      = 0;         /* required init */
2001             STRLEN skiplen   = 0;
2002             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2003
2004             if (OP(noper) == NOTHING) {
2005                 regnode *noper_next= regnext(noper);
2006                 if (noper_next != tail && OP(noper_next) == flags) {
2007                     noper = noper_next;
2008                     uc= (U8*)STRING(noper);
2009                     e= uc + STR_LEN(noper);
2010                 }
2011             }
2012
2013             if ( OP(noper) != NOTHING ) {
2014                 for ( ; uc < e ; uc += len ) {
2015
2016                     TRIE_READ_CHAR;
2017
2018                     if ( uvc < 256 ) {
2019                         charid = trie->charmap[ uvc ];
2020                     } else {
2021                         SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
2022                         charid = svpp ? (U16)SvIV(*svpp) : 0;
2023                     }
2024                     if ( charid ) {
2025                         charid--;
2026                         if ( !trie->trans[ state + charid ].next ) {
2027                             trie->trans[ state + charid ].next = next_alloc;
2028                             trie->trans[ state ].check++;
2029                             prev_states[TRIE_NODENUM(next_alloc)]
2030                                     = TRIE_NODENUM(state);
2031                             next_alloc += trie->uniquecharcount;
2032                         }
2033                         state = trie->trans[ state + charid ].next;
2034                     } else {
2035                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2036                     }
2037                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
2038                 }
2039             }
2040             accept_state = TRIE_NODENUM( state );
2041             TRIE_HANDLE_WORD(accept_state);
2042
2043         } /* end second pass */
2044
2045         /* and now dump it out before we compress it */
2046         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2047                                                           revcharmap,
2048                                                           next_alloc, depth+1));
2049
2050         {
2051         /*
2052            * Inplace compress the table.*
2053
2054            For sparse data sets the table constructed by the trie algorithm will
2055            be mostly 0/FAIL transitions or to put it another way mostly empty.
2056            (Note that leaf nodes will not contain any transitions.)
2057
2058            This algorithm compresses the tables by eliminating most such
2059            transitions, at the cost of a modest bit of extra work during lookup:
2060
2061            - Each states[] entry contains a .base field which indicates the
2062            index in the state[] array wheres its transition data is stored.
2063
2064            - If .base is 0 there are no valid transitions from that node.
2065
2066            - If .base is nonzero then charid is added to it to find an entry in
2067            the trans array.
2068
2069            -If trans[states[state].base+charid].check!=state then the
2070            transition is taken to be a 0/Fail transition. Thus if there are fail
2071            transitions at the front of the node then the .base offset will point
2072            somewhere inside the previous nodes data (or maybe even into a node
2073            even earlier), but the .check field determines if the transition is
2074            valid.
2075
2076            XXX - wrong maybe?
2077            The following process inplace converts the table to the compressed
2078            table: We first do not compress the root node 1,and mark all its
2079            .check pointers as 1 and set its .base pointer as 1 as well. This
2080            allows us to do a DFA construction from the compressed table later,
2081            and ensures that any .base pointers we calculate later are greater
2082            than 0.
2083
2084            - We set 'pos' to indicate the first entry of the second node.
2085
2086            - We then iterate over the columns of the node, finding the first and
2087            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2088            and set the .check pointers accordingly, and advance pos
2089            appropriately and repreat for the next node. Note that when we copy
2090            the next pointers we have to convert them from the original
2091            NODEIDX form to NODENUM form as the former is not valid post
2092            compression.
2093
2094            - If a node has no transitions used we mark its base as 0 and do not
2095            advance the pos pointer.
2096
2097            - If a node only has one transition we use a second pointer into the
2098            structure to fill in allocated fail transitions from other states.
2099            This pointer is independent of the main pointer and scans forward
2100            looking for null transitions that are allocated to a state. When it
2101            finds one it writes the single transition into the "hole".  If the
2102            pointer doesnt find one the single transition is appended as normal.
2103
2104            - Once compressed we can Renew/realloc the structures to release the
2105            excess space.
2106
2107            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2108            specifically Fig 3.47 and the associated pseudocode.
2109
2110            demq
2111         */
2112         const U32 laststate = TRIE_NODENUM( next_alloc );
2113         U32 state, charid;
2114         U32 pos = 0, zp=0;
2115         trie->statecount = laststate;
2116
2117         for ( state = 1 ; state < laststate ; state++ ) {
2118             U8 flag = 0;
2119             const U32 stateidx = TRIE_NODEIDX( state );
2120             const U32 o_used = trie->trans[ stateidx ].check;
2121             U32 used = trie->trans[ stateidx ].check;
2122             trie->trans[ stateidx ].check = 0;
2123
2124             for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2125                 if ( flag || trie->trans[ stateidx + charid ].next ) {
2126                     if ( trie->trans[ stateidx + charid ].next ) {
2127                         if (o_used == 1) {
2128                             for ( ; zp < pos ; zp++ ) {
2129                                 if ( ! trie->trans[ zp ].next ) {
2130                                     break;
2131                                 }
2132                             }
2133                             trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2134                             trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2135                             trie->trans[ zp ].check = state;
2136                             if ( ++zp > pos ) pos = zp;
2137                             break;
2138                         }
2139                         used--;
2140                     }
2141                     if ( !flag ) {
2142                         flag = 1;
2143                         trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2144                     }
2145                     trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2146                     trie->trans[ pos ].check = state;
2147                     pos++;
2148                 }
2149             }
2150         }
2151         trie->lasttrans = pos + 1;
2152         trie->states = (reg_trie_state *)
2153             PerlMemShared_realloc( trie->states, laststate
2154                                    * sizeof(reg_trie_state) );
2155         DEBUG_TRIE_COMPILE_MORE_r(
2156                 PerlIO_printf( Perl_debug_log,
2157                     "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2158                     (int)depth * 2 + 2,"",
2159                     (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
2160                     (IV)next_alloc,
2161                     (IV)pos,
2162                     ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2163             );
2164
2165         } /* end table compress */
2166     }
2167     DEBUG_TRIE_COMPILE_MORE_r(
2168             PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2169                 (int)depth * 2 + 2, "",
2170                 (UV)trie->statecount,
2171                 (UV)trie->lasttrans)
2172     );
2173     /* resize the trans array to remove unused space */
2174     trie->trans = (reg_trie_trans *)
2175         PerlMemShared_realloc( trie->trans, trie->lasttrans
2176                                * sizeof(reg_trie_trans) );
2177
2178     {   /* Modify the program and insert the new TRIE node */
2179         U8 nodetype =(U8)(flags & 0xFF);
2180         char *str=NULL;
2181
2182 #ifdef DEBUGGING
2183         regnode *optimize = NULL;
2184 #ifdef RE_TRACK_PATTERN_OFFSETS
2185
2186         U32 mjd_offset = 0;
2187         U32 mjd_nodelen = 0;
2188 #endif /* RE_TRACK_PATTERN_OFFSETS */
2189 #endif /* DEBUGGING */
2190         /*
2191            This means we convert either the first branch or the first Exact,
2192            depending on whether the thing following (in 'last') is a branch
2193            or not and whther first is the startbranch (ie is it a sub part of
2194            the alternation or is it the whole thing.)
2195            Assuming its a sub part we convert the EXACT otherwise we convert
2196            the whole branch sequence, including the first.
2197          */
2198         /* Find the node we are going to overwrite */
2199         if ( first != startbranch || OP( last ) == BRANCH ) {
2200             /* branch sub-chain */
2201             NEXT_OFF( first ) = (U16)(last - first);
2202 #ifdef RE_TRACK_PATTERN_OFFSETS
2203             DEBUG_r({
2204                 mjd_offset= Node_Offset((convert));
2205                 mjd_nodelen= Node_Length((convert));
2206             });
2207 #endif
2208             /* whole branch chain */
2209         }
2210 #ifdef RE_TRACK_PATTERN_OFFSETS
2211         else {
2212             DEBUG_r({
2213                 const  regnode *nop = NEXTOPER( convert );
2214                 mjd_offset= Node_Offset((nop));
2215                 mjd_nodelen= Node_Length((nop));
2216             });
2217         }
2218         DEBUG_OPTIMISE_r(
2219             PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2220                 (int)depth * 2 + 2, "",
2221                 (UV)mjd_offset, (UV)mjd_nodelen)
2222         );
2223 #endif
2224         /* But first we check to see if there is a common prefix we can
2225            split out as an EXACT and put in front of the TRIE node.  */
2226         trie->startstate= 1;
2227         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
2228             U32 state;
2229             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2230                 U32 ofs = 0;
2231                 I32 idx = -1;
2232                 U32 count = 0;
2233                 const U32 base = trie->states[ state ].trans.base;
2234
2235                 if ( trie->states[state].wordnum )
2236                         count = 1;
2237
2238                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2239                     if ( ( base + ofs >= trie->uniquecharcount ) &&
2240                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2241                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2242                     {
2243                         if ( ++count > 1 ) {
2244                             SV **tmp = av_fetch( revcharmap, ofs, 0);
2245                             const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2246                             if ( state == 1 ) break;
2247                             if ( count == 2 ) {
2248                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2249                                 DEBUG_OPTIMISE_r(
2250                                     PerlIO_printf(Perl_debug_log,
2251                                         "%*sNew Start State=%"UVuf" Class: [",
2252                                         (int)depth * 2 + 2, "",
2253                                         (UV)state));
2254                                 if (idx >= 0) {
2255                                     SV ** const tmp = av_fetch( revcharmap, idx, 0);
2256                                     const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2257
2258                                     TRIE_BITMAP_SET(trie,*ch);
2259                                     if ( folder )
2260                                         TRIE_BITMAP_SET(trie, folder[ *ch ]);
2261                                     DEBUG_OPTIMISE_r(
2262                                         PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2263                                     );
2264                                 }
2265                             }
2266                             TRIE_BITMAP_SET(trie,*ch);
2267                             if ( folder )
2268                                 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2269                             DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2270                         }
2271                         idx = ofs;
2272                     }
2273                 }
2274                 if ( count == 1 ) {
2275                     SV **tmp = av_fetch( revcharmap, idx, 0);
2276                     STRLEN len;
2277                     char *ch = SvPV( *tmp, len );
2278                     DEBUG_OPTIMISE_r({
2279                         SV *sv=sv_newmortal();
2280                         PerlIO_printf( Perl_debug_log,
2281                             "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2282                             (int)depth * 2 + 2, "",
2283                             (UV)state, (UV)idx,
2284                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2285                                 PL_colors[0], PL_colors[1],
2286                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2287                                 PERL_PV_ESCAPE_FIRSTCHAR
2288                             )
2289                         );
2290                     });
2291                     if ( state==1 ) {
2292                         OP( convert ) = nodetype;
2293                         str=STRING(convert);
2294                         STR_LEN(convert)=0;
2295                     }
2296                     STR_LEN(convert) += len;
2297                     while (len--)
2298                         *str++ = *ch++;
2299                 } else {
2300 #ifdef DEBUGGING
2301                     if (state>1)
2302                         DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2303 #endif
2304                     break;
2305                 }
2306             }
2307             trie->prefixlen = (state-1);
2308             if (str) {
2309                 regnode *n = convert+NODE_SZ_STR(convert);
2310                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2311                 trie->startstate = state;
2312                 trie->minlen -= (state - 1);
2313                 trie->maxlen -= (state - 1);
2314 #ifdef DEBUGGING
2315                /* At least the UNICOS C compiler choked on this
2316                 * being argument to DEBUG_r(), so let's just have
2317                 * it right here. */
2318                if (
2319 #ifdef PERL_EXT_RE_BUILD
2320                    1
2321 #else
2322                    DEBUG_r_TEST
2323 #endif
2324                    ) {
2325                    regnode *fix = convert;
2326                    U32 word = trie->wordcount;
2327                    mjd_nodelen++;
2328                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2329                    while( ++fix < n ) {
2330                        Set_Node_Offset_Length(fix, 0, 0);
2331                    }
2332                    while (word--) {
2333                        SV ** const tmp = av_fetch( trie_words, word, 0 );
2334                        if (tmp) {
2335                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
2336                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2337                            else
2338                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2339                        }
2340                    }
2341                }
2342 #endif
2343                 if (trie->maxlen) {
2344                     convert = n;
2345                 } else {
2346                     NEXT_OFF(convert) = (U16)(tail - convert);
2347                     DEBUG_r(optimize= n);
2348                 }
2349             }
2350         }
2351         if (!jumper)
2352             jumper = last;
2353         if ( trie->maxlen ) {
2354             NEXT_OFF( convert ) = (U16)(tail - convert);
2355             ARG_SET( convert, data_slot );
2356             /* Store the offset to the first unabsorbed branch in
2357                jump[0], which is otherwise unused by the jump logic.
2358                We use this when dumping a trie and during optimisation. */
2359             if (trie->jump)
2360                 trie->jump[0] = (U16)(nextbranch - convert);
2361
2362             /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2363              *   and there is a bitmap
2364              *   and the first "jump target" node we found leaves enough room
2365              * then convert the TRIE node into a TRIEC node, with the bitmap
2366              * embedded inline in the opcode - this is hypothetically faster.
2367              */
2368             if ( !trie->states[trie->startstate].wordnum
2369                  && trie->bitmap
2370                  && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2371             {
2372                 OP( convert ) = TRIEC;
2373                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2374                 PerlMemShared_free(trie->bitmap);
2375                 trie->bitmap= NULL;
2376             } else
2377                 OP( convert ) = TRIE;
2378
2379             /* store the type in the flags */
2380             convert->flags = nodetype;
2381             DEBUG_r({
2382             optimize = convert
2383                       + NODE_STEP_REGNODE
2384                       + regarglen[ OP( convert ) ];
2385             });
2386             /* XXX We really should free up the resource in trie now,
2387                    as we won't use them - (which resources?) dmq */
2388         }
2389         /* needed for dumping*/
2390         DEBUG_r(if (optimize) {
2391             regnode *opt = convert;
2392
2393             while ( ++opt < optimize) {
2394                 Set_Node_Offset_Length(opt,0,0);
2395             }
2396             /*
2397                 Try to clean up some of the debris left after the
2398                 optimisation.
2399              */
2400             while( optimize < jumper ) {
2401                 mjd_nodelen += Node_Length((optimize));
2402                 OP( optimize ) = OPTIMIZED;
2403                 Set_Node_Offset_Length(optimize,0,0);
2404                 optimize++;
2405             }
2406             Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2407         });
2408     } /* end node insert */
2409     REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, convert);
2410
2411     /*  Finish populating the prev field of the wordinfo array.  Walk back
2412      *  from each accept state until we find another accept state, and if
2413      *  so, point the first word's .prev field at the second word. If the
2414      *  second already has a .prev field set, stop now. This will be the
2415      *  case either if we've already processed that word's accept state,
2416      *  or that state had multiple words, and the overspill words were
2417      *  already linked up earlier.
2418      */
2419     {
2420         U16 word;
2421         U32 state;
2422         U16 prev;
2423
2424         for (word=1; word <= trie->wordcount; word++) {
2425             prev = 0;
2426             if (trie->wordinfo[word].prev)
2427                 continue;
2428             state = trie->wordinfo[word].accept;
2429             while (state) {
2430                 state = prev_states[state];
2431                 if (!state)
2432                     break;
2433                 prev = trie->states[state].wordnum;
2434                 if (prev)
2435                     break;
2436             }
2437             trie->wordinfo[word].prev = prev;
2438         }
2439         Safefree(prev_states);
2440     }
2441
2442
2443     /* and now dump out the compressed format */
2444     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2445
2446     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2447 #ifdef DEBUGGING
2448     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2449     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2450 #else
2451     SvREFCNT_dec(revcharmap);
2452 #endif
2453     return trie->jump
2454            ? MADE_JUMP_TRIE
2455            : trie->startstate>1
2456              ? MADE_EXACT_TRIE
2457              : MADE_TRIE;
2458 }
2459
2460 STATIC void
2461 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode *stclass, U32 depth)
2462 {
2463 /* The Trie is constructed and compressed now so we can build a fail array if it's needed
2464
2465    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2466    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2467    ISBN 0-201-10088-6
2468
2469    We find the fail state for each state in the trie, this state is the longest proper
2470    suffix of the current state's 'word' that is also a proper prefix of another word in our
2471    trie. State 1 represents the word '' and is thus the default fail state. This allows
2472    the DFA not to have to restart after its tried and failed a word at a given point, it
2473    simply continues as though it had been matching the other word in the first place.
2474    Consider
2475       'abcdgu'=~/abcdefg|cdgu/
2476    When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2477    fail, which would bring us to the state representing 'd' in the second word where we would
2478    try 'g' and succeed, proceeding to match 'cdgu'.
2479  */
2480  /* add a fail transition */
2481     const U32 trie_offset = ARG(source);
2482     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2483     U32 *q;
2484     const U32 ucharcount = trie->uniquecharcount;
2485     const U32 numstates = trie->statecount;
2486     const U32 ubound = trie->lasttrans + ucharcount;
2487     U32 q_read = 0;
2488     U32 q_write = 0;
2489     U32 charid;
2490     U32 base = trie->states[ 1 ].trans.base;
2491     U32 *fail;
2492     reg_ac_data *aho;
2493     const U32 data_slot = add_data( pRExC_state, 1, "T" );
2494     GET_RE_DEBUG_FLAGS_DECL;
2495
2496     PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2497 #ifndef DEBUGGING
2498     PERL_UNUSED_ARG(depth);
2499 #endif
2500
2501
2502     ARG_SET( stclass, data_slot );
2503     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2504     RExC_rxi->data->data[ data_slot ] = (void*)aho;
2505     aho->trie=trie_offset;
2506     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2507     Copy( trie->states, aho->states, numstates, reg_trie_state );
2508     Newxz( q, numstates, U32);
2509     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2510     aho->refcount = 1;
2511     fail = aho->fail;
2512     /* initialize fail[0..1] to be 1 so that we always have
2513        a valid final fail state */
2514     fail[ 0 ] = fail[ 1 ] = 1;
2515
2516     for ( charid = 0; charid < ucharcount ; charid++ ) {
2517         const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2518         if ( newstate ) {
2519             q[ q_write ] = newstate;
2520             /* set to point at the root */
2521             fail[ q[ q_write++ ] ]=1;
2522         }
2523     }
2524     while ( q_read < q_write) {
2525         const U32 cur = q[ q_read++ % numstates ];
2526         base = trie->states[ cur ].trans.base;
2527
2528         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2529             const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2530             if (ch_state) {
2531                 U32 fail_state = cur;
2532                 U32 fail_base;
2533                 do {
2534                     fail_state = fail[ fail_state ];
2535                     fail_base = aho->states[ fail_state ].trans.base;
2536                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2537
2538                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2539                 fail[ ch_state ] = fail_state;
2540                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2541                 {
2542                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
2543                 }
2544                 q[ q_write++ % numstates] = ch_state;
2545             }
2546         }
2547     }
2548     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2549        when we fail in state 1, this allows us to use the
2550        charclass scan to find a valid start char. This is based on the principle
2551        that theres a good chance the string being searched contains lots of stuff
2552        that cant be a start char.
2553      */
2554     fail[ 0 ] = fail[ 1 ] = 0;
2555     DEBUG_TRIE_COMPILE_r({
2556         PerlIO_printf(Perl_debug_log,
2557                       "%*sStclass Failtable (%"UVuf" states): 0",
2558                       (int)(depth * 2), "", (UV)numstates
2559         );
2560         for( q_read=1; q_read<numstates; q_read++ ) {
2561             PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2562         }
2563         PerlIO_printf(Perl_debug_log, "\n");
2564     });
2565     Safefree(q);
2566     /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2567 }
2568
2569
2570 /*
2571  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2572  * These need to be revisited when a newer toolchain becomes available.
2573  */
2574 #if defined(__sparc64__) && defined(__GNUC__)
2575 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2576 #       undef  SPARC64_GCC_WORKAROUND
2577 #       define SPARC64_GCC_WORKAROUND 1
2578 #   endif
2579 #endif
2580
2581 #define DEBUG_PEEP(str,scan,depth) \
2582     DEBUG_OPTIMISE_r({if (scan){ \
2583        SV * const mysv=sv_newmortal(); \
2584        regnode *Next = regnext(scan); \
2585        regprop(RExC_rx, mysv, scan); \
2586        PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2587        (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2588        Next ? (REG_NODE_NUM(Next)) : 0 ); \
2589    }});
2590
2591
2592 /* The below joins as many adjacent EXACTish nodes as possible into a single
2593  * one.  The regop may be changed if the node(s) contain certain sequences that
2594  * require special handling.  The joining is only done if:
2595  * 1) there is room in the current conglomerated node to entirely contain the
2596  *    next one.
2597  * 2) they are the exact same node type
2598  *
2599  * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
2600  * these get optimized out
2601  *
2602  * If a node is to match under /i (folded), the number of characters it matches
2603  * can be different than its character length if it contains a multi-character
2604  * fold.  *min_subtract is set to the total delta of the input nodes.
2605  *
2606  * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
2607  * and contains LATIN SMALL LETTER SHARP S
2608  *
2609  * This is as good a place as any to discuss the design of handling these
2610  * multi-character fold sequences.  It's been wrong in Perl for a very long
2611  * time.  There are three code points in Unicode whose multi-character folds
2612  * were long ago discovered to mess things up.  The previous designs for
2613  * dealing with these involved assigning a special node for them.  This
2614  * approach doesn't work, as evidenced by this example:
2615  *      "\xDFs" =~ /s\xDF/ui    # Used to fail before these patches
2616  * Both these fold to "sss", but if the pattern is parsed to create a node that
2617  * would match just the \xDF, it won't be able to handle the case where a
2618  * successful match would have to cross the node's boundary.  The new approach
2619  * that hopefully generally solves the problem generates an EXACTFU_SS node
2620  * that is "sss".
2621  *
2622  * It turns out that there are problems with all multi-character folds, and not
2623  * just these three.  Now the code is general, for all such cases, but the
2624  * three still have some special handling.  The approach taken is:
2625  * 1)   This routine examines each EXACTFish node that could contain multi-
2626  *      character fold sequences.  It returns in *min_subtract how much to
2627  *      subtract from the the actual length of the string to get a real minimum
2628  *      match length; it is 0 if there are no multi-char folds.  This delta is
2629  *      used by the caller to adjust the min length of the match, and the delta
2630  *      between min and max, so that the optimizer doesn't reject these
2631  *      possibilities based on size constraints.
2632  * 2)   Certain of these sequences require special handling by the trie code,
2633  *      so, if found, this code changes the joined node type to special ops:
2634  *      EXACTFU_TRICKYFOLD and EXACTFU_SS.
2635  * 3)   For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
2636  *      is used for an EXACTFU node that contains at least one "ss" sequence in
2637  *      it.  For non-UTF-8 patterns and strings, this is the only case where
2638  *      there is a possible fold length change.  That means that a regular
2639  *      EXACTFU node without UTF-8 involvement doesn't have to concern itself
2640  *      with length changes, and so can be processed faster.  regexec.c takes
2641  *      advantage of this.  Generally, an EXACTFish node that is in UTF-8 is
2642  *      pre-folded by regcomp.c.  This saves effort in regex matching.
2643  *      However, the pre-folding isn't done for non-UTF8 patterns because the
2644  *      fold of the MICRO SIGN requires UTF-8, and we don't want to slow things
2645  *      down by forcing the pattern into UTF8 unless necessary.  Also what
2646  *      EXACTF and EXACTFL nodes fold to isn't known until runtime.  The fold
2647  *      possibilities for the non-UTF8 patterns are quite simple, except for
2648  *      the sharp s.  All the ones that don't involve a UTF-8 target string are
2649  *      members of a fold-pair, and arrays are set up for all of them so that
2650  *      the other member of the pair can be found quickly.  Code elsewhere in
2651  *      this file makes sure that in EXACTFU nodes, the sharp s gets folded to
2652  *      'ss', even if the pattern isn't UTF-8.  This avoids the issues
2653  *      described in the next item.
2654  * 4)   A problem remains for the sharp s in EXACTF nodes.  Whether it matches
2655  *      'ss' or not is not knowable at compile time.  It will match iff the
2656  *      target string is in UTF-8, unlike the EXACTFU nodes, where it always
2657  *      matches; and the EXACTFL and EXACTFA nodes where it never does.  Thus
2658  *      it can't be folded to "ss" at compile time, unlike EXACTFU does (as
2659  *      described in item 3).  An assumption that the optimizer part of
2660  *      regexec.c (probably unwittingly) makes is that a character in the
2661  *      pattern corresponds to at most a single character in the target string.
2662  *      (And I do mean character, and not byte here, unlike other parts of the
2663  *      documentation that have never been updated to account for multibyte
2664  *      Unicode.)  This assumption is wrong only in this case, as all other
2665  *      cases are either 1-1 folds when no UTF-8 is involved; or is true by
2666  *      virtue of having this file pre-fold UTF-8 patterns.   I'm
2667  *      reluctant to try to change this assumption, so instead the code punts.
2668  *      This routine examines EXACTF nodes for the sharp s, and returns a
2669  *      boolean indicating whether or not the node is an EXACTF node that
2670  *      contains a sharp s.  When it is true, the caller sets a flag that later
2671  *      causes the optimizer in this file to not set values for the floating
2672  *      and fixed string lengths, and thus avoids the optimizer code in
2673  *      regexec.c that makes the invalid assumption.  Thus, there is no
2674  *      optimization based on string lengths for EXACTF nodes that contain the
2675  *      sharp s.  This only happens for /id rules (which means the pattern
2676  *      isn't in UTF-8).
2677  */
2678
2679 #define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
2680     if (PL_regkind[OP(scan)] == EXACT) \
2681         join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
2682
2683 STATIC U32
2684 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
2685     /* Merge several consecutive EXACTish nodes into one. */
2686     regnode *n = regnext(scan);
2687     U32 stringok = 1;
2688     regnode *next = scan + NODE_SZ_STR(scan);
2689     U32 merged = 0;
2690     U32 stopnow = 0;
2691 #ifdef DEBUGGING
2692     regnode *stop = scan;
2693     GET_RE_DEBUG_FLAGS_DECL;
2694 #else
2695     PERL_UNUSED_ARG(depth);
2696 #endif
2697
2698     PERL_ARGS_ASSERT_JOIN_EXACT;
2699 #ifndef EXPERIMENTAL_INPLACESCAN
2700     PERL_UNUSED_ARG(flags);
2701     PERL_UNUSED_ARG(val);
2702 #endif
2703     DEBUG_PEEP("join",scan,depth);
2704
2705     /* Look through the subsequent nodes in the chain.  Skip NOTHING, merge
2706      * EXACT ones that are mergeable to the current one. */
2707     while (n
2708            && (PL_regkind[OP(n)] == NOTHING
2709                || (stringok && OP(n) == OP(scan)))
2710            && NEXT_OFF(n)
2711            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
2712     {
2713
2714         if (OP(n) == TAIL || n > next)
2715             stringok = 0;
2716         if (PL_regkind[OP(n)] == NOTHING) {
2717             DEBUG_PEEP("skip:",n,depth);
2718             NEXT_OFF(scan) += NEXT_OFF(n);
2719             next = n + NODE_STEP_REGNODE;
2720 #ifdef DEBUGGING
2721             if (stringok)
2722                 stop = n;
2723 #endif
2724             n = regnext(n);
2725         }
2726         else if (stringok) {
2727             const unsigned int oldl = STR_LEN(scan);
2728             regnode * const nnext = regnext(n);
2729
2730             /* XXX I (khw) kind of doubt that this works on platforms where
2731              * U8_MAX is above 255 because of lots of other assumptions */
2732             if (oldl + STR_LEN(n) > U8_MAX)
2733                 break;
2734
2735             DEBUG_PEEP("merg",n,depth);
2736             merged++;
2737
2738             NEXT_OFF(scan) += NEXT_OFF(n);
2739             STR_LEN(scan) += STR_LEN(n);
2740             next = n + NODE_SZ_STR(n);
2741             /* Now we can overwrite *n : */
2742             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2743 #ifdef DEBUGGING
2744             stop = next - 1;
2745 #endif
2746             n = nnext;
2747             if (stopnow) break;
2748         }
2749
2750 #ifdef EXPERIMENTAL_INPLACESCAN
2751         if (flags && !NEXT_OFF(n)) {
2752             DEBUG_PEEP("atch", val, depth);
2753             if (reg_off_by_arg[OP(n)]) {
2754                 ARG_SET(n, val - n);
2755             }
2756             else {
2757                 NEXT_OFF(n) = val - n;
2758             }
2759             stopnow = 1;
2760         }
2761 #endif
2762     }
2763
2764     *min_subtract = 0;
2765     *has_exactf_sharp_s = FALSE;
2766
2767     /* Here, all the adjacent mergeable EXACTish nodes have been merged.  We
2768      * can now analyze for sequences of problematic code points.  (Prior to
2769      * this final joining, sequences could have been split over boundaries, and
2770      * hence missed).  The sequences only happen in folding, hence for any
2771      * non-EXACT EXACTish node */
2772     if (OP(scan) != EXACT) {
2773         const U8 * const s0 = (U8*) STRING(scan);
2774         const U8 * s = s0;
2775         const U8 * const s_end = s0 + STR_LEN(scan);
2776
2777         /* One pass is made over the node's string looking for all the
2778          * possibilities.  to avoid some tests in the loop, there are two main
2779          * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
2780          * non-UTF-8 */
2781         if (UTF) {
2782
2783             /* Examine the string for a multi-character fold sequence.  UTF-8
2784              * patterns have all characters pre-folded by the time this code is
2785              * executed */
2786             while (s < s_end - 1) /* Can stop 1 before the end, as minimum
2787                                      length sequence we are looking for is 2 */
2788             {
2789                 int count = 0;
2790                 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
2791                 if (! len) {    /* Not a multi-char fold: get next char */
2792                     s += UTF8SKIP(s);
2793                     continue;
2794                 }
2795
2796                 /* Nodes with 'ss' require special handling, except for EXACTFL
2797                  * and EXACTFA for which there is no multi-char fold to this */
2798                 if (len == 2 && *s == 's' && *(s+1) == 's'
2799                     && OP(scan) != EXACTFL && OP(scan) != EXACTFA)
2800                 {
2801                     count = 2;
2802                     OP(scan) = EXACTFU_SS;
2803                     s += 2;
2804                 }
2805                 else if (len == 6   /* len is the same in both ASCII and EBCDIC for these */
2806                          && (memEQ(s, GREEK_SMALL_LETTER_IOTA_UTF8
2807                                       COMBINING_DIAERESIS_UTF8
2808                                       COMBINING_ACUTE_ACCENT_UTF8,
2809                                    6)
2810                              || memEQ(s, GREEK_SMALL_LETTER_UPSILON_UTF8
2811                                          COMBINING_DIAERESIS_UTF8
2812                                          COMBINING_ACUTE_ACCENT_UTF8,
2813                                      6)))
2814                 {
2815                     count = 3;
2816
2817                     /* These two folds require special handling by trie's, so
2818                      * change the node type to indicate this.  If EXACTFA and
2819                      * EXACTFL were ever to be handled by trie's, this would
2820                      * have to be changed.  If this node has already been
2821                      * changed to EXACTFU_SS in this loop, leave it as is.  (I
2822                      * (khw) think it doesn't matter in regexec.c for UTF
2823                      * patterns, but no need to change it */
2824                     if (OP(scan) == EXACTFU) {
2825                         OP(scan) = EXACTFU_TRICKYFOLD;
2826                     }
2827                     s += 6;
2828                 }
2829                 else { /* Here is a generic multi-char fold. */
2830                     const U8* multi_end  = s + len;
2831
2832                     /* Count how many characters in it.  In the case of /l and
2833                      * /aa, no folds which contain ASCII code points are
2834                      * allowed, so check for those, and skip if found.  (In
2835                      * EXACTFL, no folds are allowed to any Latin1 code point,
2836                      * not just ASCII.  But there aren't any of these
2837                      * currently, nor ever likely, so don't take the time to
2838                      * test for them.  The code that generates the
2839                      * is_MULTI_foo() macros croaks should one actually get put
2840                      * into Unicode .) */
2841                     if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
2842                         count = utf8_length(s, multi_end);
2843                         s = multi_end;
2844                     }
2845                     else {
2846                         while (s < multi_end) {
2847                             if (isASCII(*s)) {
2848                                 s++;
2849                                 goto next_iteration;
2850                             }
2851                             else {
2852                                 s += UTF8SKIP(s);
2853                             }
2854                             count++;
2855                         }
2856                     }
2857                 }
2858
2859                 /* The delta is how long the sequence is minus 1 (1 is how long
2860                  * the character that folds to the sequence is) */
2861                 *min_subtract += count - 1;
2862             next_iteration: ;
2863             }
2864         }
2865         else if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
2866
2867             /* Here, the pattern is not UTF-8.  Look for the multi-char folds
2868              * that are all ASCII.  As in the above case, EXACTFL and EXACTFA
2869              * nodes can't have multi-char folds to this range (and there are
2870              * no existing ones in the upper latin1 range).  In the EXACTF
2871              * case we look also for the sharp s, which can be in the final
2872              * position.  Otherwise we can stop looking 1 byte earlier because
2873              * have to find at least two characters for a multi-fold */
2874             const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
2875
2876             /* The below is perhaps overboard, but this allows us to save a
2877              * test each time through the loop at the expense of a mask.  This
2878              * is because on both EBCDIC and ASCII machines, 'S' and 's' differ
2879              * by a single bit.  On ASCII they are 32 apart; on EBCDIC, they
2880              * are 64.  This uses an exclusive 'or' to find that bit and then
2881              * inverts it to form a mask, with just a single 0, in the bit
2882              * position where 'S' and 's' differ. */
2883             const U8 S_or_s_mask = (U8) ~ ('S' ^ 's');
2884             const U8 s_masked = 's' & S_or_s_mask;
2885
2886             while (s < upper) {
2887                 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
2888                 if (! len) {    /* Not a multi-char fold. */
2889                     if (*s == LATIN_SMALL_LETTER_SHARP_S && OP(scan) == EXACTF)
2890                     {
2891                         *has_exactf_sharp_s = TRUE;
2892                     }
2893                     s++;
2894                     continue;
2895                 }
2896
2897                 if (len == 2
2898                     && ((*s & S_or_s_mask) == s_masked)
2899                     && ((*(s+1) & S_or_s_mask) == s_masked))
2900                 {
2901
2902                     /* EXACTF nodes need to know that the minimum length
2903                      * changed so that a sharp s in the string can match this
2904                      * ss in the pattern, but they remain EXACTF nodes, as they
2905                      * won't match this unless the target string is is UTF-8,
2906                      * which we don't know until runtime */
2907                     if (OP(scan) != EXACTF) {
2908                         OP(scan) = EXACTFU_SS;
2909                     }
2910                 }
2911
2912                 *min_subtract += len - 1;
2913                 s += len;
2914             }
2915         }
2916     }
2917
2918 #ifdef DEBUGGING
2919     /* Allow dumping but overwriting the collection of skipped
2920      * ops and/or strings with fake optimized ops */
2921     n = scan + NODE_SZ_STR(scan);
2922     while (n <= stop) {
2923         OP(n) = OPTIMIZED;
2924         FLAGS(n) = 0;
2925         NEXT_OFF(n) = 0;
2926         n++;
2927     }
2928 #endif
2929     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2930     return stopnow;
2931 }
2932
2933 /* REx optimizer.  Converts nodes into quicker variants "in place".
2934    Finds fixed substrings.  */
2935
2936 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2937    to the position after last scanned or to NULL. */
2938
2939 #define INIT_AND_WITHP \
2940     assert(!and_withp); \
2941     Newx(and_withp,1,struct regnode_charclass_class); \
2942     SAVEFREEPV(and_withp)
2943
2944 /* this is a chain of data about sub patterns we are processing that
2945    need to be handled separately/specially in study_chunk. Its so
2946    we can simulate recursion without losing state.  */
2947 struct scan_frame;
2948 typedef struct scan_frame {
2949     regnode *last;  /* last node to process in this frame */
2950     regnode *next;  /* next node to process when last is reached */
2951     struct scan_frame *prev; /*previous frame*/
2952     I32 stop; /* what stopparen do we use */
2953 } scan_frame;
2954
2955
2956 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2957
2958 #define CASE_SYNST_FNC(nAmE)                                       \
2959 case nAmE:                                                         \
2960     if (flags & SCF_DO_STCLASS_AND) {                              \
2961             for (value = 0; value < 256; value++)                  \
2962                 if (!is_ ## nAmE ## _cp(value))                       \
2963                     ANYOF_BITMAP_CLEAR(data->start_class, value);  \
2964     }                                                              \
2965     else {                                                         \
2966             for (value = 0; value < 256; value++)                  \
2967                 if (is_ ## nAmE ## _cp(value))                        \
2968                     ANYOF_BITMAP_SET(data->start_class, value);    \
2969     }                                                              \
2970     break;                                                         \
2971 case N ## nAmE:                                                    \
2972     if (flags & SCF_DO_STCLASS_AND) {                              \
2973             for (value = 0; value < 256; value++)                   \
2974                 if (is_ ## nAmE ## _cp(value))                         \
2975                     ANYOF_BITMAP_CLEAR(data->start_class, value);   \
2976     }                                                               \
2977     else {                                                          \
2978             for (value = 0; value < 256; value++)                   \
2979                 if (!is_ ## nAmE ## _cp(value))                        \
2980                     ANYOF_BITMAP_SET(data->start_class, value);     \
2981     }                                                               \
2982     break
2983
2984
2985
2986 STATIC I32
2987 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2988                         I32 *minlenp, I32 *deltap,
2989                         regnode *last,
2990                         scan_data_t *data,
2991                         I32 stopparen,
2992                         U8* recursed,
2993                         struct regnode_charclass_class *and_withp,
2994                         U32 flags, U32 depth)
2995                         /* scanp: Start here (read-write). */
2996                         /* deltap: Write maxlen-minlen here. */
2997                         /* last: Stop before this one. */
2998                         /* data: string data about the pattern */
2999                         /* stopparen: treat close N as END */
3000                         /* recursed: which subroutines have we recursed into */
3001                         /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3002 {
3003     dVAR;
3004     I32 min = 0;    /* There must be at least this number of characters to match */
3005     I32 pars = 0, code;
3006     regnode *scan = *scanp, *next;
3007     I32 delta = 0;
3008     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3009     int is_inf_internal = 0;            /* The studied chunk is infinite */
3010     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3011     scan_data_t data_fake;
3012     SV *re_trie_maxbuff = NULL;
3013     regnode *first_non_open = scan;
3014     I32 stopmin = I32_MAX;
3015     scan_frame *frame = NULL;
3016     GET_RE_DEBUG_FLAGS_DECL;
3017
3018     PERL_ARGS_ASSERT_STUDY_CHUNK;
3019
3020 #ifdef DEBUGGING
3021     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3022 #endif
3023
3024     if ( depth == 0 ) {
3025         while (first_non_open && OP(first_non_open) == OPEN)
3026             first_non_open=regnext(first_non_open);
3027     }
3028
3029
3030   fake_study_recurse:
3031     while ( scan && OP(scan) != END && scan < last ){
3032         UV min_subtract = 0;    /* How mmany chars to subtract from the minimum
3033                                    node length to get a real minimum (because
3034                                    the folded version may be shorter) */
3035         bool has_exactf_sharp_s = FALSE;
3036         /* Peephole optimizer: */
3037         DEBUG_STUDYDATA("Peep:", data,depth);
3038         DEBUG_PEEP("Peep",scan,depth);
3039
3040         /* Its not clear to khw or hv why this is done here, and not in the
3041          * clauses that deal with EXACT nodes.  khw's guess is that it's
3042          * because of a previous design */
3043         JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
3044
3045         /* Follow the next-chain of the current node and optimize
3046            away all the NOTHINGs from it.  */
3047         if (OP(scan) != CURLYX) {
3048             const int max = (reg_off_by_arg[OP(scan)]
3049                        ? I32_MAX
3050                        /* I32 may be smaller than U16 on CRAYs! */
3051                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3052             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3053             int noff;
3054             regnode *n = scan;
3055
3056             /* Skip NOTHING and LONGJMP. */
3057             while ((n = regnext(n))
3058                    && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3059                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3060                    && off + noff < max)
3061                 off += noff;
3062             if (reg_off_by_arg[OP(scan)])
3063                 ARG(scan) = off;
3064             else
3065                 NEXT_OFF(scan) = off;
3066         }
3067
3068
3069
3070         /* The principal pseudo-switch.  Cannot be a switch, since we
3071            look into several different things.  */
3072         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3073                    || OP(scan) == IFTHEN) {
3074             next = regnext(scan);
3075             code = OP(scan);
3076             /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
3077
3078             if (OP(next) == code || code == IFTHEN) {
3079                 /* NOTE - There is similar code to this block below for handling
3080                    TRIE nodes on a re-study.  If you change stuff here check there
3081                    too. */
3082                 I32 max1 = 0, min1 = I32_MAX, num = 0;
3083                 struct regnode_charclass_class accum;
3084                 regnode * const startbranch=scan;
3085
3086                 if (flags & SCF_DO_SUBSTR)
3087                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
3088                 if (flags & SCF_DO_STCLASS)
3089                     cl_init_zero(pRExC_state, &accum);
3090
3091                 while (OP(scan) == code) {
3092                     I32 deltanext, minnext, f = 0, fake;
3093                     struct regnode_charclass_class this_class;
3094
3095                     num++;
3096                     data_fake.flags = 0;
3097                     if (data) {
3098                         data_fake.whilem_c = data->whilem_c;
3099                         data_fake.last_closep = data->last_closep;
3100                     }
3101                     else
3102                         data_fake.last_closep = &fake;
3103
3104                     data_fake.pos_delta = delta;
3105                     next = regnext(scan);
3106                     scan = NEXTOPER(scan);
3107                     if (code != BRANCH)
3108                         scan = NEXTOPER(scan);
3109                     if (flags & SCF_DO_STCLASS) {
3110                         cl_init(pRExC_state, &this_class);
3111                         data_fake.start_class = &this_class;
3112                         f = SCF_DO_STCLASS_AND;
3113                     }
3114                     if (flags & SCF_WHILEM_VISITED_POS)
3115                         f |= SCF_WHILEM_VISITED_POS;
3116
3117                     /* we suppose the run is continuous, last=next...*/
3118                     minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3119                                           next, &data_fake,
3120                                           stopparen, recursed, NULL, f,depth+1);
3121                     if (min1 > minnext)
3122                         min1 = minnext;
3123                     if (max1 < minnext + deltanext)
3124                         max1 = minnext + deltanext;
3125                     if (deltanext == I32_MAX)
3126                         is_inf = is_inf_internal = 1;
3127                     scan = next;
3128                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3129                         pars++;
3130                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
3131                         if ( stopmin > minnext)
3132                             stopmin = min + min1;
3133                         flags &= ~SCF_DO_SUBSTR;
3134                         if (data)
3135                             data->flags |= SCF_SEEN_ACCEPT;
3136                     }
3137                     if (data) {
3138                         if (data_fake.flags & SF_HAS_EVAL)
3139                             data->flags |= SF_HAS_EVAL;
3140                         data->whilem_c = data_fake.whilem_c;
3141                     }
3142                     if (flags & SCF_DO_STCLASS)
3143                         cl_or(pRExC_state, &accum, &this_class);
3144                 }
3145                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3146                     min1 = 0;
3147                 if (flags & SCF_DO_SUBSTR) {
3148                     data->pos_min += min1;
3149                     data->pos_delta += max1 - min1;
3150                     if (max1 != min1 || is_inf)
3151                         data->longest = &(data->longest_float);
3152                 }
3153                 min += min1;
3154                 delta += max1 - min1;
3155                 if (flags & SCF_DO_STCLASS_OR) {
3156                     cl_or(pRExC_state, data->start_class, &accum);
3157                     if (min1) {
3158                         cl_and(data->start_class, and_withp);
3159                         flags &= ~SCF_DO_STCLASS;
3160                     }
3161                 }
3162                 else if (flags & SCF_DO_STCLASS_AND) {
3163                     if (min1) {
3164                         cl_and(data->start_class, &accum);
3165                         flags &= ~SCF_DO_STCLASS;
3166                     }
3167                     else {
3168                         /* Switch to OR mode: cache the old value of
3169                          * data->start_class */
3170                         INIT_AND_WITHP;
3171                         StructCopy(data->start_class, and_withp,
3172                                    struct regnode_charclass_class);
3173                         flags &= ~SCF_DO_STCLASS_AND;
3174                         StructCopy(&accum, data->start_class,
3175                                    struct regnode_charclass_class);
3176                         flags |= SCF_DO_STCLASS_OR;
3177                         data->start_class->flags |= ANYOF_EOS;
3178                     }
3179                 }
3180
3181                 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
3182                 /* demq.
3183
3184                    Assuming this was/is a branch we are dealing with: 'scan' now
3185                    points at the item that follows the branch sequence, whatever
3186                    it is. We now start at the beginning of the sequence and look
3187                    for subsequences of
3188
3189                    BRANCH->EXACT=>x1
3190                    BRANCH->EXACT=>x2
3191                    tail
3192
3193                    which would be constructed from a pattern like /A|LIST|OF|WORDS/
3194
3195                    If we can find such a subsequence we need to turn the first
3196                    element into a trie and then add the subsequent branch exact
3197                    strings to the trie.
3198
3199                    We have two cases
3200
3201                      1. patterns where the whole set of branches can be converted.
3202
3203                      2. patterns where only a subset can be converted.
3204
3205                    In case 1 we can replace the whole set with a single regop
3206                    for the trie. In case 2 we need to keep the start and end
3207                    branches so
3208
3209                      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3210                      becomes BRANCH TRIE; BRANCH X;
3211
3212                   There is an additional case, that being where there is a
3213                   common prefix, which gets split out into an EXACT like node
3214                   preceding the TRIE node.
3215
3216                   If x(1..n)==tail then we can do a simple trie, if not we make
3217                   a "jump" trie, such that when we match the appropriate word
3218                   we "jump" to the appropriate tail node. Essentially we turn
3219                   a nested if into a case structure of sorts.
3220
3221                 */
3222
3223                     int made=0;
3224                     if (!re_trie_maxbuff) {
3225                         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3226                         if (!SvIOK(re_trie_maxbuff))
3227                             sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3228                     }
3229                     if ( SvIV(re_trie_maxbuff)>=0  ) {
3230                         regnode *cur;
3231                         regnode *first = (regnode *)NULL;
3232                         regnode *last = (regnode *)NULL;
3233                         regnode *tail = scan;
3234                         U8 trietype = 0;
3235                         U32 count=0;
3236
3237 #ifdef DEBUGGING
3238                         SV * const mysv = sv_newmortal();       /* for dumping */
3239 #endif
3240                         /* var tail is used because there may be a TAIL
3241                            regop in the way. Ie, the exacts will point to the
3242                            thing following the TAIL, but the last branch will
3243                            point at the TAIL. So we advance tail. If we
3244                            have nested (?:) we may have to move through several
3245                            tails.
3246                          */
3247
3248                         while ( OP( tail ) == TAIL ) {
3249                             /* this is the TAIL generated by (?:) */
3250                             tail = regnext( tail );
3251                         }
3252
3253
3254                         DEBUG_TRIE_COMPILE_r({
3255                             regprop(RExC_rx, mysv, tail );
3256                             PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3257                                 (int)depth * 2 + 2, "",
3258                                 "Looking for TRIE'able sequences. Tail node is: ",
3259                                 SvPV_nolen_const( mysv )
3260                             );
3261                         });
3262
3263                         /*
3264
3265                             Step through the branches
3266                                 cur represents each branch,
3267                                 noper is the first thing to be matched as part of that branch
3268                                 noper_next is the regnext() of that node.
3269
3270                             We normally handle a case like this /FOO[xyz]|BAR[pqr]/
3271                             via a "jump trie" but we also support building with NOJUMPTRIE,
3272                             which restricts the trie logic to structures like /FOO|BAR/.
3273
3274                             If noper is a trieable nodetype then the branch is a possible optimization
3275                             target. If we are building under NOJUMPTRIE then we require that noper_next
3276                             is the same as scan (our current position in the regex program).
3277
3278                             Once we have two or more consecutive such branches we can create a
3279                             trie of the EXACT's contents and stitch it in place into the program.
3280
3281                             If the sequence represents all of the branches in the alternation we
3282                             replace the entire thing with a single TRIE node.
3283
3284                             Otherwise when it is a subsequence we need to stitch it in place and
3285                             replace only the relevant branches. This means the first branch has
3286                             to remain as it is used by the alternation logic, and its next pointer,
3287                             and needs to be repointed at the item on the branch chain following
3288                             the last branch we have optimized away.
3289
3290                             This could be either a BRANCH, in which case the subsequence is internal,
3291                             or it could be the item following the branch sequence in which case the
3292                             subsequence is at the end (which does not necessarily mean the first node
3293                             is the start of the alternation).
3294
3295                             TRIE_TYPE(X) is a define which maps the optype to a trietype.
3296
3297                                 optype          |  trietype
3298                                 ----------------+-----------
3299                                 NOTHING         | NOTHING
3300                                 EXACT           | EXACT
3301                                 EXACTFU         | EXACTFU
3302                                 EXACTFU_SS      | EXACTFU
3303                                 EXACTFU_TRICKYFOLD | EXACTFU
3304                                 EXACTFA         | 0
3305
3306
3307                         */
3308 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING :   \
3309                        ( EXACT == (X) )   ? EXACT :        \
3310                        ( EXACTFU == (X) || EXACTFU_SS == (X) || EXACTFU_TRICKYFOLD == (X) ) ? EXACTFU :        \
3311                        0 )
3312
3313                         /* dont use tail as the end marker for this traverse */
3314                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3315                             regnode * const noper = NEXTOPER( cur );
3316                             U8 noper_type = OP( noper );
3317                             U8 noper_trietype = TRIE_TYPE( noper_type );
3318 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3319                             regnode * const noper_next = regnext( noper );
3320                             U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
3321                             U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
3322 #endif
3323
3324                             DEBUG_TRIE_COMPILE_r({
3325                                 regprop(RExC_rx, mysv, cur);
3326                                 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3327                                    (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3328
3329                                 regprop(RExC_rx, mysv, noper);
3330                                 PerlIO_printf( Perl_debug_log, " -> %s",
3331                                     SvPV_nolen_const(mysv));
3332
3333                                 if ( noper_next ) {
3334                                   regprop(RExC_rx, mysv, noper_next );
3335                                   PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3336                                     SvPV_nolen_const(mysv));
3337                                 }
3338                                 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
3339                                    REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
3340                                    PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
3341                                 );
3342                             });
3343
3344                             /* Is noper a trieable nodetype that can be merged with the
3345                              * current trie (if there is one)? */
3346                             if ( noper_trietype
3347                                   &&
3348                                   (
3349                                         ( noper_trietype == NOTHING)
3350                                         || ( trietype == NOTHING )
3351                                         || ( trietype == noper_trietype )
3352                                   )
3353 #ifdef NOJUMPTRIE
3354                                   && noper_next == tail
3355 #endif
3356                                   && count < U16_MAX)
3357                             {
3358                                 /* Handle mergable triable node
3359                                  * Either we are the first node in a new trieable sequence,
3360                                  * in which case we do some bookkeeping, otherwise we update
3361                                  * the end pointer. */
3362                                 if ( !first ) {
3363                                     first = cur;
3364                                     if ( noper_trietype == NOTHING ) {
3365 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
3366                                         regnode * const noper_next = regnext( noper );
3367                                         U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
3368                                         U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
3369 #endif
3370
3371                                         if ( noper_next_trietype ) {
3372                                             trietype = noper_next_trietype;
3373                                         } else if (noper_next_type)  {
3374                                             /* a NOTHING regop is 1 regop wide. We need at least two
3375                                              * for a trie so we can't merge this in */
3376                                             first = NULL;
3377                                         }
3378                                     } else {
3379                                         trietype = noper_trietype;
3380                                     }
3381                                 } else {
3382                                     if ( trietype == NOTHING )
3383                                         trietype = noper_trietype;
3384                                     last = cur;
3385                                 }
3386                                 if (first)
3387                                     count++;
3388                             } /* end handle mergable triable node */
3389                             else {
3390                                 /* handle unmergable node -
3391                                  * noper may either be a triable node which can not be tried
3392                                  * together with the current trie, or a non triable node */
3393                                 if ( last ) {
3394                                     /* If last is set and trietype is not NOTHING then we have found
3395                                      * at least two triable branch sequences in a row of a similar
3396                                      * trietype so we can turn them into a trie. If/when we
3397                                      * allow NOTHING to start a trie sequence this condition will be
3398                                      * required, and it isn't expensive so we leave it in for now. */
3399                                     if ( trietype && trietype != NOTHING )
3400                                         make_trie( pRExC_state,
3401                                                 startbranch, first, cur, tail, count,
3402                                                 trietype, depth+1 );
3403                                     last = NULL; /* note: we clear/update first, trietype etc below, so we dont do it here */
3404                                 }
3405                                 if ( noper_trietype
3406 #ifdef NOJUMPTRIE
3407                                      && noper_next == tail
3408 #endif
3409                                 ){
3410                                     /* noper is triable, so we can start a new trie sequence */
3411                                     count = 1;
3412                                     first = cur;
3413                                     trietype = noper_trietype;
3414                                 } else if (first) {
3415                                     /* if we already saw a first but the current node is not triable then we have
3416                                      * to reset the first information. */
3417                                     count = 0;
3418                                     first = NULL;
3419                                     trietype = 0;
3420                                 }
3421                             } /* end handle unmergable node */
3422                         } /* loop over branches */
3423                         DEBUG_TRIE_COMPILE_r({
3424                             regprop(RExC_rx, mysv, cur);
3425                             PerlIO_printf( Perl_debug_log,
3426                               "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3427                               "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3428
3429                         });
3430                         if ( last && trietype ) {
3431                             if ( trietype != NOTHING ) {
3432                                 /* the last branch of the sequence was part of a trie,
3433                                  * so we have to construct it here outside of the loop
3434                                  */
3435                                 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 );
3436 #ifdef TRIE_STUDY_OPT
3437                                 if ( ((made == MADE_EXACT_TRIE &&
3438                                      startbranch == first)
3439                                      || ( first_non_open == first )) &&
3440                                      depth==0 ) {
3441                                     flags |= SCF_TRIE_RESTUDY;
3442                                     if ( startbranch == first
3443                                          && scan == tail )
3444                                     {
3445                                         RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3446                                     }
3447                                 }
3448 #endif
3449                             } else {
3450                                 /* at this point we know whatever we have is a NOTHING sequence/branch
3451                                  * AND if 'startbranch' is 'first' then we can turn the whole thing into a NOTHING
3452                                  */
3453                                 if ( startbranch == first ) {
3454                                     regnode *opt;
3455                                     /* the entire thing is a NOTHING sequence, something like this:
3456                                      * (?:|) So we can turn it into a plain NOTHING op. */
3457                                     DEBUG_TRIE_COMPILE_r({
3458                                         regprop(RExC_rx, mysv, cur);
3459                                         PerlIO_printf( Perl_debug_log,
3460                                           "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
3461                                           "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3462
3463                                     });
3464                                     OP(startbranch)= NOTHING;
3465                                     NEXT_OFF(startbranch)= tail - startbranch;
3466                                     for ( opt= startbranch + 1; opt < tail ; opt++ )
3467                                         OP(opt)= OPTIMIZED;
3468                                 }
3469                             }
3470                         } /* end if ( last) */
3471                     } /* TRIE_MAXBUF is non zero */
3472
3473                 } /* do trie */
3474
3475             }
3476             else if ( code == BRANCHJ ) {  /* single branch is optimized. */
3477                 scan = NEXTOPER(NEXTOPER(scan));
3478             } else                      /* single branch is optimized. */
3479                 scan = NEXTOPER(scan);
3480             continue;
3481         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3482             scan_frame *newframe = NULL;
3483             I32 paren;
3484             regnode *start;
3485             regnode *end;
3486
3487             if (OP(scan) != SUSPEND) {
3488             /* set the pointer */
3489                 if (OP(scan) == GOSUB) {
3490                     paren = ARG(scan);
3491                     RExC_recurse[ARG2L(scan)] = scan;
3492                     start = RExC_open_parens[paren-1];
3493                     end   = RExC_close_parens[paren-1];
3494                 } else {
3495                     paren = 0;
3496                     start = RExC_rxi->program + 1;
3497                     end   = RExC_opend;
3498                 }
3499                 if (!recursed) {
3500                     Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3501                     SAVEFREEPV(recursed);
3502                 }
3503                 if (!PAREN_TEST(recursed,paren+1)) {
3504                     PAREN_SET(recursed,paren+1);
3505                     Newx(newframe,1,scan_frame);
3506                 } else {
3507                     if (flags & SCF_DO_SUBSTR) {
3508                         SCAN_COMMIT(pRExC_state,data,minlenp);
3509                         data->longest = &(data->longest_float);
3510                     }
3511                     is_inf = is_inf_internal = 1;
3512                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3513                         cl_anything(pRExC_state, data->start_class);
3514                     flags &= ~SCF_DO_STCLASS;
3515                 }
3516             } else {
3517                 Newx(newframe,1,scan_frame);
3518                 paren = stopparen;
3519                 start = scan+2;
3520                 end = regnext(scan);
3521             }
3522             if (newframe) {
3523                 assert(start);
3524                 assert(end);
3525                 SAVEFREEPV(newframe);
3526                 newframe->next = regnext(scan);
3527                 newframe->last = last;
3528                 newframe->stop = stopparen;
3529                 newframe->prev = frame;
3530
3531                 frame = newframe;
3532                 scan =  start;
3533                 stopparen = paren;
3534                 last = end;
3535
3536                 continue;
3537             }
3538         }
3539         else if (OP(scan) == EXACT) {
3540             I32 l = STR_LEN(scan);
3541             UV uc;
3542             if (UTF) {
3543                 const U8 * const s = (U8*)STRING(scan);
3544                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3545                 l = utf8_length(s, s + l);
3546             } else {
3547                 uc = *((U8*)STRING(scan));
3548             }
3549             min += l;
3550             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3551                 /* The code below prefers earlier match for fixed
3552                    offset, later match for variable offset.  */
3553                 if (data->last_end == -1) { /* Update the start info. */
3554                     data->last_start_min = data->pos_min;
3555                     data->last_start_max = is_inf
3556                         ? I32_MAX : data->pos_min + data->pos_delta;
3557                 }
3558                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3559                 if (UTF)
3560                     SvUTF8_on(data->last_found);
3561                 {
3562                     SV * const sv = data->last_found;
3563                     MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3564                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
3565                     if (mg && mg->mg_len >= 0)
3566                         mg->mg_len += utf8_length((U8*)STRING(scan),
3567                                                   (U8*)STRING(scan)+STR_LEN(scan));
3568                 }
3569                 data->last_end = data->pos_min + l;
3570                 data->pos_min += l; /* As in the first entry. */
3571                 data->flags &= ~SF_BEFORE_EOL;
3572             }
3573             if (flags & SCF_DO_STCLASS_AND) {
3574                 /* Check whether it is compatible with what we know already! */
3575                 int compat = 1;
3576
3577
3578                 /* If compatible, we or it in below.  It is compatible if is
3579                  * in the bitmp and either 1) its bit or its fold is set, or 2)
3580                  * it's for a locale.  Even if there isn't unicode semantics
3581                  * here, at runtime there may be because of matching against a
3582                  * utf8 string, so accept a possible false positive for
3583                  * latin1-range folds */
3584                 if (uc >= 0x100 ||
3585                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3586                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
3587                     && (!(data->start_class->flags & ANYOF_LOC_FOLD)
3588                         || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3589                     )
3590                 {
3591                     compat = 0;
3592                 }
3593                 ANYOF_CLASS_ZERO(data->start_class);
3594                 ANYOF_BITMAP_ZERO(data->start_class);
3595                 if (compat)
3596                     ANYOF_BITMAP_SET(data->start_class, uc);
3597                 else if (uc >= 0x100) {
3598                     int i;
3599
3600                     /* Some Unicode code points fold to the Latin1 range; as
3601                      * XXX temporary code, instead of figuring out if this is
3602                      * one, just assume it is and set all the start class bits
3603                      * that could be some such above 255 code point's fold
3604                      * which will generate fals positives.  As the code
3605                      * elsewhere that does compute the fold settles down, it
3606                      * can be extracted out and re-used here */
3607                     for (i = 0; i < 256; i++){
3608                         if (HAS_NONLATIN1_FOLD_CLOSURE(i)) {
3609                             ANYOF_BITMAP_SET(data->start_class, i);
3610                         }
3611                     }
3612                 }
3613                 data->start_class->flags &= ~ANYOF_EOS;
3614                 if (uc < 0x100)
3615                   data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3616             }
3617             else if (flags & SCF_DO_STCLASS_OR) {
3618                 /* false positive possible if the class is case-folded */
3619                 if (uc < 0x100)
3620                     ANYOF_BITMAP_SET(data->start_class, uc);
3621                 else
3622                     data->start_class->flags |= ANYOF_UNICODE_ALL;
3623                 data->start_class->flags &= ~ANYOF_EOS;
3624                 cl_and(data->start_class, and_withp);
3625             }
3626             flags &= ~SCF_DO_STCLASS;
3627         }
3628         else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3629             I32 l = STR_LEN(scan);
3630             UV uc = *((U8*)STRING(scan));
3631
3632             /* Search for fixed substrings supports EXACT only. */
3633             if (flags & SCF_DO_SUBSTR) {
3634                 assert(data);
3635                 SCAN_COMMIT(pRExC_state, data, minlenp);
3636             }
3637             if (UTF) {
3638                 const U8 * const s = (U8 *)STRING(scan);
3639                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3640                 l = utf8_length(s, s + l);
3641             }
3642             if (has_exactf_sharp_s) {
3643                 RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
3644             }
3645             min += l - min_subtract;
3646             assert (min >= 0);
3647             delta += min_subtract;
3648             if (flags & SCF_DO_SUBSTR) {
3649                 data->pos_min += l - min_subtract;
3650                 if (data->pos_min < 0) {
3651                     data->pos_min = 0;
3652                 }
3653                 data->pos_delta += min_subtract;
3654                 if (min_subtract) {
3655                     data->longest = &(data->longest_float);
3656                 }
3657             }
3658             if (flags & SCF_DO_STCLASS_AND) {
3659                 /* Check whether it is compatible with what we know already! */
3660                 int compat = 1;
3661                 if (uc >= 0x100 ||
3662                  (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3663                   && !ANYOF_BITMAP_TEST(data->start_class, uc)
3664                   && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3665                 {
3666                     compat = 0;
3667                 }
3668                 ANYOF_CLASS_ZERO(data->start_class);
3669                 ANYOF_BITMAP_ZERO(data->start_class);
3670                 if (compat) {
3671                     ANYOF_BITMAP_SET(data->start_class, uc);
3672                     data->start_class->flags &= ~ANYOF_EOS;
3673                     if (OP(scan) == EXACTFL) {
3674                         /* XXX This set is probably no longer necessary, and
3675                          * probably wrong as LOCALE now is on in the initial
3676                          * state */
3677                         data->start_class->flags |= ANYOF_LOCALE|ANYOF_LOC_FOLD;
3678                     }
3679                     else {
3680
3681                         /* Also set the other member of the fold pair.  In case
3682                          * that unicode semantics is called for at runtime, use
3683                          * the full latin1 fold.  (Can't do this for locale,
3684                          * because not known until runtime) */
3685                         ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3686
3687                         /* All other (EXACTFL handled above) folds except under
3688                          * /iaa that include s, S, and sharp_s also may include
3689                          * the others */
3690                         if (OP(scan) != EXACTFA) {
3691                             if (uc == 's' || uc == 'S') {
3692                                 ANYOF_BITMAP_SET(data->start_class,
3693                                                  LATIN_SMALL_LETTER_SHARP_S);
3694                             }
3695                             else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3696                                 ANYOF_BITMAP_SET(data->start_class, 's');
3697                                 ANYOF_BITMAP_SET(data->start_class, 'S');
3698                             }
3699                         }
3700                     }
3701                 }
3702                 else if (uc >= 0x100) {
3703                     int i;
3704                     for (i = 0; i < 256; i++){
3705                         if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3706                             ANYOF_BITMAP_SET(data->start_class, i);
3707                         }
3708                     }
3709                 }
3710             }
3711             else if (flags & SCF_DO_STCLASS_OR) {
3712                 if (data->start_class->flags & ANYOF_LOC_FOLD) {
3713                     /* false positive possible if the class is case-folded.
3714                        Assume that the locale settings are the same... */
3715                     if (uc < 0x100) {
3716                         ANYOF_BITMAP_SET(data->start_class, uc);
3717                         if (OP(scan) != EXACTFL) {
3718
3719                             /* And set the other member of the fold pair, but
3720                              * can't do that in locale because not known until
3721                              * run-time */
3722                             ANYOF_BITMAP_SET(data->start_class,
3723                                              PL_fold_latin1[uc]);
3724
3725                             /* All folds except under /iaa that include s, S,
3726                              * and sharp_s also may include the others */
3727                             if (OP(scan) != EXACTFA) {
3728                                 if (uc == 's' || uc == 'S') {
3729                                     ANYOF_BITMAP_SET(data->start_class,
3730                                                    LATIN_SMALL_LETTER_SHARP_S);
3731                                 }
3732                                 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3733                                     ANYOF_BITMAP_SET(data->start_class, 's');
3734                                     ANYOF_BITMAP_SET(data->start_class, 'S');
3735                                 }
3736                             }
3737                         }
3738                     }
3739                     data->start_class->flags &= ~ANYOF_EOS;
3740                 }
3741                 cl_and(data->start_class, and_withp);
3742             }
3743             flags &= ~SCF_DO_STCLASS;
3744         }
3745         else if (REGNODE_VARIES(OP(scan))) {
3746             I32 mincount, maxcount, minnext, deltanext, fl = 0;
3747             I32 f = flags, pos_before = 0;
3748             regnode * const oscan = scan;
3749             struct regnode_charclass_class this_class;
3750             struct regnode_charclass_class *oclass = NULL;
3751             I32 next_is_eval = 0;
3752
3753             switch (PL_regkind[OP(scan)]) {
3754             case WHILEM:                /* End of (?:...)* . */
3755                 scan = NEXTOPER(scan);
3756                 goto finish;
3757             case PLUS:
3758                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3759                     next = NEXTOPER(scan);
3760                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3761                         mincount = 1;
3762                         maxcount = REG_INFTY;
3763                         next = regnext(scan);
3764                         scan = NEXTOPER(scan);
3765                         goto do_curly;
3766                     }
3767                 }
3768                 if (flags & SCF_DO_SUBSTR)
3769                     data->pos_min++;
3770                 min++;
3771                 /* Fall through. */
3772             case STAR:
3773                 if (flags & SCF_DO_STCLASS) {
3774                     mincount = 0;
3775                     maxcount = REG_INFTY;
3776                     next = regnext(scan);
3777                     scan = NEXTOPER(scan);
3778                     goto do_curly;
3779                 }
3780                 is_inf = is_inf_internal = 1;
3781                 scan = regnext(scan);
3782                 if (flags & SCF_DO_SUBSTR) {
3783                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3784                     data->longest = &(data->longest_float);
3785                 }
3786                 goto optimize_curly_tail;
3787             case CURLY:
3788                 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3789                     && (scan->flags == stopparen))
3790                 {
3791                     mincount = 1;
3792                     maxcount = 1;
3793                 } else {
3794                     mincount = ARG1(scan);
3795                     maxcount = ARG2(scan);
3796                 }
3797                 next = regnext(scan);
3798                 if (OP(scan) == CURLYX) {
3799                     I32 lp = (data ? *(data->last_closep) : 0);
3800                     scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3801                 }
3802                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3803                 next_is_eval = (OP(scan) == EVAL);
3804               do_curly:
3805                 if (flags & SCF_DO_SUBSTR) {
3806                     if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3807                     pos_before = data->pos_min;
3808                 }
3809                 if (data) {
3810                     fl = data->flags;
3811                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3812                     if (is_inf)
3813                         data->flags |= SF_IS_INF;
3814                 }
3815                 if (flags & SCF_DO_STCLASS) {
3816                     cl_init(pRExC_state, &this_class);
3817                     oclass = data->start_class;
3818                     data->start_class = &this_class;
3819                     f |= SCF_DO_STCLASS_AND;
3820                     f &= ~SCF_DO_STCLASS_OR;
3821                 }
3822                 /* Exclude from super-linear cache processing any {n,m}
3823                    regops for which the combination of input pos and regex
3824                    pos is not enough information to determine if a match
3825                    will be possible.
3826
3827                    For example, in the regex /foo(bar\s*){4,8}baz/ with the
3828                    regex pos at the \s*, the prospects for a match depend not
3829                    only on the input position but also on how many (bar\s*)
3830                    repeats into the {4,8} we are. */
3831                if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3832                     f &= ~SCF_WHILEM_VISITED_POS;
3833
3834                 /* This will finish on WHILEM, setting scan, or on NULL: */
3835                 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3836                                       last, data, stopparen, recursed, NULL,
3837                                       (mincount == 0
3838                                         ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3839
3840                 if (flags & SCF_DO_STCLASS)
3841                     data->start_class = oclass;
3842                 if (mincount == 0 || minnext == 0) {
3843                     if (flags & SCF_DO_STCLASS_OR) {
3844                         cl_or(pRExC_state, data->start_class, &this_class);
3845                     }
3846                     else if (flags & SCF_DO_STCLASS_AND) {
3847                         /* Switch to OR mode: cache the old value of
3848                          * data->start_class */
3849                         INIT_AND_WITHP;
3850                         StructCopy(data->start_class, and_withp,
3851                                    struct regnode_charclass_class);
3852                         flags &= ~SCF_DO_STCLASS_AND;
3853                         StructCopy(&this_class, data->start_class,
3854                                    struct regnode_charclass_class);
3855                         flags |= SCF_DO_STCLASS_OR;
3856                         data->start_class->flags |= ANYOF_EOS;
3857                     }
3858                 } else {                /* Non-zero len */
3859                     if (flags & SCF_DO_STCLASS_OR) {
3860                         cl_or(pRExC_state, data->start_class, &this_class);
3861                         cl_and(data->start_class, and_withp);
3862                     }
3863                     else if (flags & SCF_DO_STCLASS_AND)
3864                         cl_and(data->start_class, &this_class);
3865                     flags &= ~SCF_DO_STCLASS;
3866                 }
3867                 if (!scan)              /* It was not CURLYX, but CURLY. */
3868                     scan = next;
3869                 if ( /* ? quantifier ok, except for (?{ ... }) */
3870                     (next_is_eval || !(mincount == 0 && maxcount == 1))
3871                     && (minnext == 0) && (deltanext == 0)
3872                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3873                     && maxcount <= REG_INFTY/3) /* Complement check for big count */
3874                 {
3875                     ckWARNreg(RExC_parse,
3876                               "Quantifier unexpected on zero-length expression");
3877                 }
3878
3879                 min += minnext * mincount;
3880                 is_inf_internal |= ((maxcount == REG_INFTY
3881                                      && (minnext + deltanext) > 0)
3882                                     || deltanext == I32_MAX);
3883                 is_inf |= is_inf_internal;
3884                 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3885
3886                 /* Try powerful optimization CURLYX => CURLYN. */
3887                 if (  OP(oscan) == CURLYX && data
3888                       && data->flags & SF_IN_PAR
3889                       && !(data->flags & SF_HAS_EVAL)
3890                       && !deltanext && minnext == 1 ) {
3891                     /* Try to optimize to CURLYN.  */
3892                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3893                     regnode * const nxt1 = nxt;
3894 #ifdef DEBUGGING
3895                     regnode *nxt2;
3896 #endif
3897
3898                     /* Skip open. */
3899                     nxt = regnext(nxt);
3900                     if (!REGNODE_SIMPLE(OP(nxt))
3901                         && !(PL_regkind[OP(nxt)] == EXACT
3902                              && STR_LEN(nxt) == 1))
3903                         goto nogo;
3904 #ifdef DEBUGGING
3905                     nxt2 = nxt;
3906 #endif
3907                     nxt = regnext(nxt);
3908                     if (OP(nxt) != CLOSE)
3909                         goto nogo;
3910                     if (RExC_open_parens) {
3911                         RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3912                         RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3913                     }
3914                     /* Now we know that nxt2 is the only contents: */
3915                     oscan->flags = (U8)ARG(nxt);
3916                     OP(oscan) = CURLYN;
3917                     OP(nxt1) = NOTHING; /* was OPEN. */
3918
3919 #ifdef DEBUGGING
3920                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3921                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3922                     NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3923                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
3924                     OP(nxt + 1) = OPTIMIZED; /* was count. */
3925                     NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3926 #endif
3927                 }
3928               nogo:
3929
3930                 /* Try optimization CURLYX => CURLYM. */
3931                 if (  OP(oscan) == CURLYX && data
3932                       && !(data->flags & SF_HAS_PAR)
3933                       && !(data->flags & SF_HAS_EVAL)
3934                       && !deltanext     /* atom is fixed width */
3935                       && minnext != 0   /* CURLYM can't handle zero width */
3936                       && ! (RExC_seen & REG_SEEN_EXACTF_SHARP_S) /* Nor \xDF */
3937                 ) {
3938                     /* XXXX How to optimize if data == 0? */
3939                     /* Optimize to a simpler form.  */
3940                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3941                     regnode *nxt2;
3942
3943                     OP(oscan) = CURLYM;
3944                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3945                             && (OP(nxt2) != WHILEM))
3946                         nxt = nxt2;
3947                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
3948                     /* Need to optimize away parenths. */
3949                     if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3950                         /* Set the parenth number.  */
3951                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3952
3953                         oscan->flags = (U8)ARG(nxt);
3954                         if (RExC_open_parens) {
3955                             RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3956                             RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3957                         }
3958                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
3959                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
3960
3961 #ifdef DEBUGGING
3962                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3963                         OP(nxt + 1) = OPTIMIZED; /* was count. */
3964                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3965                         NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
3966 #endif
3967 #if 0
3968                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
3969                             regnode *nnxt = regnext(nxt1);
3970                             if (nnxt == nxt) {
3971                                 if (reg_off_by_arg[OP(nxt1)])
3972                                     ARG_SET(nxt1, nxt2 - nxt1);
3973                                 else if (nxt2 - nxt1 < U16_MAX)
3974                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
3975                                 else
3976                                     OP(nxt) = NOTHING;  /* Cannot beautify */
3977                             }
3978                             nxt1 = nnxt;
3979                         }
3980 #endif
3981                         /* Optimize again: */
3982                         study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3983                                     NULL, stopparen, recursed, NULL, 0,depth+1);
3984                     }
3985                     else
3986                         oscan->flags = 0;
3987                 }
3988                 else if ((OP(oscan) == CURLYX)
3989                          && (flags & SCF_WHILEM_VISITED_POS)
3990                          /* See the comment on a similar expression above.
3991                             However, this time it's not a subexpression
3992                             we care about, but the expression itself. */
3993                          && (maxcount == REG_INFTY)
3994                          && data && ++data->whilem_c < 16) {
3995                     /* This stays as CURLYX, we can put the count/of pair. */
3996                     /* Find WHILEM (as in regexec.c) */
3997                     regnode *nxt = oscan + NEXT_OFF(oscan);
3998
3999                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4000                         nxt += ARG(nxt);
4001                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
4002                         | (RExC_whilem_seen << 4)); /* On WHILEM */
4003                 }
4004                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4005                     pars++;
4006                 if (flags & SCF_DO_SUBSTR) {
4007                     SV *last_str = NULL;
4008                     int counted = mincount != 0;
4009
4010                     if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
4011 #if defined(SPARC64_GCC_WORKAROUND)
4012                         I32 b = 0;
4013                         STRLEN l = 0;
4014                         const char *s = NULL;
4015                         I32 old = 0;
4016
4017                         if (pos_before >= data->last_start_min)
4018                             b = pos_before;
4019                         else
4020                             b = data->last_start_min;
4021
4022                         l = 0;
4023                         s = SvPV_const(data->last_found, l);
4024                         old = b - data->last_start_min;
4025
4026 #else
4027                         I32 b = pos_before >= data->last_start_min
4028                             ? pos_before : data->last_start_min;
4029                         STRLEN l;
4030                         const char * const s = SvPV_const(data->last_found, l);
4031                         I32 old = b - data->last_start_min;
4032 #endif
4033
4034                         if (UTF)
4035                             old = utf8_hop((U8*)s, old) - (U8*)s;
4036                         l -= old;
4037                         /* Get the added string: */
4038                         last_str = newSVpvn_utf8(s  + old, l, UTF);
4039                         if (deltanext == 0 && pos_before == b) {
4040                             /* What was added is a constant string */
4041                             if (mincount > 1) {
4042                                 SvGROW(last_str, (mincount * l) + 1);
4043                                 repeatcpy(SvPVX(last_str) + l,
4044                                           SvPVX_const(last_str), l, mincount - 1);
4045                                 SvCUR_set(last_str, SvCUR(last_str) * mincount);
4046                                 /* Add additional parts. */
4047                                 SvCUR_set(data->last_found,
4048                                           SvCUR(data->last_found) - l);
4049                                 sv_catsv(data->last_found, last_str);
4050                                 {
4051                                     SV * sv = data->last_found;
4052                                     MAGIC *mg =
4053                                         SvUTF8(sv) && SvMAGICAL(sv) ?
4054                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
4055                                     if (mg && mg->mg_len >= 0)
4056                                         mg->mg_len += CHR_SVLEN(last_str) - l;
4057                                 }
4058                                 data->last_end += l * (mincount - 1);
4059                             }
4060                         } else {
4061                             /* start offset must point into the last copy */
4062                             data->last_start_min += minnext * (mincount - 1);
4063                             data->last_start_max += is_inf ? I32_MAX
4064                                 : (maxcount - 1) * (minnext + data->pos_delta);
4065                         }
4066                     }
4067                     /* It is counted once already... */
4068                     data->pos_min += minnext * (mincount - counted);
4069                     data->pos_delta += - counted * deltanext +
4070                         (minnext + deltanext) * maxcount - minnext * mincount;
4071                     if (mincount != maxcount) {
4072                          /* Cannot extend fixed substrings found inside
4073                             the group.  */
4074                         SCAN_COMMIT(pRExC_state,data,minlenp);
4075                         if (mincount && last_str) {
4076                             SV * const sv = data->last_found;
4077                             MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4078                                 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4079
4080                             if (mg)
4081                                 mg->mg_len = -1;
4082                             sv_setsv(sv, last_str);
4083                             data->last_end = data->pos_min;
4084                             data->last_start_min =
4085                                 data->pos_min - CHR_SVLEN(last_str);
4086                             data->last_start_max = is_inf
4087                                 ? I32_MAX
4088                                 : data->pos_min + data->pos_delta
4089                                 - CHR_SVLEN(last_str);
4090                         }
4091                         data->longest = &(data->longest_float);
4092                     }
4093                     SvREFCNT_dec(last_str);
4094                 }
4095                 if (data && (fl & SF_HAS_EVAL))
4096                     data->flags |= SF_HAS_EVAL;
4097               optimize_curly_tail:
4098                 if (OP(oscan) != CURLYX) {
4099                     while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4100                            && NEXT_OFF(next))
4101                         NEXT_OFF(oscan) += NEXT_OFF(next);
4102                 }
4103                 continue;
4104             default:                    /* REF, ANYOFV, and CLUMP only? */
4105                 if (flags & SCF_DO_SUBSTR) {
4106                     SCAN_COMMIT(pRExC_state,data,minlenp);      /* Cannot expect anything... */
4107                     data->longest = &(data->longest_float);
4108                 }
4109                 is_inf = is_inf_internal = 1;
4110                 if (flags & SCF_DO_STCLASS_OR)
4111                     cl_anything(pRExC_state, data->start_class);
4112                 flags &= ~SCF_DO_STCLASS;
4113                 break;
4114             }
4115         }
4116         else if (OP(scan) == LNBREAK) {
4117             if (flags & SCF_DO_STCLASS) {
4118                 int value = 0;
4119                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
4120                 if (flags & SCF_DO_STCLASS_AND) {
4121                     for (value = 0; value < 256; value++)
4122                         if (!is_VERTWS_cp(value))
4123                             ANYOF_BITMAP_CLEAR(data->start_class, value);
4124                 }
4125                 else {
4126                     for (value = 0; value < 256; value++)
4127                         if (is_VERTWS_cp(value))
4128                             ANYOF_BITMAP_SET(data->start_class, value);
4129                 }
4130                 if (flags & SCF_DO_STCLASS_OR)
4131                     cl_and(data->start_class, and_withp);
4132                 flags &= ~SCF_DO_STCLASS;
4133             }
4134             min++;
4135             delta++;    /* Because of the 2 char string cr-lf */
4136             if (flags & SCF_DO_SUBSTR) {
4137                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
4138                 data->pos_min += 1;
4139                 data->pos_delta += 1;
4140                 data->longest = &(data->longest_float);
4141             }
4142         }
4143         else if (REGNODE_SIMPLE(OP(scan))) {
4144             int value = 0;
4145
4146             if (flags & SCF_DO_SUBSTR) {
4147                 SCAN_COMMIT(pRExC_state,data,minlenp);
4148                 data->pos_min++;
4149             }
4150             min++;
4151             if (flags & SCF_DO_STCLASS) {
4152                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
4153
4154                 /* Some of the logic below assumes that switching
4155                    locale on will only add false positives. */
4156                 switch (PL_regkind[OP(scan)]) {
4157                 case SANY:
4158                 default:
4159                   do_default:
4160                     /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
4161                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4162                         cl_anything(pRExC_state, data->start_class);
4163                     break;
4164                 case REG_ANY:
4165                     if (OP(scan) == SANY)
4166                         goto do_default;
4167                     if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
4168                         value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
4169                                  || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
4170                         cl_anything(pRExC_state, data->start_class);
4171                     }
4172                     if (flags & SCF_DO_STCLASS_AND || !value)
4173                         ANYOF_BITMAP_CLEAR(data->start_class,'\n');
4174                     break;
4175                 case ANYOF:
4176                     if (flags & SCF_DO_STCLASS_AND)
4177                         cl_and(data->start_class,
4178                                (struct regnode_charclass_class*)scan);
4179                     else
4180                         cl_or(pRExC_state, data->start_class,
4181                               (struct regnode_charclass_class*)scan);
4182                     break;
4183                 case ALNUM:
4184                     if (flags & SCF_DO_STCLASS_AND) {
4185                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4186                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NWORDCHAR);
4187                             if (OP(scan) == ALNUMU) {
4188                                 for (value = 0; value < 256; value++) {
4189                                     if (!isWORDCHAR_L1(value)) {
4190                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4191                                     }
4192                                 }
4193                             } else {
4194                                 for (value = 0; value < 256; value++) {
4195                                     if (!isALNUM(value)) {
4196                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4197                                     }
4198                                 }
4199                             }
4200                         }
4201                     }
4202                     else {
4203                         if (data->start_class->flags & ANYOF_LOCALE)
4204                             ANYOF_CLASS_SET(data->start_class,ANYOF_WORDCHAR);
4205
4206                         /* Even if under locale, set the bits for non-locale
4207                          * in case it isn't a true locale-node.  This will
4208                          * create false positives if it truly is locale */
4209                         if (OP(scan) == ALNUMU) {
4210                             for (value = 0; value < 256; value++) {
4211                                 if (isWORDCHAR_L1(value)) {
4212                                     ANYOF_BITMAP_SET(data->start_class, value);
4213                                 }
4214                             }
4215                         } else {
4216                             for (value = 0; value < 256; value++) {
4217                                 if (isALNUM(value)) {
4218                                     ANYOF_BITMAP_SET(data->start_class, value);
4219                                 }
4220                             }
4221                         }
4222                     }
4223                     break;
4224                 case NALNUM:
4225                     if (flags & SCF_DO_STCLASS_AND) {
4226                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4227                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_WORDCHAR);
4228                             if (OP(scan) == NALNUMU) {
4229                                 for (value = 0; value < 256; value++) {
4230                                     if (isWORDCHAR_L1(value)) {
4231                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4232                                     }
4233                                 }
4234                             } else {
4235                                 for (value = 0; value < 256; value++) {
4236                                     if (isALNUM(value)) {
4237                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4238                                     }
4239                                 }
4240                             }
4241                         }
4242                     }
4243                     else {
4244                         if (data->start_class->flags & ANYOF_LOCALE)
4245                             ANYOF_CLASS_SET(data->start_class,ANYOF_NWORDCHAR);
4246
4247                         /* Even if under locale, set the bits for non-locale in
4248                          * case it isn't a true locale-node.  This will create
4249                          * false positives if it truly is locale */
4250                         if (OP(scan) == NALNUMU) {
4251                             for (value = 0; value < 256; value++) {
4252                                 if (! isWORDCHAR_L1(value)) {
4253                                     ANYOF_BITMAP_SET(data->start_class, value);
4254                                 }
4255                             }
4256                         } else {
4257                             for (value = 0; value < 256; value++) {
4258                                 if (! isALNUM(value)) {
4259                                     ANYOF_BITMAP_SET(data->start_class, value);
4260                                 }
4261                             }
4262                         }
4263                     }
4264                     break;
4265                 case SPACE:
4266                     if (flags & SCF_DO_STCLASS_AND) {
4267                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4268                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
4269                             if (OP(scan) == SPACEU) {
4270                                 for (value = 0; value < 256; value++) {
4271                                     if (!isSPACE_L1(value)) {
4272                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4273                                     }
4274                                 }
4275                             } else {
4276                                 for (value = 0; value < 256; value++) {
4277                                     if (!isSPACE(value)) {
4278                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4279                                     }
4280                                 }
4281                             }
4282                         }
4283                     }
4284                     else {
4285                         if (data->start_class->flags & ANYOF_LOCALE) {
4286                             ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
4287                         }
4288                         if (OP(scan) == SPACEU) {
4289                             for (value = 0; value < 256; value++) {
4290                                 if (isSPACE_L1(value)) {
4291                                     ANYOF_BITMAP_SET(data->start_class, value);
4292                                 }
4293                             }
4294                         } else {
4295                             for (value = 0; value < 256; value++) {
4296                                 if (isSPACE(value)) {
4297                                     ANYOF_BITMAP_SET(data->start_class, value);
4298                                 }
4299                             }
4300                         }
4301                     }
4302                     break;
4303                 case NSPACE:
4304                     if (flags & SCF_DO_STCLASS_AND) {
4305                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4306                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
4307                             if (OP(scan) == NSPACEU) {
4308                                 for (value = 0; value < 256; value++) {
4309                                     if (isSPACE_L1(value)) {
4310                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4311                                     }
4312                                 }
4313                             } else {
4314                                 for (value = 0; value < 256; value++) {
4315                                     if (isSPACE(value)) {
4316                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4317                                     }
4318                                 }
4319                             }
4320                         }
4321                     }
4322                     else {
4323                         if (data->start_class->flags & ANYOF_LOCALE)
4324                             ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
4325                         if (OP(scan) == NSPACEU) {
4326                             for (value = 0; value < 256; value++) {
4327                                 if (!isSPACE_L1(value)) {
4328                                     ANYOF_BITMAP_SET(data->start_class, value);
4329                                 }
4330                             }
4331                         }
4332                         else {
4333                             for (value = 0; value < 256; value++) {
4334                                 if (!isSPACE(value)) {
4335                                     ANYOF_BITMAP_SET(data->start_class, value);
4336                                 }
4337                             }
4338                         }
4339                     }
4340                     break;
4341                 case DIGIT:
4342                     if (flags & SCF_DO_STCLASS_AND) {
4343                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4344                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
4345                             for (value = 0; value < 256; value++)
4346                                 if (!isDIGIT(value))
4347                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
4348                         }
4349                     }
4350                     else {
4351                         if (data->start_class->flags & ANYOF_LOCALE)
4352                             ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
4353                         for (value = 0; value < 256; value++)
4354                             if (isDIGIT(value))
4355                                 ANYOF_BITMAP_SET(data->start_class, value);
4356                     }
4357                     break;
4358                 case NDIGIT:
4359                     if (flags & SCF_DO_STCLASS_AND) {
4360                         if (!(data->start_class->flags & ANYOF_LOCALE))
4361                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
4362                         for (value = 0; value < 256; value++)
4363                             if (isDIGIT(value))
4364                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
4365                     }
4366                     else {
4367                         if (data->start_class->flags & ANYOF_LOCALE)
4368                             ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
4369                         for (value = 0; value < 256; value++)
4370                             if (!isDIGIT(value))
4371                                 ANYOF_BITMAP_SET(data->start_class, value);
4372                     }
4373                     break;
4374                 CASE_SYNST_FNC(VERTWS);
4375                 CASE_SYNST_FNC(HORIZWS);
4376
4377                 }
4378                 if (flags & SCF_DO_STCLASS_OR)
4379                     cl_and(data->start_class, and_withp);
4380                 flags &= ~SCF_DO_STCLASS;
4381             }
4382         }
4383         else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
4384             data->flags |= (OP(scan) == MEOL
4385                             ? SF_BEFORE_MEOL
4386                             : SF_BEFORE_SEOL);
4387             SCAN_COMMIT(pRExC_state, data, minlenp);
4388
4389         }
4390         else if (  PL_regkind[OP(scan)] == BRANCHJ
4391                  /* Lookbehind, or need to calculate parens/evals/stclass: */
4392                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
4393                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4394             if ( OP(scan) == UNLESSM &&
4395                  scan->flags == 0 &&
4396                  OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
4397                  OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
4398             ) {
4399                 regnode *opt;
4400                 regnode *upto= regnext(scan);
4401                 DEBUG_PARSE_r({
4402                     SV * const mysv_val=sv_newmortal();
4403                     DEBUG_STUDYDATA("OPFAIL",data,depth);
4404
4405                     /*DEBUG_PARSE_MSG("opfail");*/
4406                     regprop(RExC_rx, mysv_val, upto);
4407                     PerlIO_printf(Perl_debug_log, "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
4408                                   SvPV_nolen_const(mysv_val),
4409                                   (IV)REG_NODE_NUM(upto),
4410                                   (IV)(upto - scan)
4411                     );
4412                 });
4413                 OP(scan) = OPFAIL;
4414                 NEXT_OFF(scan) = upto - scan;
4415                 for (opt= scan + 1; opt < upto ; opt++)
4416                     OP(opt) = OPTIMIZED;
4417                 scan= upto;
4418                 continue;
4419             }
4420             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4421                 || OP(scan) == UNLESSM )
4422             {
4423                 /* Negative Lookahead/lookbehind
4424                    In this case we can't do fixed string optimisation.
4425                 */
4426
4427                 I32 deltanext, minnext, fake = 0;
4428                 regnode *nscan;
4429                 struct regnode_charclass_class intrnl;
4430                 int f = 0;
4431
4432                 data_fake.flags = 0;
4433                 if (data) {
4434                     data_fake.whilem_c = data->whilem_c;
4435                     data_fake.last_closep = data->last_closep;
4436                 }
4437                 else
4438                     data_fake.last_closep = &fake;
4439                 data_fake.pos_delta = delta;
4440                 if ( flags & SCF_DO_STCLASS && !scan->flags
4441                      && OP(scan) == IFMATCH ) { /* Lookahead */
4442                     cl_init(pRExC_state, &intrnl);
4443                     data_fake.start_class = &intrnl;
4444                     f |= SCF_DO_STCLASS_AND;
4445                 }
4446                 if (flags & SCF_WHILEM_VISITED_POS)
4447                     f |= SCF_WHILEM_VISITED_POS;
4448                 next = regnext(scan);
4449                 nscan = NEXTOPER(NEXTOPER(scan));
4450                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
4451                     last, &data_fake, stopparen, recursed, NULL, f, depth+1);
4452                 if (scan->flags) {
4453                     if (deltanext) {
4454                         FAIL("Variable length lookbehind not implemented");
4455                     }
4456                     else if (minnext > (I32)U8_MAX) {
4457                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4458                     }
4459                     scan->flags = (U8)minnext;
4460                 }
4461                 if (data) {
4462                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4463                         pars++;
4464                     if (data_fake.flags & SF_HAS_EVAL)
4465                         data->flags |= SF_HAS_EVAL;
4466                     data->whilem_c = data_fake.whilem_c;
4467                 }
4468                 if (f & SCF_DO_STCLASS_AND) {
4469                     if (flags & SCF_DO_STCLASS_OR) {
4470                         /* OR before, AND after: ideally we would recurse with
4471                          * data_fake to get the AND applied by study of the
4472                          * remainder of the pattern, and then derecurse;
4473                          * *** HACK *** for now just treat as "no information".
4474                          * See [perl #56690].
4475                          */
4476                         cl_init(pRExC_state, data->start_class);
4477                     }  else {
4478                         /* AND before and after: combine and continue */
4479                         const int was = (data->start_class->flags & ANYOF_EOS);
4480
4481                         cl_and(data->start_class, &intrnl);
4482                         if (was)
4483                             data->start_class->flags |= ANYOF_EOS;
4484                     }
4485                 }
4486             }
4487 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4488             else {
4489                 /* Positive Lookahead/lookbehind
4490                    In this case we can do fixed string optimisation,
4491                    but we must be careful about it. Note in the case of
4492                    lookbehind the positions will be offset by the minimum
4493                    length of the pattern, something we won't know about
4494                    until after the recurse.
4495                 */
4496                 I32 deltanext, fake = 0;
4497                 regnode *nscan;
4498                 struct regnode_charclass_class intrnl;
4499                 int f = 0;
4500                 /* We use SAVEFREEPV so that when the full compile
4501                     is finished perl will clean up the allocated
4502                     minlens when it's all done. This way we don't
4503                     have to worry about freeing them when we know
4504                     they wont be used, which would be a pain.
4505                  */
4506                 I32 *minnextp;
4507                 Newx( minnextp, 1, I32 );
4508                 SAVEFREEPV(minnextp);
4509
4510                 if (data) {
4511                     StructCopy(data, &data_fake, scan_data_t);
4512                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
4513                         f |= SCF_DO_SUBSTR;
4514                         if (scan->flags)
4515                             SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
4516                         data_fake.last_found=newSVsv(data->last_found);
4517                     }
4518                 }
4519                 else
4520                     data_fake.last_closep = &fake;
4521                 data_fake.flags = 0;
4522                 data_fake.pos_delta = delta;
4523                 if (is_inf)
4524                     data_fake.flags |= SF_IS_INF;
4525                 if ( flags & SCF_DO_STCLASS && !scan->flags
4526                      && OP(scan) == IFMATCH ) { /* Lookahead */
4527                     cl_init(pRExC_state, &intrnl);
4528                     data_fake.start_class = &intrnl;
4529                     f |= SCF_DO_STCLASS_AND;
4530                 }
4531                 if (flags & SCF_WHILEM_VISITED_POS)
4532                     f |= SCF_WHILEM_VISITED_POS;
4533                 next = regnext(scan);
4534                 nscan = NEXTOPER(NEXTOPER(scan));
4535
4536                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
4537                     last, &data_fake, stopparen, recursed, NULL, f,depth+1);
4538                 if (scan->flags) {
4539                     if (deltanext) {
4540                         FAIL("Variable length lookbehind not implemented");
4541                     }
4542                     else if (*minnextp > (I32)U8_MAX) {
4543                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4544                     }
4545                     scan->flags = (U8)*minnextp;
4546                 }
4547
4548                 *minnextp += min;
4549
4550                 if (f & SCF_DO_STCLASS_AND) {
4551                     const int was = (data->start_class->flags & ANYOF_EOS);
4552
4553                     cl_and(data->start_class, &intrnl);
4554                     if (was)
4555                         data->start_class->flags |= ANYOF_EOS;
4556                 }
4557                 if (data) {
4558                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4559                         pars++;
4560                     if (data_fake.flags & SF_HAS_EVAL)
4561                         data->flags |= SF_HAS_EVAL;
4562                     data->whilem_c = data_fake.whilem_c;
4563                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4564                         if (RExC_rx->minlen<*minnextp)
4565                             RExC_rx->minlen=*minnextp;
4566                         SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4567                         SvREFCNT_dec(data_fake.last_found);
4568
4569                         if ( data_fake.minlen_fixed != minlenp )
4570                         {
4571                             data->offset_fixed= data_fake.offset_fixed;
4572                             data->minlen_fixed= data_fake.minlen_fixed;
4573                             data->lookbehind_fixed+= scan->flags;
4574                         }
4575                         if ( data_fake.minlen_float != minlenp )
4576                         {
4577                             data->minlen_float= data_fake.minlen_float;
4578                             data->offset_float_min=data_fake.offset_float_min;
4579                             data->offset_float_max=data_fake.offset_float_max;
4580                             data->lookbehind_float+= scan->flags;
4581                         }
4582                     }
4583                 }
4584             }
4585 #endif
4586         }
4587         else if (OP(scan) == OPEN) {
4588             if (stopparen != (I32)ARG(scan))
4589                 pars++;
4590         }
4591         else if (OP(scan) == CLOSE) {
4592             if (stopparen == (I32)ARG(scan)) {
4593                 break;
4594             }
4595             if ((I32)ARG(scan) == is_par) {
4596                 next = regnext(scan);
4597
4598                 if ( next && (OP(next) != WHILEM) && next < last)
4599                     is_par = 0;         /* Disable optimization */
4600             }
4601             if (data)
4602                 *(data->last_closep) = ARG(scan);
4603         }
4604         else if (OP(scan) == EVAL) {
4605                 if (data)
4606                     data->flags |= SF_HAS_EVAL;
4607         }
4608         else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4609             if (flags & SCF_DO_SUBSTR) {
4610                 SCAN_COMMIT(pRExC_state,data,minlenp);
4611                 flags &= ~SCF_DO_SUBSTR;
4612             }
4613             if (data && OP(scan)==ACCEPT) {
4614                 data->flags |= SCF_SEEN_ACCEPT;
4615                 if (stopmin > min)
4616                     stopmin = min;
4617             }
4618         }
4619         else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4620         {
4621                 if (flags & SCF_DO_SUBSTR) {
4622                     SCAN_COMMIT(pRExC_state,data,minlenp);
4623                     data->longest = &(data->longest_float);
4624                 }
4625                 is_inf = is_inf_internal = 1;
4626                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4627                     cl_anything(pRExC_state, data->start_class);
4628                 flags &= ~SCF_DO_STCLASS;
4629         }
4630         else if (OP(scan) == GPOS) {
4631             if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4632                 !(delta || is_inf || (data && data->pos_delta)))
4633             {
4634                 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4635                     RExC_rx->extflags |= RXf_ANCH_GPOS;
4636                 if (RExC_rx->gofs < (U32)min)
4637                     RExC_rx->gofs = min;
4638             } else {
4639                 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4640                 RExC_rx->gofs = 0;
4641             }
4642         }
4643 #ifdef TRIE_STUDY_OPT
4644 #ifdef FULL_TRIE_STUDY
4645         else if (PL_regkind[OP(scan)] == TRIE) {
4646             /* NOTE - There is similar code to this block above for handling
4647                BRANCH nodes on the initial study.  If you change stuff here
4648                check there too. */
4649             regnode *trie_node= scan;
4650             regnode *tail= regnext(scan);
4651             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4652             I32 max1 = 0, min1 = I32_MAX;
4653             struct regnode_charclass_class accum;
4654
4655             if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4656                 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4657             if (flags & SCF_DO_STCLASS)
4658                 cl_init_zero(pRExC_state, &accum);
4659
4660             if (!trie->jump) {
4661                 min1= trie->minlen;
4662                 max1= trie->maxlen;
4663             } else {
4664                 const regnode *nextbranch= NULL;
4665                 U32 word;
4666
4667                 for ( word=1 ; word <= trie->wordcount ; word++)
4668                 {
4669                     I32 deltanext=0, minnext=0, f = 0, fake;
4670                     struct regnode_charclass_class this_class;
4671
4672                     data_fake.flags = 0;
4673                     if (data) {
4674                         data_fake.whilem_c = data->whilem_c;
4675                         data_fake.last_closep = data->last_closep;
4676                     }
4677                     else
4678                         data_fake.last_closep = &fake;
4679                     data_fake.pos_delta = delta;
4680                     if (flags & SCF_DO_STCLASS) {
4681                         cl_init(pRExC_state, &this_class);
4682                         data_fake.start_class = &this_class;
4683                         f = SCF_DO_STCLASS_AND;
4684                     }
4685                     if (flags & SCF_WHILEM_VISITED_POS)
4686                         f |= SCF_WHILEM_VISITED_POS;
4687
4688                     if (trie->jump[word]) {
4689                         if (!nextbranch)
4690                             nextbranch = trie_node + trie->jump[0];
4691                         scan= trie_node + trie->jump[word];
4692                         /* We go from the jump point to the branch that follows
4693                            it. Note this means we need the vestigal unused branches
4694                            even though they arent otherwise used.
4695                          */
4696                         minnext = study_chunk(pRExC_state, &scan, minlenp,
4697                             &deltanext, (regnode *)nextbranch, &data_fake,
4698                             stopparen, recursed, NULL, f,depth+1);
4699                     }
4700                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4701                         nextbranch= regnext((regnode*)nextbranch);
4702
4703                     if (min1 > (I32)(minnext + trie->minlen))
4704                         min1 = minnext + trie->minlen;
4705                     if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4706                         max1 = minnext + deltanext + trie->maxlen;
4707                     if (deltanext == I32_MAX)
4708                         is_inf = is_inf_internal = 1;
4709
4710                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4711                         pars++;
4712                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
4713                         if ( stopmin > min + min1)
4714                             stopmin = min + min1;
4715                         flags &= ~SCF_DO_SUBSTR;
4716                         if (data)
4717                             data->flags |= SCF_SEEN_ACCEPT;
4718                     }
4719                     if (data) {
4720                         if (data_fake.flags & SF_HAS_EVAL)
4721                             data->flags |= SF_HAS_EVAL;
4722                         data->whilem_c = data_fake.whilem_c;
4723                     }
4724                     if (flags & SCF_DO_STCLASS)
4725                         cl_or(pRExC_state, &accum, &this_class);
4726                 }
4727             }
4728             if (flags & SCF_DO_SUBSTR) {
4729                 data->pos_min += min1;
4730                 data->pos_delta += max1 - min1;
4731                 if (max1 != min1 || is_inf)
4732                     data->longest = &(data->longest_float);
4733             }
4734             min += min1;
4735             delta += max1 - min1;
4736             if (flags & SCF_DO_STCLASS_OR) {
4737                 cl_or(pRExC_state, data->start_class, &accum);
4738                 if (min1) {
4739                     cl_and(data->start_class, and_withp);
4740                     flags &= ~SCF_DO_STCLASS;
4741                 }
4742             }
4743             else if (flags & SCF_DO_STCLASS_AND) {
4744                 if (min1) {
4745                     cl_and(data->start_class, &accum);
4746                     flags &= ~SCF_DO_STCLASS;
4747                 }
4748                 else {
4749                     /* Switch to OR mode: cache the old value of
4750                      * data->start_class */
4751                     INIT_AND_WITHP;
4752                     StructCopy(data->start_class, and_withp,
4753                                struct regnode_charclass_class);
4754                     flags &= ~SCF_DO_STCLASS_AND;
4755                     StructCopy(&accum, data->start_class,
4756                                struct regnode_charclass_class);
4757                     flags |= SCF_DO_STCLASS_OR;
4758                     data->start_class->flags |= ANYOF_EOS;
4759                 }
4760             }
4761             scan= tail;
4762             continue;
4763         }
4764 #else
4765         else if (PL_regkind[OP(scan)] == TRIE) {
4766             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4767             U8*bang=NULL;
4768
4769             min += trie->minlen;
4770             delta += (trie->maxlen - trie->minlen);
4771             flags &= ~SCF_DO_STCLASS; /* xxx */
4772             if (flags & SCF_DO_SUBSTR) {
4773                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
4774                 data->pos_min += trie->minlen;
4775                 data->pos_delta += (trie->maxlen - trie->minlen);
4776                 if (trie->maxlen != trie->minlen)
4777                     data->longest = &(data->longest_float);
4778             }
4779             if (trie->jump) /* no more substrings -- for now /grr*/
4780                 flags &= ~SCF_DO_SUBSTR;
4781         }
4782 #endif /* old or new */
4783 #endif /* TRIE_STUDY_OPT */
4784
4785         /* Else: zero-length, ignore. */
4786         scan = regnext(scan);
4787     }
4788     if (frame) {
4789         last = frame->last;
4790         scan = frame->next;
4791         stopparen = frame->stop;
4792         frame = frame->prev;
4793         goto fake_study_recurse;
4794     }
4795
4796   finish:
4797     assert(!frame);
4798     DEBUG_STUDYDATA("pre-fin:",data,depth);
4799
4800     *scanp = scan;
4801     *deltap = is_inf_internal ? I32_MAX : delta;
4802     if (flags & SCF_DO_SUBSTR && is_inf)
4803         data->pos_delta = I32_MAX - data->pos_min;
4804     if (is_par > (I32)U8_MAX)
4805         is_par = 0;
4806     if (is_par && pars==1 && data) {
4807         data->flags |= SF_IN_PAR;
4808         data->flags &= ~SF_HAS_PAR;
4809     }
4810     else if (pars && data) {
4811         data->flags |= SF_HAS_PAR;
4812         data->flags &= ~SF_IN_PAR;
4813     }
4814     if (flags & SCF_DO_STCLASS_OR)
4815         cl_and(data->start_class, and_withp);
4816     if (flags & SCF_TRIE_RESTUDY)
4817         data->flags |=  SCF_TRIE_RESTUDY;
4818
4819     DEBUG_STUDYDATA("post-fin:",data,depth);
4820
4821     return min < stopmin ? min : stopmin;
4822 }
4823
4824 STATIC U32
4825 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4826 {
4827     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4828
4829     PERL_ARGS_ASSERT_ADD_DATA;
4830
4831     Renewc(RExC_rxi->data,
4832            sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4833            char, struct reg_data);
4834     if(count)
4835         Renew(RExC_rxi->data->what, count + n, U8);
4836     else
4837         Newx(RExC_rxi->data->what, n, U8);
4838     RExC_rxi->data->count = count + n;
4839     Copy(s, RExC_rxi->data->what + count, n, U8);
4840     return count;
4841 }
4842
4843 /*XXX: todo make this not included in a non debugging perl */
4844 #ifndef PERL_IN_XSUB_RE
4845 void
4846 Perl_reginitcolors(pTHX)
4847 {
4848     dVAR;
4849     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4850     if (s) {
4851         char *t = savepv(s);
4852         int i = 0;
4853         PL_colors[0] = t;
4854         while (++i < 6) {
4855             t = strchr(t, '\t');
4856             if (t) {
4857                 *t = '\0';
4858                 PL_colors[i] = ++t;
4859             }
4860             else
4861                 PL_colors[i] = t = (char *)"";
4862         }
4863     } else {
4864         int i = 0;
4865         while (i < 6)
4866             PL_colors[i++] = (char *)"";
4867     }
4868     PL_colorset = 1;
4869 }
4870 #endif
4871
4872
4873 #ifdef TRIE_STUDY_OPT
4874 #define CHECK_RESTUDY_GOTO                                  \
4875         if (                                                \
4876               (data.flags & SCF_TRIE_RESTUDY)               \
4877               && ! restudied++                              \
4878         )     goto reStudy
4879 #else
4880 #define CHECK_RESTUDY_GOTO
4881 #endif
4882
4883 /*
4884  * pregcomp - compile a regular expression into internal code
4885  *
4886  * Decides which engine's compiler to call based on the hint currently in
4887  * scope
4888  */
4889
4890 #ifndef PERL_IN_XSUB_RE
4891
4892 /* return the currently in-scope regex engine (or the default if none)  */
4893
4894 regexp_engine const *
4895 Perl_current_re_engine(pTHX)
4896 {
4897     dVAR;
4898
4899     if (IN_PERL_COMPILETIME) {
4900         HV * const table = GvHV(PL_hintgv);
4901         SV **ptr;
4902
4903         if (!table)
4904             return &reh_regexp_engine;
4905         ptr = hv_fetchs(table, "regcomp", FALSE);
4906         if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
4907             return &reh_regexp_engine;
4908         return INT2PTR(regexp_engine*,SvIV(*ptr));
4909     }
4910     else {
4911         SV *ptr;
4912         if (!PL_curcop->cop_hints_hash)
4913             return &reh_regexp_engine;
4914         ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
4915         if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
4916             return &reh_regexp_engine;
4917         return INT2PTR(regexp_engine*,SvIV(ptr));
4918     }
4919 }
4920
4921
4922 REGEXP *
4923 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4924 {
4925     dVAR;
4926     regexp_engine const *eng = current_re_engine();
4927     GET_RE_DEBUG_FLAGS_DECL;
4928
4929     PERL_ARGS_ASSERT_PREGCOMP;
4930
4931     /* Dispatch a request to compile a regexp to correct regexp engine. */
4932     DEBUG_COMPILE_r({
4933         PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4934                         PTR2UV(eng));
4935     });
4936     return CALLREGCOMP_ENG(eng, pattern, flags);
4937 }
4938 #endif
4939
4940 /* public(ish) entry point for the perl core's own regex compiling code.
4941  * It's actually a wrapper for Perl_re_op_compile that only takes an SV
4942  * pattern rather than a list of OPs, and uses the internal engine rather
4943  * than the current one */
4944
4945 REGEXP *
4946 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
4947 {
4948     SV *pat = pattern; /* defeat constness! */
4949     PERL_ARGS_ASSERT_RE_COMPILE;
4950     return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
4951 #ifdef PERL_IN_XSUB_RE
4952                                 &my_reg_engine,
4953 #else
4954                                 &reh_regexp_engine,
4955 #endif
4956                                 NULL, NULL, rx_flags, 0);
4957 }
4958
4959 /* see if there are any run-time code blocks in the pattern.
4960  * False positives are allowed */
4961
4962 static bool
4963 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state, OP *expr,
4964                     U32 pm_flags, char *pat, STRLEN plen)
4965 {
4966     int n = 0;
4967     STRLEN s;
4968
4969     /* avoid infinitely recursing when we recompile the pattern parcelled up
4970      * as qr'...'. A single constant qr// string can't have have any
4971      * run-time component in it, and thus, no runtime code. (A non-qr
4972      * string, however, can, e.g. $x =~ '(?{})') */
4973     if  ((pm_flags & PMf_IS_QR) && expr && expr->op_type == OP_CONST)
4974         return 0;
4975
4976     for (s = 0; s < plen; s++) {
4977         if (n < pRExC_state->num_code_blocks
4978             && s == pRExC_state->code_blocks[n].start)
4979         {
4980             s = pRExC_state->code_blocks[n].end;
4981             n++;
4982             continue;
4983         }
4984         /* TODO ideally should handle [..], (#..), /#.../x to reduce false
4985          * positives here */
4986         if (pat[s] == '(' && pat[s+1] == '?' &&
4987             (pat[s+2] == '{' || (pat[s+2] == '?' && pat[s+3] == '{'))
4988         )
4989             return 1;
4990     }
4991     return 0;
4992 }
4993
4994 /* Handle run-time code blocks. We will already have compiled any direct
4995  * or indirect literal code blocks. Now, take the pattern 'pat' and make a
4996  * copy of it, but with any literal code blocks blanked out and
4997  * appropriate chars escaped; then feed it into
4998  *
4999  *    eval "qr'modified_pattern'"
5000  *
5001  * For example,
5002  *
5003  *       a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
5004  *
5005  * becomes
5006  *
5007  *    qr'a\\bc                       def\'ghi\\\\jkl(?{"this is runtime"})mno'
5008  *
5009  * After eval_sv()-ing that, grab any new code blocks from the returned qr
5010  * and merge them with any code blocks of the original regexp.
5011  *
5012  * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
5013  * instead, just save the qr and return FALSE; this tells our caller that
5014  * the original pattern needs upgrading to utf8.
5015  */
5016
5017 static bool
5018 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5019     char *pat, STRLEN plen)
5020 {
5021     SV *qr;
5022
5023     GET_RE_DEBUG_FLAGS_DECL;
5024
5025     if (pRExC_state->runtime_code_qr) {
5026         /* this is the second time we've been called; this should
5027          * only happen if the main pattern got upgraded to utf8
5028          * during compilation; re-use the qr we compiled first time
5029          * round (which should be utf8 too)
5030          */
5031         qr = pRExC_state->runtime_code_qr;
5032         pRExC_state->runtime_code_qr = NULL;
5033         assert(RExC_utf8 && SvUTF8(qr));
5034     }
5035     else {
5036         int n = 0;
5037         STRLEN s;
5038         char *p, *newpat;
5039         int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
5040         SV *sv, *qr_ref;
5041         dSP;
5042
5043         /* determine how many extra chars we need for ' and \ escaping */
5044         for (s = 0; s < plen; s++) {
5045             if (pat[s] == '\'' || pat[s] == '\\')
5046                 newlen++;
5047         }
5048
5049         Newx(newpat, newlen, char);
5050         p = newpat;
5051         *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
5052
5053         for (s = 0; s < plen; s++) {
5054             if (n < pRExC_state->num_code_blocks
5055                 && s == pRExC_state->code_blocks[n].start)
5056             {
5057                 /* blank out literal code block */
5058                 assert(pat[s] == '(');
5059                 while (s <= pRExC_state->code_blocks[n].end) {
5060                     *p++ = ' ';
5061                     s++;
5062                 }
5063                 s--;
5064                 n++;
5065                 continue;
5066             }
5067             if (pat[s] == '\'' || pat[s] == '\\')
5068                 *p++ = '\\';
5069             *p++ = pat[s];
5070         }
5071         *p++ = '\'';
5072         if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
5073             *p++ = 'x';
5074         *p++ = '\0';
5075         DEBUG_COMPILE_r({
5076             PerlIO_printf(Perl_debug_log,
5077                 "%sre-parsing pattern for runtime code:%s %s\n",
5078                 PL_colors[4],PL_colors[5],newpat);
5079         });
5080
5081         sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
5082         Safefree(newpat);
5083
5084         ENTER;
5085         SAVETMPS;
5086         save_re_context();
5087         PUSHSTACKi(PERLSI_REQUIRE);
5088         /* this causes the toker to collapse \\ into \ when parsing
5089          * qr''; normally only q'' does this. It also alters hints
5090          * handling */
5091         PL_reg_state.re_reparsing = TRUE;
5092         eval_sv(sv, G_SCALAR);
5093         SvREFCNT_dec(sv);
5094         SPAGAIN;
5095         qr_ref = POPs;
5096         PUTBACK;
5097         if (SvTRUE(ERRSV))
5098             Perl_croak(aTHX_ "%s", SvPVx_nolen_const(ERRSV));
5099         assert(SvROK(qr_ref));
5100         qr = SvRV(qr_ref);
5101         assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
5102         /* the leaving below frees the tmp qr_ref.
5103          * Give qr a life of its own */
5104         SvREFCNT_inc(qr);
5105         POPSTACK;
5106         FREETMPS;
5107         LEAVE;
5108
5109     }
5110
5111     if (!RExC_utf8 && SvUTF8(qr)) {
5112         /* first time through; the pattern got upgraded; save the
5113          * qr for the next time through */
5114         assert(!pRExC_state->runtime_code_qr);
5115         pRExC_state->runtime_code_qr = qr;
5116         return 0;
5117     }
5118
5119
5120     /* extract any code blocks within the returned qr//  */
5121
5122
5123     /* merge the main (r1) and run-time (r2) code blocks into one */
5124     {
5125         RXi_GET_DECL(((struct regexp*)SvANY(qr)), r2);
5126         struct reg_code_block *new_block, *dst;
5127         RExC_state_t * const r1 = pRExC_state; /* convenient alias */
5128         int i1 = 0, i2 = 0;
5129
5130         if (!r2->num_code_blocks) /* we guessed wrong */
5131             return 1;
5132
5133         Newx(new_block,
5134             r1->num_code_blocks + r2->num_code_blocks,
5135             struct reg_code_block);
5136         dst = new_block;
5137
5138         while (    i1 < r1->num_code_blocks
5139                 || i2 < r2->num_code_blocks)
5140         {
5141             struct reg_code_block *src;
5142             bool is_qr = 0;
5143
5144             if (i1 == r1->num_code_blocks) {
5145                 src = &r2->code_blocks[i2++];
5146                 is_qr = 1;
5147             }
5148             else if (i2 == r2->num_code_blocks)
5149                 src = &r1->code_blocks[i1++];
5150             else if (  r1->code_blocks[i1].start
5151                      < r2->code_blocks[i2].start)
5152             {
5153                 src = &r1->code_blocks[i1++];
5154                 assert(src->end < r2->code_blocks[i2].start);
5155             }
5156             else {
5157                 assert(  r1->code_blocks[i1].start
5158                        > r2->code_blocks[i2].start);
5159                 src = &r2->code_blocks[i2++];
5160                 is_qr = 1;
5161                 assert(src->end < r1->code_blocks[i1].start);
5162             }
5163
5164             assert(pat[src->start] == '(');
5165             assert(pat[src->end]   == ')');
5166             dst->start      = src->start;
5167             dst->end        = src->end;
5168             dst->block      = src->block;
5169             dst->src_regex  = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
5170                                     : src->src_regex;
5171             dst++;
5172         }
5173         r1->num_code_blocks += r2->num_code_blocks;
5174         Safefree(r1->code_blocks);
5175         r1->code_blocks = new_block;
5176     }
5177
5178     SvREFCNT_dec(qr);
5179     return 1;
5180 }
5181
5182
5183 STATIC bool
5184 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, SV** rx_utf8, SV** rx_substr, I32* rx_end_shift, I32 lookbehind, I32 offset, I32 *minlen, STRLEN longest_length, bool eol, bool meol)
5185 {
5186     /* This is the common code for setting up the floating and fixed length
5187      * string data extracted from Perlre_op_compile() below.  Returns a boolean
5188      * as to whether succeeded or not */
5189
5190     I32 t,ml;
5191
5192     if (! (longest_length
5193            || (eol /* Can't have SEOL and MULTI */
5194                && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
5195           )
5196             /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
5197         || (RExC_seen & REG_SEEN_EXACTF_SHARP_S))
5198     {
5199         return FALSE;
5200     }
5201
5202     /* copy the information about the longest from the reg_scan_data
5203         over to the program. */
5204     if (SvUTF8(sv_longest)) {
5205         *rx_utf8 = sv_longest;
5206         *rx_substr = NULL;
5207     } else {
5208         *rx_substr = sv_longest;
5209         *rx_utf8 = NULL;
5210     }
5211     /* end_shift is how many chars that must be matched that
5212         follow this item. We calculate it ahead of time as once the
5213         lookbehind offset is added in we lose the ability to correctly
5214         calculate it.*/
5215     ml = minlen ? *(minlen) : (I32)longest_length;
5216     *rx_end_shift = ml - offset
5217         - longest_length + (SvTAIL(sv_longest) != 0)
5218         + lookbehind;
5219
5220     t = (eol/* Can't have SEOL and MULTI */
5221          && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
5222     fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
5223
5224     return TRUE;
5225 }
5226
5227 /*
5228  * Perl_re_op_compile - the perl internal RE engine's function to compile a
5229  * regular expression into internal code.
5230  * The pattern may be passed either as:
5231  *    a list of SVs (patternp plus pat_count)
5232  *    a list of OPs (expr)
5233  * If both are passed, the SV list is used, but the OP list indicates
5234  * which SVs are actually pre-compiled code blocks
5235  *
5236  * The SVs in the list have magic and qr overloading applied to them (and
5237  * the list may be modified in-place with replacement SVs in the latter
5238  * case).
5239  *
5240  * If the pattern hasn't changed from old_re, then old_re will be
5241  * returned.
5242  *
5243  * eng is the current engine. If that engine has an op_comp method, then
5244  * handle directly (i.e. we assume that op_comp was us); otherwise, just
5245  * do the initial concatenation of arguments and pass on to the external
5246  * engine.
5247  *
5248  * If is_bare_re is not null, set it to a boolean indicating whether the
5249  * arg list reduced (after overloading) to a single bare regex which has
5250  * been returned (i.e. /$qr/).
5251  *
5252  * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
5253  *
5254  * pm_flags contains the PMf_* flags, typically based on those from the
5255  * pm_flags field of the related PMOP. Currently we're only interested in
5256  * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
5257  *
5258  * We can't allocate space until we know how big the compiled form will be,
5259  * but we can't compile it (and thus know how big it is) until we've got a
5260  * place to put the code.  So we cheat:  we compile it twice, once with code
5261  * generation turned off and size counting turned on, and once "for real".
5262  * This also means that we don't allocate space until we are sure that the
5263  * thing really will compile successfully, and we never have to move the
5264  * code and thus invalidate pointers into it.  (Note that it has to be in
5265  * one piece because free() must be able to free it all.) [NB: not true in perl]
5266  *
5267  * Beware that the optimization-preparation code in here knows about some
5268  * of the structure of the compiled regexp.  [I'll say.]
5269  */
5270
5271 REGEXP *
5272 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
5273                     OP *expr, const regexp_engine* eng, REGEXP *VOL old_re,
5274                      bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
5275 {
5276     dVAR;
5277     REGEXP *rx;
5278     struct regexp *r;
5279     regexp_internal *ri;
5280     STRLEN plen;
5281     char  * VOL exp;
5282     char* xend;
5283     regnode *scan;
5284     I32 flags;
5285     I32 minlen = 0;
5286     U32 rx_flags;
5287     SV * VOL pat;
5288
5289     /* these are all flags - maybe they should be turned
5290      * into a single int with different bit masks */
5291     I32 sawlookahead = 0;
5292     I32 sawplus = 0;
5293     I32 sawopen = 0;
5294     bool used_setjump = FALSE;
5295     regex_charset initial_charset = get_regex_charset(orig_rx_flags);
5296     bool code_is_utf8 = 0;
5297     bool VOL recompile = 0;
5298     bool runtime_code = 0;
5299     U8 jump_ret = 0;
5300     dJMPENV;
5301     scan_data_t data;
5302     RExC_state_t RExC_state;
5303     RExC_state_t * const pRExC_state = &RExC_state;
5304 #ifdef TRIE_STUDY_OPT
5305     int restudied;
5306     RExC_state_t copyRExC_state;
5307 #endif
5308     GET_RE_DEBUG_FLAGS_DECL;
5309
5310     PERL_ARGS_ASSERT_RE_OP_COMPILE;
5311
5312     DEBUG_r(if (!PL_colorset) reginitcolors());
5313
5314 #ifndef PERL_IN_XSUB_RE
5315     /* Initialize these here instead of as-needed, as is quick and avoids
5316      * having to test them each time otherwise */
5317     if (! PL_AboveLatin1) {
5318         PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
5319         PL_ASCII = _new_invlist_C_array(ASCII_invlist);
5320         PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
5321
5322         PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist);
5323         PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist);
5324
5325         PL_L1PosixAlpha = _new_invlist_C_array(L1PosixAlpha_invlist);
5326         PL_PosixAlpha = _new_invlist_C_array(PosixAlpha_invlist);
5327
5328         PL_PosixBlank = _new_invlist_C_array(PosixBlank_invlist);
5329         PL_XPosixBlank = _new_invlist_C_array(XPosixBlank_invlist);
5330
5331         PL_L1Cased = _new_invlist_C_array(L1Cased_invlist);
5332
5333         PL_PosixCntrl = _new_invlist_C_array(PosixCntrl_invlist);
5334         PL_XPosixCntrl = _new_invlist_C_array(XPosixCntrl_invlist);
5335
5336         PL_PosixDigit = _new_invlist_C_array(PosixDigit_invlist);
5337
5338         PL_L1PosixGraph = _new_invlist_C_array(L1PosixGraph_invlist);
5339         PL_PosixGraph = _new_invlist_C_array(PosixGraph_invlist);
5340
5341         PL_L1PosixLower = _new_invlist_C_array(L1PosixLower_invlist);
5342         PL_PosixLower = _new_invlist_C_array(PosixLower_invlist);
5343
5344         PL_L1PosixPrint = _new_invlist_C_array(L1PosixPrint_invlist);
5345         PL_PosixPrint = _new_invlist_C_array(PosixPrint_invlist);
5346
5347         PL_L1PosixPunct = _new_invlist_C_array(L1PosixPunct_invlist);
5348         PL_PosixPunct = _new_invlist_C_array(PosixPunct_invlist);
5349
5350         PL_PerlSpace = _new_invlist_C_array(PerlSpace_invlist);
5351         PL_XPerlSpace = _new_invlist_C_array(XPerlSpace_invlist);
5352
5353         PL_PosixSpace = _new_invlist_C_array(PosixSpace_invlist);
5354         PL_XPosixSpace = _new_invlist_C_array(XPosixSpace_invlist);
5355
5356         PL_L1PosixUpper = _new_invlist_C_array(L1PosixUpper_invlist);
5357         PL_PosixUpper = _new_invlist_C_array(PosixUpper_invlist);
5358
5359         PL_VertSpace = _new_invlist_C_array(VertSpace_invlist);
5360
5361         PL_PosixWord = _new_invlist_C_array(PosixWord_invlist);
5362         PL_L1PosixWord = _new_invlist_C_array(L1PosixWord_invlist);
5363
5364         PL_PosixXDigit = _new_invlist_C_array(PosixXDigit_invlist);
5365         PL_XPosixXDigit = _new_invlist_C_array(XPosixXDigit_invlist);
5366
5367         PL_HasMultiCharFold = _new_invlist_C_array(_Perl_Multi_Char_Folds_invlist);
5368     }
5369 #endif
5370
5371     pRExC_state->code_blocks = NULL;
5372     pRExC_state->num_code_blocks = 0;
5373
5374     if (is_bare_re)
5375         *is_bare_re = FALSE;
5376
5377     if (expr && (expr->op_type == OP_LIST ||
5378                 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
5379
5380         /* is the source UTF8, and how many code blocks are there? */
5381         OP *o;
5382         int ncode = 0;
5383
5384         for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5385             if (o->op_type == OP_CONST && SvUTF8(cSVOPo_sv))
5386                 code_is_utf8 = 1;
5387             else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
5388                 /* count of DO blocks */
5389                 ncode++;
5390         }
5391         if (ncode) {
5392             pRExC_state->num_code_blocks = ncode;
5393             Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
5394         }
5395     }
5396
5397     if (pat_count) {
5398         /* handle a list of SVs */
5399
5400         SV **svp;
5401
5402         /* apply magic and RE overloading to each arg */
5403         for (svp = patternp; svp < patternp + pat_count; svp++) {
5404             SV *rx = *svp;
5405             SvGETMAGIC(rx);
5406             if (SvROK(rx) && SvAMAGIC(rx)) {
5407                 SV *sv = AMG_CALLunary(rx, regexp_amg);
5408                 if (sv) {
5409                     if (SvROK(sv))
5410                         sv = SvRV(sv);
5411                     if (SvTYPE(sv) != SVt_REGEXP)
5412                         Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5413                     *svp = sv;
5414                 }
5415             }
5416         }
5417
5418         if (pat_count > 1) {
5419             /* concat multiple args and find any code block indexes */
5420
5421             OP *o = NULL;
5422             int n = 0;
5423             bool utf8 = 0;
5424             STRLEN orig_patlen = 0;
5425
5426             if (pRExC_state->num_code_blocks) {
5427                 o = cLISTOPx(expr)->op_first;
5428                 assert(o->op_type == OP_PUSHMARK);
5429                 o = o->op_sibling;
5430             }
5431
5432             pat = newSVpvn("", 0);
5433             SAVEFREESV(pat);
5434
5435             /* determine if the pattern is going to be utf8 (needed
5436              * in advance to align code block indices correctly).
5437              * XXX This could fail to be detected for an arg with
5438              * overloading but not concat overloading; but the main effect
5439              * in this obscure case is to need a 'use re eval' for a
5440              * literal code block */
5441             for (svp = patternp; svp < patternp + pat_count; svp++) {
5442                 if (SvUTF8(*svp))
5443                     utf8 = 1;
5444             }
5445             if (utf8)
5446                 SvUTF8_on(pat);
5447
5448             for (svp = patternp; svp < patternp + pat_count; svp++) {
5449                 SV *sv, *msv = *svp;
5450                 SV *rx;
5451                 bool code = 0;
5452                 if (o) {
5453                     if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
5454                         assert(n < pRExC_state->num_code_blocks);
5455                         pRExC_state->code_blocks[n].start = SvCUR(pat);
5456                         pRExC_state->code_blocks[n].block = o;
5457                         pRExC_state->code_blocks[n].src_regex = NULL;
5458                         n++;
5459                         code = 1;
5460                         o = o->op_sibling; /* skip CONST */
5461                         assert(o);
5462                     }
5463                     o = o->op_sibling;;
5464                 }
5465
5466                 if ((SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5467                         (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5468                 {
5469                     sv_setsv(pat, sv);
5470                     /* overloading involved: all bets are off over literal
5471                      * code. Pretend we haven't seen it */
5472                     pRExC_state->num_code_blocks -= n;
5473                     n = 0;
5474                     rx = NULL;
5475
5476                 }
5477                 else  {
5478                     while (SvAMAGIC(msv)
5479                             && (sv = AMG_CALLunary(msv, string_amg))
5480                             && sv != msv
5481                             &&  !(   SvROK(msv)
5482                                   && SvROK(sv)
5483                                   && SvRV(msv) == SvRV(sv))
5484                     ) {
5485                         msv = sv;
5486                         SvGETMAGIC(msv);
5487                     }
5488                     if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5489                         msv = SvRV(msv);
5490                     orig_patlen = SvCUR(pat);
5491                     sv_catsv_nomg(pat, msv);
5492                     rx = msv;
5493                     if (code)
5494                         pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
5495                 }
5496
5497                 /* extract any code blocks within any embedded qr//'s */
5498                 if (rx && SvTYPE(rx) == SVt_REGEXP
5499                     && RX_ENGINE((REGEXP*)rx)->op_comp)
5500                 {
5501
5502                     RXi_GET_DECL(((struct regexp*)SvANY(rx)), ri);
5503                     if (ri->num_code_blocks) {
5504                         int i;
5505                         /* the presence of an embedded qr// with code means
5506                          * we should always recompile: the text of the
5507                          * qr// may not have changed, but it may be a
5508                          * different closure than last time */
5509                         recompile = 1;
5510                         Renew(pRExC_state->code_blocks,
5511                             pRExC_state->num_code_blocks + ri->num_code_blocks,
5512                             struct reg_code_block);
5513                         pRExC_state->num_code_blocks += ri->num_code_blocks;
5514                         for (i=0; i < ri->num_code_blocks; i++) {
5515                             struct reg_code_block *src, *dst;
5516                             STRLEN offset =  orig_patlen
5517                                 + ((struct regexp *)SvANY(rx))->pre_prefix;
5518                             assert(n < pRExC_state->num_code_blocks);
5519                             src = &ri->code_blocks[i];
5520                             dst = &pRExC_state->code_blocks[n];
5521                             dst->start      = src->start + offset;
5522                             dst->end        = src->end   + offset;
5523                             dst->block      = src->block;
5524                             dst->src_regex  = (REGEXP*) SvREFCNT_inc( (SV*)
5525                                                     src->src_regex
5526                                                         ? src->src_regex
5527                                                         : (REGEXP*)rx);
5528                             n++;
5529                         }
5530                     }
5531                 }
5532             }
5533             SvSETMAGIC(pat);
5534         }
5535         else {
5536             SV *sv;
5537             pat = *patternp;
5538             while (SvAMAGIC(pat)
5539                     && (sv = AMG_CALLunary(pat, string_amg))
5540                     && sv != pat)
5541             {
5542                 pat = sv;
5543                 SvGETMAGIC(pat);
5544             }
5545         }
5546
5547         /* handle bare regex: foo =~ $re */
5548         {
5549             SV *re = pat;
5550             if (SvROK(re))
5551                 re = SvRV(re);
5552             if (SvTYPE(re) == SVt_REGEXP) {
5553                 if (is_bare_re)
5554                     *is_bare_re = TRUE;
5555                 SvREFCNT_inc(re);
5556                 Safefree(pRExC_state->code_blocks);
5557                 return (REGEXP*)re;
5558             }
5559         }
5560     }
5561     else {
5562         /* not a list of SVs, so must be a list of OPs */
5563         assert(expr);
5564         if (expr->op_type == OP_LIST) {
5565             int i = -1;
5566             bool is_code = 0;
5567             OP *o;
5568
5569             pat = newSVpvn("", 0);
5570             SAVEFREESV(pat);
5571             if (code_is_utf8)
5572                 SvUTF8_on(pat);
5573
5574             /* given a list of CONSTs and DO blocks in expr, append all
5575              * the CONSTs to pat, and record the start and end of each
5576              * code block in code_blocks[] (each DO{} op is followed by an
5577              * OP_CONST containing the corresponding literal '(?{...})
5578              * text)
5579              */
5580             for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5581                 if (o->op_type == OP_CONST) {
5582                     sv_catsv(pat, cSVOPo_sv);
5583                     if (is_code) {
5584                         pRExC_state->code_blocks[i].end = SvCUR(pat)-1;
5585                         is_code = 0;
5586                     }
5587                 }
5588                 else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
5589                     assert(i+1 < pRExC_state->num_code_blocks);
5590                     pRExC_state->code_blocks[++i].start = SvCUR(pat);
5591                     pRExC_state->code_blocks[i].block = o;
5592                     pRExC_state->code_blocks[i].src_regex = NULL;
5593                     is_code = 1;
5594                 }
5595             }
5596         }
5597         else {
5598             assert(expr->op_type == OP_CONST);
5599             pat = cSVOPx_sv(expr);
5600         }
5601     }
5602
5603     exp = SvPV_nomg(pat, plen);
5604
5605     if (!eng->op_comp) {
5606         if ((SvUTF8(pat) && IN_BYTES)
5607                 || SvGMAGICAL(pat) || SvAMAGIC(pat))
5608         {
5609             /* make a temporary copy; either to convert to bytes,
5610              * or to avoid repeating get-magic / overloaded stringify */
5611             pat = newSVpvn_flags(exp, plen, SVs_TEMP |
5612                                         (IN_BYTES ? 0 : SvUTF8(pat)));
5613         }
5614         Safefree(pRExC_state->code_blocks);
5615         return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
5616     }
5617
5618     /* ignore the utf8ness if the pattern is 0 length */
5619     RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
5620     RExC_uni_semantics = 0;
5621     RExC_contains_locale = 0;
5622     pRExC_state->runtime_code_qr = NULL;
5623
5624     /****************** LONG JUMP TARGET HERE***********************/
5625     /* Longjmp back to here if have to switch in midstream to utf8 */
5626     if (! RExC_orig_utf8) {
5627         JMPENV_PUSH(jump_ret);
5628         used_setjump = TRUE;
5629     }
5630
5631     if (jump_ret == 0) {    /* First time through */
5632         xend = exp + plen;
5633
5634         DEBUG_COMPILE_r({
5635             SV *dsv= sv_newmortal();
5636             RE_PV_QUOTED_DECL(s, RExC_utf8,
5637                 dsv, exp, plen, 60);
5638             PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
5639                            PL_colors[4],PL_colors[5],s);
5640         });
5641     }
5642     else {  /* longjumped back */
5643         U8 *src, *dst;
5644         int n=0;
5645         STRLEN s = 0, d = 0;
5646         bool do_end = 0;
5647
5648         /* If the cause for the longjmp was other than changing to utf8, pop
5649          * our own setjmp, and longjmp to the correct handler */
5650         if (jump_ret != UTF8_LONGJMP) {
5651             JMPENV_POP;
5652             JMPENV_JUMP(jump_ret);
5653         }
5654
5655         GET_RE_DEBUG_FLAGS;
5656
5657         /* It's possible to write a regexp in ascii that represents Unicode
5658         codepoints outside of the byte range, such as via \x{100}. If we
5659         detect such a sequence we have to convert the entire pattern to utf8
5660         and then recompile, as our sizing calculation will have been based
5661         on 1 byte == 1 character, but we will need to use utf8 to encode
5662         at least some part of the pattern, and therefore must convert the whole
5663         thing.
5664         -- dmq */
5665         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5666             "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5667
5668         /* upgrade pattern to UTF8, and if there are code blocks,
5669          * recalculate the indices.
5670          * This is essentially an unrolled Perl_bytes_to_utf8() */
5671
5672         src = (U8*)SvPV_nomg(pat, plen);
5673         Newx(dst, plen * 2 + 1, U8);
5674
5675         while (s < plen) {
5676             const UV uv = NATIVE_TO_ASCII(src[s]);
5677             if (UNI_IS_INVARIANT(uv))
5678                 dst[d]   = (U8)UTF_TO_NATIVE(uv);
5679             else {
5680                 dst[d++] = (U8)UTF8_EIGHT_BIT_HI(uv);
5681                 dst[d]   = (U8)UTF8_EIGHT_BIT_LO(uv);
5682             }
5683             if (n < pRExC_state->num_code_blocks) {
5684                 if (!do_end && pRExC_state->code_blocks[n].start == s) {
5685                     pRExC_state->code_blocks[n].start = d;
5686                     assert(dst[d] == '(');
5687                     do_end = 1;
5688                 }
5689                 else if (do_end && pRExC_state->code_blocks[n].end == s) {
5690                     pRExC_state->code_blocks[n].end = d;
5691                     assert(dst[d] == ')');
5692                     do_end = 0;
5693                     n++;
5694                 }
5695             }
5696             s++;
5697             d++;
5698         }
5699         dst[d] = '\0';
5700         plen = d;
5701         exp = (char*) dst;
5702         xend = exp + plen;
5703         SAVEFREEPV(exp);
5704         RExC_orig_utf8 = RExC_utf8 = 1;
5705     }
5706
5707     /* return old regex if pattern hasn't changed */
5708
5709     if (   old_re
5710         && !recompile
5711         && !!RX_UTF8(old_re) == !!RExC_utf8
5712         && RX_PRECOMP(old_re)
5713         && RX_PRELEN(old_re) == plen
5714         && memEQ(RX_PRECOMP(old_re), exp, plen))
5715     {
5716         /* with runtime code, always recompile */
5717         runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags,
5718                                             exp, plen);
5719         if (!runtime_code) {
5720             if (used_setjump) {
5721                 JMPENV_POP;
5722             }
5723             Safefree(pRExC_state->code_blocks);
5724             return old_re;
5725         }
5726     }
5727     else if ((pm_flags & PMf_USE_RE_EVAL)
5728                 /* this second condition covers the non-regex literal case,
5729                  * i.e.  $foo =~ '(?{})'. */
5730                 || ( !PL_reg_state.re_reparsing && IN_PERL_COMPILETIME
5731                     && (PL_hints & HINT_RE_EVAL))
5732     )
5733         runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags,
5734                             exp, plen);
5735
5736 #ifdef TRIE_STUDY_OPT
5737     restudied = 0;
5738 #endif
5739
5740     rx_flags = orig_rx_flags;
5741
5742     if (initial_charset == REGEX_LOCALE_CHARSET) {
5743         RExC_contains_locale = 1;
5744     }
5745     else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
5746
5747         /* Set to use unicode semantics if the pattern is in utf8 and has the
5748          * 'depends' charset specified, as it means unicode when utf8  */
5749         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5750     }
5751
5752     RExC_precomp = exp;
5753     RExC_flags = rx_flags;
5754     RExC_pm_flags = pm_flags;
5755
5756     if (runtime_code) {
5757         if (PL_tainting && PL_tainted)
5758             Perl_croak(aTHX_ "Eval-group in insecure regular expression");
5759
5760         if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
5761             /* whoops, we have a non-utf8 pattern, whilst run-time code
5762              * got compiled as utf8. Try again with a utf8 pattern */
5763              JMPENV_JUMP(UTF8_LONGJMP);
5764         }
5765     }
5766     assert(!pRExC_state->runtime_code_qr);
5767
5768     RExC_sawback = 0;
5769
5770     RExC_seen = 0;
5771     RExC_in_lookbehind = 0;
5772     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
5773     RExC_extralen = 0;
5774     RExC_override_recoding = 0;
5775     RExC_in_multi_char_class = 0;
5776
5777     /* First pass: determine size, legality. */
5778     RExC_parse = exp;
5779     RExC_start = exp;
5780     RExC_end = xend;
5781     RExC_naughty = 0;
5782     RExC_npar = 1;
5783     RExC_nestroot = 0;
5784     RExC_size = 0L;
5785     RExC_emit = &PL_regdummy;
5786     RExC_whilem_seen = 0;
5787     RExC_open_parens = NULL;
5788     RExC_close_parens = NULL;
5789     RExC_opend = NULL;
5790     RExC_paren_names = NULL;
5791 #ifdef DEBUGGING
5792     RExC_paren_name_list = NULL;
5793 #endif
5794     RExC_recurse = NULL;
5795     RExC_recurse_count = 0;
5796     pRExC_state->code_index = 0;
5797
5798 #if 0 /* REGC() is (currently) a NOP at the first pass.
5799        * Clever compilers notice this and complain. --jhi */
5800     REGC((U8)REG_MAGIC, (char*)RExC_emit);
5801 #endif
5802     DEBUG_PARSE_r(
5803         PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
5804         RExC_lastnum=0;
5805         RExC_lastparse=NULL;
5806     );
5807     if (reg(pRExC_state, 0, &flags,1) == NULL) {
5808         RExC_precomp = NULL;
5809         Safefree(pRExC_state->code_blocks);
5810         return(NULL);
5811     }
5812
5813     /* Here, finished first pass.  Get rid of any added setjmp */
5814     if (used_setjump) {
5815         JMPENV_POP;
5816     }
5817
5818     DEBUG_PARSE_r({
5819         PerlIO_printf(Perl_debug_log,
5820             "Required size %"IVdf" nodes\n"
5821             "Starting second pass (creation)\n",
5822             (IV)RExC_size);
5823         RExC_lastnum=0;
5824         RExC_lastparse=NULL;
5825     });
5826
5827     /* The first pass could have found things that force Unicode semantics */
5828     if ((RExC_utf8 || RExC_uni_semantics)
5829          && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
5830     {
5831         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5832     }
5833
5834     /* Small enough for pointer-storage convention?
5835        If extralen==0, this means that we will not need long jumps. */
5836     if (RExC_size >= 0x10000L && RExC_extralen)
5837         RExC_size += RExC_extralen;
5838     else
5839         RExC_extralen = 0;
5840     if (RExC_whilem_seen > 15)
5841         RExC_whilem_seen = 15;
5842
5843     /* Allocate space and zero-initialize. Note, the two step process
5844        of zeroing when in debug mode, thus anything assigned has to
5845        happen after that */
5846     rx = (REGEXP*) newSV_type(SVt_REGEXP);
5847     r = (struct regexp*)SvANY(rx);
5848     Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
5849          char, regexp_internal);
5850     if ( r == NULL || ri == NULL )
5851         FAIL("Regexp out of space");
5852 #ifdef DEBUGGING
5853     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
5854     Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
5855 #else
5856     /* bulk initialize base fields with 0. */
5857     Zero(ri, sizeof(regexp_internal), char);
5858 #endif
5859
5860     /* non-zero initialization begins here */
5861     RXi_SET( r, ri );
5862     r->engine= eng;
5863     r->extflags = rx_flags;
5864     if (pm_flags & PMf_IS_QR) {
5865         ri->code_blocks = pRExC_state->code_blocks;
5866         ri->num_code_blocks = pRExC_state->num_code_blocks;
5867     }
5868     else
5869         SAVEFREEPV(pRExC_state->code_blocks);
5870
5871     {
5872         bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
5873         bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
5874
5875         /* The caret is output if there are any defaults: if not all the STD
5876          * flags are set, or if no character set specifier is needed */
5877         bool has_default =
5878                     (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
5879                     || ! has_charset);
5880         bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
5881         U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
5882                             >> RXf_PMf_STD_PMMOD_SHIFT);
5883         const char *fptr = STD_PAT_MODS;        /*"msix"*/
5884         char *p;
5885         /* Allocate for the worst case, which is all the std flags are turned
5886          * on.  If more precision is desired, we could do a population count of
5887          * the flags set.  This could be done with a small lookup table, or by
5888          * shifting, masking and adding, or even, when available, assembly
5889          * language for a machine-language population count.
5890          * We never output a minus, as all those are defaults, so are
5891          * covered by the caret */
5892         const STRLEN wraplen = plen + has_p + has_runon
5893             + has_default       /* If needs a caret */
5894
5895                 /* If needs a character set specifier */
5896             + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
5897             + (sizeof(STD_PAT_MODS) - 1)
5898             + (sizeof("(?:)") - 1);
5899
5900         p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
5901         SvPOK_on(rx);
5902         if (RExC_utf8)
5903             SvFLAGS(rx) |= SVf_UTF8;
5904         *p++='('; *p++='?';
5905
5906         /* If a default, cover it using the caret */
5907         if (has_default) {
5908             *p++= DEFAULT_PAT_MOD;
5909         }
5910         if (has_charset) {
5911             STRLEN len;
5912             const char* const name = get_regex_charset_name(r->extflags, &len);
5913             Copy(name, p, len, char);
5914             p += len;
5915         }
5916         if (has_p)
5917             *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
5918         {
5919             char ch;
5920             while((ch = *fptr++)) {
5921                 if(reganch & 1)
5922                     *p++ = ch;
5923                 reganch >>= 1;
5924             }
5925         }
5926
5927         *p++ = ':';
5928         Copy(RExC_precomp, p, plen, char);
5929         assert ((RX_WRAPPED(rx) - p) < 16);
5930         r->pre_prefix = p - RX_WRAPPED(rx);
5931         p += plen;
5932         if (has_runon)
5933             *p++ = '\n';
5934         *p++ = ')';
5935         *p = 0;
5936         SvCUR_set(rx, p - SvPVX_const(rx));
5937     }
5938
5939     r->intflags = 0;
5940     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
5941
5942     if (RExC_seen & REG_SEEN_RECURSE) {
5943         Newxz(RExC_open_parens, RExC_npar,regnode *);
5944         SAVEFREEPV(RExC_open_parens);
5945         Newxz(RExC_close_parens,RExC_npar,regnode *);
5946         SAVEFREEPV(RExC_close_parens);
5947     }
5948
5949     /* Useful during FAIL. */
5950 #ifdef RE_TRACK_PATTERN_OFFSETS
5951     Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
5952     DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
5953                           "%s %"UVuf" bytes for offset annotations.\n",
5954                           ri->u.offsets ? "Got" : "Couldn't get",
5955                           (UV)((2*RExC_size+1) * sizeof(U32))));
5956 #endif
5957     SetProgLen(ri,RExC_size);
5958     RExC_rx_sv = rx;
5959     RExC_rx = r;
5960     RExC_rxi = ri;
5961     REH_CALL_COMP_BEGIN_HOOK(pRExC_state->rx);
5962
5963     /* Second pass: emit code. */
5964     RExC_flags = rx_flags;      /* don't let top level (?i) bleed */
5965     RExC_pm_flags = pm_flags;
5966     RExC_parse = exp;
5967     RExC_end = xend;
5968     RExC_naughty = 0;
5969     RExC_npar = 1;
5970     RExC_emit_start = ri->program;
5971     RExC_emit = ri->program;
5972     RExC_emit_bound = ri->program + RExC_size + 1;
5973     pRExC_state->code_index = 0;
5974
5975     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
5976     if (reg(pRExC_state, 0, &flags,1) == NULL) {
5977         ReREFCNT_dec(rx);
5978         return(NULL);
5979     }
5980     /* XXXX To minimize changes to RE engine we always allocate
5981        3-units-long substrs field. */
5982     Newx(r->substrs, 1, struct reg_substr_data);
5983     if (RExC_recurse_count) {
5984         Newxz(RExC_recurse,RExC_recurse_count,regnode *);
5985         SAVEFREEPV(RExC_recurse);
5986     }
5987
5988 reStudy:
5989     r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
5990     Zero(r->substrs, 1, struct reg_substr_data);
5991
5992 #ifdef TRIE_STUDY_OPT
5993     if (!restudied) {
5994         StructCopy(&zero_scan_data, &data, scan_data_t);
5995         copyRExC_state = RExC_state;
5996     } else {
5997         U32 seen=RExC_seen;
5998         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
5999
6000         RExC_state = copyRExC_state;
6001         if (seen & REG_TOP_LEVEL_BRANCHES)
6002             RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6003         else
6004             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
6005         if (data.last_found) {
6006             SvREFCNT_dec(data.longest_fixed);
6007             SvREFCNT_dec(data.longest_float);
6008             SvREFCNT_dec(data.last_found);
6009         }
6010         StructCopy(&zero_scan_data, &data, scan_data_t);
6011     }
6012 #else
6013     StructCopy(&zero_scan_data, &data, scan_data_t);
6014 #endif
6015
6016     /* Dig out information for optimizations. */
6017     r->extflags = RExC_flags; /* was pm_op */
6018     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
6019
6020     if (UTF)
6021         SvUTF8_on(rx);  /* Unicode in it? */
6022     ri->regstclass = NULL;
6023     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
6024         r->intflags |= PREGf_NAUGHTY;
6025     scan = ri->program + 1;             /* First BRANCH. */
6026
6027     /* testing for BRANCH here tells us whether there is "must appear"
6028        data in the pattern. If there is then we can use it for optimisations */
6029     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /*  Only one top-level choice. */
6030         I32 fake;
6031         STRLEN longest_float_length, longest_fixed_length;
6032         struct regnode_charclass_class ch_class; /* pointed to by data */
6033         int stclass_flag;
6034         I32 last_close = 0; /* pointed to by data */
6035         regnode *first= scan;
6036         regnode *first_next= regnext(first);
6037         /*
6038          * Skip introductions and multiplicators >= 1
6039          * so that we can extract the 'meat' of the pattern that must
6040          * match in the large if() sequence following.
6041          * NOTE that EXACT is NOT covered here, as it is normally
6042          * picked up by the optimiser separately.
6043          *
6044          * This is unfortunate as the optimiser isnt handling lookahead
6045          * properly currently.
6046          *
6047          */
6048         while ((OP(first) == OPEN && (sawopen = 1)) ||
6049                /* An OR of *one* alternative - should not happen now. */
6050             (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
6051             /* for now we can't handle lookbehind IFMATCH*/
6052             (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
6053             (OP(first) == PLUS) ||
6054             (OP(first) == MINMOD) ||
6055                /* An {n,m} with n>0 */
6056             (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
6057             (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
6058         {
6059                 /*
6060                  * the only op that could be a regnode is PLUS, all the rest
6061                  * will be regnode_1 or regnode_2.
6062                  *
6063                  */
6064                 if (OP(first) == PLUS)
6065                     sawplus = 1;
6066                 else
6067                     first += regarglen[OP(first)];
6068
6069                 first = NEXTOPER(first);
6070                 first_next= regnext(first);
6071         }
6072
6073         /* Starting-point info. */
6074       again:
6075         DEBUG_PEEP("first:",first,0);
6076         /* Ignore EXACT as we deal with it later. */
6077         if (PL_regkind[OP(first)] == EXACT) {
6078             if (OP(first) == EXACT)
6079                 NOOP;   /* Empty, get anchored substr later. */
6080             else
6081                 ri->regstclass = first;
6082         }
6083 #ifdef TRIE_STCLASS
6084         else if (PL_regkind[OP(first)] == TRIE &&
6085                 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
6086         {
6087             regnode *trie_op;
6088             /* this can happen only on restudy */
6089             if ( OP(first) == TRIE ) {
6090                 struct regnode_1 *trieop = (struct regnode_1 *)
6091                     PerlMemShared_calloc(1, sizeof(struct regnode_1));
6092                 StructCopy(first,trieop,struct regnode_1);
6093                 trie_op=(regnode *)trieop;
6094             } else {
6095                 struct regnode_charclass *trieop = (struct regnode_charclass *)
6096                     PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
6097                 StructCopy(first,trieop,struct regnode_charclass);
6098                 trie_op=(regnode *)trieop;
6099             }
6100             OP(trie_op)+=2;
6101             make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
6102             ri->regstclass = trie_op;
6103         }
6104 #endif
6105         else if (REGNODE_SIMPLE(OP(first)))
6106             ri->regstclass = first;
6107         else if (PL_regkind[OP(first)] == BOUND ||
6108                  PL_regkind[OP(first)] == NBOUND)
6109             ri->regstclass = first;
6110         else if (PL_regkind[OP(first)] == BOL) {
6111             r->extflags |= (OP(first) == MBOL
6112                            ? RXf_ANCH_MBOL
6113                            : (OP(first) == SBOL
6114                               ? RXf_ANCH_SBOL
6115                               : RXf_ANCH_BOL));
6116             first = NEXTOPER(first);
6117             goto again;
6118         }
6119         else if (OP(first) == GPOS) {
6120             r->extflags |= RXf_ANCH_GPOS;
6121             first = NEXTOPER(first);
6122             goto again;
6123         }
6124         else if ((!sawopen || !RExC_sawback) &&
6125             (OP(first) == STAR &&
6126             PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
6127             !(r->extflags & RXf_ANCH) && !pRExC_state->num_code_blocks)
6128         {
6129             /* turn .* into ^.* with an implied $*=1 */
6130             const int type =
6131                 (OP(NEXTOPER(first)) == REG_ANY)
6132                     ? RXf_ANCH_MBOL
6133                     : RXf_ANCH_SBOL;
6134             r->extflags |= type;
6135             r->intflags |= PREGf_IMPLICIT;
6136             first = NEXTOPER(first);
6137             goto again;
6138         }
6139         if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
6140             && !pRExC_state->num_code_blocks) /* May examine pos and $& */
6141             /* x+ must match at the 1st pos of run of x's */
6142             r->intflags |= PREGf_SKIP;
6143
6144         /* Scan is after the zeroth branch, first is atomic matcher. */
6145 #ifdef TRIE_STUDY_OPT
6146         DEBUG_PARSE_r(
6147             if (!restudied)
6148                 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6149                               (IV)(first - scan + 1))
6150         );
6151 #else
6152         DEBUG_PARSE_r(
6153             PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6154                 (IV)(first - scan + 1))
6155         );
6156 #endif
6157
6158
6159         /*
6160         * If there's something expensive in the r.e., find the
6161         * longest literal string that must appear and make it the
6162         * regmust.  Resolve ties in favor of later strings, since
6163         * the regstart check works with the beginning of the r.e.
6164         * and avoiding duplication strengthens checking.  Not a
6165         * strong reason, but sufficient in the absence of others.
6166         * [Now we resolve ties in favor of the earlier string if
6167         * it happens that c_offset_min has been invalidated, since the
6168         * earlier string may buy us something the later one won't.]
6169         */
6170
6171         data.longest_fixed = newSVpvs("");
6172         data.longest_float = newSVpvs("");
6173         data.last_found = newSVpvs("");
6174         data.longest = &(data.longest_fixed);
6175         first = scan;
6176         if (!ri->regstclass) {
6177             cl_init(pRExC_state, &ch_class);
6178             data.start_class = &ch_class;
6179             stclass_flag = SCF_DO_STCLASS_AND;
6180         } else                          /* XXXX Check for BOUND? */
6181             stclass_flag = 0;
6182         data.last_closep = &last_close;
6183
6184         minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
6185             &data, -1, NULL, NULL,
6186             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
6187
6188
6189         CHECK_RESTUDY_GOTO;
6190
6191
6192         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
6193              && data.last_start_min == 0 && data.last_end > 0
6194              && !RExC_seen_zerolen
6195              && !(RExC_seen & REG_SEEN_VERBARG)
6196              && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
6197             r->extflags |= RXf_CHECK_ALL;
6198         scan_commit(pRExC_state, &data,&minlen,0);
6199         SvREFCNT_dec(data.last_found);
6200
6201         longest_float_length = CHR_SVLEN(data.longest_float);
6202
6203         if (! ((SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
6204                    && data.offset_fixed == data.offset_float_min
6205                    && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
6206             && S_setup_longest (aTHX_ pRExC_state,
6207                                     data.longest_float,
6208                                     &(r->float_utf8),
6209                                     &(r->float_substr),
6210                                     &(r->float_end_shift),
6211                                     data.lookbehind_float,
6212                                     data.offset_float_min,
6213                                     data.minlen_float,
6214                                     longest_float_length,
6215                                     data.flags & SF_FL_BEFORE_EOL,
6216                                     data.flags & SF_FL_BEFORE_MEOL))
6217         {
6218             r->float_min_offset = data.offset_float_min - data.lookbehind_float;
6219             r->float_max_offset = data.offset_float_max;
6220             if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
6221                 r->float_max_offset -= data.lookbehind_float;
6222         }
6223         else {
6224             r->float_substr = r->float_utf8 = NULL;
6225             SvREFCNT_dec(data.longest_float);
6226             longest_float_length = 0;
6227         }
6228
6229         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
6230
6231         if (S_setup_longest (aTHX_ pRExC_state,
6232                                 data.longest_fixed,
6233                                 &(r->anchored_utf8),
6234                                 &(r->anchored_substr),
6235                                 &(r->anchored_end_shift),
6236                                 data.lookbehind_fixed,
6237                                 data.offset_fixed,
6238                                 data.minlen_fixed,
6239                                 longest_fixed_length,
6240                                 data.flags & SF_FIX_BEFORE_EOL,
6241                                 data.flags & SF_FIX_BEFORE_MEOL))
6242         {
6243             r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
6244         }
6245         else {
6246             r->anchored_substr = r->anchored_utf8 = NULL;
6247             SvREFCNT_dec(data.longest_fixed);
6248             longest_fixed_length = 0;
6249         }
6250
6251         if (ri->regstclass
6252             && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
6253             ri->regstclass = NULL;
6254
6255         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
6256             && stclass_flag
6257             && !(data.start_class->flags & ANYOF_EOS)
6258             && !cl_is_anything(data.start_class))
6259         {
6260             const U32 n = add_data(pRExC_state, 1, "f");
6261             data.start_class->flags |= ANYOF_IS_SYNTHETIC;
6262
6263             Newx(RExC_rxi->data->data[n], 1,
6264                 struct regnode_charclass_class);
6265             StructCopy(data.start_class,
6266                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6267                        struct regnode_charclass_class);
6268             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6269             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6270             DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
6271                       regprop(r, sv, (regnode*)data.start_class);
6272                       PerlIO_printf(Perl_debug_log,
6273                                     "synthetic stclass \"%s\".\n",
6274                                     SvPVX_const(sv));});
6275         }
6276
6277         /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
6278         if (longest_fixed_length > longest_float_length) {
6279             r->check_end_shift = r->anchored_end_shift;
6280             r->check_substr = r->anchored_substr;
6281             r->check_utf8 = r->anchored_utf8;
6282             r->check_offset_min = r->check_offset_max = r->anchored_offset;
6283             if (r->extflags & RXf_ANCH_SINGLE)
6284                 r->extflags |= RXf_NOSCAN;
6285         }
6286         else {
6287             r->check_end_shift = r->float_end_shift;
6288             r->check_substr = r->float_substr;
6289             r->check_utf8 = r->float_utf8;
6290             r->check_offset_min = r->float_min_offset;
6291             r->check_offset_max = r->float_max_offset;
6292         }
6293         /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
6294            This should be changed ASAP!  */
6295         if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
6296             r->extflags |= RXf_USE_INTUIT;
6297             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
6298                 r->extflags |= RXf_INTUIT_TAIL;
6299         }
6300         /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
6301         if ( (STRLEN)minlen < longest_float_length )
6302             minlen= longest_float_length;
6303         if ( (STRLEN)minlen < longest_fixed_length )
6304             minlen= longest_fixed_length;
6305         */
6306     }
6307     else {
6308         /* Several toplevels. Best we can is to set minlen. */
6309         I32 fake;
6310         struct regnode_charclass_class ch_class;
6311         I32 last_close = 0;
6312
6313         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
6314
6315         scan = ri->program + 1;
6316         cl_init(pRExC_state, &ch_class);
6317         data.start_class = &ch_class;
6318         data.last_closep = &last_close;
6319
6320
6321         minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
6322             &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
6323
6324         CHECK_RESTUDY_GOTO;
6325
6326         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
6327                 = r->float_substr = r->float_utf8 = NULL;
6328
6329         if (!(data.start_class->flags & ANYOF_EOS)
6330             && !cl_is_anything(data.start_class))
6331         {
6332             const U32 n = add_data(pRExC_state, 1, "f");
6333             data.start_class->flags |= ANYOF_IS_SYNTHETIC;
6334
6335             Newx(RExC_rxi->data->data[n], 1,
6336                 struct regnode_charclass_class);
6337             StructCopy(data.start_class,
6338                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6339                        struct regnode_charclass_class);
6340             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6341             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6342             DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
6343                       regprop(r, sv, (regnode*)data.start_class);
6344                       PerlIO_printf(Perl_debug_log,
6345                                     "synthetic stclass \"%s\".\n",
6346                                     SvPVX_const(sv));});
6347         }
6348     }
6349
6350     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
6351        the "real" pattern. */
6352     DEBUG_OPTIMISE_r({
6353         PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
6354                       (IV)minlen, (IV)r->minlen);
6355     });
6356     r->minlenret = minlen;
6357     if (r->minlen < minlen)
6358         r->minlen = minlen;
6359
6360     if (RExC_seen & REG_SEEN_GPOS)
6361         r->extflags |= RXf_GPOS_SEEN;
6362     if (RExC_seen & REG_SEEN_LOOKBEHIND)
6363         r->extflags |= RXf_LOOKBEHIND_SEEN;
6364     if (pRExC_state->num_code_blocks)
6365         r->extflags |= RXf_EVAL_SEEN;
6366     if (RExC_seen & REG_SEEN_CANY)
6367         r->extflags |= RXf_CANY_SEEN;
6368     if (RExC_seen & REG_SEEN_VERBARG)
6369     {
6370         r->intflags |= PREGf_VERBARG_SEEN;
6371         r->extflags |= RXf_MODIFIES_VARS;
6372     }
6373     if (RExC_seen & REG_SEEN_CUTGROUP)
6374         r->intflags |= PREGf_CUTGROUP_SEEN;
6375     if (pm_flags & PMf_USE_RE_EVAL)
6376         r->intflags |= PREGf_USE_RE_EVAL;
6377     if (RExC_paren_names)
6378         RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
6379     else
6380         RXp_PAREN_NAMES(r) = NULL;
6381
6382 #ifdef STUPID_PATTERN_CHECKS
6383     if (RX_PRELEN(rx) == 0)
6384         r->extflags |= RXf_NULL;
6385     if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
6386         r->extflags |= RXf_WHITE;
6387     else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
6388         r->extflags |= RXf_START_ONLY;
6389 #else
6390     {
6391         regnode *first = ri->program + 1;
6392         U8 fop = OP(first);
6393
6394         if (PL_regkind[fop] == NOTHING && OP(NEXTOPER(first)) == END)
6395             r->extflags |= RXf_NULL;
6396         else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END)
6397             r->extflags |= RXf_START_ONLY;
6398         else if (fop == PLUS && OP(NEXTOPER(first)) == SPACE
6399                              && OP(regnext(first)) == END)
6400             r->extflags |= RXf_WHITE;
6401     }
6402 #endif
6403 #ifdef DEBUGGING
6404     if (RExC_paren_names) {
6405         ri->name_list_idx = add_data( pRExC_state, 1, "a" );
6406         ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
6407     } else
6408 #endif
6409         ri->name_list_idx = 0;
6410
6411     if (RExC_recurse_count) {
6412         for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
6413             const regnode *scan = RExC_recurse[RExC_recurse_count-1];
6414             ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
6415         }
6416     }
6417     Newxz(r->offs, RExC_npar, regexp_paren_pair);
6418     /* assume we don't need to swap parens around before we match */
6419
6420     DEBUG_DUMP_r({
6421         PerlIO_printf(Perl_debug_log,"Final program:\n");
6422         regdump(r);
6423     });
6424 #ifdef RE_TRACK_PATTERN_OFFSETS
6425     DEBUG_OFFSETS_r(if (ri->u.offsets) {
6426         const U32 len = ri->u.offsets[0];
6427         U32 i;
6428         GET_RE_DEBUG_FLAGS_DECL;
6429         PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
6430         for (i = 1; i <= len; i++) {
6431             if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
6432                 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
6433                 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
6434             }
6435         PerlIO_printf(Perl_debug_log, "\n");
6436     });
6437 #endif
6438     return rx;
6439 }
6440
6441
6442 SV*
6443 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
6444                     const U32 flags)
6445 {
6446     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
6447
6448     PERL_UNUSED_ARG(value);
6449
6450     if (flags & RXapif_FETCH) {
6451         return reg_named_buff_fetch(rx, key, flags);
6452     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
6453         Perl_croak_no_modify(aTHX);
6454         return NULL;
6455     } else if (flags & RXapif_EXISTS) {
6456         return reg_named_buff_exists(rx, key, flags)
6457             ? &PL_sv_yes
6458             : &PL_sv_no;
6459     } else if (flags & RXapif_REGNAMES) {
6460         return reg_named_buff_all(rx, flags);
6461     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
6462         return reg_named_buff_scalar(rx, flags);
6463     } else {
6464         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
6465         return NULL;
6466     }
6467 }
6468
6469 SV*
6470 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
6471                          const U32 flags)
6472 {
6473     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
6474     PERL_UNUSED_ARG(lastkey);
6475
6476     if (flags & RXapif_FIRSTKEY)
6477         return reg_named_buff_firstkey(rx, flags);
6478     else if (flags & RXapif_NEXTKEY)
6479         return reg_named_buff_nextkey(rx, flags);
6480     else {
6481         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
6482         return NULL;
6483     }
6484 }
6485
6486 SV*
6487 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
6488                           const U32 flags)
6489 {
6490     AV *retarray = NULL;
6491     SV *ret;
6492     struct regexp *const rx = (struct regexp *)SvANY(r);
6493
6494     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
6495
6496     if (flags & RXapif_ALL)
6497         retarray=newAV();
6498
6499     if (rx && RXp_PAREN_NAMES(rx)) {
6500         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
6501         if (he_str) {
6502             IV i;
6503             SV* sv_dat=HeVAL(he_str);
6504             I32 *nums=(I32*)SvPVX(sv_dat);
6505             for ( i=0; i<SvIVX(sv_dat); i++ ) {
6506                 if ((I32)(rx->nparens) >= nums[i]
6507                     && rx->offs[nums[i]].start != -1
6508                     && rx->offs[nums[i]].end != -1)
6509                 {
6510                     ret = newSVpvs("");
6511                     CALLREG_NUMBUF_FETCH(r,nums[i],ret);
6512                     if (!retarray)
6513                         return ret;
6514                 } else {
6515                     if (retarray)
6516                         ret = newSVsv(&PL_sv_undef);
6517                 }
6518                 if (retarray)
6519                     av_push(retarray, ret);
6520             }
6521             if (retarray)
6522                 return newRV_noinc(MUTABLE_SV(retarray));
6523         }
6524     }
6525     return NULL;
6526 }
6527
6528 bool
6529 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
6530                            const U32 flags)
6531 {
6532     struct regexp *const rx = (struct regexp *)SvANY(r);
6533
6534     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
6535
6536     if (rx && RXp_PAREN_NAMES(rx)) {
6537         if (flags & RXapif_ALL) {
6538             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
6539         } else {
6540             SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
6541             if (sv) {
6542                 SvREFCNT_dec(sv);
6543                 return TRUE;
6544             } else {
6545                 return FALSE;
6546             }
6547         }
6548     } else {
6549         return FALSE;
6550     }
6551 }
6552
6553 SV*
6554 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
6555 {
6556     struct regexp *const rx = (struct regexp *)SvANY(r);
6557
6558     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
6559
6560     if ( rx && RXp_PAREN_NAMES(rx) ) {
6561         (void)hv_iterinit(RXp_PAREN_NAMES(rx));
6562
6563         return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
6564     } else {
6565         return FALSE;
6566     }
6567 }
6568
6569 SV*
6570 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
6571 {
6572     struct regexp *const rx = (struct regexp *)SvANY(r);
6573     GET_RE_DEBUG_FLAGS_DECL;
6574
6575     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
6576
6577     if (rx && RXp_PAREN_NAMES(rx)) {
6578         HV *hv = RXp_PAREN_NAMES(rx);
6579         HE *temphe;
6580         while ( (temphe = hv_iternext_flags(hv,0)) ) {
6581             IV i;
6582             IV parno = 0;
6583             SV* sv_dat = HeVAL(temphe);
6584             I32 *nums = (I32*)SvPVX(sv_dat);
6585             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6586                 if ((I32)(rx->lastparen) >= nums[i] &&
6587                     rx->offs[nums[i]].start != -1 &&
6588                     rx->offs[nums[i]].end != -1)
6589                 {
6590                     parno = nums[i];
6591                     break;
6592                 }
6593             }
6594             if (parno || flags & RXapif_ALL) {
6595                 return newSVhek(HeKEY_hek(temphe));
6596             }
6597         }
6598     }
6599     return NULL;
6600 }
6601
6602 SV*
6603 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
6604 {
6605     SV *ret;
6606     AV *av;
6607     I32 length;
6608     struct regexp *const rx = (struct regexp *)SvANY(r);
6609
6610     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
6611
6612     if (rx && RXp_PAREN_NAMES(rx)) {
6613         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
6614             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
6615         } else if (flags & RXapif_ONE) {
6616             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
6617             av = MUTABLE_AV(SvRV(ret));
6618             length = av_len(av);
6619             SvREFCNT_dec(ret);
6620             return newSViv(length + 1);
6621         } else {
6622             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
6623             return NULL;
6624         }
6625     }
6626     return &PL_sv_undef;
6627 }
6628
6629 SV*
6630 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
6631 {
6632     struct regexp *const rx = (struct regexp *)SvANY(r);
6633     AV *av = newAV();
6634
6635     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
6636
6637     if (rx && RXp_PAREN_NAMES(rx)) {
6638         HV *hv= RXp_PAREN_NAMES(rx);
6639         HE *temphe;
6640         (void)hv_iterinit(hv);
6641         while ( (temphe = hv_iternext_flags(hv,0)) ) {
6642             IV i;
6643             IV parno = 0;
6644             SV* sv_dat = HeVAL(temphe);
6645             I32 *nums = (I32*)SvPVX(sv_dat);
6646             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6647                 if ((I32)(rx->lastparen) >= nums[i] &&
6648                     rx->offs[nums[i]].start != -1 &&
6649                     rx->offs[nums[i]].end != -1)
6650                 {
6651                     parno = nums[i];
6652                     break;
6653                 }
6654             }
6655             if (parno || flags & RXapif_ALL) {
6656                 av_push(av, newSVhek(HeKEY_hek(temphe)));
6657             }
6658         }
6659     }
6660
6661     return newRV_noinc(MUTABLE_SV(av));
6662 }
6663
6664 void
6665 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
6666                              SV * const sv)
6667 {
6668     struct regexp *const rx = (struct regexp *)SvANY(r);
6669     char *s = NULL;
6670     I32 i = 0;
6671     I32 s1, t1;
6672     I32 n = paren;
6673
6674     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
6675
6676     if ( (    n == RX_BUFF_IDX_CARET_PREMATCH
6677            || n == RX_BUFF_IDX_CARET_FULLMATCH
6678            || n == RX_BUFF_IDX_CARET_POSTMATCH
6679          )
6680          && !(rx->extflags & RXf_PMf_KEEPCOPY)
6681     )
6682         goto ret_undef;
6683
6684     if (!rx->subbeg)
6685         goto ret_undef;
6686
6687     if (n == RX_BUFF_IDX_CARET_FULLMATCH)
6688         /* no need to distinguish between them any more */
6689         n = RX_BUFF_IDX_FULLMATCH;
6690
6691     if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
6692         && rx->offs[0].start != -1)
6693     {
6694         /* $`, ${^PREMATCH} */
6695         i = rx->offs[0].start;
6696         s = rx->subbeg;
6697     }
6698     else
6699     if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
6700         && rx->offs[0].end != -1)
6701     {
6702         /* $', ${^POSTMATCH} */
6703         s = rx->subbeg - rx->suboffset + rx->offs[0].end;
6704         i = rx->sublen + rx->suboffset - rx->offs[0].end;
6705     }
6706     else
6707     if ( 0 <= n && n <= (I32)rx->nparens &&
6708         (s1 = rx->offs[n].start) != -1 &&
6709         (t1 = rx->offs[n].end) != -1)
6710     {
6711         /* $&, ${^MATCH},  $1 ... */
6712         i = t1 - s1;
6713         s = rx->subbeg + s1 - rx->suboffset;
6714     } else {
6715         goto ret_undef;
6716     }
6717
6718     assert(s >= rx->subbeg);
6719     assert(rx->sublen >= (s - rx->subbeg) + i );
6720     if (i >= 0) {
6721         const int oldtainted = PL_tainted;
6722         TAINT_NOT;
6723         sv_setpvn(sv, s, i);
6724         PL_tainted = oldtainted;
6725         if ( (rx->extflags & RXf_CANY_SEEN)
6726             ? (RXp_MATCH_UTF8(rx)
6727                         && (!i || is_utf8_string((U8*)s, i)))
6728             : (RXp_MATCH_UTF8(rx)) )
6729         {
6730             SvUTF8_on(sv);
6731         }
6732         else
6733             SvUTF8_off(sv);
6734         if (PL_tainting) {
6735             if (RXp_MATCH_TAINTED(rx)) {
6736                 if (SvTYPE(sv) >= SVt_PVMG) {
6737                     MAGIC* const mg = SvMAGIC(sv);
6738                     MAGIC* mgt;
6739                     PL_tainted = 1;
6740                     SvMAGIC_set(sv, mg->mg_moremagic);
6741                     SvTAINT(sv);
6742                     if ((mgt = SvMAGIC(sv))) {
6743                         mg->mg_moremagic = mgt;
6744                         SvMAGIC_set(sv, mg);
6745                     }
6746                 } else {
6747                     PL_tainted = 1;
6748                     SvTAINT(sv);
6749                 }
6750             } else
6751                 SvTAINTED_off(sv);
6752         }
6753     } else {
6754       ret_undef:
6755         sv_setsv(sv,&PL_sv_undef);
6756         return;
6757     }
6758 }
6759
6760 void
6761 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
6762                                                          SV const * const value)
6763 {
6764     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
6765
6766     PERL_UNUSED_ARG(rx);
6767     PERL_UNUSED_ARG(paren);
6768     PERL_UNUSED_ARG(value);
6769
6770     if (!PL_localizing)
6771         Perl_croak_no_modify(aTHX);
6772 }
6773
6774 I32
6775 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
6776                               const I32 paren)
6777 {
6778     struct regexp *const rx = (struct regexp *)SvANY(r);
6779     I32 i;
6780     I32 s1, t1;
6781
6782     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
6783
6784     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
6785     switch (paren) {
6786       case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
6787          if (!(rx->extflags & RXf_PMf_KEEPCOPY))
6788             goto warn_undef;
6789         /*FALLTHROUGH*/
6790
6791       case RX_BUFF_IDX_PREMATCH:       /* $` */
6792         if (rx->offs[0].start != -1) {
6793                         i = rx->offs[0].start;
6794                         if (i > 0) {
6795                                 s1 = 0;
6796                                 t1 = i;
6797                                 goto getlen;
6798                         }
6799             }
6800         return 0;
6801
6802       case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
6803          if (!(rx->extflags & RXf_PMf_KEEPCOPY))
6804             goto warn_undef;
6805       case RX_BUFF_IDX_POSTMATCH:       /* $' */
6806             if (rx->offs[0].end != -1) {
6807                         i = rx->sublen - rx->offs[0].end;
6808                         if (i > 0) {
6809                                 s1 = rx->offs[0].end;
6810                                 t1 = rx->sublen;
6811                                 goto getlen;
6812                         }
6813             }
6814         return 0;
6815
6816       case RX_BUFF_IDX_CARET_FULLMATCH: /* ${^MATCH} */
6817          if (!(rx->extflags & RXf_PMf_KEEPCOPY))
6818             goto warn_undef;
6819         /*FALLTHROUGH*/
6820
6821       /* $& / ${^MATCH}, $1, $2, ... */
6822       default:
6823             if (paren <= (I32)rx->nparens &&
6824             (s1 = rx->offs[paren].start) != -1 &&
6825             (t1 = rx->offs[paren].end) != -1)
6826             {
6827             i = t1 - s1;
6828             goto getlen;
6829         } else {
6830           warn_undef:
6831             if (ckWARN(WARN_UNINITIALIZED))
6832                 report_uninit((const SV *)sv);
6833             return 0;
6834         }
6835     }
6836   getlen:
6837     if (i > 0 && RXp_MATCH_UTF8(rx)) {
6838         const char * const s = rx->subbeg - rx->suboffset + s1;
6839         const U8 *ep;
6840         STRLEN el;
6841
6842         i = t1 - s1;
6843         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
6844                         i = el;
6845     }
6846     return i;
6847 }
6848
6849 SV*
6850 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
6851 {
6852     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
6853         PERL_UNUSED_ARG(rx);
6854         if (0)
6855             return NULL;
6856         else
6857             return newSVpvs("Regexp");
6858 }
6859
6860 /* Scans the name of a named buffer from the pattern.
6861  * If flags is REG_RSN_RETURN_NULL returns null.
6862  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
6863  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
6864  * to the parsed name as looked up in the RExC_paren_names hash.
6865  * If there is an error throws a vFAIL().. type exception.
6866  */
6867
6868 #define REG_RSN_RETURN_NULL    0
6869 #define REG_RSN_RETURN_NAME    1
6870 #define REG_RSN_RETURN_DATA    2
6871
6872 STATIC SV*
6873 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
6874 {
6875     char *name_start = RExC_parse;
6876
6877     PERL_ARGS_ASSERT_REG_SCAN_NAME;
6878
6879     if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
6880          /* skip IDFIRST by using do...while */
6881         if (UTF)
6882             do {
6883                 RExC_parse += UTF8SKIP(RExC_parse);
6884             } while (isALNUM_utf8((U8*)RExC_parse));
6885         else
6886             do {
6887                 RExC_parse++;
6888             } while (isALNUM(*RExC_parse));
6889     } else {
6890         RExC_parse++; /* so the <- from the vFAIL is after the offending character */
6891         vFAIL("Group name must start with a non-digit word character");
6892     }
6893     if ( flags ) {
6894         SV* sv_name
6895             = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
6896                              SVs_TEMP | (UTF ? SVf_UTF8 : 0));
6897         if ( flags == REG_RSN_RETURN_NAME)
6898             return sv_name;
6899         else if (flags==REG_RSN_RETURN_DATA) {
6900             HE *he_str = NULL;
6901             SV *sv_dat = NULL;
6902             if ( ! sv_name )      /* should not happen*/
6903                 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
6904             if (RExC_paren_names)
6905                 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
6906             if ( he_str )
6907                 sv_dat = HeVAL(he_str);
6908             if ( ! sv_dat )
6909                 vFAIL("Reference to nonexistent named group");
6910             return sv_dat;
6911         }
6912         else {
6913             Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
6914                        (unsigned long) flags);
6915         }
6916         assert(0); /* NOT REACHED */
6917     }
6918     return NULL;
6919 }
6920
6921 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
6922     int rem=(int)(RExC_end - RExC_parse);                       \
6923     int cut;                                                    \
6924     int num;                                                    \
6925     int iscut=0;                                                \
6926     if (rem>10) {                                               \
6927         rem=10;                                                 \
6928         iscut=1;                                                \
6929     }                                                           \
6930     cut=10-rem;                                                 \
6931     if (RExC_lastparse!=RExC_parse)                             \
6932         PerlIO_printf(Perl_debug_log," >%.*s%-*s",              \
6933             rem, RExC_parse,                                    \
6934             cut + 4,                                            \
6935             iscut ? "..." : "<"                                 \
6936         );                                                      \
6937     else                                                        \
6938         PerlIO_printf(Perl_debug_log,"%16s","");                \
6939                                                                 \
6940     if (SIZE_ONLY)                                              \
6941        num = RExC_size + 1;                                     \
6942     else                                                        \
6943        num=REG_NODE_NUM(RExC_emit);                             \
6944     if (RExC_lastnum!=num)                                      \
6945        PerlIO_printf(Perl_debug_log,"|%4d",num);                \
6946     else                                                        \
6947        PerlIO_printf(Perl_debug_log,"|%4s","");                 \
6948     PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
6949         (int)((depth*2)), "",                                   \
6950         (funcname)                                              \
6951     );                                                          \
6952     RExC_lastnum=num;                                           \
6953     RExC_lastparse=RExC_parse;                                  \
6954 })
6955
6956
6957
6958 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
6959     DEBUG_PARSE_MSG((funcname));                            \
6960     PerlIO_printf(Perl_debug_log,"%4s","\n");               \
6961 })
6962 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
6963     DEBUG_PARSE_MSG((funcname));                            \
6964     PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
6965 })
6966
6967 /* This section of code defines the inversion list object and its methods.  The
6968  * interfaces are highly subject to change, so as much as possible is static to
6969  * this file.  An inversion list is here implemented as a malloc'd C UV array
6970  * with some added info that is placed as UVs at the beginning in a header
6971  * portion.  An inversion list for Unicode is an array of code points, sorted
6972  * by ordinal number.  The zeroth element is the first code point in the list.
6973  * The 1th element is the first element beyond that not in the list.  In other
6974  * words, the first range is
6975  *  invlist[0]..(invlist[1]-1)
6976  * The other ranges follow.  Thus every element whose index is divisible by two
6977  * marks the beginning of a range that is in the list, and every element not
6978  * divisible by two marks the beginning of a range not in the list.  A single
6979  * element inversion list that contains the single code point N generally
6980  * consists of two elements
6981  *  invlist[0] == N
6982  *  invlist[1] == N+1
6983  * (The exception is when N is the highest representable value on the
6984  * machine, in which case the list containing just it would be a single
6985  * element, itself.  By extension, if the last range in the list extends to
6986  * infinity, then the first element of that range will be in the inversion list
6987  * at a position that is divisible by two, and is the final element in the
6988  * list.)
6989  * Taking the complement (inverting) an inversion list is quite simple, if the
6990  * first element is 0, remove it; otherwise add a 0 element at the beginning.
6991  * This implementation reserves an element at the beginning of each inversion
6992  * list to contain 0 when the list contains 0, and contains 1 otherwise.  The
6993  * actual beginning of the list is either that element if 0, or the next one if
6994  * 1.
6995  *
6996  * More about inversion lists can be found in "Unicode Demystified"
6997  * Chapter 13 by Richard Gillam, published by Addison-Wesley.
6998  * More will be coming when functionality is added later.
6999  *
7000  * The inversion list data structure is currently implemented as an SV pointing
7001  * to an array of UVs that the SV thinks are bytes.  This allows us to have an
7002  * array of UV whose memory management is automatically handled by the existing
7003  * facilities for SV's.
7004  *
7005  * Some of the methods should always be private to the implementation, and some
7006  * should eventually be made public */
7007
7008 /* The header definitions are in F<inline_invlist.c> */
7009
7010 #define TO_INTERNAL_SIZE(x) ((x + HEADER_LENGTH) * sizeof(UV))
7011 #define FROM_INTERNAL_SIZE(x) ((x / sizeof(UV)) - HEADER_LENGTH)
7012
7013 #define INVLIST_INITIAL_LEN 10
7014
7015 PERL_STATIC_INLINE UV*
7016 S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
7017 {
7018     /* Returns a pointer to the first element in the inversion list's array.
7019      * This is called upon initialization of an inversion list.  Where the
7020      * array begins depends on whether the list has the code point U+0000
7021      * in it or not.  The other parameter tells it whether the code that
7022      * follows this call is about to put a 0 in the inversion list or not.
7023      * The first element is either the element with 0, if 0, or the next one,
7024      * if 1 */
7025
7026     UV* zero = get_invlist_zero_addr(invlist);
7027
7028     PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
7029
7030     /* Must be empty */
7031     assert(! *_get_invlist_len_addr(invlist));
7032
7033     /* 1^1 = 0; 1^0 = 1 */
7034     *zero = 1 ^ will_have_0;
7035     return zero + *zero;
7036 }
7037
7038 PERL_STATIC_INLINE UV*
7039 S_invlist_array(pTHX_ SV* const invlist)
7040 {
7041     /* Returns the pointer to the inversion list's array.  Every time the
7042      * length changes, this needs to be called in case malloc or realloc moved
7043      * it */
7044
7045     PERL_ARGS_ASSERT_INVLIST_ARRAY;
7046
7047     /* Must not be empty.  If these fail, you probably didn't check for <len>
7048      * being non-zero before trying to get the array */
7049     assert(*_get_invlist_len_addr(invlist));
7050     assert(*get_invlist_zero_addr(invlist) == 0
7051            || *get_invlist_zero_addr(invlist) == 1);
7052
7053     /* The array begins either at the element reserved for zero if the
7054      * list contains 0 (that element will be set to 0), or otherwise the next
7055      * element (in which case the reserved element will be set to 1). */
7056     return (UV *) (get_invlist_zero_addr(invlist)
7057                    + *get_invlist_zero_addr(invlist));
7058 }
7059
7060 PERL_STATIC_INLINE void
7061 S_invlist_set_len(pTHX_ SV* const invlist, const UV len)
7062 {
7063     /* Sets the current number of elements stored in the inversion list */
7064
7065     PERL_ARGS_ASSERT_INVLIST_SET_LEN;
7066
7067     *_get_invlist_len_addr(invlist) = len;
7068
7069     assert(len <= SvLEN(invlist));
7070
7071     SvCUR_set(invlist, TO_INTERNAL_SIZE(len));
7072     /* If the list contains U+0000, that element is part of the header,
7073      * and should not be counted as part of the array.  It will contain
7074      * 0 in that case, and 1 otherwise.  So we could flop 0=>1, 1=>0 and
7075      * subtract:
7076      *  SvCUR_set(invlist,
7077      *            TO_INTERNAL_SIZE(len
7078      *                             - (*get_invlist_zero_addr(inv_list) ^ 1)));
7079      * But, this is only valid if len is not 0.  The consequences of not doing
7080      * this is that the memory allocation code may think that 1 more UV is
7081      * being used than actually is, and so might do an unnecessary grow.  That
7082      * seems worth not bothering to make this the precise amount.
7083      *
7084      * Note that when inverting, SvCUR shouldn't change */
7085 }
7086
7087 PERL_STATIC_INLINE IV*
7088 S_get_invlist_previous_index_addr(pTHX_ SV* invlist)
7089 {
7090     /* Return the address of the UV that is reserved to hold the cached index
7091      * */
7092
7093     PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
7094
7095     return (IV *) (SvPVX(invlist) + (INVLIST_PREVIOUS_INDEX_OFFSET * sizeof (UV)));
7096 }
7097
7098 PERL_STATIC_INLINE IV
7099 S_invlist_previous_index(pTHX_ SV* const invlist)
7100 {
7101     /* Returns cached index of previous search */
7102
7103     PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
7104
7105     return *get_invlist_previous_index_addr(invlist);
7106 }
7107
7108 PERL_STATIC_INLINE void
7109 S_invlist_set_previous_index(pTHX_ SV* const invlist, const IV index)
7110 {
7111     /* Caches <index> for later retrieval */
7112
7113     PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
7114
7115     assert(index == 0 || index < (int) _invlist_len(invlist));
7116
7117     *get_invlist_previous_index_addr(invlist) = index;
7118 }
7119
7120 PERL_STATIC_INLINE UV
7121 S_invlist_max(pTHX_ SV* const invlist)
7122 {
7123     /* Returns the maximum number of elements storable in the inversion list's
7124      * array, without having to realloc() */
7125
7126     PERL_ARGS_ASSERT_INVLIST_MAX;
7127
7128     return FROM_INTERNAL_SIZE(SvLEN(invlist));
7129 }
7130
7131 PERL_STATIC_INLINE UV*
7132 S_get_invlist_zero_addr(pTHX_ SV* invlist)
7133 {
7134     /* Return the address of the UV that is reserved to hold 0 if the inversion
7135      * list contains 0.  This has to be the last element of the heading, as the
7136      * list proper starts with either it if 0, or the next element if not.
7137      * (But we force it to contain either 0 or 1) */
7138
7139     PERL_ARGS_ASSERT_GET_INVLIST_ZERO_ADDR;
7140
7141     return (UV *) (SvPVX(invlist) + (INVLIST_ZERO_OFFSET * sizeof (UV)));
7142 }
7143
7144 #ifndef PERL_IN_XSUB_RE
7145 SV*
7146 Perl__new_invlist(pTHX_ IV initial_size)
7147 {
7148
7149     /* Return a pointer to a newly constructed inversion list, with enough
7150      * space to store 'initial_size' elements.  If that number is negative, a
7151      * system default is used instead */
7152
7153     SV* new_list;
7154
7155     if (initial_size < 0) {
7156         initial_size = INVLIST_INITIAL_LEN;
7157     }
7158
7159     /* Allocate the initial space */
7160     new_list = newSV(TO_INTERNAL_SIZE(initial_size));
7161     invlist_set_len(new_list, 0);
7162
7163     /* Force iterinit() to be used to get iteration to work */
7164     *get_invlist_iter_addr(new_list) = UV_MAX;
7165
7166     /* This should force a segfault if a method doesn't initialize this
7167      * properly */
7168     *get_invlist_zero_addr(new_list) = UV_MAX;
7169
7170     *get_invlist_previous_index_addr(new_list) = 0;
7171     *get_invlist_version_id_addr(new_list) = INVLIST_VERSION_ID;
7172 #if HEADER_LENGTH != 5
7173 #   error Need to regenerate VERSION_ID by running perl -E 'say int(rand 2**31-1)', and then changing the #if to the new length
7174 #endif
7175
7176     return new_list;
7177 }
7178 #endif
7179
7180 STATIC SV*
7181 S__new_invlist_C_array(pTHX_ UV* list)
7182 {
7183     /* Return a pointer to a newly constructed inversion list, initialized to
7184      * point to <list>, which has to be in the exact correct inversion list
7185      * form, including internal fields.  Thus this is a dangerous routine that
7186      * should not be used in the wrong hands */
7187
7188     SV* invlist = newSV_type(SVt_PV);
7189
7190     PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
7191
7192     SvPV_set(invlist, (char *) list);
7193     SvLEN_set(invlist, 0);  /* Means we own the contents, and the system
7194                                shouldn't touch it */
7195     SvCUR_set(invlist, TO_INTERNAL_SIZE(_invlist_len(invlist)));
7196
7197     if (*get_invlist_version_id_addr(invlist) != INVLIST_VERSION_ID) {
7198         Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
7199     }
7200
7201     return invlist;
7202 }
7203
7204 STATIC void
7205 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
7206 {
7207     /* Grow the maximum size of an inversion list */
7208
7209     PERL_ARGS_ASSERT_INVLIST_EXTEND;
7210
7211     SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max));
7212 }
7213
7214 PERL_STATIC_INLINE void
7215 S_invlist_trim(pTHX_ SV* const invlist)
7216 {
7217     PERL_ARGS_ASSERT_INVLIST_TRIM;
7218
7219     /* Change the length of the inversion list to how many entries it currently
7220      * has */
7221
7222     SvPV_shrink_to_cur((SV *) invlist);
7223 }
7224
7225 #define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
7226
7227 STATIC void
7228 S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
7229 {
7230    /* Subject to change or removal.  Append the range from 'start' to 'end' at
7231     * the end of the inversion list.  The range must be above any existing
7232     * ones. */
7233
7234     UV* array;
7235     UV max = invlist_max(invlist);
7236     UV len = _invlist_len(invlist);
7237
7238     PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
7239
7240     if (len == 0) { /* Empty lists must be initialized */
7241         array = _invlist_array_init(invlist, start == 0);
7242     }
7243     else {
7244         /* Here, the existing list is non-empty. The current max entry in the
7245          * list is generally the first value not in the set, except when the
7246          * set extends to the end of permissible values, in which case it is
7247          * the first entry in that final set, and so this call is an attempt to
7248          * append out-of-order */
7249
7250         UV final_element = len - 1;
7251         array = invlist_array(invlist);
7252         if (array[final_element] > start
7253             || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
7254         {
7255             Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
7256                        array[final_element], start,
7257                        ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
7258         }
7259
7260         /* Here, it is a legal append.  If the new range begins with the first
7261          * value not in the set, it is extending the set, so the new first
7262          * value not in the set is one greater than the newly extended range.
7263          * */
7264         if (array[final_element] == start) {
7265             if (end != UV_MAX) {
7266                 array[final_element] = end + 1;
7267             }
7268             else {
7269                 /* But if the end is the maximum representable on the machine,
7270                  * just let the range that this would extend to have no end */
7271                 invlist_set_len(invlist, len - 1);
7272             }
7273             return;
7274         }
7275     }
7276
7277     /* Here the new range doesn't extend any existing set.  Add it */
7278
7279     len += 2;   /* Includes an element each for the start and end of range */
7280
7281     /* If overflows the existing space, extend, which may cause the array to be
7282      * moved */
7283     if (max < len) {
7284         invlist_extend(invlist, len);
7285         invlist_set_len(invlist, len);  /* Have to set len here to avoid assert
7286                                            failure in invlist_array() */
7287         array = invlist_array(invlist);
7288     }
7289     else {
7290         invlist_set_len(invlist, len);
7291     }
7292
7293     /* The next item on the list starts the range, the one after that is
7294      * one past the new range.  */
7295     array[len - 2] = start;
7296     if (end != UV_MAX) {
7297         array[len - 1] = end + 1;
7298     }
7299     else {
7300         /* But if the end is the maximum representable on the machine, just let
7301          * the range have no end */
7302         invlist_set_len(invlist, len - 1);
7303     }
7304 }
7305
7306 #ifndef PERL_IN_XSUB_RE
7307
7308 IV
7309 Perl__invlist_search(pTHX_ SV* const invlist, const UV cp)
7310 {
7311     /* Searches the inversion list for the entry that contains the input code
7312      * point <cp>.  If <cp> is not in the list, -1 is returned.  Otherwise, the
7313      * return value is the index into the list's array of the range that
7314      * contains <cp> */
7315
7316     IV low = 0;
7317     IV mid;
7318     IV high = _invlist_len(invlist);
7319     const IV highest_element = high - 1;
7320     const UV* array;
7321
7322     PERL_ARGS_ASSERT__INVLIST_SEARCH;
7323
7324     /* If list is empty, return failure. */
7325     if (high == 0) {
7326         return -1;
7327     }
7328
7329     /* If the code point is before the first element, return failure.  (We
7330      * can't combine this with the test above, because we can't get the array
7331      * unless we know the list is non-empty) */
7332     array = invlist_array(invlist);
7333
7334     mid = invlist_previous_index(invlist);
7335     assert(mid >=0 && mid <= highest_element);
7336
7337     /* <mid> contains the cache of the result of the previous call to this
7338      * function (0 the first time).  See if this call is for the same result,
7339      * or if it is for mid-1.  This is under the theory that calls to this
7340      * function will often be for related code points that are near each other.
7341      * And benchmarks show that caching gives better results.  We also test
7342      * here if the code point is within the bounds of the list.  These tests
7343      * replace others that would have had to be made anyway to make sure that
7344      * the array bounds were not exceeded, and give us extra information at the
7345      * same time */
7346     if (cp >= array[mid]) {
7347         if (cp >= array[highest_element]) {
7348             return highest_element;
7349         }
7350
7351         /* Here, array[mid] <= cp < array[highest_element].  This means that
7352          * the final element is not the answer, so can exclude it; it also
7353          * means that <mid> is not the final element, so can refer to 'mid + 1'
7354          * safely */
7355         if (cp < array[mid + 1]) {
7356             return mid;
7357         }
7358         high--;
7359         low = mid + 1;
7360     }
7361     else { /* cp < aray[mid] */
7362         if (cp < array[0]) { /* Fail if outside the array */
7363             return -1;
7364         }
7365         high = mid;
7366         if (cp >= array[mid - 1]) {
7367             goto found_entry;
7368         }
7369     }
7370
7371     /* Binary search.  What we are looking for is <i> such that
7372      *  array[i] <= cp < array[i+1]
7373      * The loop below converges on the i+1.  Note that there may not be an
7374      * (i+1)th element in the array, and things work nonetheless */
7375     while (low < high) {
7376         mid = (low + high) / 2;
7377         assert(mid <= highest_element);
7378         if (array[mid] <= cp) { /* cp >= array[mid] */
7379             low = mid + 1;
7380
7381             /* We could do this extra test to exit the loop early.
7382             if (cp < array[low]) {
7383                 return mid;
7384             }
7385             */
7386         }
7387         else { /* cp < array[mid] */
7388             high = mid;
7389         }
7390     }
7391
7392   found_entry:
7393     high--;
7394     invlist_set_previous_index(invlist, high);
7395     return high;
7396 }
7397
7398 void
7399 Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch)
7400 {
7401     /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
7402      * but is used when the swash has an inversion list.  This makes this much
7403      * faster, as it uses a binary search instead of a linear one.  This is
7404      * intimately tied to that function, and perhaps should be in utf8.c,
7405      * except it is intimately tied to inversion lists as well.  It assumes
7406      * that <swatch> is all 0's on input */
7407
7408     UV current = start;
7409     const IV len = _invlist_len(invlist);
7410     IV i;
7411     const UV * array;
7412
7413     PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
7414
7415     if (len == 0) { /* Empty inversion list */
7416         return;
7417     }
7418
7419     array = invlist_array(invlist);
7420
7421     /* Find which element it is */
7422     i = _invlist_search(invlist, start);
7423
7424     /* We populate from <start> to <end> */
7425     while (current < end) {
7426         UV upper;
7427
7428         /* The inversion list gives the results for every possible code point
7429          * after the first one in the list.  Only those ranges whose index is
7430          * even are ones that the inversion list matches.  For the odd ones,
7431          * and if the initial code point is not in the list, we have to skip
7432          * forward to the next element */
7433         if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
7434             i++;
7435             if (i >= len) { /* Finished if beyond the end of the array */
7436                 return;
7437             }
7438             current = array[i];
7439             if (current >= end) {   /* Finished if beyond the end of what we
7440                                        are populating */
7441                 if (LIKELY(end < UV_MAX)) {
7442                     return;
7443                 }
7444
7445                 /* We get here when the upper bound is the maximum
7446                  * representable on the machine, and we are looking for just
7447                  * that code point.  Have to special case it */
7448                 i = len;
7449                 goto join_end_of_list;
7450             }
7451         }
7452         assert(current >= start);
7453
7454         /* The current range ends one below the next one, except don't go past
7455          * <end> */
7456         i++;
7457         upper = (i < len && array[i] < end) ? array[i] : end;
7458
7459         /* Here we are in a range that matches.  Populate a bit in the 3-bit U8
7460          * for each code point in it */
7461         for (; current < upper; current++) {
7462             const STRLEN offset = (STRLEN)(current - start);
7463             swatch[offset >> 3] |= 1 << (offset & 7);
7464         }
7465
7466     join_end_of_list:
7467
7468         /* Quit if at the end of the list */
7469         if (i >= len) {
7470
7471             /* But first, have to deal with the highest possible code point on
7472              * the platform.  The previous code assumes that <end> is one
7473              * beyond where we want to populate, but that is impossible at the
7474              * platform's infinity, so have to handle it specially */
7475             if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
7476             {
7477                 const STRLEN offset = (STRLEN)(end - start);
7478                 swatch[offset >> 3] |= 1 << (offset & 7);
7479             }
7480             return;
7481         }
7482
7483         /* Advance to the next range, which will be for code points not in the
7484          * inversion list */
7485         current = array[i];
7486     }
7487
7488     return;
7489 }
7490
7491 void
7492 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** output)
7493 {
7494     /* Take the union of two inversion lists and point <output> to it.  *output
7495      * should be defined upon input, and if it points to one of the two lists,
7496      * the reference count to that list will be decremented.  The first list,
7497      * <a>, may be NULL, in which case a copy of the second list is returned.
7498      * If <complement_b> is TRUE, the union is taken of the complement
7499      * (inversion) of <b> instead of b itself.
7500      *
7501      * The basis for this comes from "Unicode Demystified" Chapter 13 by
7502      * Richard Gillam, published by Addison-Wesley, and explained at some
7503      * length there.  The preface says to incorporate its examples into your
7504      * code at your own risk.
7505      *
7506      * The algorithm is like a merge sort.
7507      *
7508      * XXX A potential performance improvement is to keep track as we go along
7509      * if only one of the inputs contributes to the result, meaning the other
7510      * is a subset of that one.  In that case, we can skip the final copy and
7511      * return the larger of the input lists, but then outside code might need
7512      * to keep track of whether to free the input list or not */
7513
7514     UV* array_a;    /* a's array */
7515     UV* array_b;
7516     UV len_a;       /* length of a's array */
7517     UV len_b;
7518
7519     SV* u;                      /* the resulting union */
7520     UV* array_u;
7521     UV len_u;
7522
7523     UV i_a = 0;             /* current index into a's array */
7524     UV i_b = 0;
7525     UV i_u = 0;
7526
7527     /* running count, as explained in the algorithm source book; items are
7528      * stopped accumulating and are output when the count changes to/from 0.
7529      * The count is incremented when we start a range that's in the set, and
7530      * decremented when we start a range that's not in the set.  So its range
7531      * is 0 to 2.  Only when the count is zero is something not in the set.
7532      */
7533     UV count = 0;
7534
7535     PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
7536     assert(a != b);
7537
7538     /* If either one is empty, the union is the other one */
7539     if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
7540         if (*output == a) {
7541             if (a != NULL) {
7542                 SvREFCNT_dec(a);
7543             }
7544         }
7545         if (*output != b) {
7546             *output = invlist_clone(b);
7547             if (complement_b) {
7548                 _invlist_invert(*output);
7549             }
7550         } /* else *output already = b; */
7551         return;
7552     }
7553     else if ((len_b = _invlist_len(b)) == 0) {
7554         if (*output == b) {
7555             SvREFCNT_dec(b);
7556         }
7557
7558         /* The complement of an empty list is a list that has everything in it,
7559          * so the union with <a> includes everything too */
7560         if (complement_b) {
7561             if (a == *output) {
7562                 SvREFCNT_dec(a);
7563             }
7564             *output = _new_invlist(1);
7565             _append_range_to_invlist(*output, 0, UV_MAX);
7566         }
7567         else if (*output != a) {
7568             *output = invlist_clone(a);
7569         }
7570         /* else *output already = a; */
7571         return;
7572     }
7573
7574     /* Here both lists exist and are non-empty */
7575     array_a = invlist_array(a);
7576     array_b = invlist_array(b);
7577
7578     /* If are to take the union of 'a' with the complement of b, set it
7579      * up so are looking at b's complement. */
7580     if (complement_b) {
7581
7582         /* To complement, we invert: if the first element is 0, remove it.  To
7583          * do this, we just pretend the array starts one later, and clear the
7584          * flag as we don't have to do anything else later */
7585         if (array_b[0] == 0) {
7586             array_b++;
7587             len_b--;
7588             complement_b = FALSE;
7589         }
7590         else {
7591
7592             /* But if the first element is not zero, we unshift a 0 before the
7593              * array.  The data structure reserves a space for that 0 (which
7594              * should be a '1' right now), so physical shifting is unneeded,
7595              * but temporarily change that element to 0.  Before exiting the
7596              * routine, we must restore the element to '1' */
7597             array_b--;
7598             len_b++;
7599             array_b[0] = 0;
7600         }
7601     }
7602
7603     /* Size the union for the worst case: that the sets are completely
7604      * disjoint */
7605     u = _new_invlist(len_a + len_b);
7606
7607     /* Will contain U+0000 if either component does */
7608     array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
7609                                       || (len_b > 0 && array_b[0] == 0));
7610
7611     /* Go through each list item by item, stopping when exhausted one of
7612      * them */
7613     while (i_a < len_a && i_b < len_b) {
7614         UV cp;      /* The element to potentially add to the union's array */
7615         bool cp_in_set;   /* is it in the the input list's set or not */
7616
7617         /* We need to take one or the other of the two inputs for the union.
7618          * Since we are merging two sorted lists, we take the smaller of the
7619          * next items.  In case of a tie, we take the one that is in its set
7620          * first.  If we took one not in the set first, it would decrement the
7621          * count, possibly to 0 which would cause it to be output as ending the
7622          * range, and the next time through we would take the same number, and
7623          * output it again as beginning the next range.  By doing it the
7624          * opposite way, there is no possibility that the count will be
7625          * momentarily decremented to 0, and thus the two adjoining ranges will
7626          * be seamlessly merged.  (In a tie and both are in the set or both not
7627          * in the set, it doesn't matter which we take first.) */
7628         if (array_a[i_a] < array_b[i_b]
7629             || (array_a[i_a] == array_b[i_b]
7630                 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7631         {
7632             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7633             cp= array_a[i_a++];
7634         }
7635         else {
7636             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7637             cp= array_b[i_b++];
7638         }
7639
7640         /* Here, have chosen which of the two inputs to look at.  Only output
7641          * if the running count changes to/from 0, which marks the
7642          * beginning/end of a range in that's in the set */
7643         if (cp_in_set) {
7644             if (count == 0) {
7645                 array_u[i_u++] = cp;
7646             }
7647             count++;
7648         }
7649         else {
7650             count--;
7651             if (count == 0) {
7652                 array_u[i_u++] = cp;
7653             }
7654         }
7655     }
7656
7657     /* Here, we are finished going through at least one of the lists, which
7658      * means there is something remaining in at most one.  We check if the list
7659      * that hasn't been exhausted is positioned such that we are in the middle
7660      * of a range in its set or not.  (i_a and i_b point to the element beyond
7661      * the one we care about.) If in the set, we decrement 'count'; if 0, there
7662      * is potentially more to output.
7663      * There are four cases:
7664      *  1) Both weren't in their sets, count is 0, and remains 0.  What's left
7665      *     in the union is entirely from the non-exhausted set.
7666      *  2) Both were in their sets, count is 2.  Nothing further should
7667      *     be output, as everything that remains will be in the exhausted
7668      *     list's set, hence in the union; decrementing to 1 but not 0 insures
7669      *     that
7670      *  3) the exhausted was in its set, non-exhausted isn't, count is 1.
7671      *     Nothing further should be output because the union includes
7672      *     everything from the exhausted set.  Not decrementing ensures that.
7673      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1;
7674      *     decrementing to 0 insures that we look at the remainder of the
7675      *     non-exhausted set */
7676     if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7677         || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7678     {
7679         count--;
7680     }
7681
7682     /* The final length is what we've output so far, plus what else is about to
7683      * be output.  (If 'count' is non-zero, then the input list we exhausted
7684      * has everything remaining up to the machine's limit in its set, and hence
7685      * in the union, so there will be no further output. */
7686     len_u = i_u;
7687     if (count == 0) {
7688         /* At most one of the subexpressions will be non-zero */
7689         len_u += (len_a - i_a) + (len_b - i_b);
7690     }
7691
7692     /* Set result to final length, which can change the pointer to array_u, so
7693      * re-find it */
7694     if (len_u != _invlist_len(u)) {
7695         invlist_set_len(u, len_u);
7696         invlist_trim(u);
7697         array_u = invlist_array(u);
7698     }
7699
7700     /* When 'count' is 0, the list that was exhausted (if one was shorter than
7701      * the other) ended with everything above it not in its set.  That means
7702      * that the remaining part of the union is precisely the same as the
7703      * non-exhausted list, so can just copy it unchanged.  (If both list were
7704      * exhausted at the same time, then the operations below will be both 0.)
7705      */
7706     if (count == 0) {
7707         IV copy_count; /* At most one will have a non-zero copy count */
7708         if ((copy_count = len_a - i_a) > 0) {
7709             Copy(array_a + i_a, array_u + i_u, copy_count, UV);
7710         }
7711         else if ((copy_count = len_b - i_b) > 0) {
7712             Copy(array_b + i_b, array_u + i_u, copy_count, UV);
7713         }
7714     }
7715
7716     /*  We may be removing a reference to one of the inputs */
7717     if (a == *output || b == *output) {
7718         SvREFCNT_dec(*output);
7719     }
7720
7721     /* If we've changed b, restore it */
7722     if (complement_b) {
7723         array_b[0] = 1;
7724     }
7725
7726     *output = u;
7727     return;
7728 }
7729
7730 void
7731 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** i)
7732 {
7733     /* Take the intersection of two inversion lists and point <i> to it.  *i
7734      * should be defined upon input, and if it points to one of the two lists,
7735      * the reference count to that list will be decremented.
7736      * If <complement_b> is TRUE, the result will be the intersection of <a>
7737      * and the complement (or inversion) of <b> instead of <b> directly.
7738      *
7739      * The basis for this comes from "Unicode Demystified" Chapter 13 by
7740      * Richard Gillam, published by Addison-Wesley, and explained at some
7741      * length there.  The preface says to incorporate its examples into your
7742      * code at your own risk.  In fact, it had bugs
7743      *
7744      * The algorithm is like a merge sort, and is essentially the same as the
7745      * union above
7746      */
7747
7748     UV* array_a;                /* a's array */
7749     UV* array_b;
7750     UV len_a;   /* length of a's array */
7751     UV len_b;
7752
7753     SV* r;                   /* the resulting intersection */
7754     UV* array_r;
7755     UV len_r;
7756
7757     UV i_a = 0;             /* current index into a's array */
7758     UV i_b = 0;
7759     UV i_r = 0;
7760
7761     /* running count, as explained in the algorithm source book; items are
7762      * stopped accumulating and are output when the count changes to/from 2.
7763      * The count is incremented when we start a range that's in the set, and
7764      * decremented when we start a range that's not in the set.  So its range
7765      * is 0 to 2.  Only when the count is 2 is something in the intersection.
7766      */
7767     UV count = 0;
7768
7769     PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
7770     assert(a != b);
7771
7772     /* Special case if either one is empty */
7773     len_a = _invlist_len(a);
7774     if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
7775
7776         if (len_a != 0 && complement_b) {
7777
7778             /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
7779              * be empty.  Here, also we are using 'b's complement, which hence
7780              * must be every possible code point.  Thus the intersection is
7781              * simply 'a'. */
7782             if (*i != a) {
7783                 *i = invlist_clone(a);
7784
7785                 if (*i == b) {
7786                     SvREFCNT_dec(b);
7787                 }
7788             }
7789             /* else *i is already 'a' */
7790             return;
7791         }
7792
7793         /* Here, 'a' or 'b' is empty and not using the complement of 'b'.  The
7794          * intersection must be empty */
7795         if (*i == a) {
7796             SvREFCNT_dec(a);
7797         }
7798         else if (*i == b) {
7799             SvREFCNT_dec(b);
7800         }
7801         *i = _new_invlist(0);
7802         return;
7803     }
7804
7805     /* Here both lists exist and are non-empty */
7806     array_a = invlist_array(a);
7807     array_b = invlist_array(b);
7808
7809     /* If are to take the intersection of 'a' with the complement of b, set it
7810      * up so are looking at b's complement. */
7811     if (complement_b) {
7812
7813         /* To complement, we invert: if the first element is 0, remove it.  To
7814          * do this, we just pretend the array starts one later, and clear the
7815          * flag as we don't have to do anything else later */
7816         if (array_b[0] == 0) {
7817             array_b++;
7818             len_b--;
7819             complement_b = FALSE;
7820         }
7821         else {
7822
7823             /* But if the first element is not zero, we unshift a 0 before the
7824              * array.  The data structure reserves a space for that 0 (which
7825              * should be a '1' right now), so physical shifting is unneeded,
7826              * but temporarily change that element to 0.  Before exiting the
7827              * routine, we must restore the element to '1' */
7828             array_b--;
7829             len_b++;
7830             array_b[0] = 0;
7831         }
7832     }
7833
7834     /* Size the intersection for the worst case: that the intersection ends up
7835      * fragmenting everything to be completely disjoint */
7836     r= _new_invlist(len_a + len_b);
7837
7838     /* Will contain U+0000 iff both components do */
7839     array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
7840                                      && len_b > 0 && array_b[0] == 0);
7841
7842     /* Go through each list item by item, stopping when exhausted one of
7843      * them */
7844     while (i_a < len_a && i_b < len_b) {
7845         UV cp;      /* The element to potentially add to the intersection's
7846                        array */
7847         bool cp_in_set; /* Is it in the input list's set or not */
7848
7849         /* We need to take one or the other of the two inputs for the
7850          * intersection.  Since we are merging two sorted lists, we take the
7851          * smaller of the next items.  In case of a tie, we take the one that
7852          * is not in its set first (a difference from the union algorithm).  If
7853          * we took one in the set first, it would increment the count, possibly
7854          * to 2 which would cause it to be output as starting a range in the
7855          * intersection, and the next time through we would take that same
7856          * number, and output it again as ending the set.  By doing it the
7857          * opposite of this, there is no possibility that the count will be
7858          * momentarily incremented to 2.  (In a tie and both are in the set or
7859          * both not in the set, it doesn't matter which we take first.) */
7860         if (array_a[i_a] < array_b[i_b]
7861             || (array_a[i_a] == array_b[i_b]
7862                 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7863         {
7864             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7865             cp= array_a[i_a++];
7866         }
7867         else {
7868             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7869             cp= array_b[i_b++];
7870         }
7871
7872         /* Here, have chosen which of the two inputs to look at.  Only output
7873          * if the running count changes to/from 2, which marks the
7874          * beginning/end of a range that's in the intersection */
7875         if (cp_in_set) {
7876             count++;
7877             if (count == 2) {
7878                 array_r[i_r++] = cp;
7879             }
7880         }
7881         else {
7882             if (count == 2) {
7883                 array_r[i_r++] = cp;
7884             }
7885             count--;
7886         }
7887     }
7888
7889     /* Here, we are finished going through at least one of the lists, which
7890      * means there is something remaining in at most one.  We check if the list
7891      * that has been exhausted is positioned such that we are in the middle
7892      * of a range in its set or not.  (i_a and i_b point to elements 1 beyond
7893      * the ones we care about.)  There are four cases:
7894      *  1) Both weren't in their sets, count is 0, and remains 0.  There's
7895      *     nothing left in the intersection.
7896      *  2) Both were in their sets, count is 2 and perhaps is incremented to
7897      *     above 2.  What should be output is exactly that which is in the
7898      *     non-exhausted set, as everything it has is also in the intersection
7899      *     set, and everything it doesn't have can't be in the intersection
7900      *  3) The exhausted was in its set, non-exhausted isn't, count is 1, and
7901      *     gets incremented to 2.  Like the previous case, the intersection is
7902      *     everything that remains in the non-exhausted set.
7903      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
7904      *     remains 1.  And the intersection has nothing more. */
7905     if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7906         || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7907     {
7908         count++;
7909     }
7910
7911     /* The final length is what we've output so far plus what else is in the
7912      * intersection.  At most one of the subexpressions below will be non-zero */
7913     len_r = i_r;
7914     if (count >= 2) {
7915         len_r += (len_a - i_a) + (len_b - i_b);
7916     }
7917
7918     /* Set result to final length, which can change the pointer to array_r, so
7919      * re-find it */
7920     if (len_r != _invlist_len(r)) {
7921         invlist_set_len(r, len_r);
7922         invlist_trim(r);
7923         array_r = invlist_array(r);
7924     }
7925
7926     /* Finish outputting any remaining */
7927     if (count >= 2) { /* At most one will have a non-zero copy count */
7928         IV copy_count;
7929         if ((copy_count = len_a - i_a) > 0) {
7930             Copy(array_a + i_a, array_r + i_r, copy_count, UV);
7931         }
7932         else if ((copy_count = len_b - i_b) > 0) {
7933             Copy(array_b + i_b, array_r + i_r, copy_count, UV);
7934         }
7935     }
7936
7937     /*  We may be removing a reference to one of the inputs */
7938     if (a == *i || b == *i) {
7939         SvREFCNT_dec(*i);
7940     }
7941
7942     /* If we've changed b, restore it */
7943     if (complement_b) {
7944         array_b[0] = 1;
7945     }
7946
7947     *i = r;
7948     return;
7949 }
7950
7951 SV*
7952 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
7953 {
7954     /* Add the range from 'start' to 'end' inclusive to the inversion list's
7955      * set.  A pointer to the inversion list is returned.  This may actually be
7956      * a new list, in which case the passed in one has been destroyed.  The
7957      * passed in inversion list can be NULL, in which case a new one is created
7958      * with just the one range in it */
7959
7960     SV* range_invlist;
7961     UV len;
7962
7963     if (invlist == NULL) {
7964         invlist = _new_invlist(2);
7965         len = 0;
7966     }
7967     else {
7968         len = _invlist_len(invlist);
7969     }
7970
7971     /* If comes after the final entry, can just append it to the end */
7972     if (len == 0
7973         || start >= invlist_array(invlist)
7974                                     [_invlist_len(invlist) - 1])
7975     {
7976         _append_range_to_invlist(invlist, start, end);
7977         return invlist;
7978     }
7979
7980     /* Here, can't just append things, create and return a new inversion list
7981      * which is the union of this range and the existing inversion list */
7982     range_invlist = _new_invlist(2);
7983     _append_range_to_invlist(range_invlist, start, end);
7984
7985     _invlist_union(invlist, range_invlist, &invlist);
7986
7987     /* The temporary can be freed */
7988     SvREFCNT_dec(range_invlist);
7989
7990     return invlist;
7991 }
7992
7993 #endif
7994
7995 PERL_STATIC_INLINE SV*
7996 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
7997     return _add_range_to_invlist(invlist, cp, cp);
7998 }
7999
8000 #ifndef PERL_IN_XSUB_RE
8001 void
8002 Perl__invlist_invert(pTHX_ SV* const invlist)
8003 {
8004     /* Complement the input inversion list.  This adds a 0 if the list didn't
8005      * have a zero; removes it otherwise.  As described above, the data
8006      * structure is set up so that this is very efficient */
8007
8008     UV* len_pos = _get_invlist_len_addr(invlist);
8009
8010     PERL_ARGS_ASSERT__INVLIST_INVERT;
8011
8012     /* The inverse of matching nothing is matching everything */
8013     if (*len_pos == 0) {
8014         _append_range_to_invlist(invlist, 0, UV_MAX);
8015         return;
8016     }
8017
8018     /* The exclusive or complents 0 to 1; and 1 to 0.  If the result is 1, the
8019      * zero element was a 0, so it is being removed, so the length decrements
8020      * by 1; and vice-versa.  SvCUR is unaffected */
8021     if (*get_invlist_zero_addr(invlist) ^= 1) {
8022         (*len_pos)--;
8023     }
8024     else {
8025         (*len_pos)++;
8026     }
8027 }
8028
8029 void
8030 Perl__invlist_invert_prop(pTHX_ SV* const invlist)
8031 {
8032     /* Complement the input inversion list (which must be a Unicode property,
8033      * all of which don't match above the Unicode maximum code point.)  And
8034      * Perl has chosen to not have the inversion match above that either.  This
8035      * adds a 0x110000 if the list didn't end with it, and removes it if it did
8036      */
8037
8038     UV len;
8039     UV* array;
8040
8041     PERL_ARGS_ASSERT__INVLIST_INVERT_PROP;
8042
8043     _invlist_invert(invlist);
8044
8045     len = _invlist_len(invlist);
8046
8047     if (len != 0) { /* If empty do nothing */
8048         array = invlist_array(invlist);
8049         if (array[len - 1] != PERL_UNICODE_MAX + 1) {
8050             /* Add 0x110000.  First, grow if necessary */
8051             len++;
8052             if (invlist_max(invlist) < len) {
8053                 invlist_extend(invlist, len);
8054                 array = invlist_array(invlist);
8055             }
8056             invlist_set_len(invlist, len);
8057             array[len - 1] = PERL_UNICODE_MAX + 1;
8058         }
8059         else {  /* Remove the 0x110000 */
8060             invlist_set_len(invlist, len - 1);
8061         }
8062     }
8063
8064     return;
8065 }
8066 #endif
8067
8068 PERL_STATIC_INLINE SV*
8069 S_invlist_clone(pTHX_ SV* const invlist)
8070 {
8071
8072     /* Return a new inversion list that is a copy of the input one, which is
8073      * unchanged */
8074
8075     /* Need to allocate extra space to accommodate Perl's addition of a
8076      * trailing NUL to SvPV's, since it thinks they are always strings */
8077     SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
8078     STRLEN length = SvCUR(invlist);
8079
8080     PERL_ARGS_ASSERT_INVLIST_CLONE;
8081
8082     SvCUR_set(new_invlist, length); /* This isn't done automatically */
8083     Copy(SvPVX(invlist), SvPVX(new_invlist), length, char);
8084
8085     return new_invlist;
8086 }
8087
8088 PERL_STATIC_INLINE UV*
8089 S_get_invlist_iter_addr(pTHX_ SV* invlist)
8090 {
8091     /* Return the address of the UV that contains the current iteration
8092      * position */
8093
8094     PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
8095
8096     return (UV *) (SvPVX(invlist) + (INVLIST_ITER_OFFSET * sizeof (UV)));
8097 }
8098
8099 PERL_STATIC_INLINE UV*
8100 S_get_invlist_version_id_addr(pTHX_ SV* invlist)
8101 {
8102     /* Return the address of the UV that contains the version id. */
8103
8104     PERL_ARGS_ASSERT_GET_INVLIST_VERSION_ID_ADDR;
8105
8106     return (UV *) (SvPVX(invlist) + (INVLIST_VERSION_ID_OFFSET * sizeof (UV)));
8107 }
8108
8109 PERL_STATIC_INLINE void
8110 S_invlist_iterinit(pTHX_ SV* invlist)   /* Initialize iterator for invlist */
8111 {
8112     PERL_ARGS_ASSERT_INVLIST_ITERINIT;
8113
8114     *get_invlist_iter_addr(invlist) = 0;
8115 }
8116
8117 STATIC bool
8118 S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
8119 {
8120     /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
8121      * This call sets in <*start> and <*end>, the next range in <invlist>.
8122      * Returns <TRUE> if successful and the next call will return the next
8123      * range; <FALSE> if was already at the end of the list.  If the latter,
8124      * <*start> and <*end> are unchanged, and the next call to this function
8125      * will start over at the beginning of the list */
8126
8127     UV* pos = get_invlist_iter_addr(invlist);
8128     UV len = _invlist_len(invlist);
8129     UV *array;
8130
8131     PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
8132
8133     if (*pos >= len) {
8134         *pos = UV_MAX;  /* Force iternit() to be required next time */
8135         return FALSE;
8136     }
8137
8138     array = invlist_array(invlist);
8139
8140     *start = array[(*pos)++];
8141
8142     if (*pos >= len) {
8143         *end = UV_MAX;
8144     }
8145     else {
8146         *end = array[(*pos)++] - 1;
8147     }
8148
8149     return TRUE;
8150 }
8151
8152 PERL_STATIC_INLINE UV
8153 S_invlist_highest(pTHX_ SV* const invlist)
8154 {
8155     /* Returns the highest code point that matches an inversion list.  This API
8156      * has an ambiguity, as it returns 0 under either the highest is actually
8157      * 0, or if the list is empty.  If this distinction matters to you, check
8158      * for emptiness before calling this function */
8159
8160     UV len = _invlist_len(invlist);
8161     UV *array;
8162
8163     PERL_ARGS_ASSERT_INVLIST_HIGHEST;
8164
8165     if (len == 0) {
8166         return 0;
8167     }
8168
8169     array = invlist_array(invlist);
8170
8171     /* The last element in the array in the inversion list always starts a
8172      * range that goes to infinity.  That range may be for code points that are
8173      * matched in the inversion list, or it may be for ones that aren't
8174      * matched.  In the latter case, the highest code point in the set is one
8175      * less than the beginning of this range; otherwise it is the final element
8176      * of this range: infinity */
8177     return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
8178            ? UV_MAX
8179            : array[len - 1] - 1;
8180 }
8181
8182 #ifndef PERL_IN_XSUB_RE
8183 SV *
8184 Perl__invlist_contents(pTHX_ SV* const invlist)
8185 {
8186     /* Get the contents of an inversion list into a string SV so that they can
8187      * be printed out.  It uses the format traditionally done for debug tracing
8188      */
8189
8190     UV start, end;
8191     SV* output = newSVpvs("\n");
8192
8193     PERL_ARGS_ASSERT__INVLIST_CONTENTS;
8194
8195     invlist_iterinit(invlist);
8196     while (invlist_iternext(invlist, &start, &end)) {
8197         if (end == UV_MAX) {
8198             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
8199         }
8200         else if (end != start) {
8201             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
8202                     start,       end);
8203         }
8204         else {
8205             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
8206         }
8207     }
8208
8209     return output;
8210 }
8211 #endif
8212
8213 #if 0
8214 void
8215 S_invlist_dump(pTHX_ SV* const invlist, const char * const header)
8216 {
8217     /* Dumps out the ranges in an inversion list.  The string 'header'
8218      * if present is output on a line before the first range */
8219
8220     UV start, end;
8221
8222     if (header && strlen(header)) {
8223         PerlIO_printf(Perl_debug_log, "%s\n", header);
8224     }
8225     invlist_iterinit(invlist);
8226     while (invlist_iternext(invlist, &start, &end)) {
8227         if (end == UV_MAX) {
8228             PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. INFINITY\n", start);
8229         }
8230         else {
8231             PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. 0x%04"UVXf"\n", start, end);
8232         }
8233     }
8234 }
8235 #endif
8236
8237 #if 0
8238 bool
8239 S__invlistEQ(pTHX_ SV* const a, SV* const b, bool complement_b)
8240 {
8241     /* Return a boolean as to if the two passed in inversion lists are
8242      * identical.  The final argument, if TRUE, says to take the complement of
8243      * the second inversion list before doing the comparison */
8244
8245     UV* array_a = invlist_array(a);
8246     UV* array_b = invlist_array(b);
8247     UV len_a = _invlist_len(a);
8248     UV len_b = _invlist_len(b);
8249
8250     UV i = 0;               /* current index into the arrays */
8251     bool retval = TRUE;     /* Assume are identical until proven otherwise */
8252
8253     PERL_ARGS_ASSERT__INVLISTEQ;
8254
8255     /* If are to compare 'a' with the complement of b, set it
8256      * up so are looking at b's complement. */
8257     if (complement_b) {
8258
8259         /* The complement of nothing is everything, so <a> would have to have
8260          * just one element, starting at zero (ending at infinity) */
8261         if (len_b == 0) {
8262             return (len_a == 1 && array_a[0] == 0);
8263         }
8264         else if (array_b[0] == 0) {
8265
8266             /* Otherwise, to complement, we invert.  Here, the first element is
8267              * 0, just remove it.  To do this, we just pretend the array starts
8268              * one later, and clear the flag as we don't have to do anything
8269              * else later */
8270
8271             array_b++;
8272             len_b--;
8273             complement_b = FALSE;
8274         }
8275         else {
8276
8277             /* But if the first element is not zero, we unshift a 0 before the
8278              * array.  The data structure reserves a space for that 0 (which
8279              * should be a '1' right now), so physical shifting is unneeded,
8280              * but temporarily change that element to 0.  Before exiting the
8281              * routine, we must restore the element to '1' */
8282             array_b--;
8283             len_b++;
8284             array_b[0] = 0;
8285         }
8286     }
8287
8288     /* Make sure that the lengths are the same, as well as the final element
8289      * before looping through the remainder.  (Thus we test the length, final,
8290      * and first elements right off the bat) */
8291     if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
8292         retval = FALSE;
8293     }
8294     else for (i = 0; i < len_a - 1; i++) {
8295         if (array_a[i] != array_b[i]) {
8296             retval = FALSE;
8297             break;
8298         }
8299     }
8300
8301     if (complement_b) {
8302         array_b[0] = 1;
8303     }
8304     return retval;
8305 }
8306 #endif
8307
8308 #undef HEADER_LENGTH
8309 #undef INVLIST_INITIAL_LENGTH
8310 #undef TO_INTERNAL_SIZE
8311 #undef FROM_INTERNAL_SIZE
8312 #undef INVLIST_LEN_OFFSET
8313 #undef INVLIST_ZERO_OFFSET
8314 #undef INVLIST_ITER_OFFSET
8315 #undef INVLIST_VERSION_ID
8316
8317 /* End of inversion list object */
8318
8319 /*
8320  - reg - regular expression, i.e. main body or parenthesized thing
8321  *
8322  * Caller must absorb opening parenthesis.
8323  *
8324  * Combining parenthesis handling with the base level of regular expression
8325  * is a trifle forced, but the need to tie the tails of the branches to what
8326  * follows makes it hard to avoid.
8327  */
8328 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
8329 #ifdef DEBUGGING
8330 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
8331 #else
8332 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
8333 #endif
8334
8335 STATIC regnode *
8336 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
8337     /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
8338 {
8339     dVAR;
8340     regnode *ret;               /* Will be the head of the group. */
8341     regnode *br;
8342     regnode *lastbr;
8343     regnode *ender = NULL;
8344     I32 parno = 0;
8345     I32 flags;
8346     U32 oregflags = RExC_flags;
8347     bool have_branch = 0;
8348     bool is_open = 0;
8349     I32 freeze_paren = 0;
8350     I32 after_freeze = 0;
8351
8352     /* for (?g), (?gc), and (?o) warnings; warning
8353        about (?c) will warn about (?g) -- japhy    */
8354
8355 #define WASTED_O  0x01
8356 #define WASTED_G  0x02
8357 #define WASTED_C  0x04
8358 #define WASTED_GC (0x02|0x04)
8359     I32 wastedflags = 0x00;
8360
8361     char * parse_start = RExC_parse; /* MJD */
8362     char * const oregcomp_parse = RExC_parse;
8363
8364     GET_RE_DEBUG_FLAGS_DECL;
8365
8366     PERL_ARGS_ASSERT_REG;
8367     DEBUG_PARSE("reg ");
8368
8369     *flagp = 0;                         /* Tentatively. */
8370
8371
8372     /* Make an OPEN node, if parenthesized. */
8373     if (paren) {
8374         if ( *RExC_parse == '*') { /* (*VERB:ARG) */
8375             char *start_verb = RExC_parse;
8376             STRLEN verb_len = 0;
8377             char *start_arg = NULL;
8378             unsigned char op = 0;
8379             int argok = 1;
8380             int internal_argval = 0; /* internal_argval is only useful if !argok */
8381             while ( *RExC_parse && *RExC_parse != ')' ) {
8382                 if ( *RExC_parse == ':' ) {
8383                     start_arg = RExC_parse + 1;
8384                     break;
8385                 }
8386                 RExC_parse++;
8387             }
8388             ++start_verb;
8389             verb_len = RExC_parse - start_verb;
8390             if ( start_arg ) {
8391                 RExC_parse++;
8392                 while ( *RExC_parse && *RExC_parse != ')' )
8393                     RExC_parse++;
8394                 if ( *RExC_parse != ')' )
8395                     vFAIL("Unterminated verb pattern argument");
8396                 if ( RExC_parse == start_arg )
8397                     start_arg = NULL;
8398             } else {
8399                 if ( *RExC_parse != ')' )
8400                     vFAIL("Unterminated verb pattern");
8401             }
8402
8403             switch ( *start_verb ) {
8404             case 'A':  /* (*ACCEPT) */
8405                 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
8406                     op = ACCEPT;
8407                     internal_argval = RExC_nestroot;
8408                 }
8409                 break;
8410             case 'C':  /* (*COMMIT) */
8411                 if ( memEQs(start_verb,verb_len,"COMMIT") )
8412                     op = COMMIT;
8413                 break;
8414             case 'F':  /* (*FAIL) */
8415                 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
8416                     op = OPFAIL;
8417                     argok = 0;
8418                 }
8419                 break;
8420             case ':':  /* (*:NAME) */
8421             case 'M':  /* (*MARK:NAME) */
8422                 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
8423                     op = MARKPOINT;
8424                     argok = -1;
8425                 }
8426                 break;
8427             case 'P':  /* (*PRUNE) */
8428                 if ( memEQs(start_verb,verb_len,"PRUNE") )
8429                     op = PRUNE;
8430                 break;
8431             case 'S':   /* (*SKIP) */
8432                 if ( memEQs(start_verb,verb_len,"SKIP") )
8433                     op = SKIP;
8434                 break;
8435             case 'T':  /* (*THEN) */
8436                 /* [19:06] <TimToady> :: is then */
8437                 if ( memEQs(start_verb,verb_len,"THEN") ) {
8438                     op = CUTGROUP;
8439                     RExC_seen |= REG_SEEN_CUTGROUP;
8440                 }
8441                 break;
8442             }
8443             if ( ! op ) {
8444                 RExC_parse++;
8445                 vFAIL3("Unknown verb pattern '%.*s'",
8446                     verb_len, start_verb);
8447             }
8448             if ( argok ) {
8449                 if ( start_arg && internal_argval ) {
8450                     vFAIL3("Verb pattern '%.*s' may not have an argument",
8451                         verb_len, start_verb);
8452                 } else if ( argok < 0 && !start_arg ) {
8453                     vFAIL3("Verb pattern '%.*s' has a mandatory argument",
8454                         verb_len, start_verb);
8455                 } else {
8456                     ret = reganode(pRExC_state, op, internal_argval);
8457                     if ( ! internal_argval && ! SIZE_ONLY ) {
8458                         if (start_arg) {
8459                             SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
8460                             ARG(ret) = add_data( pRExC_state, 1, "S" );
8461                             RExC_rxi->data->data[ARG(ret)]=(void*)sv;
8462                             ret->flags = 0;
8463                         } else {
8464                             ret->flags = 1;
8465                         }
8466                     }
8467                 }
8468                 if (!internal_argval)
8469                     RExC_seen |= REG_SEEN_VERBARG;
8470             } else if ( start_arg ) {
8471                 vFAIL3("Verb pattern '%.*s' may not have an argument",
8472                         verb_len, start_verb);
8473             } else {
8474                 ret = reg_node(pRExC_state, op);
8475             }
8476             nextchar(pRExC_state);
8477             return ret;
8478         } else
8479         if (*RExC_parse == '?') { /* (?...) */
8480             bool is_logical = 0;
8481             const char * const seqstart = RExC_parse;
8482             bool has_use_defaults = FALSE;
8483
8484             RExC_parse++;
8485             paren = *RExC_parse++;
8486             ret = NULL;                 /* For look-ahead/behind. */
8487             switch (paren) {
8488
8489             case 'P':   /* (?P...) variants for those used to PCRE/Python */
8490                 paren = *RExC_parse++;
8491                 if ( paren == '<')         /* (?P<...>) named capture */
8492                     goto named_capture;
8493                 else if (paren == '>') {   /* (?P>name) named recursion */
8494                     goto named_recursion;
8495                 }
8496                 else if (paren == '=') {   /* (?P=...)  named backref */
8497                     /* this pretty much dupes the code for \k<NAME> in regatom(), if
8498                        you change this make sure you change that */
8499                     char* name_start = RExC_parse;
8500                     U32 num = 0;
8501                     SV *sv_dat = reg_scan_name(pRExC_state,
8502                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8503                     if (RExC_parse == name_start || *RExC_parse != ')')
8504                         vFAIL2("Sequence %.3s... not terminated",parse_start);
8505
8506                     if (!SIZE_ONLY) {
8507                         num = add_data( pRExC_state, 1, "S" );
8508                         RExC_rxi->data->data[num]=(void*)sv_dat;
8509                         SvREFCNT_inc_simple_void(sv_dat);
8510                     }
8511                     RExC_sawback = 1;
8512                     ret = reganode(pRExC_state,
8513                                    ((! FOLD)
8514                                      ? NREF
8515                                      : (ASCII_FOLD_RESTRICTED)
8516                                        ? NREFFA
8517                                        : (AT_LEAST_UNI_SEMANTICS)
8518                                          ? NREFFU
8519                                          : (LOC)
8520                                            ? NREFFL
8521                                            : NREFF),
8522                                     num);
8523                     *flagp |= HASWIDTH;
8524
8525                     Set_Node_Offset(ret, parse_start+1);
8526                     Set_Node_Cur_Length(ret); /* MJD */
8527
8528                     nextchar(pRExC_state);
8529                     return ret;
8530                 }
8531                 RExC_parse++;
8532                 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8533                 /*NOTREACHED*/
8534             case '<':           /* (?<...) */
8535                 if (*RExC_parse == '!')
8536                     paren = ',';
8537                 else if (*RExC_parse != '=')
8538               named_capture:
8539                 {               /* (?<...>) */
8540                     char *name_start;
8541                     SV *svname;
8542                     paren= '>';
8543             case '\'':          /* (?'...') */
8544                     name_start= RExC_parse;
8545                     svname = reg_scan_name(pRExC_state,
8546                         SIZE_ONLY ?  /* reverse test from the others */
8547                         REG_RSN_RETURN_NAME :
8548                         REG_RSN_RETURN_NULL);
8549                     if (RExC_parse == name_start) {
8550                         RExC_parse++;
8551                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8552                         /*NOTREACHED*/
8553                     }
8554                     if (*RExC_parse != paren)
8555                         vFAIL2("Sequence (?%c... not terminated",
8556                             paren=='>' ? '<' : paren);
8557                     if (SIZE_ONLY) {
8558                         HE *he_str;
8559                         SV *sv_dat = NULL;
8560                         if (!svname) /* shouldn't happen */
8561                             Perl_croak(aTHX_
8562                                 "panic: reg_scan_name returned NULL");
8563                         if (!RExC_paren_names) {
8564                             RExC_paren_names= newHV();
8565                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
8566 #ifdef DEBUGGING
8567                             RExC_paren_name_list= newAV();
8568                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
8569 #endif
8570                         }
8571                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
8572                         if ( he_str )
8573                             sv_dat = HeVAL(he_str);
8574                         if ( ! sv_dat ) {
8575                             /* croak baby croak */
8576                             Perl_croak(aTHX_
8577                                 "panic: paren_name hash element allocation failed");
8578                         } else if ( SvPOK(sv_dat) ) {
8579                             /* (?|...) can mean we have dupes so scan to check
8580                                its already been stored. Maybe a flag indicating
8581                                we are inside such a construct would be useful,
8582                                but the arrays are likely to be quite small, so
8583                                for now we punt -- dmq */
8584                             IV count = SvIV(sv_dat);
8585                             I32 *pv = (I32*)SvPVX(sv_dat);
8586                             IV i;
8587                             for ( i = 0 ; i < count ; i++ ) {
8588                                 if ( pv[i] == RExC_npar ) {
8589                                     count = 0;
8590                                     break;
8591                                 }
8592                             }
8593                             if ( count ) {
8594                                 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
8595                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
8596                                 pv[count] = RExC_npar;
8597                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
8598                             }
8599                         } else {
8600                             (void)SvUPGRADE(sv_dat,SVt_PVNV);
8601                             sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
8602                             SvIOK_on(sv_dat);
8603                             SvIV_set(sv_dat, 1);
8604                         }
8605 #ifdef DEBUGGING
8606                         /* Yes this does cause a memory leak in debugging Perls */
8607                         if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
8608                             SvREFCNT_dec(svname);
8609 #endif
8610
8611                         /*sv_dump(sv_dat);*/
8612                     }
8613                     nextchar(pRExC_state);
8614                     paren = 1;
8615                     goto capturing_parens;
8616                 }
8617                 RExC_seen |= REG_SEEN_LOOKBEHIND;
8618                 RExC_in_lookbehind++;
8619                 RExC_parse++;
8620             case '=':           /* (?=...) */
8621                 RExC_seen_zerolen++;
8622                 break;
8623             case '!':           /* (?!...) */
8624                 RExC_seen_zerolen++;
8625                 if (*RExC_parse == ')') {
8626                     ret=reg_node(pRExC_state, OPFAIL);
8627                     nextchar(pRExC_state);
8628                     return ret;
8629                 }
8630                 break;
8631             case '|':           /* (?|...) */
8632                 /* branch reset, behave like a (?:...) except that
8633                    buffers in alternations share the same numbers */
8634                 paren = ':';
8635                 after_freeze = freeze_paren = RExC_npar;
8636                 break;
8637             case ':':           /* (?:...) */
8638             case '>':           /* (?>...) */
8639                 break;
8640             case '$':           /* (?$...) */
8641             case '@':           /* (?@...) */
8642                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
8643                 break;
8644             case '#':           /* (?#...) */
8645                 while (*RExC_parse && *RExC_parse != ')')
8646                     RExC_parse++;
8647                 if (*RExC_parse != ')')
8648                     FAIL("Sequence (?#... not terminated");
8649                 nextchar(pRExC_state);
8650                 *flagp = TRYAGAIN;
8651                 return NULL;
8652             case '0' :           /* (?0) */
8653             case 'R' :           /* (?R) */
8654                 if (*RExC_parse != ')')
8655                     FAIL("Sequence (?R) not terminated");
8656                 ret = reg_node(pRExC_state, GOSTART);
8657                 *flagp |= POSTPONED;
8658                 nextchar(pRExC_state);
8659                 return ret;
8660                 /*notreached*/
8661             { /* named and numeric backreferences */
8662                 I32 num;
8663             case '&':            /* (?&NAME) */
8664                 parse_start = RExC_parse - 1;
8665               named_recursion:
8666                 {
8667                     SV *sv_dat = reg_scan_name(pRExC_state,
8668                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8669                      num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
8670                 }
8671                 goto gen_recurse_regop;
8672                 assert(0); /* NOT REACHED */
8673             case '+':
8674                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8675                     RExC_parse++;
8676                     vFAIL("Illegal pattern");
8677                 }
8678                 goto parse_recursion;
8679                 /* NOT REACHED*/
8680             case '-': /* (?-1) */
8681                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8682                     RExC_parse--; /* rewind to let it be handled later */
8683                     goto parse_flags;
8684                 }
8685                 /*FALLTHROUGH */
8686             case '1': case '2': case '3': case '4': /* (?1) */
8687             case '5': case '6': case '7': case '8': case '9':
8688                 RExC_parse--;
8689               parse_recursion:
8690                 num = atoi(RExC_parse);
8691                 parse_start = RExC_parse - 1; /* MJD */
8692                 if (*RExC_parse == '-')
8693                     RExC_parse++;
8694                 while (isDIGIT(*RExC_parse))
8695                         RExC_parse++;
8696                 if (*RExC_parse!=')')
8697                     vFAIL("Expecting close bracket");
8698
8699               gen_recurse_regop:
8700                 if ( paren == '-' ) {
8701                     /*
8702                     Diagram of capture buffer numbering.
8703                     Top line is the normal capture buffer numbers
8704                     Bottom line is the negative indexing as from
8705                     the X (the (?-2))
8706
8707                     +   1 2    3 4 5 X          6 7
8708                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
8709                     -   5 4    3 2 1 X          x x
8710
8711                     */
8712                     num = RExC_npar + num;
8713                     if (num < 1)  {
8714                         RExC_parse++;
8715                         vFAIL("Reference to nonexistent group");
8716                     }
8717                 } else if ( paren == '+' ) {
8718                     num = RExC_npar + num - 1;
8719                 }
8720
8721                 ret = reganode(pRExC_state, GOSUB, num);
8722                 if (!SIZE_ONLY) {
8723                     if (num > (I32)RExC_rx->nparens) {
8724                         RExC_parse++;
8725                         vFAIL("Reference to nonexistent group");
8726                     }
8727                     ARG2L_SET( ret, RExC_recurse_count++);
8728                     RExC_emit++;
8729                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
8730                         "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
8731                 } else {
8732                     RExC_size++;
8733                 }
8734                 RExC_seen |= REG_SEEN_RECURSE;
8735                 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
8736                 Set_Node_Offset(ret, parse_start); /* MJD */
8737
8738                 *flagp |= POSTPONED;
8739                 nextchar(pRExC_state);
8740                 return ret;
8741             } /* named and numeric backreferences */
8742             assert(0); /* NOT REACHED */
8743
8744             case '?':           /* (??...) */
8745                 is_logical = 1;
8746                 if (*RExC_parse != '{') {
8747                     RExC_parse++;
8748                     vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8749                     /*NOTREACHED*/
8750                 }
8751                 *flagp |= POSTPONED;
8752                 paren = *RExC_parse++;
8753                 /* FALL THROUGH */
8754             case '{':           /* (?{...}) */
8755             {
8756                 U32 n = 0;
8757                 struct reg_code_block *cb;
8758
8759                 RExC_seen_zerolen++;
8760
8761                 if (   !pRExC_state->num_code_blocks
8762                     || pRExC_state->code_index >= pRExC_state->num_code_blocks
8763                     || pRExC_state->code_blocks[pRExC_state->code_index].start
8764                         != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
8765                             - RExC_start)
8766                 ) {
8767                     if (RExC_pm_flags & PMf_USE_RE_EVAL)
8768                         FAIL("panic: Sequence (?{...}): no code block found\n");
8769                     FAIL("Eval-group not allowed at runtime, use re 'eval'");
8770                 }
8771                 /* this is a pre-compiled code block (?{...}) */
8772                 cb = &pRExC_state->code_blocks[pRExC_state->code_index];
8773                 RExC_parse = RExC_start + cb->end;
8774                 if (!SIZE_ONLY) {
8775                     OP *o = cb->block;
8776                     if (cb->src_regex) {
8777                         n = add_data(pRExC_state, 2, "rl");
8778                         RExC_rxi->data->data[n] =
8779                             (void*)SvREFCNT_inc((SV*)cb->src_regex);
8780                         RExC_rxi->data->data[n+1] = (void*)o;
8781                     }
8782                     else {
8783                         n = add_data(pRExC_state, 1,
8784                                (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l");
8785                         RExC_rxi->data->data[n] = (void*)o;
8786                     }
8787                 }
8788                 pRExC_state->code_index++;
8789                 nextchar(pRExC_state);
8790
8791                 if (is_logical) {
8792                     regnode *eval;
8793                     ret = reg_node(pRExC_state, LOGICAL);
8794                     eval = reganode(pRExC_state, EVAL, n);
8795                     if (!SIZE_ONLY) {
8796                         ret->flags = 2;
8797                         /* for later propagation into (??{}) return value */
8798                         eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME);
8799                     }
8800                     REGTAIL(pRExC_state, ret, eval);
8801                     /* deal with the length of this later - MJD */
8802                     return ret;
8803                 }
8804                 ret = reganode(pRExC_state, EVAL, n);
8805                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
8806                 Set_Node_Offset(ret, parse_start);
8807                 return ret;
8808             }
8809             case '(':           /* (?(?{...})...) and (?(?=...)...) */
8810             {
8811                 int is_define= 0;
8812                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
8813                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
8814                         || RExC_parse[1] == '<'
8815                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
8816                         I32 flag;
8817
8818                         ret = reg_node(pRExC_state, LOGICAL);
8819                         if (!SIZE_ONLY)
8820                             ret->flags = 1;
8821                         REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
8822                         goto insert_if;
8823                     }
8824                 }
8825                 else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
8826                          || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
8827                 {
8828                     char ch = RExC_parse[0] == '<' ? '>' : '\'';
8829                     char *name_start= RExC_parse++;
8830                     U32 num = 0;
8831                     SV *sv_dat=reg_scan_name(pRExC_state,
8832                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8833                     if (RExC_parse == name_start || *RExC_parse != ch)
8834                         vFAIL2("Sequence (?(%c... not terminated",
8835                             (ch == '>' ? '<' : ch));
8836                     RExC_parse++;
8837                     if (!SIZE_ONLY) {
8838                         num = add_data( pRExC_state, 1, "S" );
8839                         RExC_rxi->data->data[num]=(void*)sv_dat;
8840                         SvREFCNT_inc_simple_void(sv_dat);
8841                     }
8842                     ret = reganode(pRExC_state,NGROUPP,num);
8843                     goto insert_if_check_paren;
8844                 }
8845                 else if (RExC_parse[0] == 'D' &&
8846                          RExC_parse[1] == 'E' &&
8847                          RExC_parse[2] == 'F' &&
8848                          RExC_parse[3] == 'I' &&
8849                          RExC_parse[4] == 'N' &&
8850                          RExC_parse[5] == 'E')
8851                 {
8852                     ret = reganode(pRExC_state,DEFINEP,0);
8853                     RExC_parse +=6 ;
8854                     is_define = 1;
8855                     goto insert_if_check_paren;
8856                 }
8857                 else if (RExC_parse[0] == 'R') {
8858                     RExC_parse++;
8859                     parno = 0;
8860                     if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
8861                         parno = atoi(RExC_parse++);
8862                         while (isDIGIT(*RExC_parse))
8863                             RExC_parse++;
8864                     } else if (RExC_parse[0] == '&') {
8865                         SV *sv_dat;
8866                         RExC_parse++;
8867                         sv_dat = reg_scan_name(pRExC_state,
8868                             SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8869                         parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
8870                     }
8871                     ret = reganode(pRExC_state,INSUBP,parno);
8872                     goto insert_if_check_paren;
8873                 }
8874                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
8875                     /* (?(1)...) */
8876                     char c;
8877                     parno = atoi(RExC_parse++);
8878
8879                     while (isDIGIT(*RExC_parse))
8880                         RExC_parse++;
8881                     ret = reganode(pRExC_state, GROUPP, parno);
8882
8883                  insert_if_check_paren:
8884                     if ((c = *nextchar(pRExC_state)) != ')')
8885                         vFAIL("Switch condition not recognized");
8886                   insert_if:
8887                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
8888                     br = regbranch(pRExC_state, &flags, 1,depth+1);
8889                     if (br == NULL)
8890                         br = reganode(pRExC_state, LONGJMP, 0);
8891                     else
8892                         REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
8893                     c = *nextchar(pRExC_state);
8894                     if (flags&HASWIDTH)
8895                         *flagp |= HASWIDTH;
8896                     if (c == '|') {
8897                         if (is_define)
8898                             vFAIL("(?(DEFINE)....) does not allow branches");
8899                         lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
8900                         regbranch(pRExC_state, &flags, 1,depth+1);
8901                         REGTAIL(pRExC_state, ret, lastbr);
8902                         if (flags&HASWIDTH)
8903                             *flagp |= HASWIDTH;
8904                         c = *nextchar(pRExC_state);
8905                     }
8906                     else
8907                         lastbr = NULL;
8908                     if (c != ')')
8909                         vFAIL("Switch (?(condition)... contains too many branches");
8910                     ender = reg_node(pRExC_state, TAIL);
8911                     REGTAIL(pRExC_state, br, ender);
8912                     if (lastbr) {
8913                         REGTAIL(pRExC_state, lastbr, ender);
8914                         REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
8915                     }
8916                     else
8917                         REGTAIL(pRExC_state, ret, ender);
8918                     RExC_size++; /* XXX WHY do we need this?!!
8919                                     For large programs it seems to be required
8920                                     but I can't figure out why. -- dmq*/
8921                     return ret;
8922                 }
8923                 else {
8924                     vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
8925                 }
8926             }
8927             case 0:
8928                 RExC_parse--; /* for vFAIL to print correctly */
8929                 vFAIL("Sequence (? incomplete");
8930                 break;
8931             case DEFAULT_PAT_MOD:   /* Use default flags with the exceptions
8932                                        that follow */
8933                 has_use_defaults = TRUE;
8934                 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
8935                 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
8936                                                 ? REGEX_UNICODE_CHARSET
8937                                                 : REGEX_DEPENDS_CHARSET);
8938                 goto parse_flags;
8939             default:
8940                 --RExC_parse;
8941                 parse_flags:      /* (?i) */
8942             {
8943                 U32 posflags = 0, negflags = 0;
8944                 U32 *flagsp = &posflags;
8945                 char has_charset_modifier = '\0';
8946                 regex_charset cs = get_regex_charset(RExC_flags);
8947                 if (cs == REGEX_DEPENDS_CHARSET
8948                     && (RExC_utf8 || RExC_uni_semantics))
8949                 {
8950                     cs = REGEX_UNICODE_CHARSET;
8951                 }
8952
8953                 while (*RExC_parse) {
8954                     /* && strchr("iogcmsx", *RExC_parse) */
8955                     /* (?g), (?gc) and (?o) are useless here
8956                        and must be globally applied -- japhy */
8957                     switch (*RExC_parse) {
8958                     CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
8959                     case LOCALE_PAT_MOD:
8960                         if (has_charset_modifier) {
8961                             goto excess_modifier;
8962                         }
8963                         else if (flagsp == &negflags) {
8964                             goto neg_modifier;
8965                         }
8966                         cs = REGEX_LOCALE_CHARSET;
8967                         has_charset_modifier = LOCALE_PAT_MOD;
8968                         RExC_contains_locale = 1;
8969                         break;
8970                     case UNICODE_PAT_MOD:
8971                         if (has_charset_modifier) {
8972                             goto excess_modifier;
8973                         }
8974                         else if (flagsp == &negflags) {
8975                             goto neg_modifier;
8976                         }
8977                         cs = REGEX_UNICODE_CHARSET;
8978                         has_charset_modifier = UNICODE_PAT_MOD;
8979                         break;
8980                     case ASCII_RESTRICT_PAT_MOD:
8981                         if (flagsp == &negflags) {
8982                             goto neg_modifier;
8983                         }
8984                         if (has_charset_modifier) {
8985                             if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
8986                                 goto excess_modifier;
8987                             }
8988                             /* Doubled modifier implies more restricted */
8989                             cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
8990                         }
8991                         else {
8992                             cs = REGEX_ASCII_RESTRICTED_CHARSET;
8993                         }
8994                         has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
8995                         break;
8996                     case DEPENDS_PAT_MOD:
8997                         if (has_use_defaults) {
8998                             goto fail_modifiers;
8999                         }
9000                         else if (flagsp == &negflags) {
9001                             goto neg_modifier;
9002                         }
9003                         else if (has_charset_modifier) {
9004                             goto excess_modifier;
9005                         }
9006
9007                         /* The dual charset means unicode semantics if the
9008                          * pattern (or target, not known until runtime) are
9009                          * utf8, or something in the pattern indicates unicode
9010                          * semantics */
9011                         cs = (RExC_utf8 || RExC_uni_semantics)
9012                              ? REGEX_UNICODE_CHARSET
9013                              : REGEX_DEPENDS_CHARSET;
9014                         has_charset_modifier = DEPENDS_PAT_MOD;
9015                         break;
9016                     excess_modifier:
9017                         RExC_parse++;
9018                         if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
9019                             vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
9020                         }
9021                         else if (has_charset_modifier == *(RExC_parse - 1)) {
9022                             vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
9023                         }
9024                         else {
9025                             vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
9026                         }
9027                         /*NOTREACHED*/
9028                     neg_modifier:
9029                         RExC_parse++;
9030                         vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
9031                         /*NOTREACHED*/
9032                     case ONCE_PAT_MOD: /* 'o' */
9033                     case GLOBAL_PAT_MOD: /* 'g' */
9034                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9035                             const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9036                             if (! (wastedflags & wflagbit) ) {
9037                                 wastedflags |= wflagbit;
9038                                 vWARN5(
9039                                     RExC_parse + 1,
9040                                     "Useless (%s%c) - %suse /%c modifier",
9041                                     flagsp == &negflags ? "?-" : "?",
9042                                     *RExC_parse,
9043                                     flagsp == &negflags ? "don't " : "",
9044                                     *RExC_parse
9045                                 );
9046                             }
9047                         }
9048                         break;
9049
9050                     case CONTINUE_PAT_MOD: /* 'c' */
9051                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9052                             if (! (wastedflags & WASTED_C) ) {
9053                                 wastedflags |= WASTED_GC;
9054                                 vWARN3(
9055                                     RExC_parse + 1,
9056                                     "Useless (%sc) - %suse /gc modifier",
9057                                     flagsp == &negflags ? "?-" : "?",
9058                                     flagsp == &negflags ? "don't " : ""
9059                                 );
9060                             }
9061                         }
9062                         break;
9063                     case KEEPCOPY_PAT_MOD: /* 'p' */
9064                         if (flagsp == &negflags) {
9065                             if (SIZE_ONLY)
9066                                 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
9067                         } else {
9068                             *flagsp |= RXf_PMf_KEEPCOPY;
9069                         }
9070                         break;
9071                     case '-':
9072                         /* A flag is a default iff it is following a minus, so
9073                          * if there is a minus, it means will be trying to
9074                          * re-specify a default which is an error */
9075                         if (has_use_defaults || flagsp == &negflags) {
9076             fail_modifiers:
9077                             RExC_parse++;
9078                             vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
9079                             /*NOTREACHED*/
9080                         }
9081                         flagsp = &negflags;
9082                         wastedflags = 0;  /* reset so (?g-c) warns twice */
9083                         break;
9084                     case ':':
9085                         paren = ':';
9086                         /*FALLTHROUGH*/
9087                     case ')':
9088                         RExC_flags |= posflags;
9089                         RExC_flags &= ~negflags;
9090                         set_regex_charset(&RExC_flags, cs);
9091                         if (paren != ':') {
9092                             oregflags |= posflags;
9093                             oregflags &= ~negflags;
9094                             set_regex_charset(&oregflags, cs);
9095                         }
9096                         nextchar(pRExC_state);
9097                         if (paren != ':') {
9098                             *flagp = TRYAGAIN;
9099                             return NULL;
9100                         } else {
9101                             ret = NULL;
9102                             goto parse_rest;
9103                         }
9104                         /*NOTREACHED*/
9105                     default:
9106                         RExC_parse++;
9107                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
9108                         /*NOTREACHED*/
9109                     }
9110                     ++RExC_parse;
9111                 }
9112             }} /* one for the default block, one for the switch */
9113         }
9114         else {                  /* (...) */
9115           capturing_parens:
9116             parno = RExC_npar;
9117             RExC_npar++;
9118
9119             ret = reganode(pRExC_state, OPEN, parno);
9120             if (!SIZE_ONLY ){
9121                 if (!RExC_nestroot)
9122                     RExC_nestroot = parno;
9123                 if (RExC_seen & REG_SEEN_RECURSE
9124                     && !RExC_open_parens[parno-1])
9125                 {
9126                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9127                         "Setting open paren #%"IVdf" to %d\n",
9128                         (IV)parno, REG_NODE_NUM(ret)));
9129                     RExC_open_parens[parno-1]= ret;
9130                 }
9131             }
9132             Set_Node_Length(ret, 1); /* MJD */
9133             Set_Node_Offset(ret, RExC_parse); /* MJD */
9134             is_open = 1;
9135         }
9136     }
9137     else                        /* ! paren */
9138         ret = NULL;
9139
9140    parse_rest:
9141     /* Pick up the branches, linking them together. */
9142     parse_start = RExC_parse;   /* MJD */
9143     br = regbranch(pRExC_state, &flags, 1,depth+1);
9144
9145     /*     branch_len = (paren != 0); */
9146
9147     if (br == NULL)
9148         return(NULL);
9149     if (*RExC_parse == '|') {
9150         if (!SIZE_ONLY && RExC_extralen) {
9151             reginsert(pRExC_state, BRANCHJ, br, depth+1);
9152         }
9153         else {                  /* MJD */
9154             reginsert(pRExC_state, BRANCH, br, depth+1);
9155             Set_Node_Length(br, paren != 0);
9156             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
9157         }
9158         have_branch = 1;
9159         if (SIZE_ONLY)
9160             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
9161     }
9162     else if (paren == ':') {
9163         *flagp |= flags&SIMPLE;
9164     }
9165     if (is_open) {                              /* Starts with OPEN. */
9166         REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
9167     }
9168     else if (paren != '?')              /* Not Conditional */
9169         ret = br;
9170     *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9171     lastbr = br;
9172     while (*RExC_parse == '|') {
9173         if (!SIZE_ONLY && RExC_extralen) {
9174             ender = reganode(pRExC_state, LONGJMP,0);
9175             REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
9176         }
9177         if (SIZE_ONLY)
9178             RExC_extralen += 2;         /* Account for LONGJMP. */
9179         nextchar(pRExC_state);
9180         if (freeze_paren) {
9181             if (RExC_npar > after_freeze)
9182                 after_freeze = RExC_npar;
9183             RExC_npar = freeze_paren;
9184         }
9185         br = regbranch(pRExC_state, &flags, 0, depth+1);
9186
9187         if (br == NULL)
9188             return(NULL);
9189         REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
9190         lastbr = br;
9191         *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9192     }
9193
9194     if (have_branch || paren != ':') {
9195         /* Make a closing node, and hook it on the end. */
9196         switch (paren) {
9197         case ':':
9198             ender = reg_node(pRExC_state, TAIL);
9199             break;
9200         case 1:
9201             ender = reganode(pRExC_state, CLOSE, parno);
9202             if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
9203                 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9204                         "Setting close paren #%"IVdf" to %d\n",
9205                         (IV)parno, REG_NODE_NUM(ender)));
9206                 RExC_close_parens[parno-1]= ender;
9207                 if (RExC_nestroot == parno)
9208                     RExC_nestroot = 0;
9209             }
9210             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
9211             Set_Node_Length(ender,1); /* MJD */
9212             break;
9213         case '<':
9214         case ',':
9215         case '=':
9216         case '!':
9217             *flagp &= ~HASWIDTH;
9218             /* FALL THROUGH */
9219         case '>':
9220             ender = reg_node(pRExC_state, SUCCEED);
9221             break;
9222         case 0:
9223             ender = reg_node(pRExC_state, END);
9224             if (!SIZE_ONLY) {
9225                 assert(!RExC_opend); /* there can only be one! */
9226                 RExC_opend = ender;
9227             }
9228             break;
9229         }
9230         DEBUG_PARSE_r(if (!SIZE_ONLY) {
9231             SV * const mysv_val1=sv_newmortal();
9232             SV * const mysv_val2=sv_newmortal();
9233             DEBUG_PARSE_MSG("lsbr");
9234             regprop(RExC_rx, mysv_val1, lastbr);
9235             regprop(RExC_rx, mysv_val2, ender);
9236             PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9237                           SvPV_nolen_const(mysv_val1),
9238                           (IV)REG_NODE_NUM(lastbr),
9239                           SvPV_nolen_const(mysv_val2),
9240                           (IV)REG_NODE_NUM(ender),
9241                           (IV)(ender - lastbr)
9242             );
9243         });
9244         REGTAIL(pRExC_state, lastbr, ender);
9245
9246         if (have_branch && !SIZE_ONLY) {
9247             char is_nothing= 1;
9248             if (depth==1)
9249                 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
9250
9251             /* Hook the tails of the branches to the closing node. */
9252             for (br = ret; br; br = regnext(br)) {
9253                 const U8 op = PL_regkind[OP(br)];
9254                 if (op == BRANCH) {
9255                     REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9256                     if (OP(NEXTOPER(br)) != NOTHING || regnext(NEXTOPER(br)) != ender)
9257                         is_nothing= 0;
9258                 }
9259                 else if (op == BRANCHJ) {
9260                     REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9261                     /* for now we always disable this optimisation * /
9262                     if (OP(NEXTOPER(NEXTOPER(br))) != NOTHING || regnext(NEXTOPER(NEXTOPER(br))) != ender)
9263                     */
9264                         is_nothing= 0;
9265                 }
9266             }
9267             if (is_nothing) {
9268                 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
9269                 DEBUG_PARSE_r(if (!SIZE_ONLY) {
9270                     SV * const mysv_val1=sv_newmortal();
9271                     SV * const mysv_val2=sv_newmortal();
9272                     DEBUG_PARSE_MSG("NADA");
9273                     regprop(RExC_rx, mysv_val1, ret);
9274                     regprop(RExC_rx, mysv_val2, ender);
9275                     PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9276                                   SvPV_nolen_const(mysv_val1),
9277                                   (IV)REG_NODE_NUM(ret),
9278                                   SvPV_nolen_const(mysv_val2),
9279                                   (IV)REG_NODE_NUM(ender),
9280                                   (IV)(ender - ret)
9281                     );
9282                 });
9283                 OP(br)= NOTHING;
9284                 if (OP(ender) == TAIL) {
9285                     NEXT_OFF(br)= 0;
9286                     RExC_emit= br + 1;
9287                 } else {
9288                     regnode *opt;
9289                     for ( opt= br + 1; opt < ender ; opt++ )
9290                         OP(opt)= OPTIMIZED;
9291                     NEXT_OFF(br)= ender - br;
9292                 }
9293             }
9294         }
9295     }
9296
9297     {
9298         const char *p;
9299         static const char parens[] = "=!<,>";
9300
9301         if (paren && (p = strchr(parens, paren))) {
9302             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
9303             int flag = (p - parens) > 1;
9304
9305             if (paren == '>')
9306                 node = SUSPEND, flag = 0;
9307             reginsert(pRExC_state, node,ret, depth+1);
9308             Set_Node_Cur_Length(ret);
9309             Set_Node_Offset(ret, parse_start + 1);
9310             ret->flags = flag;
9311             REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
9312         }
9313     }
9314
9315     /* Check for proper termination. */
9316     if (paren) {
9317         RExC_flags = oregflags;
9318         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
9319             RExC_parse = oregcomp_parse;
9320             vFAIL("Unmatched (");
9321         }
9322     }
9323     else if (!paren && RExC_parse < RExC_end) {
9324         if (*RExC_parse == ')') {
9325             RExC_parse++;
9326             vFAIL("Unmatched )");
9327         }
9328         else
9329             FAIL("Junk on end of regexp");      /* "Can't happen". */
9330         assert(0); /* NOTREACHED */
9331     }
9332
9333     if (RExC_in_lookbehind) {
9334         RExC_in_lookbehind--;
9335     }
9336     if (after_freeze > RExC_npar)
9337         RExC_npar = after_freeze;
9338     return(ret);
9339 }
9340
9341 /*
9342  - regbranch - one alternative of an | operator
9343  *
9344  * Implements the concatenation operator.
9345  */
9346 STATIC regnode *
9347 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
9348 {
9349     dVAR;
9350     regnode *ret;
9351     regnode *chain = NULL;
9352     regnode *latest;
9353     I32 flags = 0, c = 0;
9354     GET_RE_DEBUG_FLAGS_DECL;
9355
9356     PERL_ARGS_ASSERT_REGBRANCH;
9357
9358     DEBUG_PARSE("brnc");
9359
9360     if (first)
9361         ret = NULL;
9362     else {
9363         if (!SIZE_ONLY && RExC_extralen)
9364             ret = reganode(pRExC_state, BRANCHJ,0);
9365         else {
9366             ret = reg_node(pRExC_state, BRANCH);
9367             Set_Node_Length(ret, 1);
9368         }
9369     }
9370
9371     if (!first && SIZE_ONLY)
9372         RExC_extralen += 1;                     /* BRANCHJ */
9373
9374     *flagp = WORST;                     /* Tentatively. */
9375
9376     RExC_parse--;
9377     nextchar(pRExC_state);
9378     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
9379         flags &= ~TRYAGAIN;
9380         latest = regpiece(pRExC_state, &flags,depth+1);
9381         if (latest == NULL) {
9382             if (flags & TRYAGAIN)
9383                 continue;
9384             return(NULL);
9385         }
9386         else if (ret == NULL)
9387             ret = latest;
9388         *flagp |= flags&(HASWIDTH|POSTPONED);
9389         if (chain == NULL)      /* First piece. */
9390             *flagp |= flags&SPSTART;
9391         else {
9392             RExC_naughty++;
9393             REGTAIL(pRExC_state, chain, latest);
9394         }
9395         chain = latest;
9396         c++;
9397     }
9398     if (chain == NULL) {        /* Loop ran zero times. */
9399         chain = reg_node(pRExC_state, NOTHING);
9400         if (ret == NULL)
9401             ret = chain;
9402     }
9403     if (c == 1) {
9404         *flagp |= flags&SIMPLE;
9405     }
9406
9407     return ret;
9408 }
9409
9410 /*
9411  - regpiece - something followed by possible [*+?]
9412  *
9413  * Note that the branching code sequences used for ? and the general cases
9414  * of * and + are somewhat optimized:  they use the same NOTHING node as
9415  * both the endmarker for their branch list and the body of the last branch.
9416  * It might seem that this node could be dispensed with entirely, but the
9417  * endmarker role is not redundant.
9418  */
9419 STATIC regnode *
9420 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
9421 {
9422     dVAR;
9423     regnode *ret;
9424     char op;
9425     char *next;
9426     I32 flags;
9427     const char * const origparse = RExC_parse;
9428     I32 min;
9429     I32 max = REG_INFTY;
9430 #ifdef RE_TRACK_PATTERN_OFFSETS
9431     char *parse_start;
9432 #endif
9433     const char *maxpos = NULL;
9434
9435     /* Save the original in case we change the emitted regop to a FAIL. */
9436     regnode * const orig_emit = RExC_emit;
9437
9438     GET_RE_DEBUG_FLAGS_DECL;
9439
9440     PERL_ARGS_ASSERT_REGPIECE;
9441
9442     DEBUG_PARSE("piec");
9443
9444     ret = regatom(pRExC_state, &flags,depth+1);
9445     if (ret == NULL) {
9446         if (flags & TRYAGAIN)
9447             *flagp |= TRYAGAIN;
9448         return(NULL);
9449     }
9450
9451     op = *RExC_parse;
9452
9453     if (op == '{' && regcurly(RExC_parse)) {
9454         maxpos = NULL;
9455 #ifdef RE_TRACK_PATTERN_OFFSETS
9456         parse_start = RExC_parse; /* MJD */
9457 #endif
9458         next = RExC_parse + 1;
9459         while (isDIGIT(*next) || *next == ',') {
9460             if (*next == ',') {
9461                 if (maxpos)
9462                     break;
9463                 else
9464                     maxpos = next;
9465             }
9466             next++;
9467         }
9468         if (*next == '}') {             /* got one */
9469             if (!maxpos)
9470                 maxpos = next;
9471             RExC_parse++;
9472             min = atoi(RExC_parse);
9473             if (*maxpos == ',')
9474                 maxpos++;
9475             else
9476                 maxpos = RExC_parse;
9477             max = atoi(maxpos);
9478             if (!max && *maxpos != '0')
9479                 max = REG_INFTY;                /* meaning "infinity" */
9480             else if (max >= REG_INFTY)
9481                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
9482             RExC_parse = next;
9483             nextchar(pRExC_state);
9484             if (max < min) {    /* If can't match, warn and optimize to fail
9485                                    unconditionally */
9486                 if (SIZE_ONLY) {
9487                     ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
9488
9489                     /* We can't back off the size because we have to reserve
9490                      * enough space for all the things we are about to throw
9491                      * away, but we can shrink it by the ammount we are about
9492                      * to re-use here */
9493                     RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
9494                 }
9495                 else {
9496                     RExC_emit = orig_emit;
9497                 }
9498                 ret = reg_node(pRExC_state, OPFAIL);
9499                 return ret;
9500             }
9501
9502         do_curly:
9503             if ((flags&SIMPLE)) {
9504                 RExC_naughty += 2 + RExC_naughty / 2;
9505                 reginsert(pRExC_state, CURLY, ret, depth+1);
9506                 Set_Node_Offset(ret, parse_start+1); /* MJD */
9507                 Set_Node_Cur_Length(ret);
9508             }
9509             else {
9510                 regnode * const w = reg_node(pRExC_state, WHILEM);
9511
9512                 w->flags = 0;
9513                 REGTAIL(pRExC_state, ret, w);
9514                 if (!SIZE_ONLY && RExC_extralen) {
9515                     reginsert(pRExC_state, LONGJMP,ret, depth+1);
9516                     reginsert(pRExC_state, NOTHING,ret, depth+1);
9517                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
9518                 }
9519                 reginsert(pRExC_state, CURLYX,ret, depth+1);
9520                                 /* MJD hk */
9521                 Set_Node_Offset(ret, parse_start+1);
9522                 Set_Node_Length(ret,
9523                                 op == '{' ? (RExC_parse - parse_start) : 1);
9524
9525                 if (!SIZE_ONLY && RExC_extralen)
9526                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
9527                 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
9528                 if (SIZE_ONLY)
9529                     RExC_whilem_seen++, RExC_extralen += 3;
9530                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
9531             }
9532             ret->flags = 0;
9533
9534             if (min > 0)
9535                 *flagp = WORST;
9536             if (max > 0)
9537                 *flagp |= HASWIDTH;
9538             if (!SIZE_ONLY) {
9539                 ARG1_SET(ret, (U16)min);
9540                 ARG2_SET(ret, (U16)max);
9541             }
9542
9543             goto nest_check;
9544         }
9545     }
9546
9547     if (!ISMULT1(op)) {
9548         *flagp = flags;
9549         return(ret);
9550     }
9551
9552 #if 0                           /* Now runtime fix should be reliable. */
9553
9554     /* if this is reinstated, don't forget to put this back into perldiag:
9555
9556             =item Regexp *+ operand could be empty at {#} in regex m/%s/
9557
9558            (F) The part of the regexp subject to either the * or + quantifier
9559            could match an empty string. The {#} shows in the regular
9560            expression about where the problem was discovered.
9561
9562     */
9563
9564     if (!(flags&HASWIDTH) && op != '?')
9565       vFAIL("Regexp *+ operand could be empty");
9566 #endif
9567
9568 #ifdef RE_TRACK_PATTERN_OFFSETS
9569     parse_start = RExC_parse;
9570 #endif
9571     nextchar(pRExC_state);
9572
9573     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
9574
9575     if (op == '*' && (flags&SIMPLE)) {
9576         reginsert(pRExC_state, STAR, ret, depth+1);
9577         ret->flags = 0;
9578         RExC_naughty += 4;
9579     }
9580     else if (op == '*') {
9581         min = 0;
9582         goto do_curly;
9583     }
9584     else if (op == '+' && (flags&SIMPLE)) {
9585         reginsert(pRExC_state, PLUS, ret, depth+1);
9586         ret->flags = 0;
9587         RExC_naughty += 3;
9588     }
9589     else if (op == '+') {
9590         min = 1;
9591         goto do_curly;
9592     }
9593     else if (op == '?') {
9594         min = 0; max = 1;
9595         goto do_curly;
9596     }
9597   nest_check:
9598     if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
9599         ckWARN3reg(RExC_parse,
9600                    "%.*s matches null string many times",
9601                    (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
9602                    origparse);
9603     }
9604
9605     if (RExC_parse < RExC_end && *RExC_parse == '?') {
9606         nextchar(pRExC_state);
9607         reginsert(pRExC_state, MINMOD, ret, depth+1);
9608         REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
9609     }
9610 #ifndef REG_ALLOW_MINMOD_SUSPEND
9611     else
9612 #endif
9613     if (RExC_parse < RExC_end && *RExC_parse == '+') {
9614         regnode *ender;
9615         nextchar(pRExC_state);
9616         ender = reg_node(pRExC_state, SUCCEED);
9617         REGTAIL(pRExC_state, ret, ender);
9618         reginsert(pRExC_state, SUSPEND, ret, depth+1);
9619         ret->flags = 0;
9620         ender = reg_node(pRExC_state, TAIL);
9621         REGTAIL(pRExC_state, ret, ender);
9622         /*ret= ender;*/
9623     }
9624
9625     if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
9626         RExC_parse++;
9627         vFAIL("Nested quantifiers");
9628     }
9629
9630     return(ret);
9631 }
9632
9633 STATIC bool
9634 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I32 *flagp, U32 depth, bool in_char_class)
9635 {
9636
9637  /* This is expected to be called by a parser routine that has recognized '\N'
9638    and needs to handle the rest. RExC_parse is expected to point at the first
9639    char following the N at the time of the call.  On successful return,
9640    RExC_parse has been updated to point to just after the sequence identified
9641    by this routine, and <*flagp> has been updated.
9642
9643    The \N may be inside (indicated by the boolean <in_char_class>) or outside a
9644    character class.
9645
9646    \N may begin either a named sequence, or if outside a character class, mean
9647    to match a non-newline.  For non single-quoted regexes, the tokenizer has
9648    attempted to decide which, and in the case of a named sequence, converted it
9649    into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
9650    where c1... are the characters in the sequence.  For single-quoted regexes,
9651    the tokenizer passes the \N sequence through unchanged; this code will not
9652    attempt to determine this nor expand those, instead raising a syntax error.
9653    The net effect is that if the beginning of the passed-in pattern isn't '{U+'
9654    or there is no '}', it signals that this \N occurrence means to match a
9655    non-newline.
9656
9657    Only the \N{U+...} form should occur in a character class, for the same
9658    reason that '.' inside a character class means to just match a period: it
9659    just doesn't make sense.
9660
9661    The function raises an error (via vFAIL), and doesn't return for various
9662    syntax errors.  Otherwise it returns TRUE and sets <node_p> or <valuep> on
9663    success; it returns FALSE otherwise.
9664
9665    If <valuep> is non-null, it means the caller can accept an input sequence
9666    consisting of a just a single code point; <*valuep> is set to that value
9667    if the input is such.
9668
9669    If <node_p> is non-null it signifies that the caller can accept any other
9670    legal sequence (i.e., one that isn't just a single code point).  <*node_p>
9671    is set as follows:
9672     1) \N means not-a-NL: points to a newly created REG_ANY node;
9673     2) \N{}:              points to a new NOTHING node;
9674     3) otherwise:         points to a new EXACT node containing the resolved
9675                           string.
9676    Note that FALSE is returned for single code point sequences if <valuep> is
9677    null.
9678  */
9679
9680     char * endbrace;    /* '}' following the name */
9681     char* p;
9682     char *endchar;      /* Points to '.' or '}' ending cur char in the input
9683                            stream */
9684     bool has_multiple_chars; /* true if the input stream contains a sequence of
9685                                 more than one character */
9686
9687     GET_RE_DEBUG_FLAGS_DECL;
9688
9689     PERL_ARGS_ASSERT_GROK_BSLASH_N;
9690
9691     GET_RE_DEBUG_FLAGS;
9692
9693     assert(cBOOL(node_p) ^ cBOOL(valuep));  /* Exactly one should be set */
9694
9695     /* The [^\n] meaning of \N ignores spaces and comments under the /x
9696      * modifier.  The other meaning does not */
9697     p = (RExC_flags & RXf_PMf_EXTENDED)
9698         ? regwhite( pRExC_state, RExC_parse )
9699         : RExC_parse;
9700
9701     /* Disambiguate between \N meaning a named character versus \N meaning
9702      * [^\n].  The former is assumed when it can't be the latter. */
9703     if (*p != '{' || regcurly(p)) {
9704         RExC_parse = p;
9705         if (! node_p) {
9706             /* no bare \N in a charclass */
9707             if (in_char_class) {
9708                 vFAIL("\\N in a character class must be a named character: \\N{...}");
9709             }
9710             return FALSE;
9711         }
9712         nextchar(pRExC_state);
9713         *node_p = reg_node(pRExC_state, REG_ANY);
9714         *flagp |= HASWIDTH|SIMPLE;
9715         RExC_naughty++;
9716         RExC_parse--;
9717         Set_Node_Length(*node_p, 1); /* MJD */
9718         return TRUE;
9719     }
9720
9721     /* Here, we have decided it should be a named character or sequence */
9722
9723     /* The test above made sure that the next real character is a '{', but
9724      * under the /x modifier, it could be separated by space (or a comment and
9725      * \n) and this is not allowed (for consistency with \x{...} and the
9726      * tokenizer handling of \N{NAME}). */
9727     if (*RExC_parse != '{') {
9728         vFAIL("Missing braces on \\N{}");
9729     }
9730
9731     RExC_parse++;       /* Skip past the '{' */
9732
9733     if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
9734         || ! (endbrace == RExC_parse            /* nothing between the {} */
9735               || (endbrace - RExC_parse >= 2    /* U+ (bad hex is checked below */
9736                   && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
9737     {
9738         if (endbrace) RExC_parse = endbrace;    /* position msg's '<--HERE' */
9739         vFAIL("\\N{NAME} must be resolved by the lexer");
9740     }
9741
9742     if (endbrace == RExC_parse) {   /* empty: \N{} */
9743         bool ret = TRUE;
9744         if (node_p) {
9745             *node_p = reg_node(pRExC_state,NOTHING);
9746         }
9747         else if (in_char_class) {
9748             if (SIZE_ONLY && in_char_class) {
9749                 ckWARNreg(RExC_parse,
9750                         "Ignoring zero length \\N{} in character class"
9751                 );
9752             }
9753             ret = FALSE;
9754         }
9755         else {
9756             return FALSE;
9757         }
9758         nextchar(pRExC_state);
9759         return ret;
9760     }
9761
9762     RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
9763     RExC_parse += 2;    /* Skip past the 'U+' */
9764
9765     endchar = RExC_parse + strcspn(RExC_parse, ".}");
9766
9767     /* Code points are separated by dots.  If none, there is only one code
9768      * point, and is terminated by the brace */
9769     has_multiple_chars = (endchar < endbrace);
9770
9771     if (valuep && (! has_multiple_chars || in_char_class)) {
9772         /* We only pay attention to the first char of
9773         multichar strings being returned in char classes. I kinda wonder
9774         if this makes sense as it does change the behaviour
9775         from earlier versions, OTOH that behaviour was broken
9776         as well. XXX Solution is to recharacterize as
9777         [rest-of-class]|multi1|multi2... */
9778
9779         STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
9780         I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
9781             | PERL_SCAN_DISALLOW_PREFIX
9782             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
9783
9784         *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
9785
9786         /* The tokenizer should have guaranteed validity, but it's possible to
9787          * bypass it by using single quoting, so check */
9788         if (length_of_hex == 0
9789             || length_of_hex != (STRLEN)(endchar - RExC_parse) )
9790         {
9791             RExC_parse += length_of_hex;        /* Includes all the valid */
9792             RExC_parse += (RExC_orig_utf8)      /* point to after 1st invalid */
9793                             ? UTF8SKIP(RExC_parse)
9794                             : 1;
9795             /* Guard against malformed utf8 */
9796             if (RExC_parse >= endchar) {
9797                 RExC_parse = endchar;
9798             }
9799             vFAIL("Invalid hexadecimal number in \\N{U+...}");
9800         }
9801
9802         if (in_char_class && has_multiple_chars) {
9803             ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
9804         }
9805
9806         RExC_parse = endbrace + 1;
9807     }
9808     else if (! node_p || ! has_multiple_chars) {
9809
9810         /* Here, the input is legal, but not according to the caller's
9811          * options.  We fail without advancing the parse, so that the
9812          * caller can try again */
9813         RExC_parse = p;
9814         return FALSE;
9815     }
9816     else {
9817
9818         /* What is done here is to convert this to a sub-pattern of the form
9819          * (?:\x{char1}\x{char2}...)
9820          * and then call reg recursively.  That way, it retains its atomicness,
9821          * while not having to worry about special handling that some code
9822          * points may have.  toke.c has converted the original Unicode values
9823          * to native, so that we can just pass on the hex values unchanged.  We
9824          * do have to set a flag to keep recoding from happening in the
9825          * recursion */
9826
9827         SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
9828         STRLEN len;
9829         char *orig_end = RExC_end;
9830         I32 flags;
9831
9832         while (RExC_parse < endbrace) {
9833
9834             /* Convert to notation the rest of the code understands */
9835             sv_catpv(substitute_parse, "\\x{");
9836             sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
9837             sv_catpv(substitute_parse, "}");
9838
9839             /* Point to the beginning of the next character in the sequence. */
9840             RExC_parse = endchar + 1;
9841             endchar = RExC_parse + strcspn(RExC_parse, ".}");
9842         }
9843         sv_catpv(substitute_parse, ")");
9844
9845         RExC_parse = SvPV(substitute_parse, len);
9846
9847         /* Don't allow empty number */
9848         if (len < 8) {
9849             vFAIL("Invalid hexadecimal number in \\N{U+...}");
9850         }
9851         RExC_end = RExC_parse + len;
9852
9853         /* The values are Unicode, and therefore not subject to recoding */
9854         RExC_override_recoding = 1;
9855
9856         *node_p = reg(pRExC_state, 1, &flags, depth+1);
9857         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
9858
9859         RExC_parse = endbrace;
9860         RExC_end = orig_end;
9861         RExC_override_recoding = 0;
9862
9863         nextchar(pRExC_state);
9864     }
9865
9866     return TRUE;
9867 }
9868
9869
9870 /*
9871  * reg_recode
9872  *
9873  * It returns the code point in utf8 for the value in *encp.
9874  *    value: a code value in the source encoding
9875  *    encp:  a pointer to an Encode object
9876  *
9877  * If the result from Encode is not a single character,
9878  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
9879  */
9880 STATIC UV
9881 S_reg_recode(pTHX_ const char value, SV **encp)
9882 {
9883     STRLEN numlen = 1;
9884     SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
9885     const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
9886     const STRLEN newlen = SvCUR(sv);
9887     UV uv = UNICODE_REPLACEMENT;
9888
9889     PERL_ARGS_ASSERT_REG_RECODE;
9890
9891     if (newlen)
9892         uv = SvUTF8(sv)
9893              ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
9894              : *(U8*)s;
9895
9896     if (!newlen || numlen != newlen) {
9897         uv = UNICODE_REPLACEMENT;
9898         *encp = NULL;
9899     }
9900     return uv;
9901 }
9902
9903 PERL_STATIC_INLINE U8
9904 S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state)
9905 {
9906     U8 op;
9907
9908     PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
9909
9910     if (! FOLD) {
9911         return EXACT;
9912     }
9913
9914     op = get_regex_charset(RExC_flags);
9915     if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
9916         op--; /* /a is same as /u, and map /aa's offset to what /a's would have
9917                  been, so there is no hole */
9918     }
9919
9920     return op + EXACTF;
9921 }
9922
9923 PERL_STATIC_INLINE void
9924 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32* flagp, STRLEN len, UV code_point)
9925 {
9926     /* This knows the details about sizing an EXACTish node, setting flags for
9927      * it (by setting <*flagp>, and potentially populating it with a single
9928      * character.
9929      *
9930      * If <len> (the length in bytes) is non-zero, this function assumes that
9931      * the node has already been populated, and just does the sizing.  In this
9932      * case <code_point> should be the final code point that has already been
9933      * placed into the node.  This value will be ignored except that under some
9934      * circumstances <*flagp> is set based on it.
9935      *
9936      * If <len> is zero, the function assumes that the node is to contain only
9937      * the single character given by <code_point> and calculates what <len>
9938      * should be.  In pass 1, it sizes the node appropriately.  In pass 2, it
9939      * additionally will populate the node's STRING with <code_point>, if <len>
9940      * is 0.  In both cases <*flagp> is appropriately set
9941      *
9942      * It knows that under FOLD, UTF characters and the Latin Sharp S must be
9943      * folded (the latter only when the rules indicate it can match 'ss') */
9944
9945     bool len_passed_in = cBOOL(len != 0);
9946     U8 character[UTF8_MAXBYTES_CASE+1];
9947
9948     PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
9949
9950     if (! len_passed_in) {
9951         if (UTF) {
9952             if (FOLD) {
9953                 to_uni_fold(NATIVE_TO_UNI(code_point), character, &len);
9954             }
9955             else {
9956                 uvchr_to_utf8( character, code_point);
9957                 len = UTF8SKIP(character);
9958             }
9959         }
9960         else if (! FOLD
9961                  || code_point != LATIN_SMALL_LETTER_SHARP_S
9962                  || ASCII_FOLD_RESTRICTED
9963                  || ! AT_LEAST_UNI_SEMANTICS)
9964         {
9965             *character = (U8) code_point;
9966             len = 1;
9967         }
9968         else {
9969             *character = 's';
9970             *(character + 1) = 's';
9971             len = 2;
9972         }
9973     }
9974
9975     if (SIZE_ONLY) {
9976         RExC_size += STR_SZ(len);
9977     }
9978     else {
9979         RExC_emit += STR_SZ(len);
9980         STR_LEN(node) = len;
9981         if (! len_passed_in) {
9982             Copy((char *) character, STRING(node), len, char);
9983         }
9984     }
9985
9986     *flagp |= HASWIDTH;
9987
9988     /* A single character node is SIMPLE, except for the special-cased SHARP S
9989      * under /di. */
9990     if ((len == 1 || (UTF && len == UNISKIP(code_point)))
9991         && (code_point != LATIN_SMALL_LETTER_SHARP_S
9992             || ! FOLD || ! DEPENDS_SEMANTICS))
9993     {
9994         *flagp |= SIMPLE;
9995     }
9996 }
9997
9998 /*
9999  - regatom - the lowest level
10000
10001    Try to identify anything special at the start of the pattern. If there
10002    is, then handle it as required. This may involve generating a single regop,
10003    such as for an assertion; or it may involve recursing, such as to
10004    handle a () structure.
10005
10006    If the string doesn't start with something special then we gobble up
10007    as much literal text as we can.
10008
10009    Once we have been able to handle whatever type of thing started the
10010    sequence, we return.
10011
10012    Note: we have to be careful with escapes, as they can be both literal
10013    and special, and in the case of \10 and friends, context determines which.
10014
10015    A summary of the code structure is:
10016
10017    switch (first_byte) {
10018         cases for each special:
10019             handle this special;
10020             break;
10021         case '\\':
10022             switch (2nd byte) {
10023                 cases for each unambiguous special:
10024                     handle this special;
10025                     break;
10026                 cases for each ambigous special/literal:
10027                     disambiguate;
10028                     if (special)  handle here
10029                     else goto defchar;
10030                 default: // unambiguously literal:
10031                     goto defchar;
10032             }
10033         default:  // is a literal char
10034             // FALL THROUGH
10035         defchar:
10036             create EXACTish node for literal;
10037             while (more input and node isn't full) {
10038                 switch (input_byte) {
10039                    cases for each special;
10040                        make sure parse pointer is set so that the next call to
10041                            regatom will see this special first
10042                        goto loopdone; // EXACTish node terminated by prev. char
10043                    default:
10044                        append char to EXACTISH node;
10045                 }
10046                 get next input byte;
10047             }
10048         loopdone:
10049    }
10050    return the generated node;
10051
10052    Specifically there are two separate switches for handling
10053    escape sequences, with the one for handling literal escapes requiring
10054    a dummy entry for all of the special escapes that are actually handled
10055    by the other.
10056 */
10057
10058 STATIC regnode *
10059 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10060 {
10061     dVAR;
10062     regnode *ret = NULL;
10063     I32 flags;
10064     char *parse_start = RExC_parse;
10065     U8 op;
10066     GET_RE_DEBUG_FLAGS_DECL;
10067     DEBUG_PARSE("atom");
10068     *flagp = WORST;             /* Tentatively. */
10069
10070     PERL_ARGS_ASSERT_REGATOM;
10071
10072 tryagain:
10073     switch ((U8)*RExC_parse) {
10074     case '^':
10075         RExC_seen_zerolen++;
10076         nextchar(pRExC_state);
10077         if (RExC_flags & RXf_PMf_MULTILINE)
10078             ret = reg_node(pRExC_state, MBOL);
10079         else if (RExC_flags & RXf_PMf_SINGLELINE)
10080             ret = reg_node(pRExC_state, SBOL);
10081         else
10082             ret = reg_node(pRExC_state, BOL);
10083         Set_Node_Length(ret, 1); /* MJD */
10084         break;
10085     case '$':
10086         nextchar(pRExC_state);
10087         if (*RExC_parse)
10088             RExC_seen_zerolen++;
10089         if (RExC_flags & RXf_PMf_MULTILINE)
10090             ret = reg_node(pRExC_state, MEOL);
10091         else if (RExC_flags & RXf_PMf_SINGLELINE)
10092             ret = reg_node(pRExC_state, SEOL);
10093         else
10094             ret = reg_node(pRExC_state, EOL);
10095         Set_Node_Length(ret, 1); /* MJD */
10096         break;
10097     case '.':
10098         nextchar(pRExC_state);
10099         if (RExC_flags & RXf_PMf_SINGLELINE)
10100             ret = reg_node(pRExC_state, SANY);
10101         else
10102             ret = reg_node(pRExC_state, REG_ANY);
10103         *flagp |= HASWIDTH|SIMPLE;
10104         RExC_naughty++;
10105         Set_Node_Length(ret, 1); /* MJD */
10106         break;
10107     case '[':
10108     {
10109         char * const oregcomp_parse = ++RExC_parse;
10110         ret = regclass(pRExC_state, flagp,depth+1);
10111         if (*RExC_parse != ']') {
10112             RExC_parse = oregcomp_parse;
10113             vFAIL("Unmatched [");
10114         }
10115         nextchar(pRExC_state);
10116         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
10117         break;
10118     }
10119     case '(':
10120         nextchar(pRExC_state);
10121         ret = reg(pRExC_state, 1, &flags,depth+1);
10122         if (ret == NULL) {
10123                 if (flags & TRYAGAIN) {
10124                     if (RExC_parse == RExC_end) {
10125                          /* Make parent create an empty node if needed. */
10126                         *flagp |= TRYAGAIN;
10127                         return(NULL);
10128                     }
10129                     goto tryagain;
10130                 }
10131                 return(NULL);
10132         }
10133         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10134         break;
10135     case '|':
10136     case ')':
10137         if (flags & TRYAGAIN) {
10138             *flagp |= TRYAGAIN;
10139             return NULL;
10140         }
10141         vFAIL("Internal urp");
10142                                 /* Supposed to be caught earlier. */
10143         break;
10144     case '?':
10145     case '+':
10146     case '*':
10147         RExC_parse++;
10148         vFAIL("Quantifier follows nothing");
10149         break;
10150     case '\\':
10151         /* Special Escapes
10152
10153            This switch handles escape sequences that resolve to some kind
10154            of special regop and not to literal text. Escape sequnces that
10155            resolve to literal text are handled below in the switch marked
10156            "Literal Escapes".
10157
10158            Every entry in this switch *must* have a corresponding entry
10159            in the literal escape switch. However, the opposite is not
10160            required, as the default for this switch is to jump to the
10161            literal text handling code.
10162         */
10163         switch ((U8)*++RExC_parse) {
10164         /* Special Escapes */
10165         case 'A':
10166             RExC_seen_zerolen++;
10167             ret = reg_node(pRExC_state, SBOL);
10168             *flagp |= SIMPLE;
10169             goto finish_meta_pat;
10170         case 'G':
10171             ret = reg_node(pRExC_state, GPOS);
10172             RExC_seen |= REG_SEEN_GPOS;
10173             *flagp |= SIMPLE;
10174             goto finish_meta_pat;
10175         case 'K':
10176             RExC_seen_zerolen++;
10177             ret = reg_node(pRExC_state, KEEPS);
10178             *flagp |= SIMPLE;
10179             /* XXX:dmq : disabling in-place substitution seems to
10180              * be necessary here to avoid cases of memory corruption, as
10181              * with: C<$_="x" x 80; s/x\K/y/> -- rgs
10182              */
10183             RExC_seen |= REG_SEEN_LOOKBEHIND;
10184             goto finish_meta_pat;
10185         case 'Z':
10186             ret = reg_node(pRExC_state, SEOL);
10187             *flagp |= SIMPLE;
10188             RExC_seen_zerolen++;                /* Do not optimize RE away */
10189             goto finish_meta_pat;
10190         case 'z':
10191             ret = reg_node(pRExC_state, EOS);
10192             *flagp |= SIMPLE;
10193             RExC_seen_zerolen++;                /* Do not optimize RE away */
10194             goto finish_meta_pat;
10195         case 'C':
10196             ret = reg_node(pRExC_state, CANY);
10197             RExC_seen |= REG_SEEN_CANY;
10198             *flagp |= HASWIDTH|SIMPLE;
10199             goto finish_meta_pat;
10200         case 'X':
10201             ret = reg_node(pRExC_state, CLUMP);
10202             *flagp |= HASWIDTH;
10203             goto finish_meta_pat;
10204         case 'w':
10205             op = ALNUM + get_regex_charset(RExC_flags);
10206             if (op > ALNUMA) {  /* /aa is same as /a */
10207                 op = ALNUMA;
10208             }
10209             ret = reg_node(pRExC_state, op);
10210             *flagp |= HASWIDTH|SIMPLE;
10211             goto finish_meta_pat;
10212         case 'W':
10213             op = NALNUM + get_regex_charset(RExC_flags);
10214             if (op > NALNUMA) { /* /aa is same as /a */
10215                 op = NALNUMA;
10216             }
10217             ret = reg_node(pRExC_state, op);
10218             *flagp |= HASWIDTH|SIMPLE;
10219             goto finish_meta_pat;
10220         case 'b':
10221             RExC_seen_zerolen++;
10222             RExC_seen |= REG_SEEN_LOOKBEHIND;
10223             op = BOUND + get_regex_charset(RExC_flags);
10224             if (op > BOUNDA) {  /* /aa is same as /a */
10225                 op = BOUNDA;
10226             }
10227             ret = reg_node(pRExC_state, op);
10228             FLAGS(ret) = get_regex_charset(RExC_flags);
10229             *flagp |= SIMPLE;
10230             goto finish_meta_pat;
10231         case 'B':
10232             RExC_seen_zerolen++;
10233             RExC_seen |= REG_SEEN_LOOKBEHIND;
10234             op = NBOUND + get_regex_charset(RExC_flags);
10235             if (op > NBOUNDA) { /* /aa is same as /a */
10236                 op = NBOUNDA;
10237             }
10238             ret = reg_node(pRExC_state, op);
10239             FLAGS(ret) = get_regex_charset(RExC_flags);
10240             *flagp |= SIMPLE;
10241             goto finish_meta_pat;
10242         case 's':
10243             op = SPACE + get_regex_charset(RExC_flags);
10244             if (op > SPACEA) {  /* /aa is same as /a */
10245                 op = SPACEA;
10246             }
10247             ret = reg_node(pRExC_state, op);
10248             *flagp |= HASWIDTH|SIMPLE;
10249             goto finish_meta_pat;
10250         case 'S':
10251             op = NSPACE + get_regex_charset(RExC_flags);
10252             if (op > NSPACEA) { /* /aa is same as /a */
10253                 op = NSPACEA;
10254             }
10255             ret = reg_node(pRExC_state, op);
10256             *flagp |= HASWIDTH|SIMPLE;
10257             goto finish_meta_pat;
10258         case 'D':
10259             op = NDIGIT;
10260             goto join_D_and_d;
10261         case 'd':
10262             op = DIGIT;
10263         join_D_and_d:
10264             {
10265                 U8 offset = get_regex_charset(RExC_flags);
10266                 if (offset == REGEX_UNICODE_CHARSET) {
10267                     offset = REGEX_DEPENDS_CHARSET;
10268                 }
10269                 else if (offset == REGEX_ASCII_MORE_RESTRICTED_CHARSET) {
10270                     offset = REGEX_ASCII_RESTRICTED_CHARSET;
10271                 }
10272                 op += offset;
10273             }
10274             ret = reg_node(pRExC_state, op);
10275             *flagp |= HASWIDTH|SIMPLE;
10276             goto finish_meta_pat;
10277         case 'R':
10278             ret = reg_node(pRExC_state, LNBREAK);
10279             *flagp |= HASWIDTH;
10280             goto finish_meta_pat;
10281         case 'h':
10282             ret = reg_node(pRExC_state, HORIZWS);
10283             *flagp |= HASWIDTH|SIMPLE;
10284             goto finish_meta_pat;
10285         case 'H':
10286             ret = reg_node(pRExC_state, NHORIZWS);
10287             *flagp |= HASWIDTH|SIMPLE;
10288             goto finish_meta_pat;
10289         case 'v':
10290             ret = reg_node(pRExC_state, VERTWS);
10291             *flagp |= HASWIDTH|SIMPLE;
10292             goto finish_meta_pat;
10293         case 'V':
10294             ret = reg_node(pRExC_state, NVERTWS);
10295             *flagp |= HASWIDTH|SIMPLE;
10296          finish_meta_pat:
10297             nextchar(pRExC_state);
10298             Set_Node_Length(ret, 2); /* MJD */
10299             break;
10300         case 'p':
10301         case 'P':
10302             {
10303                 char* const oldregxend = RExC_end;
10304 #ifdef DEBUGGING
10305                 char* parse_start = RExC_parse - 2;
10306 #endif
10307
10308                 if (RExC_parse[1] == '{') {
10309                   /* a lovely hack--pretend we saw [\pX] instead */
10310                     RExC_end = strchr(RExC_parse, '}');
10311                     if (!RExC_end) {
10312                         const U8 c = (U8)*RExC_parse;
10313                         RExC_parse += 2;
10314                         RExC_end = oldregxend;
10315                         vFAIL2("Missing right brace on \\%c{}", c);
10316                     }
10317                     RExC_end++;
10318                 }
10319                 else {
10320                     RExC_end = RExC_parse + 2;
10321                     if (RExC_end > oldregxend)
10322                         RExC_end = oldregxend;
10323                 }
10324                 RExC_parse--;
10325
10326                 ret = regclass(pRExC_state, flagp,depth+1);
10327
10328                 RExC_end = oldregxend;
10329                 RExC_parse--;
10330
10331                 Set_Node_Offset(ret, parse_start + 2);
10332                 Set_Node_Cur_Length(ret);
10333                 nextchar(pRExC_state);
10334             }
10335             break;
10336         case 'N':
10337             /* Handle \N and \N{NAME} with multiple code points here and not
10338              * below because it can be multicharacter. join_exact() will join
10339              * them up later on.  Also this makes sure that things like
10340              * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
10341              * The options to the grok function call causes it to fail if the
10342              * sequence is just a single code point.  We then go treat it as
10343              * just another character in the current EXACT node, and hence it
10344              * gets uniform treatment with all the other characters.  The
10345              * special treatment for quantifiers is not needed for such single
10346              * character sequences */
10347             ++RExC_parse;
10348             if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE)) {
10349                 RExC_parse--;
10350                 goto defchar;
10351             }
10352             break;
10353         case 'k':    /* Handle \k<NAME> and \k'NAME' */
10354         parse_named_seq:
10355         {
10356             char ch= RExC_parse[1];
10357             if (ch != '<' && ch != '\'' && ch != '{') {
10358                 RExC_parse++;
10359                 vFAIL2("Sequence %.2s... not terminated",parse_start);
10360             } else {
10361                 /* this pretty much dupes the code for (?P=...) in reg(), if
10362                    you change this make sure you change that */
10363                 char* name_start = (RExC_parse += 2);
10364                 U32 num = 0;
10365                 SV *sv_dat = reg_scan_name(pRExC_state,
10366                     SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10367                 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
10368                 if (RExC_parse == name_start || *RExC_parse != ch)
10369                     vFAIL2("Sequence %.3s... not terminated",parse_start);
10370
10371                 if (!SIZE_ONLY) {
10372                     num = add_data( pRExC_state, 1, "S" );
10373                     RExC_rxi->data->data[num]=(void*)sv_dat;
10374                     SvREFCNT_inc_simple_void(sv_dat);
10375                 }
10376
10377                 RExC_sawback = 1;
10378                 ret = reganode(pRExC_state,
10379                                ((! FOLD)
10380                                  ? NREF
10381                                  : (ASCII_FOLD_RESTRICTED)
10382                                    ? NREFFA
10383                                    : (AT_LEAST_UNI_SEMANTICS)
10384                                      ? NREFFU
10385                                      : (LOC)
10386                                        ? NREFFL
10387                                        : NREFF),
10388                                 num);
10389                 *flagp |= HASWIDTH;
10390
10391                 /* override incorrect value set in reganode MJD */
10392                 Set_Node_Offset(ret, parse_start+1);
10393                 Set_Node_Cur_Length(ret); /* MJD */
10394                 nextchar(pRExC_state);
10395
10396             }
10397             break;
10398         }
10399         case 'g':
10400         case '1': case '2': case '3': case '4':
10401         case '5': case '6': case '7': case '8': case '9':
10402             {
10403                 I32 num;
10404                 bool isg = *RExC_parse == 'g';
10405                 bool isrel = 0;
10406                 bool hasbrace = 0;
10407                 if (isg) {
10408                     RExC_parse++;
10409                     if (*RExC_parse == '{') {
10410                         RExC_parse++;
10411                         hasbrace = 1;
10412                     }
10413                     if (*RExC_parse == '-') {
10414                         RExC_parse++;
10415                         isrel = 1;
10416                     }
10417                     if (hasbrace && !isDIGIT(*RExC_parse)) {
10418                         if (isrel) RExC_parse--;
10419                         RExC_parse -= 2;
10420                         goto parse_named_seq;
10421                 }   }
10422                 num = atoi(RExC_parse);
10423                 if (isg && num == 0)
10424                     vFAIL("Reference to invalid group 0");
10425                 if (isrel) {
10426                     num = RExC_npar - num;
10427                     if (num < 1)
10428                         vFAIL("Reference to nonexistent or unclosed group");
10429                 }
10430                 if (!isg && num > 9 && num >= RExC_npar)
10431                     /* Probably a character specified in octal, e.g. \35 */
10432                     goto defchar;
10433                 else {
10434                     char * const parse_start = RExC_parse - 1; /* MJD */
10435                     while (isDIGIT(*RExC_parse))
10436                         RExC_parse++;
10437                     if (parse_start == RExC_parse - 1)
10438                         vFAIL("Unterminated \\g... pattern");
10439                     if (hasbrace) {
10440                         if (*RExC_parse != '}')
10441                             vFAIL("Unterminated \\g{...} pattern");
10442                         RExC_parse++;
10443                     }
10444                     if (!SIZE_ONLY) {
10445                         if (num > (I32)RExC_rx->nparens)
10446                             vFAIL("Reference to nonexistent group");
10447                     }
10448                     RExC_sawback = 1;
10449                     ret = reganode(pRExC_state,
10450                                    ((! FOLD)
10451                                      ? REF
10452                                      : (ASCII_FOLD_RESTRICTED)
10453                                        ? REFFA
10454                                        : (AT_LEAST_UNI_SEMANTICS)
10455                                          ? REFFU
10456                                          : (LOC)
10457                                            ? REFFL
10458                                            : REFF),
10459                                     num);
10460                     *flagp |= HASWIDTH;
10461
10462                     /* override incorrect value set in reganode MJD */
10463                     Set_Node_Offset(ret, parse_start+1);
10464                     Set_Node_Cur_Length(ret); /* MJD */
10465                     RExC_parse--;
10466                     nextchar(pRExC_state);
10467                 }
10468             }
10469             break;
10470         case '\0':
10471             if (RExC_parse >= RExC_end)
10472                 FAIL("Trailing \\");
10473             /* FALL THROUGH */
10474         default:
10475             /* Do not generate "unrecognized" warnings here, we fall
10476                back into the quick-grab loop below */
10477             parse_start--;
10478             goto defchar;
10479         }
10480         break;
10481
10482     case '#':
10483         if (RExC_flags & RXf_PMf_EXTENDED) {
10484             if ( reg_skipcomment( pRExC_state ) )
10485                 goto tryagain;
10486         }
10487         /* FALL THROUGH */
10488
10489     default:
10490
10491             parse_start = RExC_parse - 1;
10492
10493             RExC_parse++;
10494
10495         defchar: {
10496             STRLEN len = 0;
10497             UV ender;
10498             char *p;
10499             char *s;
10500 #define MAX_NODE_STRING_SIZE 127
10501             char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
10502             char *s0;
10503             U8 upper_parse = MAX_NODE_STRING_SIZE;
10504             STRLEN foldlen;
10505             U8 node_type;
10506             bool next_is_quantifier;
10507             char * oldp = NULL;
10508
10509             /* If a folding node contains only code points that don't
10510              * participate in folds, it can be changed into an EXACT node,
10511              * which allows the optimizer more things to look for */
10512             bool maybe_exact;
10513
10514             ender = 0;
10515             node_type = compute_EXACTish(pRExC_state);
10516             ret = reg_node(pRExC_state, node_type);
10517
10518             /* In pass1, folded, we use a temporary buffer instead of the
10519              * actual node, as the node doesn't exist yet */
10520             s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
10521
10522             s0 = s;
10523
10524         reparse:
10525
10526             /* We do the EXACTFish to EXACT node only if folding, and not if in
10527              * locale, as whether a character folds or not isn't known until
10528              * runtime */
10529             maybe_exact = FOLD && ! LOC;
10530
10531             /* XXX The node can hold up to 255 bytes, yet this only goes to
10532              * 127.  I (khw) do not know why.  Keeping it somewhat less than
10533              * 255 allows us to not have to worry about overflow due to
10534              * converting to utf8 and fold expansion, but that value is
10535              * 255-UTF8_MAXBYTES_CASE.  join_exact() may join adjacent nodes
10536              * split up by this limit into a single one using the real max of
10537              * 255.  Even at 127, this breaks under rare circumstances.  If
10538              * folding, we do not want to split a node at a character that is a
10539              * non-final in a multi-char fold, as an input string could just
10540              * happen to want to match across the node boundary.  The join
10541              * would solve that problem if the join actually happens.  But a
10542              * series of more than two nodes in a row each of 127 would cause
10543              * the first join to succeed to get to 254, but then there wouldn't
10544              * be room for the next one, which could at be one of those split
10545              * multi-char folds.  I don't know of any fool-proof solution.  One
10546              * could back off to end with only a code point that isn't such a
10547              * non-final, but it is possible for there not to be any in the
10548              * entire node. */
10549             for (p = RExC_parse - 1;
10550                  len < upper_parse && p < RExC_end;
10551                  len++)
10552             {
10553                 oldp = p;
10554
10555                 if (RExC_flags & RXf_PMf_EXTENDED)
10556                     p = regwhite( pRExC_state, p );
10557                 switch ((U8)*p) {
10558                 case '^':
10559                 case '$':
10560                 case '.':
10561                 case '[':
10562                 case '(':
10563                 case ')':
10564                 case '|':
10565                     goto loopdone;
10566                 case '\\':
10567                     /* Literal Escapes Switch
10568
10569                        This switch is meant to handle escape sequences that
10570                        resolve to a literal character.
10571
10572                        Every escape sequence that represents something
10573                        else, like an assertion or a char class, is handled
10574                        in the switch marked 'Special Escapes' above in this
10575                        routine, but also has an entry here as anything that
10576                        isn't explicitly mentioned here will be treated as
10577                        an unescaped equivalent literal.
10578                     */
10579
10580                     switch ((U8)*++p) {
10581                     /* These are all the special escapes. */
10582                     case 'A':             /* Start assertion */
10583                     case 'b': case 'B':   /* Word-boundary assertion*/
10584                     case 'C':             /* Single char !DANGEROUS! */
10585                     case 'd': case 'D':   /* digit class */
10586                     case 'g': case 'G':   /* generic-backref, pos assertion */
10587                     case 'h': case 'H':   /* HORIZWS */
10588                     case 'k': case 'K':   /* named backref, keep marker */
10589                     case 'p': case 'P':   /* Unicode property */
10590                               case 'R':   /* LNBREAK */
10591                     case 's': case 'S':   /* space class */
10592                     case 'v': case 'V':   /* VERTWS */
10593                     case 'w': case 'W':   /* word class */
10594                     case 'X':             /* eXtended Unicode "combining character sequence" */
10595                     case 'z': case 'Z':   /* End of line/string assertion */
10596                         --p;
10597                         goto loopdone;
10598
10599                     /* Anything after here is an escape that resolves to a
10600                        literal. (Except digits, which may or may not)
10601                      */
10602                     case 'n':
10603                         ender = '\n';
10604                         p++;
10605                         break;
10606                     case 'N': /* Handle a single-code point named character. */
10607                         /* The options cause it to fail if a multiple code
10608                          * point sequence.  Handle those in the switch() above
10609                          * */
10610                         RExC_parse = p + 1;
10611                         if (! grok_bslash_N(pRExC_state, NULL, &ender,
10612                                             flagp, depth, FALSE))
10613                         {
10614                             RExC_parse = p = oldp;
10615                             goto loopdone;
10616                         }
10617                         p = RExC_parse;
10618                         if (ender > 0xff) {
10619                             REQUIRE_UTF8;
10620                         }
10621                         break;
10622                     case 'r':
10623                         ender = '\r';
10624                         p++;
10625                         break;
10626                     case 't':
10627                         ender = '\t';
10628                         p++;
10629                         break;
10630                     case 'f':
10631                         ender = '\f';
10632                         p++;
10633                         break;
10634                     case 'e':
10635                           ender = ASCII_TO_NATIVE('\033');
10636                         p++;
10637                         break;
10638                     case 'a':
10639                           ender = ASCII_TO_NATIVE('\007');
10640                         p++;
10641                         break;
10642                     case 'o':
10643                         {
10644                             STRLEN brace_len = len;
10645                             UV result;
10646                             const char* error_msg;
10647
10648                             bool valid = grok_bslash_o(p,
10649                                                        &result,
10650                                                        &brace_len,
10651                                                        &error_msg,
10652                                                        1);
10653                             p += brace_len;
10654                             if (! valid) {
10655                                 RExC_parse = p; /* going to die anyway; point
10656                                                    to exact spot of failure */
10657                                 vFAIL(error_msg);
10658                             }
10659                             else
10660                             {
10661                                 ender = result;
10662                             }
10663                             if (PL_encoding && ender < 0x100) {
10664                                 goto recode_encoding;
10665                             }
10666                             if (ender > 0xff) {
10667                                 REQUIRE_UTF8;
10668                             }
10669                             break;
10670                         }
10671                     case 'x':
10672                         {
10673                             STRLEN brace_len = len;
10674                             UV result;
10675                             const char* error_msg;
10676
10677                             bool valid = grok_bslash_x(p,
10678                                                        &result,
10679                                                        &brace_len,
10680                                                        &error_msg,
10681                                                        1);
10682                             p += brace_len;
10683                             if (! valid) {
10684                                 RExC_parse = p; /* going to die anyway; point
10685                                                    to exact spot of failure */
10686                                 vFAIL(error_msg);
10687                             }
10688                             else {
10689                                 ender = result;
10690                             }
10691                             if (PL_encoding && ender < 0x100) {
10692                                 goto recode_encoding;
10693                             }
10694                             if (ender > 0xff) {
10695                                 REQUIRE_UTF8;
10696                             }
10697                             break;
10698                         }
10699                     case 'c':
10700                         p++;
10701                         ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
10702                         break;
10703                     case '0': case '1': case '2': case '3':case '4':
10704                     case '5': case '6': case '7':
10705                         if (*p == '0' ||
10706                             (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
10707                         {
10708                             I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
10709                             STRLEN numlen = 3;
10710                             ender = grok_oct(p, &numlen, &flags, NULL);
10711                             if (ender > 0xff) {
10712                                 REQUIRE_UTF8;
10713                             }
10714                             p += numlen;
10715                         }
10716                         else {
10717                             --p;
10718                             goto loopdone;
10719                         }
10720                         if (PL_encoding && ender < 0x100)
10721                             goto recode_encoding;
10722                         break;
10723                     recode_encoding:
10724                         if (! RExC_override_recoding) {
10725                             SV* enc = PL_encoding;
10726                             ender = reg_recode((const char)(U8)ender, &enc);
10727                             if (!enc && SIZE_ONLY)
10728                                 ckWARNreg(p, "Invalid escape in the specified encoding");
10729                             REQUIRE_UTF8;
10730                         }
10731                         break;
10732                     case '\0':
10733                         if (p >= RExC_end)
10734                             FAIL("Trailing \\");
10735                         /* FALL THROUGH */
10736                     default:
10737                         if (!SIZE_ONLY&& isALNUMC(*p)) {
10738                             ckWARN2reg(p + 1, "Unrecognized escape \\%.1s passed through", p);
10739                         }
10740                         goto normal_default;
10741                     }
10742                     break;
10743                 case '{':
10744                     /* Currently we don't warn when the lbrace is at the start
10745                      * of a construct.  This catches it in the middle of a
10746                      * literal string, or when its the first thing after
10747                      * something like "\b" */
10748                     if (! SIZE_ONLY
10749                         && (len || (p > RExC_start && isALPHA_A(*(p -1)))))
10750                     {
10751                         ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through");
10752                     }
10753                     /*FALLTHROUGH*/
10754                 default:
10755                   normal_default:
10756                     if (UTF8_IS_START(*p) && UTF) {
10757                         STRLEN numlen;
10758                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
10759                                                &numlen, UTF8_ALLOW_DEFAULT);
10760                         p += numlen;
10761                     }
10762                     else
10763                         ender = (U8) *p++;
10764                     break;
10765                 } /* End of switch on the literal */
10766
10767                 /* Here, have looked at the literal character and <ender>
10768                  * contains its ordinal, <p> points to the character after it
10769                  */
10770
10771                 if ( RExC_flags & RXf_PMf_EXTENDED)
10772                     p = regwhite( pRExC_state, p );
10773
10774                 /* If the next thing is a quantifier, it applies to this
10775                  * character only, which means that this character has to be in
10776                  * its own node and can't just be appended to the string in an
10777                  * existing node, so if there are already other characters in
10778                  * the node, close the node with just them, and set up to do
10779                  * this character again next time through, when it will be the
10780                  * only thing in its new node */
10781                 if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
10782                 {
10783                     p = oldp;
10784                     goto loopdone;
10785                 }
10786
10787                 if (FOLD) {
10788                     if (UTF
10789                             /* See comments for join_exact() as to why we fold
10790                              * this non-UTF at compile time */
10791                         || (node_type == EXACTFU
10792                             && ender == LATIN_SMALL_LETTER_SHARP_S))
10793                     {
10794
10795
10796                         /* Prime the casefolded buffer.  Locale rules, which
10797                          * apply only to code points < 256, aren't known until
10798                          * execution, so for them, just output the original
10799                          * character using utf8.  If we start to fold non-UTF
10800                          * patterns, be sure to update join_exact() */
10801                         if (LOC && ender < 256) {
10802                             if (UNI_IS_INVARIANT(ender)) {
10803                                 *s = (U8) ender;
10804                                 foldlen = 1;
10805                             } else {
10806                                 *s = UTF8_TWO_BYTE_HI(ender);
10807                                 *(s + 1) = UTF8_TWO_BYTE_LO(ender);
10808                                 foldlen = 2;
10809                             }
10810                         }
10811                         else {
10812                             UV folded = _to_uni_fold_flags(
10813                                            ender,
10814                                            (U8 *) s,
10815                                            &foldlen,
10816                                            FOLD_FLAGS_FULL
10817                                            | ((LOC) ?  FOLD_FLAGS_LOCALE
10818                                                     : (ASCII_FOLD_RESTRICTED)
10819                                                       ? FOLD_FLAGS_NOMIX_ASCII
10820                                                       : 0)
10821                                             );
10822
10823                             /* If this node only contains non-folding code
10824                              * points so far, see if this new one is also
10825                              * non-folding */
10826                             if (maybe_exact) {
10827                                 if (folded != ender) {
10828                                     maybe_exact = FALSE;
10829                                 }
10830                                 else {
10831                                     /* Here the fold is the original; we have
10832                                      * to check further to see if anything
10833                                      * folds to it */
10834                                     if (! PL_utf8_foldable) {
10835                                         SV* swash = swash_init("utf8",
10836                                                            "_Perl_Any_Folds",
10837                                                            &PL_sv_undef, 1, 0);
10838                                         PL_utf8_foldable =
10839                                                     _get_swash_invlist(swash);
10840                                         SvREFCNT_dec(swash);
10841                                     }
10842                                     if (_invlist_contains_cp(PL_utf8_foldable,
10843                                                              ender))
10844                                     {
10845                                         maybe_exact = FALSE;
10846                                     }
10847                                 }
10848                             }
10849                             ender = folded;
10850                         }
10851                         s += foldlen;
10852
10853                         /* The loop increments <len> each time, as all but this
10854                          * path (and the one just below for UTF) through it add
10855                          * a single byte to the EXACTish node.  But this one
10856                          * has changed len to be the correct final value, so
10857                          * subtract one to cancel out the increment that
10858                          * follows */
10859                         len += foldlen - 1;
10860                     }
10861                     else {
10862                         *(s++) = ender;
10863                         maybe_exact &= ! IS_IN_SOME_FOLD_L1(ender);
10864                     }
10865                 }
10866                 else if (UTF) {
10867                     const STRLEN unilen = reguni(pRExC_state, ender, s);
10868                     if (unilen > 0) {
10869                        s   += unilen;
10870                        len += unilen;
10871                     }
10872
10873                     /* See comment just above for - 1 */
10874                     len--;
10875                 }
10876                 else {
10877                     REGC((char)ender, s++);
10878                 }
10879
10880                 if (next_is_quantifier) {
10881
10882                     /* Here, the next input is a quantifier, and to get here,
10883                      * the current character is the only one in the node.
10884                      * Also, here <len> doesn't include the final byte for this
10885                      * character */
10886                     len++;
10887                     goto loopdone;
10888                 }
10889
10890             } /* End of loop through literal characters */
10891
10892             /* Here we have either exhausted the input or ran out of room in
10893              * the node.  (If we encountered a character that can't be in the
10894              * node, transfer is made directly to <loopdone>, and so we
10895              * wouldn't have fallen off the end of the loop.)  In the latter
10896              * case, we artificially have to split the node into two, because
10897              * we just don't have enough space to hold everything.  This
10898              * creates a problem if the final character participates in a
10899              * multi-character fold in the non-final position, as a match that
10900              * should have occurred won't, due to the way nodes are matched,
10901              * and our artificial boundary.  So back off until we find a non-
10902              * problematic character -- one that isn't at the beginning or
10903              * middle of such a fold.  (Either it doesn't participate in any
10904              * folds, or appears only in the final position of all the folds it
10905              * does participate in.)  A better solution with far fewer false
10906              * positives, and that would fill the nodes more completely, would
10907              * be to actually have available all the multi-character folds to
10908              * test against, and to back-off only far enough to be sure that
10909              * this node isn't ending with a partial one.  <upper_parse> is set
10910              * further below (if we need to reparse the node) to include just
10911              * up through that final non-problematic character that this code
10912              * identifies, so when it is set to less than the full node, we can
10913              * skip the rest of this */
10914             if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
10915
10916                 const STRLEN full_len = len;
10917
10918                 assert(len >= MAX_NODE_STRING_SIZE);
10919
10920                 /* Here, <s> points to the final byte of the final character.
10921                  * Look backwards through the string until find a non-
10922                  * problematic character */
10923
10924                 if (! UTF) {
10925
10926                     /* These two have no multi-char folds to non-UTF characters
10927                      */
10928                     if (ASCII_FOLD_RESTRICTED || LOC) {
10929                         goto loopdone;
10930                     }
10931
10932                     while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
10933                     len = s - s0 + 1;
10934                 }
10935                 else {
10936                     if (!  PL_NonL1NonFinalFold) {
10937                         PL_NonL1NonFinalFold = _new_invlist_C_array(
10938                                         NonL1_Perl_Non_Final_Folds_invlist);
10939                     }
10940
10941                     /* Point to the first byte of the final character */
10942                     s = (char *) utf8_hop((U8 *) s, -1);
10943
10944                     while (s >= s0) {   /* Search backwards until find
10945                                            non-problematic char */
10946                         if (UTF8_IS_INVARIANT(*s)) {
10947
10948                             /* There are no ascii characters that participate
10949                              * in multi-char folds under /aa.  In EBCDIC, the
10950                              * non-ascii invariants are all control characters,
10951                              * so don't ever participate in any folds. */
10952                             if (ASCII_FOLD_RESTRICTED
10953                                 || ! IS_NON_FINAL_FOLD(*s))
10954                             {
10955                                 break;
10956                             }
10957                         }
10958                         else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
10959
10960                             /* No Latin1 characters participate in multi-char
10961                              * folds under /l */
10962                             if (LOC
10963                                 || ! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_UNI(
10964                                                                 *s, *(s+1))))
10965                             {
10966                                 break;
10967                             }
10968                         }
10969                         else if (! _invlist_contains_cp(
10970                                         PL_NonL1NonFinalFold,
10971                                         valid_utf8_to_uvchr((U8 *) s, NULL)))
10972                         {
10973                             break;
10974                         }
10975
10976                         /* Here, the current character is problematic in that
10977                          * it does occur in the non-final position of some
10978                          * fold, so try the character before it, but have to
10979                          * special case the very first byte in the string, so
10980                          * we don't read outside the string */
10981                         s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
10982                     } /* End of loop backwards through the string */
10983
10984                     /* If there were only problematic characters in the string,
10985                      * <s> will point to before s0, in which case the length
10986                      * should be 0, otherwise include the length of the
10987                      * non-problematic character just found */
10988                     len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
10989                 }
10990
10991                 /* Here, have found the final character, if any, that is
10992                  * non-problematic as far as ending the node without splitting
10993                  * it across a potential multi-char fold.  <len> contains the
10994                  * number of bytes in the node up-to and including that
10995                  * character, or is 0 if there is no such character, meaning
10996                  * the whole node contains only problematic characters.  In
10997                  * this case, give up and just take the node as-is.  We can't
10998                  * do any better */
10999                 if (len == 0) {
11000                     len = full_len;
11001                 } else {
11002
11003                     /* Here, the node does contain some characters that aren't
11004                      * problematic.  If one such is the final character in the
11005                      * node, we are done */
11006                     if (len == full_len) {
11007                         goto loopdone;
11008                     }
11009                     else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
11010
11011                         /* If the final character is problematic, but the
11012                          * penultimate is not, back-off that last character to
11013                          * later start a new node with it */
11014                         p = oldp;
11015                         goto loopdone;
11016                     }
11017
11018                     /* Here, the final non-problematic character is earlier
11019                      * in the input than the penultimate character.  What we do
11020                      * is reparse from the beginning, going up only as far as
11021                      * this final ok one, thus guaranteeing that the node ends
11022                      * in an acceptable character.  The reason we reparse is
11023                      * that we know how far in the character is, but we don't
11024                      * know how to correlate its position with the input parse.
11025                      * An alternate implementation would be to build that
11026                      * correlation as we go along during the original parse,
11027                      * but that would entail extra work for every node, whereas
11028                      * this code gets executed only when the string is too
11029                      * large for the node, and the final two characters are
11030                      * problematic, an infrequent occurrence.  Yet another
11031                      * possible strategy would be to save the tail of the
11032                      * string, and the next time regatom is called, initialize
11033                      * with that.  The problem with this is that unless you
11034                      * back off one more character, you won't be guaranteed
11035                      * regatom will get called again, unless regbranch,
11036                      * regpiece ... are also changed.  If you do back off that
11037                      * extra character, so that there is input guaranteed to
11038                      * force calling regatom, you can't handle the case where
11039                      * just the first character in the node is acceptable.  I
11040                      * (khw) decided to try this method which doesn't have that
11041                      * pitfall; if performance issues are found, we can do a
11042                      * combination of the current approach plus that one */
11043                     upper_parse = len;
11044                     len = 0;
11045                     s = s0;
11046                     goto reparse;
11047                 }
11048             }   /* End of verifying node ends with an appropriate char */
11049
11050         loopdone:   /* Jumped to when encounters something that shouldn't be in
11051                        the node */
11052
11053             /* If 'maybe_exact' is still set here, means there are no
11054              * code points in the node that participate in folds */
11055             if (FOLD && maybe_exact) {
11056                 OP(ret) = EXACT;
11057             }
11058
11059             /* I (khw) don't know if you can get here with zero length, but the
11060              * old code handled this situation by creating a zero-length EXACT
11061              * node.  Might as well be NOTHING instead */
11062             if (len == 0) {
11063                 OP(ret) = NOTHING;
11064             }
11065             else{
11066                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender);
11067             }
11068
11069             RExC_parse = p - 1;
11070             Set_Node_Cur_Length(ret); /* MJD */
11071             nextchar(pRExC_state);
11072             {
11073                 /* len is STRLEN which is unsigned, need to copy to signed */
11074                 IV iv = len;
11075                 if (iv < 0)
11076                     vFAIL("Internal disaster");
11077             }
11078
11079         } /* End of label 'defchar:' */
11080         break;
11081     } /* End of giant switch on input character */
11082
11083     return(ret);
11084 }
11085
11086 STATIC char *
11087 S_regwhite( RExC_state_t *pRExC_state, char *p )
11088 {
11089     const char *e = RExC_end;
11090
11091     PERL_ARGS_ASSERT_REGWHITE;
11092
11093     while (p < e) {
11094         if (isSPACE(*p))
11095             ++p;
11096         else if (*p == '#') {
11097             bool ended = 0;
11098             do {
11099                 if (*p++ == '\n') {
11100                     ended = 1;
11101                     break;
11102                 }
11103             } while (p < e);
11104             if (!ended)
11105                 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11106         }
11107         else
11108             break;
11109     }
11110     return p;
11111 }
11112
11113 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
11114    Character classes ([:foo:]) can also be negated ([:^foo:]).
11115    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
11116    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
11117    but trigger failures because they are currently unimplemented. */
11118
11119 #define POSIXCC_DONE(c)   ((c) == ':')
11120 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
11121 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
11122
11123 STATIC I32
11124 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
11125 {
11126     dVAR;
11127     I32 namedclass = OOB_NAMEDCLASS;
11128
11129     PERL_ARGS_ASSERT_REGPPOSIXCC;
11130
11131     if (value == '[' && RExC_parse + 1 < RExC_end &&
11132         /* I smell either [: or [= or [. -- POSIX has been here, right? */
11133         POSIXCC(UCHARAT(RExC_parse))) {
11134         const char c = UCHARAT(RExC_parse);
11135         char* const s = RExC_parse++;
11136
11137         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
11138             RExC_parse++;
11139         if (RExC_parse == RExC_end)
11140             /* Grandfather lone [:, [=, [. */
11141             RExC_parse = s;
11142         else {
11143             const char* const t = RExC_parse++; /* skip over the c */
11144             assert(*t == c);
11145
11146             if (UCHARAT(RExC_parse) == ']') {
11147                 const char *posixcc = s + 1;
11148                 RExC_parse++; /* skip over the ending ] */
11149
11150                 if (*s == ':') {
11151                     const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
11152                     const I32 skip = t - posixcc;
11153
11154                     /* Initially switch on the length of the name.  */
11155                     switch (skip) {
11156                     case 4:
11157                         if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
11158                             namedclass = ANYOF_WORDCHAR;
11159                         break;
11160                     case 5:
11161                         /* Names all of length 5.  */
11162                         /* alnum alpha ascii blank cntrl digit graph lower
11163                            print punct space upper  */
11164                         /* Offset 4 gives the best switch position.  */
11165                         switch (posixcc[4]) {
11166                         case 'a':
11167                             if (memEQ(posixcc, "alph", 4)) /* alpha */
11168                                 namedclass = ANYOF_ALPHA;
11169                             break;
11170                         case 'e':
11171                             if (memEQ(posixcc, "spac", 4)) /* space */
11172                                 namedclass = ANYOF_PSXSPC;
11173                             break;
11174                         case 'h':
11175                             if (memEQ(posixcc, "grap", 4)) /* graph */
11176                                 namedclass = ANYOF_GRAPH;
11177                             break;
11178                         case 'i':
11179                             if (memEQ(posixcc, "asci", 4)) /* ascii */
11180                                 namedclass = ANYOF_ASCII;
11181                             break;
11182                         case 'k':
11183                             if (memEQ(posixcc, "blan", 4)) /* blank */
11184                                 namedclass = ANYOF_BLANK;
11185                             break;
11186                         case 'l':
11187                             if (memEQ(posixcc, "cntr", 4)) /* cntrl */
11188                                 namedclass = ANYOF_CNTRL;
11189                             break;
11190                         case 'm':
11191                             if (memEQ(posixcc, "alnu", 4)) /* alnum */
11192                                 namedclass = ANYOF_ALNUMC;
11193                             break;
11194                         case 'r':
11195                             if (memEQ(posixcc, "lowe", 4)) /* lower */
11196                                 namedclass = ANYOF_LOWER;
11197                             else if (memEQ(posixcc, "uppe", 4)) /* upper */
11198                                 namedclass = ANYOF_UPPER;
11199                             break;
11200                         case 't':
11201                             if (memEQ(posixcc, "digi", 4)) /* digit */
11202                                 namedclass = ANYOF_DIGIT;
11203                             else if (memEQ(posixcc, "prin", 4)) /* print */
11204                                 namedclass = ANYOF_PRINT;
11205                             else if (memEQ(posixcc, "punc", 4)) /* punct */
11206                                 namedclass = ANYOF_PUNCT;
11207                             break;
11208                         }
11209                         break;
11210                     case 6:
11211                         if (memEQ(posixcc, "xdigit", 6))
11212                             namedclass = ANYOF_XDIGIT;
11213                         break;
11214                     }
11215
11216                     if (namedclass == OOB_NAMEDCLASS)
11217                         Simple_vFAIL3("POSIX class [:%.*s:] unknown",
11218                                       t - s - 1, s + 1);
11219
11220                     /* The #defines are structured so each complement is +1 to
11221                      * the normal one */
11222                     if (complement) {
11223                         namedclass++;
11224                     }
11225                     assert (posixcc[skip] == ':');
11226                     assert (posixcc[skip+1] == ']');
11227                 } else if (!SIZE_ONLY) {
11228                     /* [[=foo=]] and [[.foo.]] are still future. */
11229
11230                     /* adjust RExC_parse so the warning shows after
11231                        the class closes */
11232                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
11233                         RExC_parse++;
11234                     Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
11235                 }
11236             } else {
11237                 /* Maternal grandfather:
11238                  * "[:" ending in ":" but not in ":]" */
11239                 RExC_parse = s;
11240             }
11241         }
11242     }
11243
11244     return namedclass;
11245 }
11246
11247 STATIC void
11248 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
11249 {
11250     dVAR;
11251
11252     PERL_ARGS_ASSERT_CHECKPOSIXCC;
11253
11254     if (POSIXCC(UCHARAT(RExC_parse))) {
11255         const char *s = RExC_parse;
11256         const char  c = *s++;
11257
11258         while (isALNUM(*s))
11259             s++;
11260         if (*s && c == *s && s[1] == ']') {
11261             ckWARN3reg(s+2,
11262                        "POSIX syntax [%c %c] belongs inside character classes",
11263                        c, c);
11264
11265             /* [[=foo=]] and [[.foo.]] are still future. */
11266             if (POSIXCC_NOTYET(c)) {
11267                 /* adjust RExC_parse so the error shows after
11268                    the class closes */
11269                 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
11270                     NOOP;
11271                 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
11272             }
11273         }
11274     }
11275 }
11276
11277 /* Generate the code to add a full posix character <class> to the bracketed
11278  * character class given by <node>.  (<node> is needed only under locale rules)
11279  * destlist     is the inversion list for non-locale rules that this class is
11280  *              to be added to
11281  * sourcelist   is the ASCII-range inversion list to add under /a rules
11282  * Xsourcelist  is the full Unicode range list to use otherwise. */
11283 #define DO_POSIX(node, class, destlist, sourcelist, Xsourcelist)           \
11284     if (LOC) {                                                             \
11285         SV* scratch_list = NULL;                                           \
11286                                                                            \
11287         /* Set this class in the node for runtime matching */              \
11288         ANYOF_CLASS_SET(node, class);                                      \
11289                                                                            \
11290         /* For above Latin1 code points, we use the full Unicode range */  \
11291         _invlist_intersection(PL_AboveLatin1,                              \
11292                               Xsourcelist,                                 \
11293                               &scratch_list);                              \
11294         /* And set the output to it, adding instead if there already is an \
11295          * output.  Checking if <destlist> is NULL first saves an extra    \
11296          * clone.  Its reference count will be decremented at the next     \
11297          * union, etc, or if this is the only instance, at the end of the  \
11298          * routine */                                                      \
11299         if (! destlist) {                                                  \
11300             destlist = scratch_list;                                       \
11301         }                                                                  \
11302         else {                                                             \
11303             _invlist_union(destlist, scratch_list, &destlist);             \
11304             SvREFCNT_dec(scratch_list);                                    \
11305         }                                                                  \
11306     }                                                                      \
11307     else {                                                                 \
11308         /* For non-locale, just add it to any existing list */             \
11309         _invlist_union(destlist,                                           \
11310                        (AT_LEAST_ASCII_RESTRICTED)                         \
11311                            ? sourcelist                                    \
11312                            : Xsourcelist,                                  \
11313                        &destlist);                                         \
11314     }
11315
11316 /* Like DO_POSIX, but matches the complement of <sourcelist> and <Xsourcelist>.
11317  */
11318 #define DO_N_POSIX(node, class, destlist, sourcelist, Xsourcelist)         \
11319     if (LOC) {                                                             \
11320         SV* scratch_list = NULL;                                           \
11321         ANYOF_CLASS_SET(node, class);                                      \
11322         _invlist_subtract(PL_AboveLatin1, Xsourcelist, &scratch_list);     \
11323         if (! destlist) {                                                  \
11324             destlist = scratch_list;                                       \
11325         }                                                                  \
11326         else {                                                             \
11327             _invlist_union(destlist, scratch_list, &destlist);             \
11328             SvREFCNT_dec(scratch_list);                                    \
11329         }                                                                  \
11330     }                                                                      \
11331     else {                                                                 \
11332         _invlist_union_complement_2nd(destlist,                            \
11333                                     (AT_LEAST_ASCII_RESTRICTED)            \
11334                                         ? sourcelist                       \
11335                                         : Xsourcelist,                     \
11336                                     &destlist);                            \
11337         /* Under /d, everything in the upper half of the Latin1 range      \
11338          * matches this complement */                                      \
11339         if (DEPENDS_SEMANTICS) {                                           \
11340             ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL;                \
11341         }                                                                  \
11342     }
11343
11344 /* Generate the code to add a posix character <class> to the bracketed
11345  * character class given by <node>.  (<node> is needed only under locale rules)
11346  * destlist       is the inversion list for non-locale rules that this class is
11347  *                to be added to
11348  * sourcelist     is the ASCII-range inversion list to add under /a rules
11349  * l1_sourcelist  is the Latin1 range list to use otherwise.
11350  * Xpropertyname  is the name to add to <run_time_list> of the property to
11351  *                specify the code points above Latin1 that will have to be
11352  *                determined at run-time
11353  * run_time_list  is a SV* that contains text names of properties that are to
11354  *                be computed at run time.  This concatenates <Xpropertyname>
11355  *                to it, appropriately
11356  * This is essentially DO_POSIX, but we know only the Latin1 values at compile
11357  * time */
11358 #define DO_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist,      \
11359                               l1_sourcelist, Xpropertyname, run_time_list) \
11360         /* First, resolve whether to use the ASCII-only list or the L1     \
11361          * list */                                                         \
11362         DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(node, class, destlist,      \
11363                 ((AT_LEAST_ASCII_RESTRICTED) ? sourcelist : l1_sourcelist),\
11364                 Xpropertyname, run_time_list)
11365
11366 #define DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(node, class, destlist, sourcelist, \
11367                 Xpropertyname, run_time_list)                              \
11368     /* If not /a matching, there are going to be code points we will have  \
11369      * to defer to runtime to look-up */                                   \
11370     if (! AT_LEAST_ASCII_RESTRICTED) {                                     \
11371         Perl_sv_catpvf(aTHX_ run_time_list, "+utf8::%s\n", Xpropertyname); \
11372     }                                                                      \
11373     if (LOC) {                                                             \
11374         ANYOF_CLASS_SET(node, class);                                      \
11375     }                                                                      \
11376     else {                                                                 \
11377         _invlist_union(destlist, sourcelist, &destlist);                   \
11378     }
11379
11380 /* Like DO_POSIX_LATIN1_ONLY_KNOWN, but for the complement.  A combination of
11381  * this and DO_N_POSIX.  Sets <matches_above_unicode> only if it can; unchanged
11382  * otherwise */
11383 #define DO_N_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist,    \
11384        l1_sourcelist, Xpropertyname, run_time_list, matches_above_unicode) \
11385     if (AT_LEAST_ASCII_RESTRICTED) {                                       \
11386         _invlist_union_complement_2nd(destlist, sourcelist, &destlist);    \
11387     }                                                                      \
11388     else {                                                                 \
11389         Perl_sv_catpvf(aTHX_ run_time_list, "!utf8::%s\n", Xpropertyname); \
11390         matches_above_unicode = TRUE;                                      \
11391         if (LOC) {                                                         \
11392             ANYOF_CLASS_SET(node, namedclass);                             \
11393         }                                                                  \
11394         else {                                                             \
11395             SV* scratch_list = NULL;                                       \
11396             _invlist_subtract(PL_Latin1, l1_sourcelist, &scratch_list);    \
11397             if (! destlist) {                                              \
11398                 destlist = scratch_list;                                   \
11399             }                                                              \
11400             else {                                                         \
11401                 _invlist_union(destlist, scratch_list, &destlist);         \
11402                 SvREFCNT_dec(scratch_list);                                \
11403             }                                                              \
11404             if (DEPENDS_SEMANTICS) {                                       \
11405                 ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL;            \
11406             }                                                              \
11407         }                                                                  \
11408     }
11409
11410 /* The names of properties whose definitions are not known at compile time are
11411  * stored in this SV, after a constant heading.  So if the length has been
11412  * changed since initialization, then there is a run-time definition. */
11413 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION (SvCUR(listsv) != initial_listsv_len)
11414
11415 /* This converts the named class defined in regcomp.h to its equivalent class
11416  * number defined in handy.h. */
11417 #define namedclass_to_classnum(class)  ((class) / 2)
11418
11419 STATIC regnode *
11420 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
11421 {
11422     /* parse a bracketed class specification.  Most of these will produce an ANYOF node;
11423      * but something like [a] will produce an EXACT node; [aA], an EXACTFish
11424      * node; [[:ascii:]], a POSIXA node; etc.  It is more complex under /i with
11425      * multi-character folds: it will be rewritten following the paradigm of
11426      * this example, where the <multi-fold>s are characters which fold to
11427      * multiple character sequences:
11428      *      /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
11429      * gets effectively rewritten as:
11430      *      /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
11431      * reg() gets called (recursively) on the rewritten version, and this
11432      * function will return what it constructs.  (Actually the <multi-fold>s
11433      * aren't physically removed from the [abcdefghi], it's just that they are
11434      * ignored in the recursion by means of a a flag:
11435      * <RExC_in_multi_char_class>.)
11436      *
11437      * ANYOF nodes contain a bit map for the first 256 characters, with the
11438      * corresponding bit set if that character is in the list.  For characters
11439      * above 255, a range list or swash is used.  There are extra bits for \w,
11440      * etc. in locale ANYOFs, as what these match is not determinable at
11441      * compile time */
11442
11443     dVAR;
11444     UV nextvalue;
11445     UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
11446     IV range = 0;
11447     UV value = OOB_UNICODE, save_value = OOB_UNICODE;
11448     regnode *ret;
11449     STRLEN numlen;
11450     IV namedclass = OOB_NAMEDCLASS;
11451     char *rangebegin = NULL;
11452     bool need_class = 0;
11453     bool allow_full_fold = TRUE;   /* Assume wants multi-char folding */
11454     SV *listsv = NULL;
11455     STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
11456                                       than just initialized.  */
11457     SV* properties = NULL;    /* Code points that match \p{} \P{} */
11458     SV* posixes = NULL;     /* Code points that match classes like, [:word:],
11459                                extended beyond the Latin1 range */
11460     UV element_count = 0;   /* Number of distinct elements in the class.
11461                                Optimizations may be possible if this is tiny */
11462     AV * multi_char_matches = NULL; /* Code points that fold to more than one
11463                                        character; used under /i */
11464     UV n;
11465
11466     /* Unicode properties are stored in a swash; this holds the current one
11467      * being parsed.  If this swash is the only above-latin1 component of the
11468      * character class, an optimization is to pass it directly on to the
11469      * execution engine.  Otherwise, it is set to NULL to indicate that there
11470      * are other things in the class that have to be dealt with at execution
11471      * time */
11472     SV* swash = NULL;           /* Code points that match \p{} \P{} */
11473
11474     /* Set if a component of this character class is user-defined; just passed
11475      * on to the engine */
11476     bool has_user_defined_property = FALSE;
11477
11478     /* inversion list of code points this node matches only when the target
11479      * string is in UTF-8.  (Because is under /d) */
11480     SV* depends_list = NULL;
11481
11482     /* inversion list of code points this node matches.  For much of the
11483      * function, it includes only those that match regardless of the utf8ness
11484      * of the target string */
11485     SV* cp_list = NULL;
11486
11487 #ifdef EBCDIC
11488     /* In a range, counts how many 0-2 of the ends of it came from literals,
11489      * not escapes.  Thus we can tell if 'A' was input vs \x{C1} */
11490     UV literal_endpoint = 0;
11491 #endif
11492     bool invert = FALSE;    /* Is this class to be complemented */
11493
11494     /* Is there any thing like \W or [:^digit:] that matches above the legal
11495      * Unicode range? */
11496     bool runtime_posix_matches_above_Unicode = FALSE;
11497
11498     regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
11499         case we need to change the emitted regop to an EXACT. */
11500     const char * orig_parse = RExC_parse;
11501     const I32 orig_size = RExC_size;
11502     GET_RE_DEBUG_FLAGS_DECL;
11503
11504     PERL_ARGS_ASSERT_REGCLASS;
11505 #ifndef DEBUGGING
11506     PERL_UNUSED_ARG(depth);
11507 #endif
11508
11509     DEBUG_PARSE("clas");
11510
11511     /* Assume we are going to generate an ANYOF node. */
11512     ret = reganode(pRExC_state, ANYOF, 0);
11513
11514     if (!SIZE_ONLY) {
11515         ANYOF_FLAGS(ret) = 0;
11516     }
11517
11518     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
11519         RExC_parse++;
11520         if (! RExC_in_multi_char_class) {
11521             invert = TRUE;
11522             RExC_naughty++;
11523
11524             /* We have decided to not allow multi-char folds in inverted
11525              * character classes, due to the confusion that can happen,
11526              * especially with classes that are designed for a non-Unicode
11527              * world:  You have the peculiar case that:
11528                 "s s" =~ /^[^\xDF]+$/i => Y
11529                 "ss"  =~ /^[^\xDF]+$/i => N
11530             *
11531             * See [perl #89750] */
11532             allow_full_fold = FALSE;
11533         }
11534     }
11535
11536     if (SIZE_ONLY) {
11537         RExC_size += ANYOF_SKIP;
11538         listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
11539     }
11540     else {
11541         RExC_emit += ANYOF_SKIP;
11542         if (LOC) {
11543             ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
11544         }
11545         listsv = newSVpvs("# comment\n");
11546         initial_listsv_len = SvCUR(listsv);
11547     }
11548
11549     nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
11550
11551     if (!SIZE_ONLY && POSIXCC(nextvalue))
11552         checkposixcc(pRExC_state);
11553
11554     /* allow 1st char to be ] (allowing it to be - is dealt with later) */
11555     if (UCHARAT(RExC_parse) == ']')
11556         goto charclassloop;
11557
11558 parseit:
11559     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
11560
11561     charclassloop:
11562
11563         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
11564         save_value = value;
11565         save_prevvalue = prevvalue;
11566
11567         if (!range) {
11568             rangebegin = RExC_parse;
11569             element_count++;
11570         }
11571         if (UTF) {
11572             value = utf8n_to_uvchr((U8*)RExC_parse,
11573                                    RExC_end - RExC_parse,
11574                                    &numlen, UTF8_ALLOW_DEFAULT);
11575             RExC_parse += numlen;
11576         }
11577         else
11578             value = UCHARAT(RExC_parse++);
11579
11580         nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
11581         if (value == '[' && POSIXCC(nextvalue))
11582             namedclass = regpposixcc(pRExC_state, value);
11583         else if (value == '\\') {
11584             if (UTF) {
11585                 value = utf8n_to_uvchr((U8*)RExC_parse,
11586                                    RExC_end - RExC_parse,
11587                                    &numlen, UTF8_ALLOW_DEFAULT);
11588                 RExC_parse += numlen;
11589             }
11590             else
11591                 value = UCHARAT(RExC_parse++);
11592             /* Some compilers cannot handle switching on 64-bit integer
11593              * values, therefore value cannot be an UV.  Yes, this will
11594              * be a problem later if we want switch on Unicode.
11595              * A similar issue a little bit later when switching on
11596              * namedclass. --jhi */
11597             switch ((I32)value) {
11598             case 'w':   namedclass = ANYOF_WORDCHAR;    break;
11599             case 'W':   namedclass = ANYOF_NWORDCHAR;   break;
11600             case 's':   namedclass = ANYOF_SPACE;       break;
11601             case 'S':   namedclass = ANYOF_NSPACE;      break;
11602             case 'd':   namedclass = ANYOF_DIGIT;       break;
11603             case 'D':   namedclass = ANYOF_NDIGIT;      break;
11604             case 'v':   namedclass = ANYOF_VERTWS;      break;
11605             case 'V':   namedclass = ANYOF_NVERTWS;     break;
11606             case 'h':   namedclass = ANYOF_HORIZWS;     break;
11607             case 'H':   namedclass = ANYOF_NHORIZWS;    break;
11608             case 'N':  /* Handle \N{NAME} in class */
11609                 {
11610                     /* We only pay attention to the first char of
11611                     multichar strings being returned. I kinda wonder
11612                     if this makes sense as it does change the behaviour
11613                     from earlier versions, OTOH that behaviour was broken
11614                     as well. */
11615                     if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth,
11616                                       TRUE /* => charclass */))
11617                     {
11618                         goto parseit;
11619                     }
11620                 }
11621                 break;
11622             case 'p':
11623             case 'P':
11624                 {
11625                 char *e;
11626
11627                 /* This routine will handle any undefined properties */
11628                 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF;
11629
11630                 if (RExC_parse >= RExC_end)
11631                     vFAIL2("Empty \\%c{}", (U8)value);
11632                 if (*RExC_parse == '{') {
11633                     const U8 c = (U8)value;
11634                     e = strchr(RExC_parse++, '}');
11635                     if (!e)
11636                         vFAIL2("Missing right brace on \\%c{}", c);
11637                     while (isSPACE(UCHARAT(RExC_parse)))
11638                         RExC_parse++;
11639                     if (e == RExC_parse)
11640                         vFAIL2("Empty \\%c{}", c);
11641                     n = e - RExC_parse;
11642                     while (isSPACE(UCHARAT(RExC_parse + n - 1)))
11643                         n--;
11644                 }
11645                 else {
11646                     e = RExC_parse;
11647                     n = 1;
11648                 }
11649                 if (!SIZE_ONLY) {
11650                     SV* invlist;
11651                     char* name;
11652
11653                     if (UCHARAT(RExC_parse) == '^') {
11654                          RExC_parse++;
11655                          n--;
11656                          value = value == 'p' ? 'P' : 'p'; /* toggle */
11657                          while (isSPACE(UCHARAT(RExC_parse))) {
11658                               RExC_parse++;
11659                               n--;
11660                          }
11661                     }
11662                     /* Try to get the definition of the property into
11663                      * <invlist>.  If /i is in effect, the effective property
11664                      * will have its name be <__NAME_i>.  The design is
11665                      * discussed in commit
11666                      * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
11667                     Newx(name, n + sizeof("_i__\n"), char);
11668
11669                     sprintf(name, "%s%.*s%s\n",
11670                                     (FOLD) ? "__" : "",
11671                                     (int)n,
11672                                     RExC_parse,
11673                                     (FOLD) ? "_i" : ""
11674                     );
11675
11676                     /* Look up the property name, and get its swash and
11677                      * inversion list, if the property is found  */
11678                     if (swash) {
11679                         SvREFCNT_dec(swash);
11680                     }
11681                     swash = _core_swash_init("utf8", name, &PL_sv_undef,
11682                                              1, /* binary */
11683                                              0, /* not tr/// */
11684                                              NULL, /* No inversion list */
11685                                              &swash_init_flags
11686                                             );
11687                     if (! swash || ! (invlist = _get_swash_invlist(swash))) {
11688                         if (swash) {
11689                             SvREFCNT_dec(swash);
11690                             swash = NULL;
11691                         }
11692
11693                         /* Here didn't find it.  It could be a user-defined
11694                          * property that will be available at run-time.  Add it
11695                          * to the list to look up then */
11696                         Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n",
11697                                         (value == 'p' ? '+' : '!'),
11698                                         name);
11699                         has_user_defined_property = TRUE;
11700
11701                         /* We don't know yet, so have to assume that the
11702                          * property could match something in the Latin1 range,
11703                          * hence something that isn't utf8.  Note that this
11704                          * would cause things in <depends_list> to match
11705                          * inappropriately, except that any \p{}, including
11706                          * this one forces Unicode semantics, which means there
11707                          * is <no depends_list> */
11708                         ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
11709                     }
11710                     else {
11711
11712                         /* Here, did get the swash and its inversion list.  If
11713                          * the swash is from a user-defined property, then this
11714                          * whole character class should be regarded as such */
11715                         has_user_defined_property =
11716                                     (swash_init_flags
11717                                      & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY);
11718
11719                         /* Invert if asking for the complement */
11720                         if (value == 'P') {
11721                             _invlist_union_complement_2nd(properties,
11722                                                           invlist,
11723                                                           &properties);
11724
11725                             /* The swash can't be used as-is, because we've
11726                              * inverted things; delay removing it to here after
11727                              * have copied its invlist above */
11728                             SvREFCNT_dec(swash);
11729                             swash = NULL;
11730                         }
11731                         else {
11732                             _invlist_union(properties, invlist, &properties);
11733                         }
11734                     }
11735                     Safefree(name);
11736                 }
11737                 RExC_parse = e + 1;
11738                 namedclass = ANYOF_MAX;  /* no official name, but it's named */
11739
11740                 /* \p means they want Unicode semantics */
11741                 RExC_uni_semantics = 1;
11742                 }
11743                 break;
11744             case 'n':   value = '\n';                   break;
11745             case 'r':   value = '\r';                   break;
11746             case 't':   value = '\t';                   break;
11747             case 'f':   value = '\f';                   break;
11748             case 'b':   value = '\b';                   break;
11749             case 'e':   value = ASCII_TO_NATIVE('\033');break;
11750             case 'a':   value = ASCII_TO_NATIVE('\007');break;
11751             case 'o':
11752                 RExC_parse--;   /* function expects to be pointed at the 'o' */
11753                 {
11754                     const char* error_msg;
11755                     bool valid = grok_bslash_o(RExC_parse,
11756                                                &value,
11757                                                &numlen,
11758                                                &error_msg,
11759                                                SIZE_ONLY);
11760                     RExC_parse += numlen;
11761                     if (! valid) {
11762                         vFAIL(error_msg);
11763                     }
11764                 }
11765                 if (PL_encoding && value < 0x100) {
11766                     goto recode_encoding;
11767                 }
11768                 break;
11769             case 'x':
11770                 RExC_parse--;   /* function expects to be pointed at the 'x' */
11771                 {
11772                     const char* error_msg;
11773                     bool valid = grok_bslash_x(RExC_parse,
11774                                                &value,
11775                                                &numlen,
11776                                                &error_msg,
11777                                                1);
11778                     RExC_parse += numlen;
11779                     if (! valid) {
11780                         vFAIL(error_msg);
11781                     }
11782                 }
11783                 if (PL_encoding && value < 0x100)
11784                     goto recode_encoding;
11785                 break;
11786             case 'c':
11787                 value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
11788                 break;
11789             case '0': case '1': case '2': case '3': case '4':
11790             case '5': case '6': case '7':
11791                 {
11792                     /* Take 1-3 octal digits */
11793                     I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
11794                     numlen = 3;
11795                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
11796                     RExC_parse += numlen;
11797                     if (PL_encoding && value < 0x100)
11798                         goto recode_encoding;
11799                     break;
11800                 }
11801             recode_encoding:
11802                 if (! RExC_override_recoding) {
11803                     SV* enc = PL_encoding;
11804                     value = reg_recode((const char)(U8)value, &enc);
11805                     if (!enc && SIZE_ONLY)
11806                         ckWARNreg(RExC_parse,
11807                                   "Invalid escape in the specified encoding");
11808                     break;
11809                 }
11810             default:
11811                 /* Allow \_ to not give an error */
11812                 if (!SIZE_ONLY && isALNUM(value) && value != '_') {
11813                     ckWARN2reg(RExC_parse,
11814                                "Unrecognized escape \\%c in character class passed through",
11815                                (int)value);
11816                 }
11817                 break;
11818             }
11819         } /* end of \blah */
11820 #ifdef EBCDIC
11821         else
11822             literal_endpoint++;
11823 #endif
11824
11825             /* What matches in a locale is not known until runtime.  This
11826              * includes what the Posix classes (like \w, [:space:]) match.
11827              * Room must be reserved (one time per class) to store such
11828              * classes, either if Perl is compiled so that locale nodes always
11829              * should have this space, or if there is such class info to be
11830              * stored.  The space will contain a bit for each named class that
11831              * is to be matched against.  This isn't needed for \p{} and
11832              * pseudo-classes, as they are not affected by locale, and hence
11833              * are dealt with separately */
11834             if (LOC
11835                 && ! need_class
11836                 && (ANYOF_LOCALE == ANYOF_CLASS
11837                     || (namedclass > OOB_NAMEDCLASS && namedclass < ANYOF_MAX)))
11838             {
11839                 need_class = 1;
11840                 if (SIZE_ONLY) {
11841                     RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
11842                 }
11843                 else {
11844                     RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
11845                     ANYOF_CLASS_ZERO(ret);
11846                 }
11847                 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
11848             }
11849
11850         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
11851
11852             /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
11853              * literal, as is the character that began the false range, i.e.
11854              * the 'a' in the examples */
11855             if (range) {
11856                 if (!SIZE_ONLY) {
11857                     const int w =
11858                         RExC_parse >= rangebegin ?
11859                         RExC_parse - rangebegin : 0;
11860                     ckWARN4reg(RExC_parse,
11861                                "False [] range \"%*.*s\"",
11862                                w, w, rangebegin);
11863                     cp_list = add_cp_to_invlist(cp_list, '-');
11864                     cp_list = add_cp_to_invlist(cp_list, prevvalue);
11865                 }
11866
11867                 range = 0; /* this was not a true range */
11868                 element_count += 2; /* So counts for three values */
11869             }
11870
11871             if (! SIZE_ONLY) {
11872                 switch ((I32)namedclass) {
11873
11874                 case ANYOF_ALNUMC: /* C's alnum, in contrast to \w */
11875                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11876                         PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv);
11877                     break;
11878                 case ANYOF_NALNUMC:
11879                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11880                         PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv,
11881                         runtime_posix_matches_above_Unicode);
11882                     break;
11883                 case ANYOF_ALPHA:
11884                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11885                         PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv);
11886                     break;
11887                 case ANYOF_NALPHA:
11888                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11889                         PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv,
11890                         runtime_posix_matches_above_Unicode);
11891                     break;
11892                 case ANYOF_ASCII:
11893 #ifdef HAS_ISASCII
11894                     if (LOC) {
11895                         ANYOF_CLASS_SET(ret, namedclass);
11896                     }
11897                     else
11898 #endif  /* Not isascii(); just use the hard-coded definition for it */
11899                         _invlist_union(posixes, PL_ASCII, &posixes);
11900                     break;
11901                 case ANYOF_NASCII:
11902 #ifdef HAS_ISASCII
11903                     if (LOC) {
11904                         ANYOF_CLASS_SET(ret, namedclass);
11905                     }
11906                     else {
11907 #endif
11908                         _invlist_union_complement_2nd(posixes,
11909                                                     PL_ASCII, &posixes);
11910                         if (DEPENDS_SEMANTICS) {
11911                             ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
11912                         }
11913 #ifdef HAS_ISASCII
11914                     }
11915 #endif
11916                     break;
11917                 case ANYOF_BLANK:
11918                     if (hasISBLANK || ! LOC) {
11919                         DO_POSIX(ret, namedclass, posixes,
11920                                             PL_PosixBlank, PL_XPosixBlank);
11921                     }
11922                     else { /* There is no isblank() and we are in locale:  We
11923                               use the ASCII range and the above-Latin1 range
11924                               code points */
11925                         SV* scratch_list = NULL;
11926
11927                         /* Include all above-Latin1 blanks */
11928                         _invlist_intersection(PL_AboveLatin1,
11929                                               PL_XPosixBlank,
11930                                               &scratch_list);
11931                         /* Add it to the running total of posix classes */
11932                         if (! posixes) {
11933                             posixes = scratch_list;
11934                         }
11935                         else {
11936                             _invlist_union(posixes, scratch_list, &posixes);
11937                             SvREFCNT_dec(scratch_list);
11938                         }
11939                         /* Add the ASCII-range blanks to the running total. */
11940                         _invlist_union(posixes, PL_PosixBlank, &posixes);
11941                     }
11942                     break;
11943                 case ANYOF_NBLANK:
11944                     if (hasISBLANK || ! LOC) {
11945                         DO_N_POSIX(ret, namedclass, posixes,
11946                                                 PL_PosixBlank, PL_XPosixBlank);
11947                     }
11948                     else { /* There is no isblank() and we are in locale */
11949                         SV* scratch_list = NULL;
11950
11951                         /* Include all above-Latin1 non-blanks */
11952                         _invlist_subtract(PL_AboveLatin1, PL_XPosixBlank,
11953                                           &scratch_list);
11954
11955                         /* Add them to the running total of posix classes */
11956                         _invlist_subtract(PL_AboveLatin1, PL_XPosixBlank,
11957                                           &scratch_list);
11958                         if (! posixes) {
11959                             posixes = scratch_list;
11960                         }
11961                         else {
11962                             _invlist_union(posixes, scratch_list, &posixes);
11963                             SvREFCNT_dec(scratch_list);
11964                         }
11965
11966                         /* Get the list of all non-ASCII-blanks in Latin 1, and
11967                          * add them to the running total */
11968                         _invlist_subtract(PL_Latin1, PL_PosixBlank,
11969                                           &scratch_list);
11970                         _invlist_union(posixes, scratch_list, &posixes);
11971                         SvREFCNT_dec(scratch_list);
11972                     }
11973                     break;
11974                 case ANYOF_CNTRL:
11975                     DO_POSIX(ret, namedclass, posixes,
11976                                             PL_PosixCntrl, PL_XPosixCntrl);
11977                     break;
11978                 case ANYOF_NCNTRL:
11979                     DO_N_POSIX(ret, namedclass, posixes,
11980                                             PL_PosixCntrl, PL_XPosixCntrl);
11981                     break;
11982                 case ANYOF_DIGIT:
11983                     /* There are no digits in the Latin1 range outside of
11984                      * ASCII, so call the macro that doesn't have to resolve
11985                      * them */
11986                     DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(ret, namedclass, posixes,
11987                         PL_PosixDigit, "XPosixDigit", listsv);
11988                     break;
11989                 case ANYOF_NDIGIT:
11990                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11991                         PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv,
11992                         runtime_posix_matches_above_Unicode);
11993                     break;
11994                 case ANYOF_GRAPH:
11995                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11996                         PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv);
11997                     break;
11998                 case ANYOF_NGRAPH:
11999                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12000                         PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv,
12001                         runtime_posix_matches_above_Unicode);
12002                     break;
12003                 case ANYOF_HORIZWS:
12004                     /* For these, we use the cp_list, as /d doesn't make a
12005                      * difference in what these match.  There would be problems
12006                      * if these characters had folds other than themselves, as
12007                      * cp_list is subject to folding.  It turns out that \h
12008                      * is just a synonym for XPosixBlank */
12009                     _invlist_union(cp_list, PL_XPosixBlank, &cp_list);
12010                     break;
12011                 case ANYOF_NHORIZWS:
12012                     _invlist_union_complement_2nd(cp_list,
12013                                                  PL_XPosixBlank, &cp_list);
12014                     break;
12015                 case ANYOF_LOWER:
12016                 case ANYOF_NLOWER:
12017                 {   /* These require special handling, as they differ under
12018                        folding, matching Cased there (which in the ASCII range
12019                        is the same as Alpha */
12020
12021                     SV* ascii_source;
12022                     SV* l1_source;
12023                     const char *Xname;
12024
12025                     if (FOLD && ! LOC) {
12026                         ascii_source = PL_PosixAlpha;
12027                         l1_source = PL_L1Cased;
12028                         Xname = "Cased";
12029                     }
12030                     else {
12031                         ascii_source = PL_PosixLower;
12032                         l1_source = PL_L1PosixLower;
12033                         Xname = "XPosixLower";
12034                     }
12035                     if (namedclass == ANYOF_LOWER) {
12036                         DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12037                                     ascii_source, l1_source, Xname, listsv);
12038                     }
12039                     else {
12040                         DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
12041                             posixes, ascii_source, l1_source, Xname, listsv,
12042                             runtime_posix_matches_above_Unicode);
12043                     }
12044                     break;
12045                 }
12046                 case ANYOF_PRINT:
12047                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12048                         PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv);
12049                     break;
12050                 case ANYOF_NPRINT:
12051                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12052                         PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv,
12053                         runtime_posix_matches_above_Unicode);
12054                     break;
12055                 case ANYOF_PUNCT:
12056                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12057                         PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv);
12058                     break;
12059                 case ANYOF_NPUNCT:
12060                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12061                         PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv,
12062                         runtime_posix_matches_above_Unicode);
12063                     break;
12064                 case ANYOF_PSXSPC:
12065                     DO_POSIX(ret, namedclass, posixes,
12066                                             PL_PosixSpace, PL_XPosixSpace);
12067                     break;
12068                 case ANYOF_NPSXSPC:
12069                     DO_N_POSIX(ret, namedclass, posixes,
12070                                             PL_PosixSpace, PL_XPosixSpace);
12071                     break;
12072                 case ANYOF_SPACE:
12073                     DO_POSIX(ret, namedclass, posixes,
12074                                             PL_PerlSpace, PL_XPerlSpace);
12075                     break;
12076                 case ANYOF_NSPACE:
12077                     DO_N_POSIX(ret, namedclass, posixes,
12078                                             PL_PerlSpace, PL_XPerlSpace);
12079                     break;
12080                 case ANYOF_UPPER:   /* Same as LOWER, above */
12081                 case ANYOF_NUPPER:
12082                 {
12083                     SV* ascii_source;
12084                     SV* l1_source;
12085                     const char *Xname;
12086
12087                     if (FOLD && ! LOC) {
12088                         ascii_source = PL_PosixAlpha;
12089                         l1_source = PL_L1Cased;
12090                         Xname = "Cased";
12091                     }
12092                     else {
12093                         ascii_source = PL_PosixUpper;
12094                         l1_source = PL_L1PosixUpper;
12095                         Xname = "XPosixUpper";
12096                     }
12097                     if (namedclass == ANYOF_UPPER) {
12098                         DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12099                                     ascii_source, l1_source, Xname, listsv);
12100                     }
12101                     else {
12102                         DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
12103                         posixes, ascii_source, l1_source, Xname, listsv,
12104                         runtime_posix_matches_above_Unicode);
12105                     }
12106                     break;
12107                 }
12108                 case ANYOF_WORDCHAR:
12109                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12110                             PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
12111                     break;
12112                 case ANYOF_NWORDCHAR:
12113                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12114                             PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv,
12115                             runtime_posix_matches_above_Unicode);
12116                     break;
12117                 case ANYOF_VERTWS:
12118                     /* For these, we use the cp_list, as /d doesn't make a
12119                      * difference in what these match.  There would be problems
12120                      * if these characters had folds other than themselves, as
12121                      * cp_list is subject to folding */
12122                     _invlist_union(cp_list, PL_VertSpace, &cp_list);
12123                     break;
12124                 case ANYOF_NVERTWS:
12125                     _invlist_union_complement_2nd(cp_list,
12126                                                     PL_VertSpace, &cp_list);
12127                     break;
12128                 case ANYOF_XDIGIT:
12129                     DO_POSIX(ret, namedclass, posixes,
12130                                             PL_PosixXDigit, PL_XPosixXDigit);
12131                     break;
12132                 case ANYOF_NXDIGIT:
12133                     DO_N_POSIX(ret, namedclass, posixes,
12134                                             PL_PosixXDigit, PL_XPosixXDigit);
12135                     break;
12136                 case ANYOF_MAX:
12137                     /* this is to handle \p and \P */
12138                     break;
12139                 default:
12140                     vFAIL("Invalid [::] class");
12141                     break;
12142                 }
12143
12144                 continue;   /* Go get next character */
12145             }
12146         } /* end of namedclass \blah */
12147
12148         if (range) {
12149             if (prevvalue > value) /* b-a */ {
12150                 const int w = RExC_parse - rangebegin;
12151                 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
12152                 range = 0; /* not a valid range */
12153             }
12154         }
12155         else {
12156             prevvalue = value; /* save the beginning of the potential range */
12157             if (RExC_parse+1 < RExC_end
12158                 && *RExC_parse == '-'
12159                 && RExC_parse[1] != ']')
12160             {
12161                 RExC_parse++;
12162
12163                 /* a bad range like \w-, [:word:]- ? */
12164                 if (namedclass > OOB_NAMEDCLASS) {
12165                     if (ckWARN(WARN_REGEXP)) {
12166                         const int w =
12167                             RExC_parse >= rangebegin ?
12168                             RExC_parse - rangebegin : 0;
12169                         vWARN4(RExC_parse,
12170                                "False [] range \"%*.*s\"",
12171                                w, w, rangebegin);
12172                     }
12173                     if (!SIZE_ONLY) {
12174                         cp_list = add_cp_to_invlist(cp_list, '-');
12175                     }
12176                     element_count++;
12177                 } else
12178                     range = 1;  /* yeah, it's a range! */
12179                 continue;       /* but do it the next time */
12180             }
12181         }
12182
12183         /* Here, <prevvalue> is the beginning of the range, if any; or <value>
12184          * if not */
12185
12186         /* non-Latin1 code point implies unicode semantics.  Must be set in
12187          * pass1 so is there for the whole of pass 2 */
12188         if (value > 255) {
12189             RExC_uni_semantics = 1;
12190         }
12191
12192         /* Ready to process either the single value, or the completed range.
12193          * For single-valued non-inverted ranges, we consider the possibility
12194          * of multi-char folds.  (We made a conscious decision to not do this
12195          * for the other cases because it can often lead to non-intuitive
12196          * results) */
12197         if (FOLD && ! invert && value == prevvalue) {
12198             if (value == LATIN_SMALL_LETTER_SHARP_S
12199                 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
12200                                                         value)))
12201             {
12202                 /* Here <value> is indeed a multi-char fold.  Get what it is */
12203
12204                 U8 foldbuf[UTF8_MAXBYTES_CASE];
12205                 STRLEN foldlen;
12206
12207                 UV folded = _to_uni_fold_flags(
12208                                 value,
12209                                 foldbuf,
12210                                 &foldlen,
12211                                 FOLD_FLAGS_FULL
12212                                 | ((LOC) ?  FOLD_FLAGS_LOCALE
12213                                             : (ASCII_FOLD_RESTRICTED)
12214                                               ? FOLD_FLAGS_NOMIX_ASCII
12215                                               : 0)
12216                                 );
12217
12218                 /* Here, <folded> should be the first character of the
12219                  * multi-char fold of <value>, with <foldbuf> containing the
12220                  * whole thing.  But, if this fold is not allowed (because of
12221                  * the flags), <fold> will be the same as <value>, and should
12222                  * be processed like any other character, so skip the special
12223                  * handling */
12224                 if (folded != value) {
12225
12226                     /* Skip if we are recursed, currently parsing the class
12227                      * again.  Otherwise add this character to the list of
12228                      * multi-char folds. */
12229                     if (! RExC_in_multi_char_class) {
12230                         AV** this_array_ptr;
12231                         AV* this_array;
12232                         STRLEN cp_count = utf8_length(foldbuf,
12233                                                       foldbuf + foldlen);
12234                         SV* multi_fold = sv_2mortal(newSVpvn("", 0));
12235
12236                         Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
12237
12238
12239                         if (! multi_char_matches) {
12240                             multi_char_matches = newAV();
12241                         }
12242
12243                         /* <multi_char_matches> is actually an array of arrays.
12244                          * There will be one or two top-level elements: [2],
12245                          * and/or [3].  The [2] element is an array, each
12246                          * element thereof is a character which folds to two
12247                          * characters; likewise for [3].  (Unicode guarantees a
12248                          * maximum of 3 characters in any fold.)  When we
12249                          * rewrite the character class below, we will do so
12250                          * such that the longest folds are written first, so
12251                          * that it prefers the longest matching strings first.
12252                          * This is done even if it turns out that any
12253                          * quantifier is non-greedy, out of programmer
12254                          * laziness.  Tom Christiansen has agreed that this is
12255                          * ok.  This makes the test for the ligature 'ffi' come
12256                          * before the test for 'ff' */
12257                         if (av_exists(multi_char_matches, cp_count)) {
12258                             this_array_ptr = (AV**) av_fetch(multi_char_matches,
12259                                                              cp_count, FALSE);
12260                             this_array = *this_array_ptr;
12261                         }
12262                         else {
12263                             this_array = newAV();
12264                             av_store(multi_char_matches, cp_count,
12265                                      (SV*) this_array);
12266                         }
12267                         av_push(this_array, multi_fold);
12268                     }
12269
12270                     /* This element should not be processed further in this
12271                      * class */
12272                     element_count--;
12273                     value = save_value;
12274                     prevvalue = save_prevvalue;
12275                     continue;
12276                 }
12277             }
12278         }
12279
12280         /* Deal with this element of the class */
12281         if (! SIZE_ONLY) {
12282 #ifndef EBCDIC
12283             cp_list = _add_range_to_invlist(cp_list, prevvalue, value);
12284 #else
12285             UV* this_range = _new_invlist(1);
12286             _append_range_to_invlist(this_range, prevvalue, value);
12287
12288             /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
12289              * If this range was specified using something like 'i-j', we want
12290              * to include only the 'i' and the 'j', and not anything in
12291              * between, so exclude non-ASCII, non-alphabetics from it.
12292              * However, if the range was specified with something like
12293              * [\x89-\x91] or [\x89-j], all code points within it should be
12294              * included.  literal_endpoint==2 means both ends of the range used
12295              * a literal character, not \x{foo} */
12296             if (literal_endpoint == 2
12297                 && (prevvalue >= 'a' && value <= 'z')
12298                     || (prevvalue >= 'A' && value <= 'Z'))
12299             {
12300                 _invlist_intersection(this_range, PL_ASCII, &this_range, );
12301                 _invlist_intersection(this_range, PL_Alpha, &this_range, );
12302             }
12303             _invlist_union(cp_list, this_range, &cp_list);
12304             literal_endpoint = 0;
12305 #endif
12306         }
12307
12308         range = 0; /* this range (if it was one) is done now */
12309     } /* End of loop through all the text within the brackets */
12310
12311     /* If anything in the class expands to more than one character, we have to
12312      * deal with them by building up a substitute parse string, and recursively
12313      * calling reg() on it, instead of proceeding */
12314     if (multi_char_matches) {
12315         SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
12316         I32 cp_count;
12317         STRLEN len;
12318         char *save_end = RExC_end;
12319         char *save_parse = RExC_parse;
12320         bool first_time = TRUE;     /* First multi-char occurrence doesn't get
12321                                        a "|" */
12322         I32 reg_flags;
12323
12324         assert(! invert);
12325 #if 0   /* Have decided not to deal with multi-char folds in inverted classes,
12326            because too confusing */
12327         if (invert) {
12328             sv_catpv(substitute_parse, "(?:");
12329         }
12330 #endif
12331
12332         /* Look at the longest folds first */
12333         for (cp_count = av_len(multi_char_matches); cp_count > 0; cp_count--) {
12334
12335             if (av_exists(multi_char_matches, cp_count)) {
12336                 AV** this_array_ptr;
12337                 SV* this_sequence;
12338
12339                 this_array_ptr = (AV**) av_fetch(multi_char_matches,
12340                                                  cp_count, FALSE);
12341                 while ((this_sequence = av_pop(*this_array_ptr)) !=
12342                                                                 &PL_sv_undef)
12343                 {
12344                     if (! first_time) {
12345                         sv_catpv(substitute_parse, "|");
12346                     }
12347                     first_time = FALSE;
12348
12349                     sv_catpv(substitute_parse, SvPVX(this_sequence));
12350                 }
12351             }
12352         }
12353
12354         /* If the character class contains anything else besides these
12355          * multi-character folds, have to include it in recursive parsing */
12356         if (element_count) {
12357             sv_catpv(substitute_parse, "|[");
12358             sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
12359             sv_catpv(substitute_parse, "]");
12360         }
12361
12362         sv_catpv(substitute_parse, ")");
12363 #if 0
12364         if (invert) {
12365             /* This is a way to get the parse to skip forward a whole named
12366              * sequence instead of matching the 2nd character when it fails the
12367              * first */
12368             sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
12369         }
12370 #endif
12371
12372         RExC_parse = SvPV(substitute_parse, len);
12373         RExC_end = RExC_parse + len;
12374         RExC_in_multi_char_class = 1;
12375         RExC_emit = (regnode *)orig_emit;
12376
12377         ret = reg(pRExC_state, 1, &reg_flags, depth+1);
12378
12379         *flagp |= reg_flags&(HASWIDTH|SPSTART|POSTPONED);
12380
12381         RExC_parse = save_parse;
12382         RExC_end = save_end;
12383         RExC_in_multi_char_class = 0;
12384         SvREFCNT_dec(multi_char_matches);
12385         return ret;
12386     }
12387
12388     /* If the character class contains only a single element, it may be
12389      * optimizable into another node type which is smaller and runs faster.
12390      * Check if this is the case for this class */
12391     if (element_count == 1) {
12392         U8 op = END;
12393         U8 arg = 0;
12394
12395         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like \w or
12396                                               [:digit:] or \p{foo} */
12397
12398             /* Certain named classes have equivalents that can appear outside a
12399              * character class, e.g. \w, \H.  We use these instead of a
12400              * character class. */
12401             switch ((I32)namedclass) {
12402                 U8 offset;
12403
12404                 /* The first group is for node types that depend on the charset
12405                  * modifier to the regex.  We first calculate the base node
12406                  * type, and if it should be inverted */
12407
12408                 case ANYOF_NWORDCHAR:
12409                     invert = ! invert;
12410                     /* FALLTHROUGH */
12411                 case ANYOF_WORDCHAR:
12412                     op = ALNUM;
12413                     goto join_charset_classes;
12414
12415                 case ANYOF_NSPACE:
12416                     invert = ! invert;
12417                     /* FALLTHROUGH */
12418                 case ANYOF_SPACE:
12419                     op = SPACE;
12420                     goto join_charset_classes;
12421
12422                 case ANYOF_NDIGIT:
12423                     invert = ! invert;
12424                     /* FALLTHROUGH */
12425                 case ANYOF_DIGIT:
12426                     op = DIGIT;
12427
12428                   join_charset_classes:
12429
12430                     /* Now that we have the base node type, we take advantage
12431                      * of the enum ordering of the charset modifiers to get the
12432                      * exact node type,  For example the base SPACE also has
12433                      * SPACEL, SPACEU, and SPACEA */
12434
12435                     offset = get_regex_charset(RExC_flags);
12436
12437                     /* /aa is the same as /a for these */
12438                     if (offset == REGEX_ASCII_MORE_RESTRICTED_CHARSET) {
12439                         offset = REGEX_ASCII_RESTRICTED_CHARSET;
12440                     }
12441                     else if (op == DIGIT && offset == REGEX_UNICODE_CHARSET) {
12442                         offset = REGEX_DEPENDS_CHARSET; /* There is no DIGITU */
12443                     }
12444
12445                     op += offset;
12446
12447                     /* The number of varieties of each of these is the same,
12448                      * hence, so is the delta between the normal and
12449                      * complemented nodes */
12450                     if (invert) {
12451                         op += NALNUM - ALNUM;
12452                     }
12453                     *flagp |= HASWIDTH|SIMPLE;
12454                     break;
12455
12456                 /* The second group doesn't depend of the charset modifiers.
12457                  * We just have normal and complemented */
12458                 case ANYOF_NHORIZWS:
12459                     invert = ! invert;
12460                     /* FALLTHROUGH */
12461                 case ANYOF_HORIZWS:
12462                   is_horizws:
12463                     op = (invert) ? NHORIZWS : HORIZWS;
12464                     *flagp |= HASWIDTH|SIMPLE;
12465                     break;
12466
12467                 case ANYOF_NVERTWS:
12468                     invert = ! invert;
12469                     /* FALLTHROUGH */
12470                 case ANYOF_VERTWS:
12471                     op = (invert) ? NVERTWS : VERTWS;
12472                     *flagp |= HASWIDTH|SIMPLE;
12473                     break;
12474
12475                 case ANYOF_MAX:
12476                     break;
12477
12478                 case ANYOF_NBLANK:
12479                     invert = ! invert;
12480                     /* FALLTHROUGH */
12481                 case ANYOF_BLANK:
12482                     if (AT_LEAST_UNI_SEMANTICS && ! AT_LEAST_ASCII_RESTRICTED) {
12483                         goto is_horizws;
12484                     }
12485                     /* FALLTHROUGH */
12486                 default:
12487                     /* A generic posix class.  All the /a ones can be handled
12488                      * by the POSIXA opcode.  And all are closed under folding
12489                      * in the ASCII range, so FOLD doesn't matter */
12490                     if (AT_LEAST_ASCII_RESTRICTED
12491                         || (! LOC && namedclass == ANYOF_ASCII))
12492                     {
12493                         /* The odd numbered ones are the complements of the
12494                          * next-lower even number one */
12495                         if (namedclass % 2 == 1) {
12496                             invert = ! invert;
12497                             namedclass--;
12498                         }
12499                         arg = namedclass_to_classnum(namedclass);
12500                         op = (invert) ? NPOSIXA : POSIXA;
12501                     }
12502                     break;
12503             }
12504         }
12505         else if (value == prevvalue) {
12506
12507             /* Here, the class consists of just a single code point */
12508
12509             if (invert) {
12510                 if (! LOC && value == '\n') {
12511                     op = REG_ANY; /* Optimize [^\n] */
12512                     *flagp |= HASWIDTH|SIMPLE;
12513                     RExC_naughty++;
12514                 }
12515             }
12516             else if (value < 256 || UTF) {
12517
12518                 /* Optimize a single value into an EXACTish node, but not if it
12519                  * would require converting the pattern to UTF-8. */
12520                 op = compute_EXACTish(pRExC_state);
12521             }
12522         } /* Otherwise is a range */
12523         else if (! LOC) {   /* locale could vary these */
12524             if (prevvalue == '0') {
12525                 if (value == '9') {
12526                     op = (invert) ? NDIGITA : DIGITA;
12527                     *flagp |= HASWIDTH|SIMPLE;
12528                 }
12529             }
12530         }
12531
12532         /* Here, we have changed <op> away from its initial value iff we found
12533          * an optimization */
12534         if (op != END) {
12535
12536             /* Throw away this ANYOF regnode, and emit the calculated one,
12537              * which should correspond to the beginning, not current, state of
12538              * the parse */
12539             const char * cur_parse = RExC_parse;
12540             RExC_parse = (char *)orig_parse;
12541             if ( SIZE_ONLY) {
12542                 if (! LOC) {
12543
12544                     /* To get locale nodes to not use the full ANYOF size would
12545                      * require moving the code above that writes the portions
12546                      * of it that aren't in other nodes to after this point.
12547                      * e.g.  ANYOF_CLASS_SET */
12548                     RExC_size = orig_size;
12549                 }
12550             }
12551             else {
12552                 RExC_emit = (regnode *)orig_emit;
12553             }
12554
12555             ret = reg_node(pRExC_state, op);
12556
12557             if (PL_regkind[op] == POSIXD) {
12558                 if (! SIZE_ONLY) {
12559                     FLAGS(ret) = arg;
12560                 }
12561                 *flagp |= HASWIDTH|SIMPLE;
12562             }
12563             else if (PL_regkind[op] == EXACT) {
12564                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
12565             }
12566
12567             RExC_parse = (char *) cur_parse;
12568
12569             SvREFCNT_dec(listsv);
12570             return ret;
12571         }
12572     }
12573
12574     if (SIZE_ONLY)
12575         return ret;
12576     /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
12577
12578     /* If folding, we calculate all characters that could fold to or from the
12579      * ones already on the list */
12580     if (FOLD && cp_list) {
12581         UV start, end;  /* End points of code point ranges */
12582
12583         SV* fold_intersection = NULL;
12584
12585         /* If the highest code point is within Latin1, we can use the
12586          * compiled-in Alphas list, and not have to go out to disk.  This
12587          * yields two false positives, the masculine and feminine oridinal
12588          * indicators, which are weeded out below using the
12589          * IS_IN_SOME_FOLD_L1() macro */
12590         if (invlist_highest(cp_list) < 256) {
12591             _invlist_intersection(PL_L1PosixAlpha, cp_list, &fold_intersection);
12592         }
12593         else {
12594
12595             /* Here, there are non-Latin1 code points, so we will have to go
12596              * fetch the list of all the characters that participate in folds
12597              */
12598             if (! PL_utf8_foldable) {
12599                 SV* swash = swash_init("utf8", "_Perl_Any_Folds",
12600                                        &PL_sv_undef, 1, 0);
12601                 PL_utf8_foldable = _get_swash_invlist(swash);
12602                 SvREFCNT_dec(swash);
12603             }
12604
12605             /* This is a hash that for a particular fold gives all characters
12606              * that are involved in it */
12607             if (! PL_utf8_foldclosures) {
12608
12609                 /* If we were unable to find any folds, then we likely won't be
12610                  * able to find the closures.  So just create an empty list.
12611                  * Folding will effectively be restricted to the non-Unicode
12612                  * rules hard-coded into Perl.  (This case happens legitimately
12613                  * during compilation of Perl itself before the Unicode tables
12614                  * are generated) */
12615                 if (_invlist_len(PL_utf8_foldable) == 0) {
12616                     PL_utf8_foldclosures = newHV();
12617                 }
12618                 else {
12619                     /* If the folds haven't been read in, call a fold function
12620                      * to force that */
12621                     if (! PL_utf8_tofold) {
12622                         U8 dummy[UTF8_MAXBYTES+1];
12623
12624                         /* This string is just a short named one above \xff */
12625                         to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
12626                         assert(PL_utf8_tofold); /* Verify that worked */
12627                     }
12628                     PL_utf8_foldclosures =
12629                                         _swash_inversion_hash(PL_utf8_tofold);
12630                 }
12631             }
12632
12633             /* Only the characters in this class that participate in folds need
12634              * be checked.  Get the intersection of this class and all the
12635              * possible characters that are foldable.  This can quickly narrow
12636              * down a large class */
12637             _invlist_intersection(PL_utf8_foldable, cp_list,
12638                                   &fold_intersection);
12639         }
12640
12641         /* Now look at the foldable characters in this class individually */
12642         invlist_iterinit(fold_intersection);
12643         while (invlist_iternext(fold_intersection, &start, &end)) {
12644             UV j;
12645
12646             /* Locale folding for Latin1 characters is deferred until runtime */
12647             if (LOC && start < 256) {
12648                 start = 256;
12649             }
12650
12651             /* Look at every character in the range */
12652             for (j = start; j <= end; j++) {
12653
12654                 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
12655                 STRLEN foldlen;
12656                 UV f;
12657                 SV** listp;
12658
12659                 if (j < 256) {
12660
12661                     /* We have the latin1 folding rules hard-coded here so that
12662                      * an innocent-looking character class, like /[ks]/i won't
12663                      * have to go out to disk to find the possible matches.
12664                      * XXX It would be better to generate these via regen, in
12665                      * case a new version of the Unicode standard adds new
12666                      * mappings, though that is not really likely, and may be
12667                      * caught by the default: case of the switch below. */
12668
12669                     if (IS_IN_SOME_FOLD_L1(j)) {
12670
12671                         /* ASCII is always matched; non-ASCII is matched only
12672                          * under Unicode rules */
12673                         if (isASCII(j) || AT_LEAST_UNI_SEMANTICS) {
12674                             cp_list =
12675                                 add_cp_to_invlist(cp_list, PL_fold_latin1[j]);
12676                         }
12677                         else {
12678                             depends_list =
12679                              add_cp_to_invlist(depends_list, PL_fold_latin1[j]);
12680                         }
12681                     }
12682
12683                     if (HAS_NONLATIN1_FOLD_CLOSURE(j)
12684                         && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
12685                     {
12686                         /* Certain Latin1 characters have matches outside
12687                          * Latin1.  To get here, <j> is one of those
12688                          * characters.   None of these matches is valid for
12689                          * ASCII characters under /aa, which is why the 'if'
12690                          * just above excludes those.  These matches only
12691                          * happen when the target string is utf8.  The code
12692                          * below adds the single fold closures for <j> to the
12693                          * inversion list. */
12694                         switch (j) {
12695                             case 'k':
12696                             case 'K':
12697                                 cp_list =
12698                                     add_cp_to_invlist(cp_list, KELVIN_SIGN);
12699                                 break;
12700                             case 's':
12701                             case 'S':
12702                                 cp_list = add_cp_to_invlist(cp_list,
12703                                                     LATIN_SMALL_LETTER_LONG_S);
12704                                 break;
12705                             case MICRO_SIGN:
12706                                 cp_list = add_cp_to_invlist(cp_list,
12707                                                     GREEK_CAPITAL_LETTER_MU);
12708                                 cp_list = add_cp_to_invlist(cp_list,
12709                                                     GREEK_SMALL_LETTER_MU);
12710                                 break;
12711                             case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
12712                             case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
12713                                 cp_list =
12714                                     add_cp_to_invlist(cp_list, ANGSTROM_SIGN);
12715                                 break;
12716                             case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
12717                                 cp_list = add_cp_to_invlist(cp_list,
12718                                         LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
12719                                 break;
12720                             case LATIN_SMALL_LETTER_SHARP_S:
12721                                 cp_list = add_cp_to_invlist(cp_list,
12722                                                 LATIN_CAPITAL_LETTER_SHARP_S);
12723                                 break;
12724                             case 'F': case 'f':
12725                             case 'I': case 'i':
12726                             case 'L': case 'l':
12727                             case 'T': case 't':
12728                             case 'A': case 'a':
12729                             case 'H': case 'h':
12730                             case 'J': case 'j':
12731                             case 'N': case 'n':
12732                             case 'W': case 'w':
12733                             case 'Y': case 'y':
12734                                 /* These all are targets of multi-character
12735                                  * folds from code points that require UTF8 to
12736                                  * express, so they can't match unless the
12737                                  * target string is in UTF-8, so no action here
12738                                  * is necessary, as regexec.c properly handles
12739                                  * the general case for UTF-8 matching and
12740                                  * multi-char folds */
12741                                 break;
12742                             default:
12743                                 /* Use deprecated warning to increase the
12744                                  * chances of this being output */
12745                                 ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j);
12746                                 break;
12747                         }
12748                     }
12749                     continue;
12750                 }
12751
12752                 /* Here is an above Latin1 character.  We don't have the rules
12753                  * hard-coded for it.  First, get its fold.  This is the simple
12754                  * fold, as the multi-character folds have been handled earlier
12755                  * and separated out */
12756                 f = _to_uni_fold_flags(j, foldbuf, &foldlen,
12757                                         ((LOC)
12758                                         ? FOLD_FLAGS_LOCALE
12759                                         : (ASCII_FOLD_RESTRICTED)
12760                                             ? FOLD_FLAGS_NOMIX_ASCII
12761                                             : 0));
12762
12763                 /* Single character fold of above Latin1.  Add everything in
12764                  * its fold closure to the list that this node should match.
12765                  * The fold closures data structure is a hash with the keys
12766                  * being the UTF-8 of every character that is folded to, like
12767                  * 'k', and the values each an array of all code points that
12768                  * fold to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ].
12769                  * Multi-character folds are not included */
12770                 if ((listp = hv_fetch(PL_utf8_foldclosures,
12771                                       (char *) foldbuf, foldlen, FALSE)))
12772                 {
12773                     AV* list = (AV*) *listp;
12774                     IV k;
12775                     for (k = 0; k <= av_len(list); k++) {
12776                         SV** c_p = av_fetch(list, k, FALSE);
12777                         UV c;
12778                         if (c_p == NULL) {
12779                             Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
12780                         }
12781                         c = SvUV(*c_p);
12782
12783                         /* /aa doesn't allow folds between ASCII and non-; /l
12784                          * doesn't allow them between above and below 256 */
12785                         if ((ASCII_FOLD_RESTRICTED
12786                                   && (isASCII(c) != isASCII(j)))
12787                             || (LOC && ((c < 256) != (j < 256))))
12788                         {
12789                             continue;
12790                         }
12791
12792                         /* Folds involving non-ascii Latin1 characters
12793                          * under /d are added to a separate list */
12794                         if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
12795                         {
12796                             cp_list = add_cp_to_invlist(cp_list, c);
12797                         }
12798                         else {
12799                           depends_list = add_cp_to_invlist(depends_list, c);
12800                         }
12801                     }
12802                 }
12803             }
12804         }
12805         SvREFCNT_dec(fold_intersection);
12806     }
12807
12808     /* And combine the result (if any) with any inversion list from posix
12809      * classes.  The lists are kept separate up to now because we don't want to
12810      * fold the classes (folding of those is automatically handled by the swash
12811      * fetching code) */
12812     if (posixes) {
12813         if (! DEPENDS_SEMANTICS) {
12814             if (cp_list) {
12815                 _invlist_union(cp_list, posixes, &cp_list);
12816                 SvREFCNT_dec(posixes);
12817             }
12818             else {
12819                 cp_list = posixes;
12820             }
12821         }
12822         else {
12823             /* Under /d, we put into a separate list the Latin1 things that
12824              * match only when the target string is utf8 */
12825             SV* nonascii_but_latin1_properties = NULL;
12826             _invlist_intersection(posixes, PL_Latin1,
12827                                   &nonascii_but_latin1_properties);
12828             _invlist_subtract(nonascii_but_latin1_properties, PL_ASCII,
12829                               &nonascii_but_latin1_properties);
12830             _invlist_subtract(posixes, nonascii_but_latin1_properties,
12831                               &posixes);
12832             if (cp_list) {
12833                 _invlist_union(cp_list, posixes, &cp_list);
12834                 SvREFCNT_dec(posixes);
12835             }
12836             else {
12837                 cp_list = posixes;
12838             }
12839
12840             if (depends_list) {
12841                 _invlist_union(depends_list, nonascii_but_latin1_properties,
12842                                &depends_list);
12843                 SvREFCNT_dec(nonascii_but_latin1_properties);
12844             }
12845             else {
12846                 depends_list = nonascii_but_latin1_properties;
12847             }
12848         }
12849     }
12850
12851     /* And combine the result (if any) with any inversion list from properties.
12852      * The lists are kept separate up to now so that we can distinguish the two
12853      * in regards to matching above-Unicode.  A run-time warning is generated
12854      * if a Unicode property is matched against a non-Unicode code point. But,
12855      * we allow user-defined properties to match anything, without any warning,
12856      * and we also suppress the warning if there is a portion of the character
12857      * class that isn't a Unicode property, and which matches above Unicode, \W
12858      * or [\x{110000}] for example.
12859      * (Note that in this case, unlike the Posix one above, there is no
12860      * <depends_list>, because having a Unicode property forces Unicode
12861      * semantics */
12862     if (properties) {
12863         bool warn_super = ! has_user_defined_property;
12864         if (cp_list) {
12865
12866             /* If it matters to the final outcome, see if a non-property
12867              * component of the class matches above Unicode.  If so, the
12868              * warning gets suppressed.  This is true even if just a single
12869              * such code point is specified, as though not strictly correct if
12870              * another such code point is matched against, the fact that they
12871              * are using above-Unicode code points indicates they should know
12872              * the issues involved */
12873             if (warn_super) {
12874                 bool non_prop_matches_above_Unicode =
12875                             runtime_posix_matches_above_Unicode
12876                             | (invlist_highest(cp_list) > PERL_UNICODE_MAX);
12877                 if (invert) {
12878                     non_prop_matches_above_Unicode =
12879                                             !  non_prop_matches_above_Unicode;
12880                 }
12881                 warn_super = ! non_prop_matches_above_Unicode;
12882             }
12883
12884             _invlist_union(properties, cp_list, &cp_list);
12885             SvREFCNT_dec(properties);
12886         }
12887         else {
12888             cp_list = properties;
12889         }
12890
12891         if (warn_super) {
12892             ANYOF_FLAGS(ret) |= ANYOF_WARN_SUPER;
12893         }
12894     }
12895
12896     /* Here, we have calculated what code points should be in the character
12897      * class.
12898      *
12899      * Now we can see about various optimizations.  Fold calculation (which we
12900      * did above) needs to take place before inversion.  Otherwise /[^k]/i
12901      * would invert to include K, which under /i would match k, which it
12902      * shouldn't.  Therefore we can't invert folded locale now, as it won't be
12903      * folded until runtime */
12904
12905     /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
12906      * at compile time.  Besides not inverting folded locale now, we can't
12907      * invert if there are things such as \w, which aren't known until runtime
12908      * */
12909     if (invert
12910         && ! (LOC && (FOLD || (ANYOF_FLAGS(ret) & ANYOF_CLASS)))
12911         && ! depends_list
12912         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
12913     {
12914         _invlist_invert(cp_list);
12915
12916         /* Any swash can't be used as-is, because we've inverted things */
12917         if (swash) {
12918             SvREFCNT_dec(swash);
12919             swash = NULL;
12920         }
12921
12922         /* Clear the invert flag since have just done it here */
12923         invert = FALSE;
12924     }
12925
12926     /* If we didn't do folding, it's because some information isn't available
12927      * until runtime; set the run-time fold flag for these.  (We don't have to
12928      * worry about properties folding, as that is taken care of by the swash
12929      * fetching) */
12930     if (FOLD && LOC)
12931     {
12932        ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
12933     }
12934
12935     /* Some character classes are equivalent to other nodes.  Such nodes take
12936      * up less room and generally fewer operations to execute than ANYOF nodes.
12937      * Above, we checked for and optimized into some such equivalents for
12938      * certain common classes that are easy to test.  Getting to this point in
12939      * the code means that the class didn't get optimized there.  Since this
12940      * code is only executed in Pass 2, it is too late to save space--it has
12941      * been allocated in Pass 1, and currently isn't given back.  But turning
12942      * things into an EXACTish node can allow the optimizer to join it to any
12943      * adjacent such nodes.  And if the class is equivalent to things like /./,
12944      * expensive run-time swashes can be avoided.  Now that we have more
12945      * complete information, we can find things necessarily missed by the
12946      * earlier code.  I (khw) am not sure how much to look for here.  It would
12947      * be easy, but perhaps too slow, to check any candidates against all the
12948      * node types they could possibly match using _invlistEQ(). */
12949
12950     if (cp_list
12951         && ! invert
12952         && ! depends_list
12953         && ! (ANYOF_FLAGS(ret) & ANYOF_CLASS)
12954         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
12955     {
12956        UV start, end;
12957        U8 op = END;  /* The optimzation node-type */
12958         const char * cur_parse= RExC_parse;
12959
12960        invlist_iterinit(cp_list);
12961        if (! invlist_iternext(cp_list, &start, &end)) {
12962
12963             /* Here, the list is empty.  This happens, for example, when a
12964              * Unicode property is the only thing in the character class, and
12965              * it doesn't match anything.  (perluniprops.pod notes such
12966              * properties) */
12967             op = OPFAIL;
12968             *flagp |= HASWIDTH|SIMPLE;
12969         }
12970         else if (start == end) {    /* The range is a single code point */
12971             if (! invlist_iternext(cp_list, &start, &end)
12972
12973                     /* Don't do this optimization if it would require changing
12974                      * the pattern to UTF-8 */
12975                 && (start < 256 || UTF))
12976             {
12977                 /* Here, the list contains a single code point.  Can optimize
12978                  * into an EXACT node */
12979
12980                 value = start;
12981
12982                 if (! FOLD) {
12983                     op = EXACT;
12984                 }
12985                 else if (LOC) {
12986
12987                     /* A locale node under folding with one code point can be
12988                      * an EXACTFL, as its fold won't be calculated until
12989                      * runtime */
12990                     op = EXACTFL;
12991                 }
12992                 else {
12993
12994                     /* Here, we are generally folding, but there is only one
12995                      * code point to match.  If we have to, we use an EXACT
12996                      * node, but it would be better for joining with adjacent
12997                      * nodes in the optimization pass if we used the same
12998                      * EXACTFish node that any such are likely to be.  We can
12999                      * do this iff the code point doesn't participate in any
13000                      * folds.  For example, an EXACTF of a colon is the same as
13001                      * an EXACT one, since nothing folds to or from a colon. */
13002                     if (value < 256) {
13003                         if (IS_IN_SOME_FOLD_L1(value)) {
13004                             op = EXACT;
13005                         }
13006                     }
13007                     else {
13008                         if (! PL_utf8_foldable) {
13009                             SV* swash = swash_init("utf8", "_Perl_Any_Folds",
13010                                                 &PL_sv_undef, 1, 0);
13011                             PL_utf8_foldable = _get_swash_invlist(swash);
13012                             SvREFCNT_dec(swash);
13013                         }
13014                         if (_invlist_contains_cp(PL_utf8_foldable, value)) {
13015                             op = EXACT;
13016                         }
13017                     }
13018
13019                     /* If we haven't found the node type, above, it means we
13020                      * can use the prevailing one */
13021                     if (op == END) {
13022                         op = compute_EXACTish(pRExC_state);
13023                     }
13024                 }
13025             }
13026         }
13027         else if (start == 0) {
13028             if (end == UV_MAX) {
13029                 op = SANY;
13030                 *flagp |= HASWIDTH|SIMPLE;
13031                 RExC_naughty++;
13032             }
13033             else if (end == '\n' - 1
13034                     && invlist_iternext(cp_list, &start, &end)
13035                     && start == '\n' + 1 && end == UV_MAX)
13036             {
13037                 op = REG_ANY;
13038                 *flagp |= HASWIDTH|SIMPLE;
13039                 RExC_naughty++;
13040             }
13041         }
13042
13043         if (op != END) {
13044             RExC_parse = (char *)orig_parse;
13045             RExC_emit = (regnode *)orig_emit;
13046
13047             ret = reg_node(pRExC_state, op);
13048
13049             RExC_parse = (char *)cur_parse;
13050
13051             if (PL_regkind[op] == EXACT) {
13052                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
13053             }
13054
13055             SvREFCNT_dec(listsv);
13056             return ret;
13057         }
13058     }
13059
13060     /* Here, <cp_list> contains all the code points we can determine at
13061      * compile time that match under all conditions.  Go through it, and
13062      * for things that belong in the bitmap, put them there, and delete from
13063      * <cp_list>.  While we are at it, see if everything above 255 is in the
13064      * list, and if so, set a flag to speed up execution */
13065     ANYOF_BITMAP_ZERO(ret);
13066     if (cp_list) {
13067
13068         /* This gets set if we actually need to modify things */
13069         bool change_invlist = FALSE;
13070
13071         UV start, end;
13072
13073         /* Start looking through <cp_list> */
13074         invlist_iterinit(cp_list);
13075         while (invlist_iternext(cp_list, &start, &end)) {
13076             UV high;
13077             int i;
13078
13079             if (end == UV_MAX && start <= 256) {
13080                 ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
13081             }
13082
13083             /* Quit if are above what we should change */
13084             if (start > 255) {
13085                 break;
13086             }
13087
13088             change_invlist = TRUE;
13089
13090             /* Set all the bits in the range, up to the max that we are doing */
13091             high = (end < 255) ? end : 255;
13092             for (i = start; i <= (int) high; i++) {
13093                 if (! ANYOF_BITMAP_TEST(ret, i)) {
13094                     ANYOF_BITMAP_SET(ret, i);
13095                     prevvalue = value;
13096                     value = i;
13097                 }
13098             }
13099         }
13100
13101         /* Done with loop; remove any code points that are in the bitmap from
13102          * <cp_list> */
13103         if (change_invlist) {
13104             _invlist_subtract(cp_list, PL_Latin1, &cp_list);
13105         }
13106
13107         /* If have completely emptied it, remove it completely */
13108         if (_invlist_len(cp_list) == 0) {
13109             SvREFCNT_dec(cp_list);
13110             cp_list = NULL;
13111         }
13112     }
13113
13114     if (invert) {
13115         ANYOF_FLAGS(ret) |= ANYOF_INVERT;
13116     }
13117
13118     /* Here, the bitmap has been populated with all the Latin1 code points that
13119      * always match.  Can now add to the overall list those that match only
13120      * when the target string is UTF-8 (<depends_list>). */
13121     if (depends_list) {
13122         if (cp_list) {
13123             _invlist_union(cp_list, depends_list, &cp_list);
13124             SvREFCNT_dec(depends_list);
13125         }
13126         else {
13127             cp_list = depends_list;
13128         }
13129     }
13130
13131     /* If there is a swash and more than one element, we can't use the swash in
13132      * the optimization below. */
13133     if (swash && element_count > 1) {
13134         SvREFCNT_dec(swash);
13135         swash = NULL;
13136     }
13137
13138     if (! cp_list
13139         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13140     {
13141         ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
13142         SvREFCNT_dec(listsv);
13143     }
13144     else {
13145         /* av[0] stores the character class description in its textual form:
13146          *       used later (regexec.c:Perl_regclass_swash()) to initialize the
13147          *       appropriate swash, and is also useful for dumping the regnode.
13148          * av[1] if NULL, is a placeholder to later contain the swash computed
13149          *       from av[0].  But if no further computation need be done, the
13150          *       swash is stored there now.
13151          * av[2] stores the cp_list inversion list for use in addition or
13152          *       instead of av[0]; used only if av[1] is NULL
13153          * av[3] is set if any component of the class is from a user-defined
13154          *       property; used only if av[1] is NULL */
13155         AV * const av = newAV();
13156         SV *rv;
13157
13158         av_store(av, 0, (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13159                         ? listsv
13160                         : &PL_sv_undef);
13161         if (swash) {
13162             av_store(av, 1, swash);
13163             SvREFCNT_dec(cp_list);
13164         }
13165         else {
13166             av_store(av, 1, NULL);
13167             if (cp_list) {
13168                 av_store(av, 2, cp_list);
13169                 av_store(av, 3, newSVuv(has_user_defined_property));
13170             }
13171         }
13172
13173         rv = newRV_noinc(MUTABLE_SV(av));
13174         n = add_data(pRExC_state, 1, "s");
13175         RExC_rxi->data->data[n] = (void*)rv;
13176         ARG_SET(ret, n);
13177     }
13178
13179     *flagp |= HASWIDTH|SIMPLE;
13180     return ret;
13181 }
13182 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
13183
13184
13185 /* reg_skipcomment()
13186
13187    Absorbs an /x style # comments from the input stream.
13188    Returns true if there is more text remaining in the stream.
13189    Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
13190    terminates the pattern without including a newline.
13191
13192    Note its the callers responsibility to ensure that we are
13193    actually in /x mode
13194
13195 */
13196
13197 STATIC bool
13198 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
13199 {
13200     bool ended = 0;
13201
13202     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
13203
13204     while (RExC_parse < RExC_end)
13205         if (*RExC_parse++ == '\n') {
13206             ended = 1;
13207             break;
13208         }
13209     if (!ended) {
13210         /* we ran off the end of the pattern without ending
13211            the comment, so we have to add an \n when wrapping */
13212         RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
13213         return 0;
13214     } else
13215         return 1;
13216 }
13217
13218 /* nextchar()
13219
13220    Advances the parse position, and optionally absorbs
13221    "whitespace" from the inputstream.
13222
13223    Without /x "whitespace" means (?#...) style comments only,
13224    with /x this means (?#...) and # comments and whitespace proper.
13225
13226    Returns the RExC_parse point from BEFORE the scan occurs.
13227
13228    This is the /x friendly way of saying RExC_parse++.
13229 */
13230
13231 STATIC char*
13232 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
13233 {
13234     char* const retval = RExC_parse++;
13235
13236     PERL_ARGS_ASSERT_NEXTCHAR;
13237
13238     for (;;) {
13239         if (RExC_end - RExC_parse >= 3
13240             && *RExC_parse == '('
13241             && RExC_parse[1] == '?'
13242             && RExC_parse[2] == '#')
13243         {
13244             while (*RExC_parse != ')') {
13245                 if (RExC_parse == RExC_end)
13246                     FAIL("Sequence (?#... not terminated");
13247                 RExC_parse++;
13248             }
13249             RExC_parse++;
13250             continue;
13251         }
13252         if (RExC_flags & RXf_PMf_EXTENDED) {
13253             if (isSPACE(*RExC_parse)) {
13254                 RExC_parse++;
13255                 continue;
13256             }
13257             else if (*RExC_parse == '#') {
13258                 if ( reg_skipcomment( pRExC_state ) )
13259                     continue;
13260             }
13261         }
13262         return retval;
13263     }
13264 }
13265
13266 /*
13267 - reg_node - emit a node
13268 */
13269 STATIC regnode *                        /* Location. */
13270 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
13271 {
13272     dVAR;
13273     regnode *ptr;
13274     regnode * const ret = RExC_emit;
13275     GET_RE_DEBUG_FLAGS_DECL;
13276
13277     PERL_ARGS_ASSERT_REG_NODE;
13278
13279     if (SIZE_ONLY) {
13280         SIZE_ALIGN(RExC_size);
13281         RExC_size += 1;
13282         return(ret);
13283     }
13284     if (RExC_emit >= RExC_emit_bound)
13285         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
13286                    op, RExC_emit, RExC_emit_bound);
13287
13288     NODE_ALIGN_FILL(ret);
13289     ptr = ret;
13290     FILL_ADVANCE_NODE(ptr, op);
13291     REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 1);
13292 #ifdef RE_TRACK_PATTERN_OFFSETS
13293     if (RExC_offsets) {         /* MJD */
13294         MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
13295               "reg_node", __LINE__,
13296               PL_reg_name[op],
13297               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
13298                 ? "Overwriting end of array!\n" : "OK",
13299               (UV)(RExC_emit - RExC_emit_start),
13300               (UV)(RExC_parse - RExC_start),
13301               (UV)RExC_offsets[0]));
13302         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
13303     }
13304 #endif
13305     RExC_emit = ptr;
13306     return(ret);
13307 }
13308
13309 /*
13310 - reganode - emit a node with an argument
13311 */
13312 STATIC regnode *                        /* Location. */
13313 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
13314 {
13315     dVAR;
13316     regnode *ptr;
13317     regnode * const ret = RExC_emit;
13318     GET_RE_DEBUG_FLAGS_DECL;
13319
13320     PERL_ARGS_ASSERT_REGANODE;
13321
13322     if (SIZE_ONLY) {
13323         SIZE_ALIGN(RExC_size);
13324         RExC_size += 2;
13325         /*
13326            We can't do this:
13327
13328            assert(2==regarglen[op]+1);
13329
13330            Anything larger than this has to allocate the extra amount.
13331            If we changed this to be:
13332
13333            RExC_size += (1 + regarglen[op]);
13334
13335            then it wouldn't matter. Its not clear what side effect
13336            might come from that so its not done so far.
13337            -- dmq
13338         */
13339         return(ret);
13340     }
13341     if (RExC_emit >= RExC_emit_bound)
13342         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
13343                    op, RExC_emit, RExC_emit_bound);
13344
13345     NODE_ALIGN_FILL(ret);
13346     ptr = ret;
13347     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
13348     REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 2);
13349 #ifdef RE_TRACK_PATTERN_OFFSETS
13350     if (RExC_offsets) {         /* MJD */
13351         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
13352               "reganode",
13353               __LINE__,
13354               PL_reg_name[op],
13355               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
13356               "Overwriting end of array!\n" : "OK",
13357               (UV)(RExC_emit - RExC_emit_start),
13358               (UV)(RExC_parse - RExC_start),
13359               (UV)RExC_offsets[0]));
13360         Set_Cur_Node_Offset;
13361     }
13362 #endif
13363     RExC_emit = ptr;
13364     return(ret);
13365 }
13366
13367 /*
13368 - reguni - emit (if appropriate) a Unicode character
13369 */
13370 STATIC STRLEN
13371 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
13372 {
13373     dVAR;
13374
13375     PERL_ARGS_ASSERT_REGUNI;
13376
13377     return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
13378 }
13379
13380 /*
13381 - reginsert - insert an operator in front of already-emitted operand
13382 *
13383 * Means relocating the operand.
13384 */
13385 STATIC void
13386 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
13387 {
13388     dVAR;
13389     regnode *src;
13390     regnode *dst;
13391     regnode *place;
13392     const int offset = regarglen[(U8)op];
13393     const int size = NODE_STEP_REGNODE + offset;
13394     GET_RE_DEBUG_FLAGS_DECL;
13395
13396     PERL_ARGS_ASSERT_REGINSERT;
13397     PERL_UNUSED_ARG(depth);
13398 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
13399     DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
13400     if (SIZE_ONLY) {
13401         RExC_size += size;
13402         return;
13403     }
13404
13405     src = RExC_emit;
13406     RExC_emit += size;
13407     dst = RExC_emit;
13408     if (RExC_open_parens) {
13409         int paren;
13410         /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
13411         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
13412             if ( RExC_open_parens[paren] >= opnd ) {
13413                 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
13414                 RExC_open_parens[paren] += size;
13415             } else {
13416                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
13417             }
13418             if ( RExC_close_parens[paren] >= opnd ) {
13419                 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
13420                 RExC_close_parens[paren] += size;
13421             } else {
13422                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
13423             }
13424         }
13425     }
13426
13427     while (src > opnd) {
13428         StructCopy(--src, --dst, regnode);
13429 #ifdef RE_TRACK_PATTERN_OFFSETS
13430         if (RExC_offsets) {     /* MJD 20010112 */
13431             MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
13432                   "reg_insert",
13433                   __LINE__,
13434                   PL_reg_name[op],
13435                   (UV)(dst - RExC_emit_start) > RExC_offsets[0]
13436                     ? "Overwriting end of array!\n" : "OK",
13437                   (UV)(src - RExC_emit_start),
13438                   (UV)(dst - RExC_emit_start),
13439                   (UV)RExC_offsets[0]));
13440             Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
13441             Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
13442         }
13443 #endif
13444     }
13445
13446
13447     place = opnd;               /* Op node, where operand used to be. */
13448 #ifdef RE_TRACK_PATTERN_OFFSETS
13449     if (RExC_offsets) {         /* MJD */
13450         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
13451               "reginsert",
13452               __LINE__,
13453               PL_reg_name[op],
13454               (UV)(place - RExC_emit_start) > RExC_offsets[0]
13455               ? "Overwriting end of array!\n" : "OK",
13456               (UV)(place - RExC_emit_start),
13457               (UV)(RExC_parse - RExC_start),
13458               (UV)RExC_offsets[0]));
13459         Set_Node_Offset(place, RExC_parse);
13460         Set_Node_Length(place, 1);
13461     }
13462 #endif
13463     src = NEXTOPER(place);
13464     FILL_ADVANCE_NODE(place, op);
13465     REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (place) - 1);
13466     Zero(src, offset, regnode);
13467 }
13468
13469 /*
13470 - regtail - set the next-pointer at the end of a node chain of p to val.
13471 - SEE ALSO: regtail_study
13472 */
13473 /* TODO: All three parms should be const */
13474 STATIC void
13475 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
13476 {
13477     dVAR;
13478     regnode *scan;
13479     GET_RE_DEBUG_FLAGS_DECL;
13480
13481     PERL_ARGS_ASSERT_REGTAIL;
13482 #ifndef DEBUGGING
13483     PERL_UNUSED_ARG(depth);
13484 #endif
13485
13486     if (SIZE_ONLY)
13487         return;
13488
13489     /* Find last node. */
13490     scan = p;
13491     for (;;) {
13492         regnode * const temp = regnext(scan);
13493         DEBUG_PARSE_r({
13494             SV * const mysv=sv_newmortal();
13495             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
13496             regprop(RExC_rx, mysv, scan);
13497             PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
13498                 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
13499                     (temp == NULL ? "->" : ""),
13500                     (temp == NULL ? PL_reg_name[OP(val)] : "")
13501             );
13502         });
13503         if (temp == NULL)
13504             break;
13505         scan = temp;
13506     }
13507
13508     if (reg_off_by_arg[OP(scan)]) {
13509         ARG_SET(scan, val - scan);
13510     }
13511     else {
13512         NEXT_OFF(scan) = val - scan;
13513     }
13514 }
13515
13516 #ifdef DEBUGGING
13517 /*
13518 - regtail_study - set the next-pointer at the end of a node chain of p to val.
13519 - Look for optimizable sequences at the same time.
13520 - currently only looks for EXACT chains.
13521
13522 This is experimental code. The idea is to use this routine to perform
13523 in place optimizations on branches and groups as they are constructed,
13524 with the long term intention of removing optimization from study_chunk so
13525 that it is purely analytical.
13526
13527 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
13528 to control which is which.
13529
13530 */
13531 /* TODO: All four parms should be const */
13532
13533 STATIC U8
13534 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
13535 {
13536     dVAR;
13537     regnode *scan;
13538     U8 exact = PSEUDO;
13539 #ifdef EXPERIMENTAL_INPLACESCAN
13540     I32 min = 0;
13541 #endif
13542     GET_RE_DEBUG_FLAGS_DECL;
13543
13544     PERL_ARGS_ASSERT_REGTAIL_STUDY;
13545
13546
13547     if (SIZE_ONLY)
13548         return exact;
13549
13550     /* Find last node. */
13551
13552     scan = p;
13553     for (;;) {
13554         regnode * const temp = regnext(scan);
13555 #ifdef EXPERIMENTAL_INPLACESCAN
13556         if (PL_regkind[OP(scan)] == EXACT) {
13557             bool has_exactf_sharp_s;    /* Unexamined in this routine */
13558             if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1))
13559                 return EXACT;
13560         }
13561 #endif
13562         if ( exact ) {
13563             switch (OP(scan)) {
13564                 case EXACT:
13565                 case EXACTF:
13566                 case EXACTFA:
13567                 case EXACTFU:
13568                 case EXACTFU_SS:
13569                 case EXACTFU_TRICKYFOLD:
13570                 case EXACTFL:
13571                         if( exact == PSEUDO )
13572                             exact= OP(scan);
13573                         else if ( exact != OP(scan) )
13574                             exact= 0;
13575                 case NOTHING:
13576                     break;
13577                 default:
13578                     exact= 0;
13579             }
13580         }
13581         DEBUG_PARSE_r({
13582             SV * const mysv=sv_newmortal();
13583             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
13584             regprop(RExC_rx, mysv, scan);
13585             PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
13586                 SvPV_nolen_const(mysv),
13587                 REG_NODE_NUM(scan),
13588                 PL_reg_name[exact]);
13589         });
13590         if (temp == NULL)
13591             break;
13592         scan = temp;
13593     }
13594     DEBUG_PARSE_r({
13595         SV * const mysv_val=sv_newmortal();
13596         DEBUG_PARSE_MSG("");
13597         regprop(RExC_rx, mysv_val, val);
13598         PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
13599                       SvPV_nolen_const(mysv_val),
13600                       (IV)REG_NODE_NUM(val),
13601                       (IV)(val - scan)
13602         );
13603     });
13604     if (reg_off_by_arg[OP(scan)]) {
13605         ARG_SET(scan, val - scan);
13606     }
13607     else {
13608         NEXT_OFF(scan) = val - scan;
13609     }
13610
13611     return exact;
13612 }
13613 #endif
13614
13615 /*
13616  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
13617  */
13618 #ifdef DEBUGGING
13619 static void
13620 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
13621 {
13622     int bit;
13623     int set=0;
13624     regex_charset cs;
13625
13626     for (bit=0; bit<32; bit++) {
13627         if (flags & (1<<bit)) {
13628             if ((1<<bit) & RXf_PMf_CHARSET) {   /* Output separately, below */
13629                 continue;
13630             }
13631             if (!set++ && lead)
13632                 PerlIO_printf(Perl_debug_log, "%s",lead);
13633             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
13634         }
13635     }
13636     if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
13637             if (!set++ && lead) {
13638                 PerlIO_printf(Perl_debug_log, "%s",lead);
13639             }
13640             switch (cs) {
13641                 case REGEX_UNICODE_CHARSET:
13642                     PerlIO_printf(Perl_debug_log, "UNICODE");
13643                     break;
13644                 case REGEX_LOCALE_CHARSET:
13645                     PerlIO_printf(Perl_debug_log, "LOCALE");
13646                     break;
13647                 case REGEX_ASCII_RESTRICTED_CHARSET:
13648                     PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
13649                     break;
13650                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
13651                     PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
13652                     break;
13653                 default:
13654                     PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
13655                     break;
13656             }
13657     }
13658     if (lead)  {
13659         if (set)
13660             PerlIO_printf(Perl_debug_log, "\n");
13661         else
13662             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
13663     }
13664 }
13665 #endif
13666
13667 void
13668 Perl_regdump(pTHX_ const regexp *r)
13669 {
13670 #ifdef DEBUGGING
13671     dVAR;
13672     SV * const sv = sv_newmortal();
13673     SV *dsv= sv_newmortal();
13674     RXi_GET_DECL(r,ri);
13675     GET_RE_DEBUG_FLAGS_DECL;
13676
13677     PERL_ARGS_ASSERT_REGDUMP;
13678
13679     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
13680
13681     /* Header fields of interest. */
13682     if (r->anchored_substr) {
13683         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
13684             RE_SV_DUMPLEN(r->anchored_substr), 30);
13685         PerlIO_printf(Perl_debug_log,
13686                       "anchored %s%s at %"IVdf" ",
13687                       s, RE_SV_TAIL(r->anchored_substr),
13688                       (IV)r->anchored_offset);
13689     } else if (r->anchored_utf8) {
13690         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
13691             RE_SV_DUMPLEN(r->anchored_utf8), 30);
13692         PerlIO_printf(Perl_debug_log,
13693                       "anchored utf8 %s%s at %"IVdf" ",
13694                       s, RE_SV_TAIL(r->anchored_utf8),
13695                       (IV)r->anchored_offset);
13696     }
13697     if (r->float_substr) {
13698         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
13699             RE_SV_DUMPLEN(r->float_substr), 30);
13700         PerlIO_printf(Perl_debug_log,
13701                       "floating %s%s at %"IVdf"..%"UVuf" ",
13702                       s, RE_SV_TAIL(r->float_substr),
13703                       (IV)r->float_min_offset, (UV)r->float_max_offset);
13704     } else if (r->float_utf8) {
13705         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
13706             RE_SV_DUMPLEN(r->float_utf8), 30);
13707         PerlIO_printf(Perl_debug_log,
13708                       "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
13709                       s, RE_SV_TAIL(r->float_utf8),
13710                       (IV)r->float_min_offset, (UV)r->float_max_offset);
13711     }
13712     if (r->check_substr || r->check_utf8)
13713         PerlIO_printf(Perl_debug_log,
13714                       (const char *)
13715                       (r->check_substr == r->float_substr
13716                        && r->check_utf8 == r->float_utf8
13717                        ? "(checking floating" : "(checking anchored"));
13718     if (r->extflags & RXf_NOSCAN)
13719         PerlIO_printf(Perl_debug_log, " noscan");
13720     if (r->extflags & RXf_CHECK_ALL)
13721         PerlIO_printf(Perl_debug_log, " isall");
13722     if (r->check_substr || r->check_utf8)
13723         PerlIO_printf(Perl_debug_log, ") ");
13724
13725     if (ri->regstclass) {
13726         regprop(r, sv, ri->regstclass);
13727         PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
13728     }
13729     if (r->extflags & RXf_ANCH) {
13730         PerlIO_printf(Perl_debug_log, "anchored");
13731         if (r->extflags & RXf_ANCH_BOL)
13732             PerlIO_printf(Perl_debug_log, "(BOL)");
13733         if (r->extflags & RXf_ANCH_MBOL)
13734             PerlIO_printf(Perl_debug_log, "(MBOL)");
13735         if (r->extflags & RXf_ANCH_SBOL)
13736             PerlIO_printf(Perl_debug_log, "(SBOL)");
13737         if (r->extflags & RXf_ANCH_GPOS)
13738             PerlIO_printf(Perl_debug_log, "(GPOS)");
13739         PerlIO_putc(Perl_debug_log, ' ');
13740     }
13741     if (r->extflags & RXf_GPOS_SEEN)
13742         PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
13743     if (r->intflags & PREGf_SKIP)
13744         PerlIO_printf(Perl_debug_log, "plus ");
13745     if (r->intflags & PREGf_IMPLICIT)
13746         PerlIO_printf(Perl_debug_log, "implicit ");
13747     PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
13748     if (r->extflags & RXf_EVAL_SEEN)
13749         PerlIO_printf(Perl_debug_log, "with eval ");
13750     PerlIO_printf(Perl_debug_log, "\n");
13751     DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
13752 #else
13753     PERL_ARGS_ASSERT_REGDUMP;
13754     PERL_UNUSED_CONTEXT;
13755     PERL_UNUSED_ARG(r);
13756 #endif  /* DEBUGGING */
13757 }
13758
13759 /*
13760 - regprop - printable representation of opcode
13761 */
13762 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
13763 STMT_START { \
13764         if (do_sep) {                           \
13765             Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
13766             if (flags & ANYOF_INVERT)           \
13767                 /*make sure the invert info is in each */ \
13768                 sv_catpvs(sv, "^");             \
13769             do_sep = 0;                         \
13770         }                                       \
13771 } STMT_END
13772
13773 void
13774 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
13775 {
13776 #ifdef DEBUGGING
13777     dVAR;
13778     int k;
13779
13780     /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
13781     static const char * const anyofs[] = {
13782         "\\w",
13783         "\\W",
13784         "\\s",
13785         "\\S",
13786         "\\d",
13787         "\\D",
13788         "[:alnum:]",
13789         "[:^alnum:]",
13790         "[:alpha:]",
13791         "[:^alpha:]",
13792         "[:ascii:]",
13793         "[:^ascii:]",
13794         "[:cntrl:]",
13795         "[:^cntrl:]",
13796         "[:graph:]",
13797         "[:^graph:]",
13798         "[:lower:]",
13799         "[:^lower:]",
13800         "[:print:]",
13801         "[:^print:]",
13802         "[:punct:]",
13803         "[:^punct:]",
13804         "[:upper:]",
13805         "[:^upper:]",
13806         "[:xdigit:]",
13807         "[:^xdigit:]",
13808         "[:space:]",
13809         "[:^space:]",
13810         "[:blank:]",
13811         "[:^blank:]"
13812     };
13813     RXi_GET_DECL(prog,progi);
13814     GET_RE_DEBUG_FLAGS_DECL;
13815
13816     PERL_ARGS_ASSERT_REGPROP;
13817
13818     sv_setpvs(sv, "");
13819
13820     if (OP(o) > REGNODE_MAX)            /* regnode.type is unsigned */
13821         /* It would be nice to FAIL() here, but this may be called from
13822            regexec.c, and it would be hard to supply pRExC_state. */
13823         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
13824     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
13825
13826     k = PL_regkind[OP(o)];
13827
13828     if (k == EXACT) {
13829         sv_catpvs(sv, " ");
13830         /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
13831          * is a crude hack but it may be the best for now since
13832          * we have no flag "this EXACTish node was UTF-8"
13833          * --jhi */
13834         pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
13835                   PERL_PV_ESCAPE_UNI_DETECT |
13836                   PERL_PV_ESCAPE_NONASCII   |
13837                   PERL_PV_PRETTY_ELLIPSES   |
13838                   PERL_PV_PRETTY_LTGT       |
13839                   PERL_PV_PRETTY_NOCLEAR
13840                   );
13841     } else if (k == TRIE) {
13842         /* print the details of the trie in dumpuntil instead, as
13843          * progi->data isn't available here */
13844         const char op = OP(o);
13845         const U32 n = ARG(o);
13846         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
13847                (reg_ac_data *)progi->data->data[n] :
13848                NULL;
13849         const reg_trie_data * const trie
13850             = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
13851
13852         Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
13853         DEBUG_TRIE_COMPILE_r(
13854             Perl_sv_catpvf(aTHX_ sv,
13855                 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
13856                 (UV)trie->startstate,
13857                 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
13858                 (UV)trie->wordcount,
13859                 (UV)trie->minlen,
13860                 (UV)trie->maxlen,
13861                 (UV)TRIE_CHARCOUNT(trie),
13862                 (UV)trie->uniquecharcount
13863             )
13864         );
13865         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
13866             int i;
13867             int rangestart = -1;
13868             U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
13869             sv_catpvs(sv, "[");
13870             for (i = 0; i <= 256; i++) {
13871                 if (i < 256 && BITMAP_TEST(bitmap,i)) {
13872                     if (rangestart == -1)
13873                         rangestart = i;
13874                 } else if (rangestart != -1) {
13875                     if (i <= rangestart + 3)
13876                         for (; rangestart < i; rangestart++)
13877                             put_byte(sv, rangestart);
13878                     else {
13879                         put_byte(sv, rangestart);
13880                         sv_catpvs(sv, "-");
13881                         put_byte(sv, i - 1);
13882                     }
13883                     rangestart = -1;
13884                 }
13885             }
13886             sv_catpvs(sv, "]");
13887         }
13888
13889     } else if (k == CURLY) {
13890         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
13891             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
13892         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
13893     }
13894     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
13895         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
13896     else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
13897         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
13898         if ( RXp_PAREN_NAMES(prog) ) {
13899             if ( k != REF || (OP(o) < NREF)) {
13900                 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
13901                 SV **name= av_fetch(list, ARG(o), 0 );
13902                 if (name)
13903                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
13904             }
13905             else {
13906                 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
13907                 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
13908                 I32 *nums=(I32*)SvPVX(sv_dat);
13909                 SV **name= av_fetch(list, nums[0], 0 );
13910                 I32 n;
13911                 if (name) {
13912                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
13913                         Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
13914                                     (n ? "," : ""), (IV)nums[n]);
13915                     }
13916                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
13917                 }
13918             }
13919         }
13920     } else if (k == GOSUB)
13921         Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
13922     else if (k == VERB) {
13923         if (!o->flags)
13924             Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
13925                            SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
13926     } else if (k == LOGICAL)
13927         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);     /* 2: embedded, otherwise 1 */
13928     else if (k == ANYOF) {
13929         int i, rangestart = -1;
13930         const U8 flags = ANYOF_FLAGS(o);
13931         int do_sep = 0;
13932
13933
13934         if (flags & ANYOF_LOCALE)
13935             sv_catpvs(sv, "{loc}");
13936         if (flags & ANYOF_LOC_FOLD)
13937             sv_catpvs(sv, "{i}");
13938         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
13939         if (flags & ANYOF_INVERT)
13940             sv_catpvs(sv, "^");
13941
13942         /* output what the standard cp 0-255 bitmap matches */
13943         for (i = 0; i <= 256; i++) {
13944             if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
13945                 if (rangestart == -1)
13946                     rangestart = i;
13947             } else if (rangestart != -1) {
13948                 if (i <= rangestart + 3)
13949                     for (; rangestart < i; rangestart++)
13950                         put_byte(sv, rangestart);
13951                 else {
13952                     put_byte(sv, rangestart);
13953                     sv_catpvs(sv, "-");
13954                     put_byte(sv, i - 1);
13955                 }
13956                 do_sep = 1;
13957                 rangestart = -1;
13958             }
13959         }
13960
13961         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
13962         /* output any special charclass tests (used entirely under use locale) */
13963         if (ANYOF_CLASS_TEST_ANY_SET(o))
13964             for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
13965                 if (ANYOF_CLASS_TEST(o,i)) {
13966                     sv_catpv(sv, anyofs[i]);
13967                     do_sep = 1;
13968                 }
13969
13970         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
13971
13972         if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
13973             sv_catpvs(sv, "{non-utf8-latin1-all}");
13974         }
13975
13976         /* output information about the unicode matching */
13977         if (flags & ANYOF_UNICODE_ALL)
13978             sv_catpvs(sv, "{unicode_all}");
13979         else if (ANYOF_NONBITMAP(o))
13980             sv_catpvs(sv, "{unicode}");
13981         if (flags & ANYOF_NONBITMAP_NON_UTF8)
13982             sv_catpvs(sv, "{outside bitmap}");
13983
13984         if (ANYOF_NONBITMAP(o)) {
13985             SV *lv; /* Set if there is something outside the bit map */
13986             SV * const sw = regclass_swash(prog, o, FALSE, &lv, NULL);
13987             bool byte_output = FALSE;   /* If something in the bitmap has been
13988                                            output */
13989
13990             if (lv && lv != &PL_sv_undef) {
13991                 if (sw) {
13992                     U8 s[UTF8_MAXBYTES_CASE+1];
13993
13994                     for (i = 0; i <= 256; i++) { /* Look at chars in bitmap */
13995                         uvchr_to_utf8(s, i);
13996
13997                         if (i < 256
13998                             && ! ANYOF_BITMAP_TEST(o, i)    /* Don't duplicate
13999                                                                things already
14000                                                                output as part
14001                                                                of the bitmap */
14002                             && swash_fetch(sw, s, TRUE))
14003                         {
14004                             if (rangestart == -1)
14005                                 rangestart = i;
14006                         } else if (rangestart != -1) {
14007                             byte_output = TRUE;
14008                             if (i <= rangestart + 3)
14009                                 for (; rangestart < i; rangestart++) {
14010                                     put_byte(sv, rangestart);
14011                                 }
14012                             else {
14013                                 put_byte(sv, rangestart);
14014                                 sv_catpvs(sv, "-");
14015                                 put_byte(sv, i-1);
14016                             }
14017                             rangestart = -1;
14018                         }
14019                     }
14020                 }
14021
14022                 {
14023                     char *s = savesvpv(lv);
14024                     char * const origs = s;
14025
14026                     while (*s && *s != '\n')
14027                         s++;
14028
14029                     if (*s == '\n') {
14030                         const char * const t = ++s;
14031
14032                         if (byte_output) {
14033                             sv_catpvs(sv, " ");
14034                         }
14035
14036                         while (*s) {
14037                             if (*s == '\n') {
14038
14039                                 /* Truncate very long output */
14040                                 if (s - origs > 256) {
14041                                     Perl_sv_catpvf(aTHX_ sv,
14042                                                    "%.*s...",
14043                                                    (int) (s - origs - 1),
14044                                                    t);
14045                                     goto out_dump;
14046                                 }
14047                                 *s = ' ';
14048                             }
14049                             else if (*s == '\t') {
14050                                 *s = '-';
14051                             }
14052                             s++;
14053                         }
14054                         if (s[-1] == ' ')
14055                             s[-1] = 0;
14056
14057                         sv_catpv(sv, t);
14058                     }
14059
14060                 out_dump:
14061
14062                     Safefree(origs);
14063                 }
14064                 SvREFCNT_dec(lv);
14065             }
14066         }
14067
14068         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
14069     }
14070     else if (k == POSIXD) {
14071         U8 index = FLAGS(o) * 2;
14072         if (index > (sizeof(anyofs) / sizeof(anyofs[0]))) {
14073             Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
14074         }
14075         else {
14076             sv_catpv(sv, anyofs[index]);
14077         }
14078     }
14079     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
14080         Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
14081 #else
14082     PERL_UNUSED_CONTEXT;
14083     PERL_UNUSED_ARG(sv);
14084     PERL_UNUSED_ARG(o);
14085     PERL_UNUSED_ARG(prog);
14086 #endif  /* DEBUGGING */
14087 }
14088
14089 SV *
14090 Perl_re_intuit_string(pTHX_ REGEXP * const r)
14091 {                               /* Assume that RE_INTUIT is set */
14092     dVAR;
14093     struct regexp *const prog = (struct regexp *)SvANY(r);
14094     GET_RE_DEBUG_FLAGS_DECL;
14095
14096     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
14097     PERL_UNUSED_CONTEXT;
14098
14099     DEBUG_COMPILE_r(
14100         {
14101             const char * const s = SvPV_nolen_const(prog->check_substr
14102                       ? prog->check_substr : prog->check_utf8);
14103
14104             if (!PL_colorset) reginitcolors();
14105             PerlIO_printf(Perl_debug_log,
14106                       "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
14107                       PL_colors[4],
14108                       prog->check_substr ? "" : "utf8 ",
14109                       PL_colors[5],PL_colors[0],
14110                       s,
14111                       PL_colors[1],
14112                       (strlen(s) > 60 ? "..." : ""));
14113         } );
14114
14115     return prog->check_substr ? prog->check_substr : prog->check_utf8;
14116 }
14117
14118 /*
14119    pregfree()
14120
14121    handles refcounting and freeing the perl core regexp structure. When
14122    it is necessary to actually free the structure the first thing it
14123    does is call the 'free' method of the regexp_engine associated to
14124    the regexp, allowing the handling of the void *pprivate; member
14125    first. (This routine is not overridable by extensions, which is why
14126    the extensions free is called first.)
14127
14128    See regdupe and regdupe_internal if you change anything here.
14129 */
14130 #ifndef PERL_IN_XSUB_RE
14131 void
14132 Perl_pregfree(pTHX_ REGEXP *r)
14133 {
14134     SvREFCNT_dec(r);
14135 }
14136
14137 void
14138 Perl_pregfree2(pTHX_ REGEXP *rx)
14139 {
14140     dVAR;
14141     struct regexp *const r = (struct regexp *)SvANY(rx);
14142     GET_RE_DEBUG_FLAGS_DECL;
14143
14144     PERL_ARGS_ASSERT_PREGFREE2;
14145
14146     if (r->mother_re) {
14147         ReREFCNT_dec(r->mother_re);
14148     } else {
14149         CALLREGFREE_PVT(rx); /* free the private data */
14150         SvREFCNT_dec(RXp_PAREN_NAMES(r));
14151     }
14152     if (r->substrs) {
14153         SvREFCNT_dec(r->anchored_substr);
14154         SvREFCNT_dec(r->anchored_utf8);
14155         SvREFCNT_dec(r->float_substr);
14156         SvREFCNT_dec(r->float_utf8);
14157         Safefree(r->substrs);
14158     }
14159     RX_MATCH_COPY_FREE(rx);
14160 #ifdef PERL_OLD_COPY_ON_WRITE
14161     SvREFCNT_dec(r->saved_copy);
14162 #endif
14163     Safefree(r->offs);
14164     SvREFCNT_dec(r->qr_anoncv);
14165 }
14166
14167 /*  reg_temp_copy()
14168
14169     This is a hacky workaround to the structural issue of match results
14170     being stored in the regexp structure which is in turn stored in
14171     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
14172     could be PL_curpm in multiple contexts, and could require multiple
14173     result sets being associated with the pattern simultaneously, such
14174     as when doing a recursive match with (??{$qr})
14175
14176     The solution is to make a lightweight copy of the regexp structure
14177     when a qr// is returned from the code executed by (??{$qr}) this
14178     lightweight copy doesn't actually own any of its data except for
14179     the starp/end and the actual regexp structure itself.
14180
14181 */
14182
14183
14184 REGEXP *
14185 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
14186 {
14187     struct regexp *ret;
14188     struct regexp *const r = (struct regexp *)SvANY(rx);
14189
14190     PERL_ARGS_ASSERT_REG_TEMP_COPY;
14191
14192     if (!ret_x)
14193         ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
14194     ret = (struct regexp *)SvANY(ret_x);
14195
14196     (void)ReREFCNT_inc(rx);
14197     /* We can take advantage of the existing "copied buffer" mechanism in SVs
14198        by pointing directly at the buffer, but flagging that the allocated
14199        space in the copy is zero. As we've just done a struct copy, it's now
14200        a case of zero-ing that, rather than copying the current length.  */
14201     SvPV_set(ret_x, RX_WRAPPED(rx));
14202     SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
14203     memcpy(&(ret->xpv_cur), &(r->xpv_cur),
14204            sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
14205     SvLEN_set(ret_x, 0);
14206     SvSTASH_set(ret_x, NULL);
14207     SvMAGIC_set(ret_x, NULL);
14208     if (r->offs) {
14209         const I32 npar = r->nparens+1;
14210         Newx(ret->offs, npar, regexp_paren_pair);
14211         Copy(r->offs, ret->offs, npar, regexp_paren_pair);
14212     }
14213     if (r->substrs) {
14214         Newx(ret->substrs, 1, struct reg_substr_data);
14215         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
14216
14217         SvREFCNT_inc_void(ret->anchored_substr);
14218         SvREFCNT_inc_void(ret->anchored_utf8);
14219         SvREFCNT_inc_void(ret->float_substr);
14220         SvREFCNT_inc_void(ret->float_utf8);
14221
14222         /* check_substr and check_utf8, if non-NULL, point to either their
14223            anchored or float namesakes, and don't hold a second reference.  */
14224     }
14225     RX_MATCH_COPIED_off(ret_x);
14226 #ifdef PERL_OLD_COPY_ON_WRITE
14227     ret->saved_copy = NULL;
14228 #endif
14229     ret->mother_re = rx;
14230     SvREFCNT_inc_void(ret->qr_anoncv);
14231
14232     return ret_x;
14233 }
14234 #endif
14235
14236 /* regfree_internal()
14237
14238    Free the private data in a regexp. This is overloadable by
14239    extensions. Perl takes care of the regexp structure in pregfree(),
14240    this covers the *pprivate pointer which technically perl doesn't
14241    know about, however of course we have to handle the
14242    regexp_internal structure when no extension is in use.
14243
14244    Note this is called before freeing anything in the regexp
14245    structure.
14246  */
14247
14248 void
14249 Perl_regfree_internal(pTHX_ REGEXP * const rx)
14250 {
14251     dVAR;
14252     struct regexp *const r = (struct regexp *)SvANY(rx);
14253     RXi_GET_DECL(r,ri);
14254     GET_RE_DEBUG_FLAGS_DECL;
14255
14256     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
14257
14258     DEBUG_COMPILE_r({
14259         if (!PL_colorset)
14260             reginitcolors();
14261         {
14262             SV *dsv= sv_newmortal();
14263             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
14264                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
14265             PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
14266                 PL_colors[4],PL_colors[5],s);
14267         }
14268     });
14269 #ifdef RE_TRACK_PATTERN_OFFSETS
14270     if (ri->u.offsets)
14271         Safefree(ri->u.offsets);             /* 20010421 MJD */
14272 #endif
14273     if (ri->code_blocks) {
14274         int n;
14275         for (n = 0; n < ri->num_code_blocks; n++)
14276             SvREFCNT_dec(ri->code_blocks[n].src_regex);
14277         Safefree(ri->code_blocks);
14278     }
14279
14280     if (ri->data) {
14281         int n = ri->data->count;
14282
14283         while (--n >= 0) {
14284           /* If you add a ->what type here, update the comment in regcomp.h */
14285             switch (ri->data->what[n]) {
14286             case 'a':
14287             case 'r':
14288             case 's':
14289             case 'S':
14290             case 'u':
14291                 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
14292                 break;
14293             case 'f':
14294                 Safefree(ri->data->data[n]);
14295                 break;
14296             case 'l':
14297             case 'L':
14298                 break;
14299             case 'T':
14300                 { /* Aho Corasick add-on structure for a trie node.
14301                      Used in stclass optimization only */
14302                     U32 refcount;
14303                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
14304                     OP_REFCNT_LOCK;
14305                     refcount = --aho->refcount;
14306                     OP_REFCNT_UNLOCK;
14307                     if ( !refcount ) {
14308                         PerlMemShared_free(aho->states);
14309                         PerlMemShared_free(aho->fail);
14310                          /* do this last!!!! */
14311                         PerlMemShared_free(ri->data->data[n]);
14312                         PerlMemShared_free(ri->regstclass);
14313                     }
14314                 }
14315                 break;
14316             case 't':
14317                 {
14318                     /* trie structure. */
14319                     U32 refcount;
14320                     reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
14321                     OP_REFCNT_LOCK;
14322                     refcount = --trie->refcount;
14323                     OP_REFCNT_UNLOCK;
14324                     if ( !refcount ) {
14325                         PerlMemShared_free(trie->charmap);
14326                         PerlMemShared_free(trie->states);
14327                         PerlMemShared_free(trie->trans);
14328                         if (trie->bitmap)
14329                             PerlMemShared_free(trie->bitmap);
14330                         if (trie->jump)
14331                             PerlMemShared_free(trie->jump);
14332                         PerlMemShared_free(trie->wordinfo);
14333                         /* do this last!!!! */
14334                         PerlMemShared_free(ri->data->data[n]);
14335                     }
14336                 }
14337                 break;
14338             default:
14339                 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
14340             }
14341         }
14342         Safefree(ri->data->what);
14343         Safefree(ri->data);
14344     }
14345
14346     Safefree(ri);
14347 }
14348
14349 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
14350 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
14351 #define SAVEPVN(p,n)    ((p) ? savepvn(p,n) : NULL)
14352
14353 /*
14354    re_dup - duplicate a regexp.
14355
14356    This routine is expected to clone a given regexp structure. It is only
14357    compiled under USE_ITHREADS.
14358
14359    After all of the core data stored in struct regexp is duplicated
14360    the regexp_engine.dupe method is used to copy any private data
14361    stored in the *pprivate pointer. This allows extensions to handle
14362    any duplication it needs to do.
14363
14364    See pregfree() and regfree_internal() if you change anything here.
14365 */
14366 #if defined(USE_ITHREADS)
14367 #ifndef PERL_IN_XSUB_RE
14368 void
14369 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
14370 {
14371     dVAR;
14372     I32 npar;
14373     const struct regexp *r = (const struct regexp *)SvANY(sstr);
14374     struct regexp *ret = (struct regexp *)SvANY(dstr);
14375
14376     PERL_ARGS_ASSERT_RE_DUP_GUTS;
14377
14378     npar = r->nparens+1;
14379     Newx(ret->offs, npar, regexp_paren_pair);
14380     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
14381     if(ret->swap) {
14382         /* no need to copy these */
14383         Newx(ret->swap, npar, regexp_paren_pair);
14384     }
14385
14386     if (ret->substrs) {
14387         /* Do it this way to avoid reading from *r after the StructCopy().
14388            That way, if any of the sv_dup_inc()s dislodge *r from the L1
14389            cache, it doesn't matter.  */
14390         const bool anchored = r->check_substr
14391             ? r->check_substr == r->anchored_substr
14392             : r->check_utf8 == r->anchored_utf8;
14393         Newx(ret->substrs, 1, struct reg_substr_data);
14394         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
14395
14396         ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
14397         ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
14398         ret->float_substr = sv_dup_inc(ret->float_substr, param);
14399         ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
14400
14401         /* check_substr and check_utf8, if non-NULL, point to either their
14402            anchored or float namesakes, and don't hold a second reference.  */
14403
14404         if (ret->check_substr) {
14405             if (anchored) {
14406                 assert(r->check_utf8 == r->anchored_utf8);
14407                 ret->check_substr = ret->anchored_substr;
14408                 ret->check_utf8 = ret->anchored_utf8;
14409             } else {
14410                 assert(r->check_substr == r->float_substr);
14411                 assert(r->check_utf8 == r->float_utf8);
14412                 ret->check_substr = ret->float_substr;
14413                 ret->check_utf8 = ret->float_utf8;
14414             }
14415         } else if (ret->check_utf8) {
14416             if (anchored) {
14417                 ret->check_utf8 = ret->anchored_utf8;
14418             } else {
14419                 ret->check_utf8 = ret->float_utf8;
14420             }
14421         }
14422     }
14423
14424     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
14425     ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
14426
14427     if (ret->pprivate)
14428         RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
14429
14430     if (RX_MATCH_COPIED(dstr))
14431         ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
14432     else
14433         ret->subbeg = NULL;
14434 #ifdef PERL_OLD_COPY_ON_WRITE
14435     ret->saved_copy = NULL;
14436 #endif
14437
14438     if (ret->mother_re) {
14439         if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
14440             /* Our storage points directly to our mother regexp, but that's
14441                1: a buffer in a different thread
14442                2: something we no longer hold a reference on
14443                so we need to copy it locally.  */
14444             /* Note we need to use SvCUR(), rather than
14445                SvLEN(), on our mother_re, because it, in
14446                turn, may well be pointing to its own mother_re.  */
14447             SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
14448                                    SvCUR(ret->mother_re)+1));
14449             SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
14450         }
14451         ret->mother_re      = NULL;
14452     }
14453     ret->gofs = 0;
14454 }
14455 #endif /* PERL_IN_XSUB_RE */
14456
14457 /*
14458    regdupe_internal()
14459
14460    This is the internal complement to regdupe() which is used to copy
14461    the structure pointed to by the *pprivate pointer in the regexp.
14462    This is the core version of the extension overridable cloning hook.
14463    The regexp structure being duplicated will be copied by perl prior
14464    to this and will be provided as the regexp *r argument, however
14465    with the /old/ structures pprivate pointer value. Thus this routine
14466    may override any copying normally done by perl.
14467
14468    It returns a pointer to the new regexp_internal structure.
14469 */
14470
14471 void *
14472 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
14473 {
14474     dVAR;
14475     struct regexp *const r = (struct regexp *)SvANY(rx);
14476     regexp_internal *reti;
14477     int len;
14478     RXi_GET_DECL(r,ri);
14479
14480     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
14481
14482     len = ProgLen(ri);
14483
14484     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
14485     Copy(ri->program, reti->program, len+1, regnode);
14486
14487     reti->num_code_blocks = ri->num_code_blocks;
14488     if (ri->code_blocks) {
14489         int n;
14490         Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
14491                 struct reg_code_block);
14492         Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
14493                 struct reg_code_block);
14494         for (n = 0; n < ri->num_code_blocks; n++)
14495              reti->code_blocks[n].src_regex = (REGEXP*)
14496                     sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
14497     }
14498     else
14499         reti->code_blocks = NULL;
14500
14501     reti->regstclass = NULL;
14502
14503     if (ri->data) {
14504         struct reg_data *d;
14505         const int count = ri->data->count;
14506         int i;
14507
14508         Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
14509                 char, struct reg_data);
14510         Newx(d->what, count, U8);
14511
14512         d->count = count;
14513         for (i = 0; i < count; i++) {
14514             d->what[i] = ri->data->what[i];
14515             switch (d->what[i]) {
14516                 /* see also regcomp.h and regfree_internal() */
14517             case 'a': /* actually an AV, but the dup function is identical.  */
14518             case 'r':
14519             case 's':
14520             case 'S':
14521             case 'u': /* actually an HV, but the dup function is identical.  */
14522                 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
14523                 break;
14524             case 'f':
14525                 /* This is cheating. */
14526                 Newx(d->data[i], 1, struct regnode_charclass_class);
14527                 StructCopy(ri->data->data[i], d->data[i],
14528                             struct regnode_charclass_class);
14529                 reti->regstclass = (regnode*)d->data[i];
14530                 break;
14531             case 'T':
14532                 /* Trie stclasses are readonly and can thus be shared
14533                  * without duplication. We free the stclass in pregfree
14534                  * when the corresponding reg_ac_data struct is freed.
14535                  */
14536                 reti->regstclass= ri->regstclass;
14537                 /* Fall through */
14538             case 't':
14539                 OP_REFCNT_LOCK;
14540                 ((reg_trie_data*)ri->data->data[i])->refcount++;
14541                 OP_REFCNT_UNLOCK;
14542                 /* Fall through */
14543             case 'l':
14544             case 'L':
14545                 d->data[i] = ri->data->data[i];
14546                 break;
14547             default:
14548                 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
14549             }
14550         }
14551
14552         reti->data = d;
14553     }
14554     else
14555         reti->data = NULL;
14556
14557     reti->name_list_idx = ri->name_list_idx;
14558
14559 #ifdef RE_TRACK_PATTERN_OFFSETS
14560     if (ri->u.offsets) {
14561         Newx(reti->u.offsets, 2*len+1, U32);
14562         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
14563     }
14564 #else
14565     SetProgLen(reti,len);
14566 #endif
14567
14568     return (void*)reti;
14569 }
14570
14571 #endif    /* USE_ITHREADS */
14572
14573 #ifndef PERL_IN_XSUB_RE
14574
14575 /*
14576  - regnext - dig the "next" pointer out of a node
14577  */
14578 regnode *
14579 Perl_regnext(pTHX_ register regnode *p)
14580 {
14581     dVAR;
14582     I32 offset;
14583
14584     if (!p)
14585         return(NULL);
14586
14587     if (OP(p) > REGNODE_MAX) {          /* regnode.type is unsigned */
14588         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
14589     }
14590
14591     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
14592     if (offset == 0)
14593         return(NULL);
14594
14595     return(p+offset);
14596 }
14597 #endif
14598
14599 STATIC void
14600 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
14601 {
14602     va_list args;
14603     STRLEN l1 = strlen(pat1);
14604     STRLEN l2 = strlen(pat2);
14605     char buf[512];
14606     SV *msv;
14607     const char *message;
14608
14609     PERL_ARGS_ASSERT_RE_CROAK2;
14610
14611     if (l1 > 510)
14612         l1 = 510;
14613     if (l1 + l2 > 510)
14614         l2 = 510 - l1;
14615     Copy(pat1, buf, l1 , char);
14616     Copy(pat2, buf + l1, l2 , char);
14617     buf[l1 + l2] = '\n';
14618     buf[l1 + l2 + 1] = '\0';
14619 #ifdef I_STDARG
14620     /* ANSI variant takes additional second argument */
14621     va_start(args, pat2);
14622 #else
14623     va_start(args);
14624 #endif
14625     msv = vmess(buf, &args);
14626     va_end(args);
14627     message = SvPV_const(msv,l1);
14628     if (l1 > 512)
14629         l1 = 512;
14630     Copy(message, buf, l1 , char);
14631     buf[l1-1] = '\0';                   /* Overwrite \n */
14632     Perl_croak(aTHX_ "%s", buf);
14633 }
14634
14635 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
14636
14637 #ifndef PERL_IN_XSUB_RE
14638 void
14639 Perl_save_re_context(pTHX)
14640 {
14641     dVAR;
14642
14643     struct re_save_state *state;
14644
14645     SAVEVPTR(PL_curcop);
14646     SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
14647
14648     state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
14649     PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
14650     SSPUSHUV(SAVEt_RE_STATE);
14651
14652     Copy(&PL_reg_state, state, 1, struct re_save_state);
14653
14654     PL_reg_oldsaved = NULL;
14655     PL_reg_oldsavedlen = 0;
14656     PL_reg_oldsavedoffset = 0;
14657     PL_reg_oldsavedcoffset = 0;
14658     PL_reg_maxiter = 0;
14659     PL_reg_leftiter = 0;
14660     PL_reg_poscache = NULL;
14661     PL_reg_poscache_size = 0;
14662 #ifdef PERL_OLD_COPY_ON_WRITE
14663     PL_nrs = NULL;
14664 #endif
14665
14666     /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
14667     if (PL_curpm) {
14668         const REGEXP * const rx = PM_GETRE(PL_curpm);
14669         if (rx) {
14670             U32 i;
14671             for (i = 1; i <= RX_NPARENS(rx); i++) {
14672                 char digits[TYPE_CHARS(long)];
14673                 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
14674                 GV *const *const gvp
14675                     = (GV**)hv_fetch(PL_defstash, digits, len, 0);
14676
14677                 if (gvp) {
14678                     GV * const gv = *gvp;
14679                     if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
14680                         save_scalar(gv);
14681                 }
14682             }
14683         }
14684     }
14685 }
14686 #endif
14687
14688 static void
14689 clear_re(pTHX_ void *r)
14690 {
14691     dVAR;
14692     ReREFCNT_dec((REGEXP *)r);
14693 }
14694
14695 #ifdef DEBUGGING
14696
14697 STATIC void
14698 S_put_byte(pTHX_ SV *sv, int c)
14699 {
14700     PERL_ARGS_ASSERT_PUT_BYTE;
14701
14702     /* Our definition of isPRINT() ignores locales, so only bytes that are
14703        not part of UTF-8 are considered printable. I assume that the same
14704        holds for UTF-EBCDIC.
14705        Also, code point 255 is not printable in either (it's E0 in EBCDIC,
14706        which Wikipedia says:
14707
14708        EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
14709        ones (binary 1111 1111, hexadecimal FF). It is similar, but not
14710        identical, to the ASCII delete (DEL) or rubout control character.
14711        ) So the old condition can be simplified to !isPRINT(c)  */
14712     if (!isPRINT(c)) {
14713         if (c < 256) {
14714             Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
14715         }
14716         else {
14717             Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
14718         }
14719     }
14720     else {
14721         const char string = c;
14722         if (c == '-' || c == ']' || c == '\\' || c == '^')
14723             sv_catpvs(sv, "\\");
14724         sv_catpvn(sv, &string, 1);
14725     }
14726 }
14727
14728
14729 #define CLEAR_OPTSTART \
14730     if (optstart) STMT_START { \
14731             DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
14732             optstart=NULL; \
14733     } STMT_END
14734
14735 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
14736
14737 STATIC const regnode *
14738 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
14739             const regnode *last, const regnode *plast,
14740             SV* sv, I32 indent, U32 depth)
14741 {
14742     dVAR;
14743     U8 op = PSEUDO;     /* Arbitrary non-END op. */
14744     const regnode *next;
14745     const regnode *optstart= NULL;
14746
14747     RXi_GET_DECL(r,ri);
14748     GET_RE_DEBUG_FLAGS_DECL;
14749
14750     PERL_ARGS_ASSERT_DUMPUNTIL;
14751
14752 #ifdef DEBUG_DUMPUNTIL
14753     PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
14754         last ? last-start : 0,plast ? plast-start : 0);
14755 #endif
14756
14757     if (plast && plast < last)
14758         last= plast;
14759
14760     while (PL_regkind[op] != END && (!last || node < last)) {
14761         /* While that wasn't END last time... */
14762         NODE_ALIGN(node);
14763         op = OP(node);
14764         if (op == CLOSE || op == WHILEM)
14765             indent--;
14766         next = regnext((regnode *)node);
14767
14768         /* Where, what. */
14769         if (OP(node) == OPTIMIZED) {
14770             if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
14771                 optstart = node;
14772             else
14773                 goto after_print;
14774         } else
14775             CLEAR_OPTSTART;
14776
14777         regprop(r, sv, node);
14778         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
14779                       (int)(2*indent + 1), "", SvPVX_const(sv));
14780
14781         if (OP(node) != OPTIMIZED) {
14782             if (next == NULL)           /* Next ptr. */
14783                 PerlIO_printf(Perl_debug_log, " (0)");
14784             else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
14785                 PerlIO_printf(Perl_debug_log, " (FAIL)");
14786             else
14787                 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
14788             (void)PerlIO_putc(Perl_debug_log, '\n');
14789         }
14790
14791       after_print:
14792         if (PL_regkind[(U8)op] == BRANCHJ) {
14793             assert(next);
14794             {
14795                 const regnode *nnode = (OP(next) == LONGJMP
14796                                        ? regnext((regnode *)next)
14797                                        : next);
14798                 if (last && nnode > last)
14799                     nnode = last;
14800                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
14801             }
14802         }
14803         else if (PL_regkind[(U8)op] == BRANCH) {
14804             assert(next);
14805             DUMPUNTIL(NEXTOPER(node), next);
14806         }
14807         else if ( PL_regkind[(U8)op]  == TRIE ) {
14808             const regnode *this_trie = node;
14809             const char op = OP(node);
14810             const U32 n = ARG(node);
14811             const reg_ac_data * const ac = op>=AHOCORASICK ?
14812                (reg_ac_data *)ri->data->data[n] :
14813                NULL;
14814             const reg_trie_data * const trie =
14815                 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
14816 #ifdef DEBUGGING
14817             AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
14818 #endif
14819             const regnode *nextbranch= NULL;
14820             I32 word_idx;
14821             sv_setpvs(sv, "");
14822             for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
14823                 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
14824
14825                 PerlIO_printf(Perl_debug_log, "%*s%s ",
14826                    (int)(2*(indent+3)), "",
14827                     elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
14828                             PL_colors[0], PL_colors[1],
14829                             (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
14830                             PERL_PV_PRETTY_ELLIPSES    |
14831                             PERL_PV_PRETTY_LTGT
14832                             )
14833                             : "???"
14834                 );
14835                 if (trie->jump) {
14836                     U16 dist= trie->jump[word_idx+1];
14837                     PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
14838                                   (UV)((dist ? this_trie + dist : next) - start));
14839                     if (dist) {
14840                         if (!nextbranch)
14841                             nextbranch= this_trie + trie->jump[0];
14842                         DUMPUNTIL(this_trie + dist, nextbranch);
14843                     }
14844                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
14845                         nextbranch= regnext((regnode *)nextbranch);
14846                 } else {
14847                     PerlIO_printf(Perl_debug_log, "\n");
14848                 }
14849             }
14850             if (last && next > last)
14851                 node= last;
14852             else
14853                 node= next;
14854         }
14855         else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
14856             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
14857                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
14858         }
14859         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
14860             assert(next);
14861             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
14862         }
14863         else if ( op == PLUS || op == STAR) {
14864             DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
14865         }
14866         else if (PL_regkind[(U8)op] == ANYOF) {
14867             /* arglen 1 + class block */
14868             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
14869                     ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
14870             node = NEXTOPER(node);
14871         }
14872         else if (PL_regkind[(U8)op] == EXACT) {
14873             /* Literal string, where present. */
14874             node += NODE_SZ_STR(node) - 1;
14875             node = NEXTOPER(node);
14876         }
14877         else {
14878             node = NEXTOPER(node);
14879             node += regarglen[(U8)op];
14880         }
14881         if (op == CURLYX || op == OPEN)
14882             indent++;
14883     }
14884     CLEAR_OPTSTART;
14885 #ifdef DEBUG_DUMPUNTIL
14886     PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
14887 #endif
14888     return node;
14889 }
14890
14891 #endif  /* DEBUGGING */
14892
14893 /*
14894  * Local variables:
14895  * c-indentation-style: bsd
14896  * c-basic-offset: 4
14897  * indent-tabs-mode: nil
14898  * End:
14899  *
14900  * ex: set ts=8 sts=4 sw=4 et:
14901  */