src/5017008/orig/regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  *      Copyright (c) 1986 by University of Toronto.
  42  *      Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  *      Permission is granted to anyone to use this software for any
  45  *      purpose on any computer system, and to redistribute it freely,
  46  *      subject to the following restrictions:
  47  *
  48  *      1. The author is not responsible for the consequences of use of
  49  *              this software, no matter how awful, even if they arise
  50  *              from defects in it.
  51  *
  52  *      2. The origin of this software must not be misrepresented, either
  53  *              by explicit claim or by omission.
  54  *
  55  *      3. Altered versions must be plainly marked as such, and must not
  56  *              be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #include "perl.h"
  76
  77 #ifndef PERL_IN_XSUB_RE
  78 #  include "INTERN.h"
  79 #endif
  80
  81 #define REG_COMP_C
  82 #ifdef PERL_IN_XSUB_RE
  83 #  include "re_comp.h"
  84 extern const struct regexp_engine my_reg_engine;
  85 #else
  86 #  include "regcomp.h"
  87 #endif
  88
  89 #include "dquote_static.c"
  90 #include "charclass_invlists.h"
  91 #include "inline_invlist.c"
  92 #include "unicode_constants.h"
  93
  94 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  95 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
  96 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
  97
  98 #ifdef op
  99 #undef op
 100 #endif /* op */
 101
 102 #ifdef MSDOS
 103 #  if defined(BUGGY_MSC6)
 104  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
 105 #    pragma optimize("a",off)
 106  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
 107 #    pragma optimize("w",on )
 108 #  endif /* BUGGY_MSC6 */
 109 #endif /* MSDOS */
 110
 111 #ifndef STATIC
 112 #define STATIC  static
 113 #endif
 114
 115
 116 typedef struct RExC_state_t {
 117     U32         flags;                  /* RXf_* are we folding, multilining? */
 118     U32         pm_flags;               /* PMf_* stuff from the calling PMOP */
 119     char        *precomp;               /* uncompiled string. */
 120     REGEXP      *rx_sv;                 /* The SV that is the regexp. */
 121     regexp      *rx;                    /* perl core regexp structure */
 122     regexp_internal     *rxi;           /* internal data for regexp object pprivate field */
 123     char        *start;                 /* Start of input for compile */
 124     char        *end;                   /* End of input for compile */
 125     char        *parse;                 /* Input-scan pointer. */
 126     I32         whilem_seen;            /* number of WHILEM in this expr */
 127     regnode     *emit_start;            /* Start of emitted-code area */
 128     regnode     *emit_bound;            /* First regnode outside of the allocated space */
 129     regnode     *emit;                  /* Code-emit pointer; &regdummy = don't = compiling */
 130     I32         naughty;                /* How bad is this pattern? */
 131     I32         sawback;                /* Did we see \1, ...? */
 132     U32         seen;
 133     I32         size;                   /* Code size. */
 134     I32         npar;                   /* Capture buffer count, (OPEN). */
 135     I32         cpar;                   /* Capture buffer count, (CLOSE). */
 136     I32         nestroot;               /* root parens we are in - used by accept */
 137     I32         extralen;
 138     I32         seen_zerolen;
 139     regnode     **open_parens;          /* pointers to open parens */
 140     regnode     **close_parens;         /* pointers to close parens */
 141     regnode     *opend;                 /* END node in program */
 142     I32         utf8;           /* whether the pattern is utf8 or not */
 143     I32         orig_utf8;      /* whether the pattern was originally in utf8 */
 144                                 /* XXX use this for future optimisation of case
 145                                  * where pattern must be upgraded to utf8. */
 146     I32         uni_semantics;  /* If a d charset modifier should use unicode
 147                                    rules, even if the pattern is not in
 148                                    utf8 */
 149     HV          *paren_names;           /* Paren names */
 150
 151     regnode     **recurse;              /* Recurse regops */
 152     I32         recurse_count;          /* Number of recurse regops */
 153     I32         in_lookbehind;
 154     I32         contains_locale;
 155     I32         override_recoding;
 156     I32         in_multi_char_class;
 157     struct reg_code_block *code_blocks; /* positions of literal (?{})
 158                                             within pattern */
 159     int         num_code_blocks;        /* size of code_blocks[] */
 160     int         code_index;             /* next code_blocks[] slot */
 161 #if ADD_TO_REGEXEC
 162     char        *starttry;              /* -Dr: where regtry was called. */
 163 #define RExC_starttry   (pRExC_state->starttry)
 164 #endif
 165     SV          *runtime_code_qr;       /* qr with the runtime code blocks */
 166 #ifdef DEBUGGING
 167     const char  *lastparse;
 168     I32         lastnum;
 169     AV          *paren_name_list;       /* idx -> name */
 170 #define RExC_lastparse  (pRExC_state->lastparse)
 171 #define RExC_lastnum    (pRExC_state->lastnum)
 172 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 173 #endif
 174 } RExC_state_t;
 175
 176 #define RExC_flags      (pRExC_state->flags)
 177 #define RExC_pm_flags   (pRExC_state->pm_flags)
 178 #define RExC_precomp    (pRExC_state->precomp)
 179 #define RExC_rx_sv      (pRExC_state->rx_sv)
 180 #define RExC_rx         (pRExC_state->rx)
 181 #define RExC_rxi        (pRExC_state->rxi)
 182 #define RExC_start      (pRExC_state->start)
 183 #define RExC_end        (pRExC_state->end)
 184 #define RExC_parse      (pRExC_state->parse)
 185 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
 186 #ifdef RE_TRACK_PATTERN_OFFSETS
 187 #define RExC_offsets    (pRExC_state->rxi->u.offsets) /* I am not like the others */
 188 #endif
 189 #define RExC_emit       (pRExC_state->emit)
 190 #define RExC_emit_start (pRExC_state->emit_start)
 191 #define RExC_emit_bound (pRExC_state->emit_bound)
 192 #define RExC_naughty    (pRExC_state->naughty)
 193 #define RExC_sawback    (pRExC_state->sawback)
 194 #define RExC_seen       (pRExC_state->seen)
 195 #define RExC_size       (pRExC_state->size)
 196 #define RExC_npar       (pRExC_state->npar)
 197 #define RExC_nestroot   (pRExC_state->nestroot)
 198 #define RExC_extralen   (pRExC_state->extralen)
 199 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
 200 #define RExC_utf8       (pRExC_state->utf8)
 201 #define RExC_uni_semantics      (pRExC_state->uni_semantics)
 202 #define RExC_orig_utf8  (pRExC_state->orig_utf8)
 203 #define RExC_open_parens        (pRExC_state->open_parens)
 204 #define RExC_close_parens       (pRExC_state->close_parens)
 205 #define RExC_opend      (pRExC_state->opend)
 206 #define RExC_paren_names        (pRExC_state->paren_names)
 207 #define RExC_recurse    (pRExC_state->recurse)
 208 #define RExC_recurse_count      (pRExC_state->recurse_count)
 209 #define RExC_in_lookbehind      (pRExC_state->in_lookbehind)
 210 #define RExC_contains_locale    (pRExC_state->contains_locale)
 211 #define RExC_override_recoding (pRExC_state->override_recoding)
 212 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
 213
 214
 215 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
 216 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 217         ((*s) == '{' && regcurly(s, FALSE)))
 218
 219 #ifdef SPSTART
 220 #undef SPSTART          /* dratted cpp namespace... */
 221 #endif
 222 /*
 223  * Flags to be passed up and down.
 224  */
 225 #define WORST           0       /* Worst case. */
 226 #define HASWIDTH        0x01    /* Known to match non-null strings. */
 227
 228 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
 229  * character.  (There needs to be a case: in the switch statement in regexec.c
 230  * for any node marked SIMPLE.)  Note that this is not the same thing as
 231  * REGNODE_SIMPLE */
 232 #define SIMPLE          0x02
 233 #define SPSTART         0x04    /* Starts with * or + */
 234 #define TRYAGAIN        0x08    /* Weeded out a declaration. */
 235 #define POSTPONED       0x10    /* (?1),(?&name), (??{...}) or similar */
 236
 237 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 238
 239 /* whether trie related optimizations are enabled */
 240 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 241 #define TRIE_STUDY_OPT
 242 #define FULL_TRIE_STUDY
 243 #define TRIE_STCLASS
 244 #endif
 245
 246
 247
 248 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 249 #define PBITVAL(paren) (1 << ((paren) & 7))
 250 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 251 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 252 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 253
 254 /* If not already in utf8, do a longjmp back to the beginning */
 255 #define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
 256 #define REQUIRE_UTF8    STMT_START {                                       \
 257                                      if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
 258                         } STMT_END
 259
 260 /* This converts the named class defined in regcomp.h to its equivalent class
 261  * number defined in handy.h. */
 262 #define namedclass_to_classnum(class)  ((int) ((class) / 2))
 263 #define classnum_to_namedclass(classnum)  ((classnum) * 2)
 264
 265 /* About scan_data_t.
 266
 267   During optimisation we recurse through the regexp program performing
 268   various inplace (keyhole style) optimisations. In addition study_chunk
 269   and scan_commit populate this data structure with information about
 270   what strings MUST appear in the pattern. We look for the longest
 271   string that must appear at a fixed location, and we look for the
 272   longest string that may appear at a floating location. So for instance
 273   in the pattern:
 274
 275     /FOO[xX]A.*B[xX]BAR/
 276
 277   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 278   strings (because they follow a .* construct). study_chunk will identify
 279   both FOO and BAR as being the longest fixed and floating strings respectively.
 280
 281   The strings can be composites, for instance
 282
 283      /(f)(o)(o)/
 284
 285   will result in a composite fixed substring 'foo'.
 286
 287   For each string some basic information is maintained:
 288
 289   - offset or min_offset
 290     This is the position the string must appear at, or not before.
 291     It also implicitly (when combined with minlenp) tells us how many
 292     characters must match before the string we are searching for.
 293     Likewise when combined with minlenp and the length of the string it
 294     tells us how many characters must appear after the string we have
 295     found.
 296
 297   - max_offset
 298     Only used for floating strings. This is the rightmost point that
 299     the string can appear at. If set to I32 max it indicates that the
 300     string can occur infinitely far to the right.
 301
 302   - minlenp
 303     A pointer to the minimum number of characters of the pattern that the
 304     string was found inside. This is important as in the case of positive
 305     lookahead or positive lookbehind we can have multiple patterns
 306     involved. Consider
 307
 308     /(?=FOO).*F/
 309
 310     The minimum length of the pattern overall is 3, the minimum length
 311     of the lookahead part is 3, but the minimum length of the part that
 312     will actually match is 1. So 'FOO's minimum length is 3, but the
 313     minimum length for the F is 1. This is important as the minimum length
 314     is used to determine offsets in front of and behind the string being
 315     looked for.  Since strings can be composites this is the length of the
 316     pattern at the time it was committed with a scan_commit. Note that
 317     the length is calculated by study_chunk, so that the minimum lengths
 318     are not known until the full pattern has been compiled, thus the
 319     pointer to the value.
 320
 321   - lookbehind
 322
 323     In the case of lookbehind the string being searched for can be
 324     offset past the start point of the final matching string.
 325     If this value was just blithely removed from the min_offset it would
 326     invalidate some of the calculations for how many chars must match
 327     before or after (as they are derived from min_offset and minlen and
 328     the length of the string being searched for).
 329     When the final pattern is compiled and the data is moved from the
 330     scan_data_t structure into the regexp structure the information
 331     about lookbehind is factored in, with the information that would
 332     have been lost precalculated in the end_shift field for the
 333     associated string.
 334
 335   The fields pos_min and pos_delta are used to store the minimum offset
 336   and the delta to the maximum offset at the current point in the pattern.
 337
 338 */
 339
 340 typedef struct scan_data_t {
 341     /*I32 len_min;      unused */
 342     /*I32 len_delta;    unused */
 343     I32 pos_min;
 344     I32 pos_delta;
 345     SV *last_found;
 346     I32 last_end;           /* min value, <0 unless valid. */
 347     I32 last_start_min;
 348     I32 last_start_max;
 349     SV **longest;           /* Either &l_fixed, or &l_float. */
 350     SV *longest_fixed;      /* longest fixed string found in pattern */
 351     I32 offset_fixed;       /* offset where it starts */
 352     I32 *minlen_fixed;      /* pointer to the minlen relevant to the string */
 353     I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 354     SV *longest_float;      /* longest floating string found in pattern */
 355     I32 offset_float_min;   /* earliest point in string it can appear */
 356     I32 offset_float_max;   /* latest point in string it can appear */
 357     I32 *minlen_float;      /* pointer to the minlen relevant to the string */
 358     I32 lookbehind_float;   /* is the position of the string modified by LB */
 359     I32 flags;
 360     I32 whilem_c;
 361     I32 *last_closep;
 362     struct regnode_charclass_class *start_class;
 363 } scan_data_t;
 364
 365 /*
 366  * Forward declarations for pregcomp()'s friends.
 367  */
 368
 369 static const scan_data_t zero_scan_data =
 370   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 371
 372 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 373 #define SF_BEFORE_SEOL          0x0001
 374 #define SF_BEFORE_MEOL          0x0002
 375 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 376 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 377
 378 #ifdef NO_UNARY_PLUS
 379 #  define SF_FIX_SHIFT_EOL      (0+2)
 380 #  define SF_FL_SHIFT_EOL               (0+4)
 381 #else
 382 #  define SF_FIX_SHIFT_EOL      (+2)
 383 #  define SF_FL_SHIFT_EOL               (+4)
 384 #endif
 385
 386 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 387 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 388
 389 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 390 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 391 #define SF_IS_INF               0x0040
 392 #define SF_HAS_PAR              0x0080
 393 #define SF_IN_PAR               0x0100
 394 #define SF_HAS_EVAL             0x0200
 395 #define SCF_DO_SUBSTR           0x0400
 396 #define SCF_DO_STCLASS_AND      0x0800
 397 #define SCF_DO_STCLASS_OR       0x1000
 398 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 399 #define SCF_WHILEM_VISITED_POS  0x2000
 400
 401 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 402 #define SCF_SEEN_ACCEPT         0x8000
 403
 404 #define UTF cBOOL(RExC_utf8)
 405
 406 /* The enums for all these are ordered so things work out correctly */
 407 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 408 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
 409 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
 410 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
 411 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
 412 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
 413 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
 414
 415 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 416
 417 #define OOB_NAMEDCLASS          -1
 418
 419 /* There is no code point that is out-of-bounds, so this is problematic.  But
 420  * its only current use is to initialize a variable that is always set before
 421  * looked at. */
 422 #define OOB_UNICODE             0xDEADBEEF
 423
 424 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 425 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 426
 427
 428 /* length of regex to show in messages that don't mark a position within */
 429 #define RegexLengthToShowInErrorMessages 127
 430
 431 /*
 432  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 433  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 434  * op/pragma/warn/regcomp.
 435  */
 436 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 437 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 438
 439 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
 440
 441 /*
 442  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 443  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 444  * "...".
 445  */
 446 #define _FAIL(code) STMT_START {                                        \
 447     const char *ellipses = "";                                          \
 448     IV len = RExC_end - RExC_precomp;                                   \
 449                                                                         \
 450     if (!SIZE_ONLY)                                                     \
 451         SAVEFREESV(RExC_rx_sv);                                         \
 452     if (len > RegexLengthToShowInErrorMessages) {                       \
 453         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
 454         len = RegexLengthToShowInErrorMessages - 10;                    \
 455         ellipses = "...";                                               \
 456     }                                                                   \
 457     code;                                                               \
 458 } STMT_END
 459
 460 #define FAIL(msg) _FAIL(                            \
 461     Perl_croak(aTHX_ "%s in regex m/%.*s%s/",       \
 462             msg, (int)len, RExC_precomp, ellipses))
 463
 464 #define FAIL2(msg,arg) _FAIL(                       \
 465     Perl_croak(aTHX_ msg " in regex m/%.*s%s/",     \
 466             arg, (int)len, RExC_precomp, ellipses))
 467
 468 /*
 469  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 470  */
 471 #define Simple_vFAIL(m) STMT_START {                                    \
 472     const IV offset = RExC_parse - RExC_precomp;                        \
 473     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
 474             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
 475 } STMT_END
 476
 477 /*
 478  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 479  */
 480 #define vFAIL(m) STMT_START {                           \
 481     if (!SIZE_ONLY)                                     \
 482         SAVEFREESV(RExC_rx_sv);                         \
 483     Simple_vFAIL(m);                                    \
 484 } STMT_END
 485
 486 /*
 487  * Like Simple_vFAIL(), but accepts two arguments.
 488  */
 489 #define Simple_vFAIL2(m,a1) STMT_START {                        \
 490     const IV offset = RExC_parse - RExC_precomp;                        \
 491     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,                   \
 492             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 493 } STMT_END
 494
 495 /*
 496  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 497  */
 498 #define vFAIL2(m,a1) STMT_START {                       \
 499     if (!SIZE_ONLY)                                     \
 500         SAVEFREESV(RExC_rx_sv);                         \
 501     Simple_vFAIL2(m, a1);                               \
 502 } STMT_END
 503
 504
 505 /*
 506  * Like Simple_vFAIL(), but accepts three arguments.
 507  */
 508 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
 509     const IV offset = RExC_parse - RExC_precomp;                \
 510     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,               \
 511             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 512 } STMT_END
 513
 514 /*
 515  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 516  */
 517 #define vFAIL3(m,a1,a2) STMT_START {                    \
 518     if (!SIZE_ONLY)                                     \
 519         SAVEFREESV(RExC_rx_sv);                         \
 520     Simple_vFAIL3(m, a1, a2);                           \
 521 } STMT_END
 522
 523 /*
 524  * Like Simple_vFAIL(), but accepts four arguments.
 525  */
 526 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
 527     const IV offset = RExC_parse - RExC_precomp;                \
 528     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,           \
 529             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 530 } STMT_END
 531
 532 #define vFAIL4(m,a1,a2,a3) STMT_START {                 \
 533     if (!SIZE_ONLY)                                     \
 534         SAVEFREESV(RExC_rx_sv);                         \
 535     Simple_vFAIL4(m, a1, a2, a3);                       \
 536 } STMT_END
 537
 538 /* m is not necessarily a "literal string", in this macro */
 539 #define reg_warn_non_literal_string(loc, m) STMT_START {                \
 540     const IV offset = loc - RExC_precomp;                               \
 541     Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,      \
 542             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
 543 } STMT_END
 544
 545 #define ckWARNreg(loc,m) STMT_START {                                   \
 546     const IV offset = loc - RExC_precomp;                               \
 547     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 548             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 549 } STMT_END
 550
 551 #define ckWARNregdep(loc,m) STMT_START {                                \
 552     const IV offset = loc - RExC_precomp;                               \
 553     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 554             m REPORT_LOCATION,                                          \
 555             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 556 } STMT_END
 557
 558 #define ckWARN2regdep(loc,m, a1) STMT_START {                           \
 559     const IV offset = loc - RExC_precomp;                               \
 560     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 561             m REPORT_LOCATION,                                          \
 562             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
 563 } STMT_END
 564
 565 #define ckWARN2reg(loc, m, a1) STMT_START {                             \
 566     const IV offset = loc - RExC_precomp;                               \
 567     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 568             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
 569 } STMT_END
 570
 571 #define vWARN3(loc, m, a1, a2) STMT_START {                             \
 572     const IV offset = loc - RExC_precomp;                               \
 573     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 574             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 575 } STMT_END
 576
 577 #define ckWARN3reg(loc, m, a1, a2) STMT_START {                         \
 578     const IV offset = loc - RExC_precomp;                               \
 579     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 580             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 581 } STMT_END
 582
 583 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
 584     const IV offset = loc - RExC_precomp;                               \
 585     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 586             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 587 } STMT_END
 588
 589 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START {                     \
 590     const IV offset = loc - RExC_precomp;                               \
 591     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 592             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 593 } STMT_END
 594
 595 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
 596     const IV offset = loc - RExC_precomp;                               \
 597     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 598             a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
 599 } STMT_END
 600
 601
 602 /* Allow for side effects in s */
 603 #define REGC(c,s) STMT_START {                  \
 604     if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
 605 } STMT_END
 606
 607 /* Macros for recording node offsets.   20001227 mjd@plover.com
 608  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 609  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 610  * Element 0 holds the number n.
 611  * Position is 1 indexed.
 612  */
 613 #ifndef RE_TRACK_PATTERN_OFFSETS
 614 #define Set_Node_Offset_To_R(node,byte)
 615 #define Set_Node_Offset(node,byte)
 616 #define Set_Cur_Node_Offset
 617 #define Set_Node_Length_To_R(node,len)
 618 #define Set_Node_Length(node,len)
 619 #define Set_Node_Cur_Length(node)
 620 #define Node_Offset(n)
 621 #define Node_Length(n)
 622 #define Set_Node_Offset_Length(node,offset,len)
 623 #define ProgLen(ri) ri->u.proglen
 624 #define SetProgLen(ri,x) ri->u.proglen = x
 625 #else
 626 #define ProgLen(ri) ri->u.offsets[0]
 627 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 628 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
 629     if (! SIZE_ONLY) {                                                  \
 630         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
 631                     __LINE__, (int)(node), (int)(byte)));               \
 632         if((node) < 0) {                                                \
 633             Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
 634         } else {                                                        \
 635             RExC_offsets[2*(node)-1] = (byte);                          \
 636         }                                                               \
 637     }                                                                   \
 638 } STMT_END
 639
 640 #define Set_Node_Offset(node,byte) \
 641     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 642 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 643
 644 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
 645     if (! SIZE_ONLY) {                                                  \
 646         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
 647                 __LINE__, (int)(node), (int)(len)));                    \
 648         if((node) < 0) {                                                \
 649             Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
 650         } else {                                                        \
 651             RExC_offsets[2*(node)] = (len);                             \
 652         }                                                               \
 653     }                                                                   \
 654 } STMT_END
 655
 656 #define Set_Node_Length(node,len) \
 657     Set_Node_Length_To_R((node)-RExC_emit_start, len)
 658 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
 659 #define Set_Node_Cur_Length(node) \
 660     Set_Node_Length(node, RExC_parse - parse_start)
 661
 662 /* Get offsets and lengths */
 663 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 664 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 665
 666 #define Set_Node_Offset_Length(node,offset,len) STMT_START {    \
 667     Set_Node_Offset_To_R((node)-RExC_emit_start, (offset));     \
 668     Set_Node_Length_To_R((node)-RExC_emit_start, (len));        \
 669 } STMT_END
 670 #endif
 671
 672 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 673 #define EXPERIMENTAL_INPLACESCAN
 674 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
 675
 676 #define DEBUG_STUDYDATA(str,data,depth)                              \
 677 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
 678     PerlIO_printf(Perl_debug_log,                                    \
 679         "%*s" str "Pos:%"IVdf"/%"IVdf                                \
 680         " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s",       \
 681         (int)(depth)*2, "",                                          \
 682         (IV)((data)->pos_min),                                       \
 683         (IV)((data)->pos_delta),                                     \
 684         (UV)((data)->flags),                                         \
 685         (IV)((data)->whilem_c),                                      \
 686         (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
 687         is_inf ? "INF " : ""                                         \
 688     );                                                               \
 689     if ((data)->last_found)                                          \
 690         PerlIO_printf(Perl_debug_log,                                \
 691             "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 692             " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 693             SvPVX_const((data)->last_found),                         \
 694             (IV)((data)->last_end),                                  \
 695             (IV)((data)->last_start_min),                            \
 696             (IV)((data)->last_start_max),                            \
 697             ((data)->longest &&                                      \
 698              (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 699             SvPVX_const((data)->longest_fixed),                      \
 700             (IV)((data)->offset_fixed),                              \
 701             ((data)->longest &&                                      \
 702              (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 703             SvPVX_const((data)->longest_float),                      \
 704             (IV)((data)->offset_float_min),                          \
 705             (IV)((data)->offset_float_max)                           \
 706         );                                                           \
 707     PerlIO_printf(Perl_debug_log,"\n");                              \
 708 });
 709
 710 /* Mark that we cannot extend a found fixed substring at this point.
 711    Update the longest found anchored substring and the longest found
 712    floating substrings if needed. */
 713
 714 STATIC void
 715 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
 716 {
 717     const STRLEN l = CHR_SVLEN(data->last_found);
 718     const STRLEN old_l = CHR_SVLEN(*data->longest);
 719     GET_RE_DEBUG_FLAGS_DECL;
 720
 721     PERL_ARGS_ASSERT_SCAN_COMMIT;
 722
 723     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 724         SvSetMagicSV(*data->longest, data->last_found);
 725         if (*data->longest == data->longest_fixed) {
 726             data->offset_fixed = l ? data->last_start_min : data->pos_min;
 727             if (data->flags & SF_BEFORE_EOL)
 728                 data->flags
 729                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
 730             else
 731                 data->flags &= ~SF_FIX_BEFORE_EOL;
 732             data->minlen_fixed=minlenp;
 733             data->lookbehind_fixed=0;
 734         }
 735         else { /* *data->longest == data->longest_float */
 736             data->offset_float_min = l ? data->last_start_min : data->pos_min;
 737             data->offset_float_max = (l
 738                                       ? data->last_start_max
 739                                       : data->pos_min + data->pos_delta);
 740             if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
 741                 data->offset_float_max = I32_MAX;
 742             if (data->flags & SF_BEFORE_EOL)
 743                 data->flags
 744                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
 745             else
 746                 data->flags &= ~SF_FL_BEFORE_EOL;
 747             data->minlen_float=minlenp;
 748             data->lookbehind_float=0;
 749         }
 750     }
 751     SvCUR_set(data->last_found, 0);
 752     {
 753         SV * const sv = data->last_found;
 754         if (SvUTF8(sv) && SvMAGICAL(sv)) {
 755             MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 756             if (mg)
 757                 mg->mg_len = 0;
 758         }
 759     }
 760     data->last_end = -1;
 761     data->flags &= ~SF_BEFORE_EOL;
 762     DEBUG_STUDYDATA("commit: ",data,0);
 763 }
 764
 765 /* These macros set, clear and test whether the synthetic start class ('ssc',
 766  * given by the parameter) matches an empty string (EOS).  This uses the
 767  * 'next_off' field in the node, to save a bit in the flags field.  The ssc
 768  * stands alone, so there is never a next_off, so this field is otherwise
 769  * unused.  The EOS information is used only for compilation, but theoretically
 770  * it could be passed on to the execution code.  This could be used to store
 771  * more than one bit of information, but only this one is currently used. */
 772 #define SET_SSC_EOS(node)   STMT_START { (node)->next_off = TRUE; } STMT_END
 773 #define CLEAR_SSC_EOS(node) STMT_START { (node)->next_off = FALSE; } STMT_END
 774 #define TEST_SSC_EOS(node)  cBOOL((node)->next_off)
 775
 776 /* Can match anything (initialization) */
 777 STATIC void
 778 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 779 {
 780     PERL_ARGS_ASSERT_CL_ANYTHING;
 781
 782     ANYOF_BITMAP_SETALL(cl);
 783     cl->flags = ANYOF_UNICODE_ALL;
 784     SET_SSC_EOS(cl);
 785
 786     /* If any portion of the regex is to operate under locale rules,
 787      * initialization includes it.  The reason this isn't done for all regexes
 788      * is that the optimizer was written under the assumption that locale was
 789      * all-or-nothing.  Given the complexity and lack of documentation in the
 790      * optimizer, and that there are inadequate test cases for locale, so many
 791      * parts of it may not work properly, it is safest to avoid locale unless
 792      * necessary. */
 793     if (RExC_contains_locale) {
 794         ANYOF_CLASS_SETALL(cl);     /* /l uses class */
 795         cl->flags |= ANYOF_LOCALE|ANYOF_CLASS|ANYOF_LOC_FOLD;
 796     }
 797     else {
 798         ANYOF_CLASS_ZERO(cl);       /* Only /l uses class now */
 799     }
 800 }
 801
 802 /* Can match anything (initialization) */
 803 STATIC int
 804 S_cl_is_anything(const struct regnode_charclass_class *cl)
 805 {
 806     int value;
 807
 808     PERL_ARGS_ASSERT_CL_IS_ANYTHING;
 809
 810     for (value = 0; value < ANYOF_MAX; value += 2)
 811         if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
 812             return 1;
 813     if (!(cl->flags & ANYOF_UNICODE_ALL))
 814         return 0;
 815     if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
 816         return 0;
 817     return 1;
 818 }
 819
 820 /* Can match anything (initialization) */
 821 STATIC void
 822 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 823 {
 824     PERL_ARGS_ASSERT_CL_INIT;
 825
 826     Zero(cl, 1, struct regnode_charclass_class);
 827     cl->type = ANYOF;
 828     cl_anything(pRExC_state, cl);
 829     ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
 830 }
 831
 832 /* These two functions currently do the exact same thing */
 833 #define cl_init_zero            S_cl_init
 834
 835 /* 'AND' a given class with another one.  Can create false positives.  'cl'
 836  * should not be inverted.  'and_with->flags & ANYOF_CLASS' should be 0 if
 837  * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
 838 STATIC void
 839 S_cl_and(struct regnode_charclass_class *cl,
 840         const struct regnode_charclass_class *and_with)
 841 {
 842     PERL_ARGS_ASSERT_CL_AND;
 843
 844     assert(PL_regkind[and_with->type] == ANYOF);
 845
 846     /* I (khw) am not sure all these restrictions are necessary XXX */
 847     if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
 848         && !(ANYOF_CLASS_TEST_ANY_SET(cl))
 849         && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 850         && !(and_with->flags & ANYOF_LOC_FOLD)
 851         && !(cl->flags & ANYOF_LOC_FOLD)) {
 852         int i;
 853
 854         if (and_with->flags & ANYOF_INVERT)
 855             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 856                 cl->bitmap[i] &= ~and_with->bitmap[i];
 857         else
 858             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 859                 cl->bitmap[i] &= and_with->bitmap[i];
 860     } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
 861
 862     if (and_with->flags & ANYOF_INVERT) {
 863
 864         /* Here, the and'ed node is inverted.  Get the AND of the flags that
 865          * aren't affected by the inversion.  Those that are affected are
 866          * handled individually below */
 867         U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
 868         cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
 869         cl->flags |= affected_flags;
 870
 871         /* We currently don't know how to deal with things that aren't in the
 872          * bitmap, but we know that the intersection is no greater than what
 873          * is already in cl, so let there be false positives that get sorted
 874          * out after the synthetic start class succeeds, and the node is
 875          * matched for real. */
 876
 877         /* The inversion of these two flags indicate that the resulting
 878          * intersection doesn't have them */
 879         if (and_with->flags & ANYOF_UNICODE_ALL) {
 880             cl->flags &= ~ANYOF_UNICODE_ALL;
 881         }
 882         if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
 883             cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
 884         }
 885     }
 886     else {   /* and'd node is not inverted */
 887         U8 outside_bitmap_but_not_utf8; /* Temp variable */
 888
 889         if (! ANYOF_NONBITMAP(and_with)) {
 890
 891             /* Here 'and_with' doesn't match anything outside the bitmap
 892              * (except possibly ANYOF_UNICODE_ALL), which means the
 893              * intersection can't either, except for ANYOF_UNICODE_ALL, in
 894              * which case we don't know what the intersection is, but it's no
 895              * greater than what cl already has, so can just leave it alone,
 896              * with possible false positives */
 897             if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
 898                 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
 899                 cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
 900             }
 901         }
 902         else if (! ANYOF_NONBITMAP(cl)) {
 903
 904             /* Here, 'and_with' does match something outside the bitmap, and cl
 905              * doesn't have a list of things to match outside the bitmap.  If
 906              * cl can match all code points above 255, the intersection will
 907              * be those above-255 code points that 'and_with' matches.  If cl
 908              * can't match all Unicode code points, it means that it can't
 909              * match anything outside the bitmap (since the 'if' that got us
 910              * into this block tested for that), so we leave the bitmap empty.
 911              */
 912             if (cl->flags & ANYOF_UNICODE_ALL) {
 913                 ARG_SET(cl, ARG(and_with));
 914
 915                 /* and_with's ARG may match things that don't require UTF8.
 916                  * And now cl's will too, in spite of this being an 'and'.  See
 917                  * the comments below about the kludge */
 918                 cl->flags |= and_with->flags & ANYOF_NONBITMAP_NON_UTF8;
 919             }
 920         }
 921         else {
 922             /* Here, both 'and_with' and cl match something outside the
 923              * bitmap.  Currently we do not do the intersection, so just match
 924              * whatever cl had at the beginning.  */
 925         }
 926
 927
 928         /* Take the intersection of the two sets of flags.  However, the
 929          * ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'.  This is a
 930          * kludge around the fact that this flag is not treated like the others
 931          * which are initialized in cl_anything().  The way the optimizer works
 932          * is that the synthetic start class (SSC) is initialized to match
 933          * anything, and then the first time a real node is encountered, its
 934          * values are AND'd with the SSC's with the result being the values of
 935          * the real node.  However, there are paths through the optimizer where
 936          * the AND never gets called, so those initialized bits are set
 937          * inappropriately, which is not usually a big deal, as they just cause
 938          * false positives in the SSC, which will just mean a probably
 939          * imperceptible slow down in execution.  However this bit has a
 940          * higher false positive consequence in that it can cause utf8.pm,
 941          * utf8_heavy.pl ... to be loaded when not necessary, which is a much
 942          * bigger slowdown and also causes significant extra memory to be used.
 943          * In order to prevent this, the code now takes a different tack.  The
 944          * bit isn't set unless some part of the regular expression needs it,
 945          * but once set it won't get cleared.  This means that these extra
 946          * modules won't get loaded unless there was some path through the
 947          * pattern that would have required them anyway, and  so any false
 948          * positives that occur by not ANDing them out when they could be
 949          * aren't as severe as they would be if we treated this bit like all
 950          * the others */
 951         outside_bitmap_but_not_utf8 = (cl->flags | and_with->flags)
 952                                       & ANYOF_NONBITMAP_NON_UTF8;
 953         cl->flags &= and_with->flags;
 954         cl->flags |= outside_bitmap_but_not_utf8;
 955     }
 956 }
 957
 958 /* 'OR' a given class with another one.  Can create false positives.  'cl'
 959  * should not be inverted.  'or_with->flags & ANYOF_CLASS' should be 0 if
 960  * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
 961 STATIC void
 962 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
 963 {
 964     PERL_ARGS_ASSERT_CL_OR;
 965
 966     if (or_with->flags & ANYOF_INVERT) {
 967
 968         /* Here, the or'd node is to be inverted.  This means we take the
 969          * complement of everything not in the bitmap, but currently we don't
 970          * know what that is, so give up and match anything */
 971         if (ANYOF_NONBITMAP(or_with)) {
 972             cl_anything(pRExC_state, cl);
 973         }
 974         /* We do not use
 975          * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
 976          *   <= (B1 | !B2) | (CL1 | !CL2)
 977          * which is wasteful if CL2 is small, but we ignore CL2:
 978          *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
 979          * XXXX Can we handle case-fold?  Unclear:
 980          *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
 981          *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
 982          */
 983         else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 984              && !(or_with->flags & ANYOF_LOC_FOLD)
 985              && !(cl->flags & ANYOF_LOC_FOLD) ) {
 986             int i;
 987
 988             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 989                 cl->bitmap[i] |= ~or_with->bitmap[i];
 990         } /* XXXX: logic is complicated otherwise */
 991         else {
 992             cl_anything(pRExC_state, cl);
 993         }
 994
 995         /* And, we can just take the union of the flags that aren't affected
 996          * by the inversion */
 997         cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
 998
 999         /* For the remaining flags:
1000             ANYOF_UNICODE_ALL and inverted means to not match anything above
1001                     255, which means that the union with cl should just be
1002                     what cl has in it, so can ignore this flag
1003             ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
1004                     is 127-255 to match them, but then invert that, so the
1005                     union with cl should just be what cl has in it, so can
1006                     ignore this flag
1007          */
1008     } else {    /* 'or_with' is not inverted */
1009         /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
1010         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
1011              && (!(or_with->flags & ANYOF_LOC_FOLD)
1012                  || (cl->flags & ANYOF_LOC_FOLD)) ) {
1013             int i;
1014
1015             /* OR char bitmap and class bitmap separately */
1016             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
1017                 cl->bitmap[i] |= or_with->bitmap[i];
1018             ANYOF_CLASS_OR(or_with, cl);
1019         }
1020         else { /* XXXX: logic is complicated, leave it along for a moment. */
1021             cl_anything(pRExC_state, cl);
1022         }
1023
1024         if (ANYOF_NONBITMAP(or_with)) {
1025
1026             /* Use the added node's outside-the-bit-map match if there isn't a
1027              * conflict.  If there is a conflict (both nodes match something
1028              * outside the bitmap, but what they match outside is not the same
1029              * pointer, and hence not easily compared until XXX we extend
1030              * inversion lists this far), give up and allow the start class to
1031              * match everything outside the bitmap.  If that stuff is all above
1032              * 255, can just set UNICODE_ALL, otherwise caould be anything. */
1033             if (! ANYOF_NONBITMAP(cl)) {
1034                 ARG_SET(cl, ARG(or_with));
1035             }
1036             else if (ARG(cl) != ARG(or_with)) {
1037
1038                 if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
1039                     cl_anything(pRExC_state, cl);
1040                 }
1041                 else {
1042                     cl->flags |= ANYOF_UNICODE_ALL;
1043                 }
1044             }
1045         }
1046
1047         /* Take the union */
1048         cl->flags |= or_with->flags;
1049     }
1050 }
1051
1052 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1053 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
1054 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1055 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
1056
1057
1058 #ifdef DEBUGGING
1059 /*
1060    dump_trie(trie,widecharmap,revcharmap)
1061    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1062    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1063
1064    These routines dump out a trie in a somewhat readable format.
1065    The _interim_ variants are used for debugging the interim
1066    tables that are used to generate the final compressed
1067    representation which is what dump_trie expects.
1068
1069    Part of the reason for their existence is to provide a form
1070    of documentation as to how the different representations function.
1071
1072 */
1073
1074 /*
1075   Dumps the final compressed table form of the trie to Perl_debug_log.
1076   Used for debugging make_trie().
1077 */
1078
1079 STATIC void
1080 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1081             AV *revcharmap, U32 depth)
1082 {
1083     U32 state;
1084     SV *sv=sv_newmortal();
1085     int colwidth= widecharmap ? 6 : 4;
1086     U16 word;
1087     GET_RE_DEBUG_FLAGS_DECL;
1088
1089     PERL_ARGS_ASSERT_DUMP_TRIE;
1090
1091     PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1092         (int)depth * 2 + 2,"",
1093         "Match","Base","Ofs" );
1094
1095     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1096         SV ** const tmp = av_fetch( revcharmap, state, 0);
1097         if ( tmp ) {
1098             PerlIO_printf( Perl_debug_log, "%*s",
1099                 colwidth,
1100                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1101                             PL_colors[0], PL_colors[1],
1102                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1103                             PERL_PV_ESCAPE_FIRSTCHAR
1104                 )
1105             );
1106         }
1107     }
1108     PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1109         (int)depth * 2 + 2,"");
1110
1111     for( state = 0 ; state < trie->uniquecharcount ; state++ )
1112         PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1113     PerlIO_printf( Perl_debug_log, "\n");
1114
1115     for( state = 1 ; state < trie->statecount ; state++ ) {
1116         const U32 base = trie->states[ state ].trans.base;
1117
1118         PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1119
1120         if ( trie->states[ state ].wordnum ) {
1121             PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1122         } else {
1123             PerlIO_printf( Perl_debug_log, "%6s", "" );
1124         }
1125
1126         PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1127
1128         if ( base ) {
1129             U32 ofs = 0;
1130
1131             while( ( base + ofs  < trie->uniquecharcount ) ||
1132                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
1133                      && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1134                     ofs++;
1135
1136             PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1137
1138             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1139                 if ( ( base + ofs >= trie->uniquecharcount ) &&
1140                      ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1141                      trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1142                 {
1143                    PerlIO_printf( Perl_debug_log, "%*"UVXf,
1144                     colwidth,
1145                     (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1146                 } else {
1147                     PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
1148                 }
1149             }
1150
1151             PerlIO_printf( Perl_debug_log, "]");
1152
1153         }
1154         PerlIO_printf( Perl_debug_log, "\n" );
1155     }
1156     PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1157     for (word=1; word <= trie->wordcount; word++) {
1158         PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1159             (int)word, (int)(trie->wordinfo[word].prev),
1160             (int)(trie->wordinfo[word].len));
1161     }
1162     PerlIO_printf(Perl_debug_log, "\n" );
1163 }
1164 /*
1165   Dumps a fully constructed but uncompressed trie in list form.
1166   List tries normally only are used for construction when the number of
1167   possible chars (trie->uniquecharcount) is very high.
1168   Used for debugging make_trie().
1169 */
1170 STATIC void
1171 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1172                          HV *widecharmap, AV *revcharmap, U32 next_alloc,
1173                          U32 depth)
1174 {
1175     U32 state;
1176     SV *sv=sv_newmortal();
1177     int colwidth= widecharmap ? 6 : 4;
1178     GET_RE_DEBUG_FLAGS_DECL;
1179
1180     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1181
1182     /* print out the table precompression.  */
1183     PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1184         (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1185         "------:-----+-----------------\n" );
1186
1187     for( state=1 ; state < next_alloc ; state ++ ) {
1188         U16 charid;
1189
1190         PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1191             (int)depth * 2 + 2,"", (UV)state  );
1192         if ( ! trie->states[ state ].wordnum ) {
1193             PerlIO_printf( Perl_debug_log, "%5s| ","");
1194         } else {
1195             PerlIO_printf( Perl_debug_log, "W%4x| ",
1196                 trie->states[ state ].wordnum
1197             );
1198         }
1199         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1200             SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1201             if ( tmp ) {
1202                 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1203                     colwidth,
1204                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1205                             PL_colors[0], PL_colors[1],
1206                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1207                             PERL_PV_ESCAPE_FIRSTCHAR
1208                     ) ,
1209                     TRIE_LIST_ITEM(state,charid).forid,
1210                     (UV)TRIE_LIST_ITEM(state,charid).newstate
1211                 );
1212                 if (!(charid % 10))
1213                     PerlIO_printf(Perl_debug_log, "\n%*s| ",
1214                         (int)((depth * 2) + 14), "");
1215             }
1216         }
1217         PerlIO_printf( Perl_debug_log, "\n");
1218     }
1219 }
1220
1221 /*
1222   Dumps a fully constructed but uncompressed trie in table form.
1223   This is the normal DFA style state transition table, with a few
1224   twists to facilitate compression later.
1225   Used for debugging make_trie().
1226 */
1227 STATIC void
1228 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1229                           HV *widecharmap, AV *revcharmap, U32 next_alloc,
1230                           U32 depth)
1231 {
1232     U32 state;
1233     U16 charid;
1234     SV *sv=sv_newmortal();
1235     int colwidth= widecharmap ? 6 : 4;
1236     GET_RE_DEBUG_FLAGS_DECL;
1237
1238     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1239
1240     /*
1241        print out the table precompression so that we can do a visual check
1242        that they are identical.
1243      */
1244
1245     PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1246
1247     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1248         SV ** const tmp = av_fetch( revcharmap, charid, 0);
1249         if ( tmp ) {
1250             PerlIO_printf( Perl_debug_log, "%*s",
1251                 colwidth,
1252                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1253                             PL_colors[0], PL_colors[1],
1254                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1255                             PERL_PV_ESCAPE_FIRSTCHAR
1256                 )
1257             );
1258         }
1259     }
1260
1261     PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1262
1263     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1264         PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1265     }
1266
1267     PerlIO_printf( Perl_debug_log, "\n" );
1268
1269     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1270
1271         PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1272             (int)depth * 2 + 2,"",
1273             (UV)TRIE_NODENUM( state ) );
1274
1275         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1276             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1277             if (v)
1278                 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1279             else
1280                 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1281         }
1282         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1283             PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1284         } else {
1285             PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1286             trie->states[ TRIE_NODENUM( state ) ].wordnum );
1287         }
1288     }
1289 }
1290
1291 #endif
1292
1293
1294 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1295   startbranch: the first branch in the whole branch sequence
1296   first      : start branch of sequence of branch-exact nodes.
1297                May be the same as startbranch
1298   last       : Thing following the last branch.
1299                May be the same as tail.
1300   tail       : item following the branch sequence
1301   count      : words in the sequence
1302   flags      : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1303   depth      : indent depth
1304
1305 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1306
1307 A trie is an N'ary tree where the branches are determined by digital
1308 decomposition of the key. IE, at the root node you look up the 1st character and
1309 follow that branch repeat until you find the end of the branches. Nodes can be
1310 marked as "accepting" meaning they represent a complete word. Eg:
1311
1312   /he|she|his|hers/
1313
1314 would convert into the following structure. Numbers represent states, letters
1315 following numbers represent valid transitions on the letter from that state, if
1316 the number is in square brackets it represents an accepting state, otherwise it
1317 will be in parenthesis.
1318
1319       +-h->+-e->[3]-+-r->(8)-+-s->[9]
1320       |    |
1321       |   (2)
1322       |    |
1323      (1)   +-i->(6)-+-s->[7]
1324       |
1325       +-s->(3)-+-h->(4)-+-e->[5]
1326
1327       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1328
1329 This shows that when matching against the string 'hers' we will begin at state 1
1330 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1331 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1332 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1333 single traverse. We store a mapping from accepting to state to which word was
1334 matched, and then when we have multiple possibilities we try to complete the
1335 rest of the regex in the order in which they occured in the alternation.
1336
1337 The only prior NFA like behaviour that would be changed by the TRIE support is
1338 the silent ignoring of duplicate alternations which are of the form:
1339
1340  / (DUPE|DUPE) X? (?{ ... }) Y /x
1341
1342 Thus EVAL blocks following a trie may be called a different number of times with
1343 and without the optimisation. With the optimisations dupes will be silently
1344 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1345 the following demonstrates:
1346
1347  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1348
1349 which prints out 'word' three times, but
1350
1351  'words'=~/(word|word|word)(?{ print $1 })S/
1352
1353 which doesnt print it out at all. This is due to other optimisations kicking in.
1354
1355 Example of what happens on a structural level:
1356
1357 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1358
1359    1: CURLYM[1] {1,32767}(18)
1360    5:   BRANCH(8)
1361    6:     EXACT <ac>(16)
1362    8:   BRANCH(11)
1363    9:     EXACT <ad>(16)
1364   11:   BRANCH(14)
1365   12:     EXACT <ab>(16)
1366   16:   SUCCEED(0)
1367   17:   NOTHING(18)
1368   18: END(0)
1369
1370 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1371 and should turn into:
1372
1373    1: CURLYM[1] {1,32767}(18)
1374    5:   TRIE(16)
1375         [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1376           <ac>
1377           <ad>
1378           <ab>
1379   16:   SUCCEED(0)
1380   17:   NOTHING(18)
1381   18: END(0)
1382
1383 Cases where tail != last would be like /(?foo|bar)baz/:
1384
1385    1: BRANCH(4)
1386    2:   EXACT <foo>(8)
1387    4: BRANCH(7)
1388    5:   EXACT <bar>(8)
1389    7: TAIL(8)
1390    8: EXACT <baz>(10)
1391   10: END(0)
1392
1393 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1394 and would end up looking like:
1395
1396     1: TRIE(8)
1397       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1398         <foo>
1399         <bar>
1400    7: TAIL(8)
1401    8: EXACT <baz>(10)
1402   10: END(0)
1403
1404     d = uvuni_to_utf8_flags(d, uv, 0);
1405
1406 is the recommended Unicode-aware way of saying
1407
1408     *(d++) = uv;
1409 */
1410
1411 #define TRIE_STORE_REVCHAR(val)                                            \
1412     STMT_START {                                                           \
1413         if (UTF) {                                                         \
1414             SV *zlopp = newSV(7); /* XXX: optimize me */                   \
1415             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
1416             unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, val); \
1417             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
1418             SvPOK_on(zlopp);                                               \
1419             SvUTF8_on(zlopp);                                              \
1420             av_push(revcharmap, zlopp);                                    \
1421         } else {                                                           \
1422             char ooooff = (char)val;                                           \
1423             av_push(revcharmap, newSVpvn(&ooooff, 1));                     \
1424         }                                                                  \
1425         } STMT_END
1426
1427 #define TRIE_READ_CHAR STMT_START {                                                     \
1428     wordlen++;                                                                          \
1429     if ( UTF ) {                                                                        \
1430         /* if it is UTF then it is either already folded, or does not need folding */   \
1431         uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags);             \
1432     }                                                                                   \
1433     else if (folder == PL_fold_latin1) {                                                \
1434         /* if we use this folder we have to obey unicode rules on latin-1 data */       \
1435         if ( foldlen > 0 ) {                                                            \
1436            uvc = utf8n_to_uvuni( (const U8*) scan, UTF8_MAXLEN, &len, uniflags );       \
1437            foldlen -= len;                                                              \
1438            scan += len;                                                                 \
1439            len = 0;                                                                     \
1440         } else {                                                                        \
1441             len = 1;                                                                    \
1442             uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1);                     \
1443             skiplen = UNISKIP(uvc);                                                     \
1444             foldlen -= skiplen;                                                         \
1445             scan = foldbuf + skiplen;                                                   \
1446         }                                                                               \
1447     } else {                                                                            \
1448         /* raw data, will be folded later if needed */                                  \
1449         uvc = (U32)*uc;                                                                 \
1450         len = 1;                                                                        \
1451     }                                                                                   \
1452 } STMT_END
1453
1454
1455
1456 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
1457     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
1458         U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
1459         Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1460     }                                                           \
1461     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
1462     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
1463     TRIE_LIST_CUR( state )++;                                   \
1464 } STMT_END
1465
1466 #define TRIE_LIST_NEW(state) STMT_START {                       \
1467     Newxz( trie->states[ state ].trans.list,               \
1468         4, reg_trie_trans_le );                                 \
1469      TRIE_LIST_CUR( state ) = 1;                                \
1470      TRIE_LIST_LEN( state ) = 4;                                \
1471 } STMT_END
1472
1473 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
1474     U16 dupe= trie->states[ state ].wordnum;                    \
1475     regnode * const noper_next = regnext( noper );              \
1476                                                                 \
1477     DEBUG_r({                                                   \
1478         /* store the word for dumping */                        \
1479         SV* tmp;                                                \
1480         if (OP(noper) != NOTHING)                               \
1481             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);    \
1482         else                                                    \
1483             tmp = newSVpvn_utf8( "", 0, UTF );                  \
1484         av_push( trie_words, tmp );                             \
1485     });                                                         \
1486                                                                 \
1487     curword++;                                                  \
1488     trie->wordinfo[curword].prev   = 0;                         \
1489     trie->wordinfo[curword].len    = wordlen;                   \
1490     trie->wordinfo[curword].accept = state;                     \
1491                                                                 \
1492     if ( noper_next < tail ) {                                  \
1493         if (!trie->jump)                                        \
1494             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1495         trie->jump[curword] = (U16)(noper_next - convert);      \
1496         if (!jumper)                                            \
1497             jumper = noper_next;                                \
1498         if (!nextbranch)                                        \
1499             nextbranch= regnext(cur);                           \
1500     }                                                           \
1501                                                                 \
1502     if ( dupe ) {                                               \
1503         /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
1504         /* chain, so that when the bits of chain are later    */\
1505         /* linked together, the dups appear in the chain      */\
1506         trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1507         trie->wordinfo[dupe].prev = curword;                    \
1508     } else {                                                    \
1509         /* we haven't inserted this word yet.                */ \
1510         trie->states[ state ].wordnum = curword;                \
1511     }                                                           \
1512 } STMT_END
1513
1514
1515 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)          \
1516      ( ( base + charid >=  ucharcount                                   \
1517          && base + charid < ubound                                      \
1518          && state == trie->trans[ base - ucharcount + charid ].check    \
1519          && trie->trans[ base - ucharcount + charid ].next )            \
1520            ? trie->trans[ base - ucharcount + charid ].next             \
1521            : ( state==1 ? special : 0 )                                 \
1522       )
1523
1524 #define MADE_TRIE       1
1525 #define MADE_JUMP_TRIE  2
1526 #define MADE_EXACT_TRIE 4
1527
1528 STATIC I32
1529 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1530 {
1531     dVAR;
1532     /* first pass, loop through and scan words */
1533     reg_trie_data *trie;
1534     HV *widecharmap = NULL;
1535     AV *revcharmap = newAV();
1536     regnode *cur;
1537     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1538     STRLEN len = 0;
1539     UV uvc = 0;
1540     U16 curword = 0;
1541     U32 next_alloc = 0;
1542     regnode *jumper = NULL;
1543     regnode *nextbranch = NULL;
1544     regnode *convert = NULL;
1545     U32 *prev_states; /* temp array mapping each state to previous one */
1546     /* we just use folder as a flag in utf8 */
1547     const U8 * folder = NULL;
1548
1549 #ifdef DEBUGGING
1550     const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1551     AV *trie_words = NULL;
1552     /* along with revcharmap, this only used during construction but both are
1553      * useful during debugging so we store them in the struct when debugging.
1554      */
1555 #else
1556     const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1557     STRLEN trie_charcount=0;
1558 #endif
1559     SV *re_trie_maxbuff;
1560     GET_RE_DEBUG_FLAGS_DECL;
1561
1562     PERL_ARGS_ASSERT_MAKE_TRIE;
1563 #ifndef DEBUGGING
1564     PERL_UNUSED_ARG(depth);
1565 #endif
1566
1567     switch (flags) {
1568         case EXACT: break;
1569         case EXACTFA:
1570         case EXACTFU_SS:
1571         case EXACTFU_TRICKYFOLD:
1572         case EXACTFU: folder = PL_fold_latin1; break;
1573         case EXACTF:  folder = PL_fold; break;
1574         case EXACTFL: folder = PL_fold_locale; break;
1575         default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1576     }
1577
1578     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1579     trie->refcount = 1;
1580     trie->startstate = 1;
1581     trie->wordcount = word_count;
1582     RExC_rxi->data->data[ data_slot ] = (void*)trie;
1583     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1584     if (flags == EXACT)
1585         trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1586     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1587                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
1588
1589     DEBUG_r({
1590         trie_words = newAV();
1591     });
1592
1593     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1594     if (!SvIOK(re_trie_maxbuff)) {
1595         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1596     }
1597     DEBUG_TRIE_COMPILE_r({
1598                 PerlIO_printf( Perl_debug_log,
1599                   "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1600                   (int)depth * 2 + 2, "",
1601                   REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1602                   REG_NODE_NUM(last), REG_NODE_NUM(tail),
1603                   (int)depth);
1604     });
1605
1606    /* Find the node we are going to overwrite */
1607     if ( first == startbranch && OP( last ) != BRANCH ) {
1608         /* whole branch chain */
1609         convert = first;
1610     } else {
1611         /* branch sub-chain */
1612         convert = NEXTOPER( first );
1613     }
1614
1615     /*  -- First loop and Setup --
1616
1617        We first traverse the branches and scan each word to determine if it
1618        contains widechars, and how many unique chars there are, this is
1619        important as we have to build a table with at least as many columns as we
1620        have unique chars.
1621
1622        We use an array of integers to represent the character codes 0..255
1623        (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1624        native representation of the character value as the key and IV's for the
1625        coded index.
1626
1627        *TODO* If we keep track of how many times each character is used we can
1628        remap the columns so that the table compression later on is more
1629        efficient in terms of memory by ensuring the most common value is in the
1630        middle and the least common are on the outside.  IMO this would be better
1631        than a most to least common mapping as theres a decent chance the most
1632        common letter will share a node with the least common, meaning the node
1633        will not be compressible. With a middle is most common approach the worst
1634        case is when we have the least common nodes twice.
1635
1636      */
1637
1638     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1639         regnode *noper = NEXTOPER( cur );
1640         const U8 *uc = (U8*)STRING( noper );
1641         const U8 *e  = uc + STR_LEN( noper );
1642         STRLEN foldlen = 0;
1643         U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1644         STRLEN skiplen = 0;
1645         const U8 *scan = (U8*)NULL;
1646         U32 wordlen      = 0;         /* required init */
1647         STRLEN chars = 0;
1648         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1649
1650         if (OP(noper) == NOTHING) {
1651             regnode *noper_next= regnext(noper);
1652             if (noper_next != tail && OP(noper_next) == flags) {
1653                 noper = noper_next;
1654                 uc= (U8*)STRING(noper);
1655                 e= uc + STR_LEN(noper);
1656                 trie->minlen= STR_LEN(noper);
1657             } else {
1658                 trie->minlen= 0;
1659                 continue;
1660             }
1661         }
1662
1663         if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
1664             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1665                                           regardless of encoding */
1666             if (OP( noper ) == EXACTFU_SS) {
1667                 /* false positives are ok, so just set this */
1668                 TRIE_BITMAP_SET(trie,0xDF);
1669             }
1670         }
1671         for ( ; uc < e ; uc += len ) {
1672             TRIE_CHARCOUNT(trie)++;
1673             TRIE_READ_CHAR;
1674             chars++;
1675             if ( uvc < 256 ) {
1676                 if ( folder ) {
1677                     U8 folded= folder[ (U8) uvc ];
1678                     if ( !trie->charmap[ folded ] ) {
1679                         trie->charmap[ folded ]=( ++trie->uniquecharcount );
1680                         TRIE_STORE_REVCHAR( folded );
1681                     }
1682                 }
1683                 if ( !trie->charmap[ uvc ] ) {
1684                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1685                     TRIE_STORE_REVCHAR( uvc );
1686                 }
1687                 if ( set_bit ) {
1688                     /* store the codepoint in the bitmap, and its folded
1689                      * equivalent. */
1690                     TRIE_BITMAP_SET(trie, uvc);
1691
1692                     /* store the folded codepoint */
1693                     if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
1694
1695                     if ( !UTF ) {
1696                         /* store first byte of utf8 representation of
1697                            variant codepoints */
1698                         if (! UNI_IS_INVARIANT(uvc)) {
1699                             TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
1700                         }
1701                     }
1702                     set_bit = 0; /* We've done our bit :-) */
1703                 }
1704             } else {
1705                 SV** svpp;
1706                 if ( !widecharmap )
1707                     widecharmap = newHV();
1708
1709                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1710
1711                 if ( !svpp )
1712                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1713
1714                 if ( !SvTRUE( *svpp ) ) {
1715                     sv_setiv( *svpp, ++trie->uniquecharcount );
1716                     TRIE_STORE_REVCHAR(uvc);
1717                 }
1718             }
1719         }
1720         if( cur == first ) {
1721             trie->minlen = chars;
1722             trie->maxlen = chars;
1723         } else if (chars < trie->minlen) {
1724             trie->minlen = chars;
1725         } else if (chars > trie->maxlen) {
1726             trie->maxlen = chars;
1727         }
1728         if (OP( noper ) == EXACTFU_SS) {
1729             /* XXX: workaround - 'ss' could match "\x{DF}" so minlen could be 1 and not 2*/
1730             if (trie->minlen > 1)
1731                 trie->minlen= 1;
1732         }
1733         if (OP( noper ) == EXACTFU_TRICKYFOLD) {
1734             /* XXX: workround - things like "\x{1FBE}\x{0308}\x{0301}" can match "\x{0390}"
1735              *                - We assume that any such sequence might match a 2 byte string */
1736             if (trie->minlen > 2 )
1737                 trie->minlen= 2;
1738         }
1739
1740     } /* end first pass */
1741     DEBUG_TRIE_COMPILE_r(
1742         PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1743                 (int)depth * 2 + 2,"",
1744                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1745                 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1746                 (int)trie->minlen, (int)trie->maxlen )
1747     );
1748
1749     /*
1750         We now know what we are dealing with in terms of unique chars and
1751         string sizes so we can calculate how much memory a naive
1752         representation using a flat table  will take. If it's over a reasonable
1753         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1754         conservative but potentially much slower representation using an array
1755         of lists.
1756
1757         At the end we convert both representations into the same compressed
1758         form that will be used in regexec.c for matching with. The latter
1759         is a form that cannot be used to construct with but has memory
1760         properties similar to the list form and access properties similar
1761         to the table form making it both suitable for fast searches and
1762         small enough that its feasable to store for the duration of a program.
1763
1764         See the comment in the code where the compressed table is produced
1765         inplace from the flat tabe representation for an explanation of how
1766         the compression works.
1767
1768     */
1769
1770
1771     Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1772     prev_states[1] = 0;
1773
1774     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1775         /*
1776             Second Pass -- Array Of Lists Representation
1777
1778             Each state will be represented by a list of charid:state records
1779             (reg_trie_trans_le) the first such element holds the CUR and LEN
1780             points of the allocated array. (See defines above).
1781
1782             We build the initial structure using the lists, and then convert
1783             it into the compressed table form which allows faster lookups
1784             (but cant be modified once converted).
1785         */
1786
1787         STRLEN transcount = 1;
1788
1789         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1790             "%*sCompiling trie using list compiler\n",
1791             (int)depth * 2 + 2, ""));
1792
1793         trie->states = (reg_trie_state *)
1794             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1795                                   sizeof(reg_trie_state) );
1796         TRIE_LIST_NEW(1);
1797         next_alloc = 2;
1798
1799         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1800
1801             regnode *noper   = NEXTOPER( cur );
1802             U8 *uc           = (U8*)STRING( noper );
1803             const U8 *e      = uc + STR_LEN( noper );
1804             U32 state        = 1;         /* required init */
1805             U16 charid       = 0;         /* sanity init */
1806             U8 *scan         = (U8*)NULL; /* sanity init */
1807             STRLEN foldlen   = 0;         /* required init */
1808             U32 wordlen      = 0;         /* required init */
1809             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1810             STRLEN skiplen   = 0;
1811
1812             if (OP(noper) == NOTHING) {
1813                 regnode *noper_next= regnext(noper);
1814                 if (noper_next != tail && OP(noper_next) == flags) {
1815                     noper = noper_next;
1816                     uc= (U8*)STRING(noper);
1817                     e= uc + STR_LEN(noper);
1818                 }
1819             }
1820
1821             if (OP(noper) != NOTHING) {
1822                 for ( ; uc < e ; uc += len ) {
1823
1824                     TRIE_READ_CHAR;
1825
1826                     if ( uvc < 256 ) {
1827                         charid = trie->charmap[ uvc ];
1828                     } else {
1829                         SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1830                         if ( !svpp ) {
1831                             charid = 0;
1832                         } else {
1833                             charid=(U16)SvIV( *svpp );
1834                         }
1835                     }
1836                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
1837                     if ( charid ) {
1838
1839                         U16 check;
1840                         U32 newstate = 0;
1841
1842                         charid--;
1843                         if ( !trie->states[ state ].trans.list ) {
1844                             TRIE_LIST_NEW( state );
1845                         }
1846                         for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1847                             if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1848                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1849                                 break;
1850                             }
1851                         }
1852                         if ( ! newstate ) {
1853                             newstate = next_alloc++;
1854                             prev_states[newstate] = state;
1855                             TRIE_LIST_PUSH( state, charid, newstate );
1856                             transcount++;
1857                         }
1858                         state = newstate;
1859                     } else {
1860                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1861                     }
1862                 }
1863             }
1864             TRIE_HANDLE_WORD(state);
1865
1866         } /* end second pass */
1867
1868         /* next alloc is the NEXT state to be allocated */
1869         trie->statecount = next_alloc;
1870         trie->states = (reg_trie_state *)
1871             PerlMemShared_realloc( trie->states,
1872                                    next_alloc
1873                                    * sizeof(reg_trie_state) );
1874
1875         /* and now dump it out before we compress it */
1876         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1877                                                          revcharmap, next_alloc,
1878                                                          depth+1)
1879         );
1880
1881         trie->trans = (reg_trie_trans *)
1882             PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1883         {
1884             U32 state;
1885             U32 tp = 0;
1886             U32 zp = 0;
1887
1888
1889             for( state=1 ; state < next_alloc ; state ++ ) {
1890                 U32 base=0;
1891
1892                 /*
1893                 DEBUG_TRIE_COMPILE_MORE_r(
1894                     PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1895                 );
1896                 */
1897
1898                 if (trie->states[state].trans.list) {
1899                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1900                     U16 maxid=minid;
1901                     U16 idx;
1902
1903                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1904                         const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1905                         if ( forid < minid ) {
1906                             minid=forid;
1907                         } else if ( forid > maxid ) {
1908                             maxid=forid;
1909                         }
1910                     }
1911                     if ( transcount < tp + maxid - minid + 1) {
1912                         transcount *= 2;
1913                         trie->trans = (reg_trie_trans *)
1914                             PerlMemShared_realloc( trie->trans,
1915                                                      transcount
1916                                                      * sizeof(reg_trie_trans) );
1917                         Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1918                     }
1919                     base = trie->uniquecharcount + tp - minid;
1920                     if ( maxid == minid ) {
1921                         U32 set = 0;
1922                         for ( ; zp < tp ; zp++ ) {
1923                             if ( ! trie->trans[ zp ].next ) {
1924                                 base = trie->uniquecharcount + zp - minid;
1925                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1926                                 trie->trans[ zp ].check = state;
1927                                 set = 1;
1928                                 break;
1929                             }
1930                         }
1931                         if ( !set ) {
1932                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1933                             trie->trans[ tp ].check = state;
1934                             tp++;
1935                             zp = tp;
1936                         }
1937                     } else {
1938                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1939                             const U32 tid = base -  trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1940                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1941                             trie->trans[ tid ].check = state;
1942                         }
1943                         tp += ( maxid - minid + 1 );
1944                     }
1945                     Safefree(trie->states[ state ].trans.list);
1946                 }
1947                 /*
1948                 DEBUG_TRIE_COMPILE_MORE_r(
1949                     PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1950                 );
1951                 */
1952                 trie->states[ state ].trans.base=base;
1953             }
1954             trie->lasttrans = tp + 1;
1955         }
1956     } else {
1957         /*
1958            Second Pass -- Flat Table Representation.
1959
1960            we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1961            We know that we will need Charcount+1 trans at most to store the data
1962            (one row per char at worst case) So we preallocate both structures
1963            assuming worst case.
1964
1965            We then construct the trie using only the .next slots of the entry
1966            structs.
1967
1968            We use the .check field of the first entry of the node temporarily to
1969            make compression both faster and easier by keeping track of how many non
1970            zero fields are in the node.
1971
1972            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1973            transition.
1974
1975            There are two terms at use here: state as a TRIE_NODEIDX() which is a
1976            number representing the first entry of the node, and state as a
1977            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1978            TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1979            are 2 entrys per node. eg:
1980
1981              A B       A B
1982           1. 2 4    1. 3 7
1983           2. 0 3    3. 0 5
1984           3. 0 0    5. 0 0
1985           4. 0 0    7. 0 0
1986
1987            The table is internally in the right hand, idx form. However as we also
1988            have to deal with the states array which is indexed by nodenum we have to
1989            use TRIE_NODENUM() to convert.
1990
1991         */
1992         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1993             "%*sCompiling trie using table compiler\n",
1994             (int)depth * 2 + 2, ""));
1995
1996         trie->trans = (reg_trie_trans *)
1997             PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1998                                   * trie->uniquecharcount + 1,
1999                                   sizeof(reg_trie_trans) );
2000         trie->states = (reg_trie_state *)
2001             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2002                                   sizeof(reg_trie_state) );
2003         next_alloc = trie->uniquecharcount + 1;
2004
2005
2006         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2007
2008             regnode *noper   = NEXTOPER( cur );
2009             const U8 *uc     = (U8*)STRING( noper );
2010             const U8 *e      = uc + STR_LEN( noper );
2011
2012             U32 state        = 1;         /* required init */
2013
2014             U16 charid       = 0;         /* sanity init */
2015             U32 accept_state = 0;         /* sanity init */
2016             U8 *scan         = (U8*)NULL; /* sanity init */
2017
2018             STRLEN foldlen   = 0;         /* required init */
2019             U32 wordlen      = 0;         /* required init */
2020             STRLEN skiplen   = 0;
2021             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2022
2023             if (OP(noper) == NOTHING) {
2024                 regnode *noper_next= regnext(noper);
2025                 if (noper_next != tail && OP(noper_next) == flags) {
2026                     noper = noper_next;
2027                     uc= (U8*)STRING(noper);
2028                     e= uc + STR_LEN(noper);
2029                 }
2030             }
2031
2032             if ( OP(noper) != NOTHING ) {
2033                 for ( ; uc < e ; uc += len ) {
2034
2035                     TRIE_READ_CHAR;
2036
2037                     if ( uvc < 256 ) {
2038                         charid = trie->charmap[ uvc ];
2039                     } else {
2040                         SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
2041                         charid = svpp ? (U16)SvIV(*svpp) : 0;
2042                     }
2043                     if ( charid ) {
2044                         charid--;
2045                         if ( !trie->trans[ state + charid ].next ) {
2046                             trie->trans[ state + charid ].next = next_alloc;
2047                             trie->trans[ state ].check++;
2048                             prev_states[TRIE_NODENUM(next_alloc)]
2049                                     = TRIE_NODENUM(state);
2050                             next_alloc += trie->uniquecharcount;
2051                         }
2052                         state = trie->trans[ state + charid ].next;
2053                     } else {
2054                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2055                     }
2056                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
2057                 }
2058             }
2059             accept_state = TRIE_NODENUM( state );
2060             TRIE_HANDLE_WORD(accept_state);
2061
2062         } /* end second pass */
2063
2064         /* and now dump it out before we compress it */
2065         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2066                                                           revcharmap,
2067                                                           next_alloc, depth+1));
2068
2069         {
2070         /*
2071            * Inplace compress the table.*
2072
2073            For sparse data sets the table constructed by the trie algorithm will
2074            be mostly 0/FAIL transitions or to put it another way mostly empty.
2075            (Note that leaf nodes will not contain any transitions.)
2076
2077            This algorithm compresses the tables by eliminating most such
2078            transitions, at the cost of a modest bit of extra work during lookup:
2079
2080            - Each states[] entry contains a .base field which indicates the
2081            index in the state[] array wheres its transition data is stored.
2082
2083            - If .base is 0 there are no valid transitions from that node.
2084
2085            - If .base is nonzero then charid is added to it to find an entry in
2086            the trans array.
2087
2088            -If trans[states[state].base+charid].check!=state then the
2089            transition is taken to be a 0/Fail transition. Thus if there are fail
2090            transitions at the front of the node then the .base offset will point
2091            somewhere inside the previous nodes data (or maybe even into a node
2092            even earlier), but the .check field determines if the transition is
2093            valid.
2094
2095            XXX - wrong maybe?
2096            The following process inplace converts the table to the compressed
2097            table: We first do not compress the root node 1,and mark all its
2098            .check pointers as 1 and set its .base pointer as 1 as well. This
2099            allows us to do a DFA construction from the compressed table later,
2100            and ensures that any .base pointers we calculate later are greater
2101            than 0.
2102
2103            - We set 'pos' to indicate the first entry of the second node.
2104
2105            - We then iterate over the columns of the node, finding the first and
2106            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2107            and set the .check pointers accordingly, and advance pos
2108            appropriately and repreat for the next node. Note that when we copy
2109            the next pointers we have to convert them from the original
2110            NODEIDX form to NODENUM form as the former is not valid post
2111            compression.
2112
2113            - If a node has no transitions used we mark its base as 0 and do not
2114            advance the pos pointer.
2115
2116            - If a node only has one transition we use a second pointer into the
2117            structure to fill in allocated fail transitions from other states.
2118            This pointer is independent of the main pointer and scans forward
2119            looking for null transitions that are allocated to a state. When it
2120            finds one it writes the single transition into the "hole".  If the
2121            pointer doesnt find one the single transition is appended as normal.
2122
2123            - Once compressed we can Renew/realloc the structures to release the
2124            excess space.
2125
2126            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2127            specifically Fig 3.47 and the associated pseudocode.
2128
2129            demq
2130         */
2131         const U32 laststate = TRIE_NODENUM( next_alloc );
2132         U32 state, charid;
2133         U32 pos = 0, zp=0;
2134         trie->statecount = laststate;
2135
2136         for ( state = 1 ; state < laststate ; state++ ) {
2137             U8 flag = 0;
2138             const U32 stateidx = TRIE_NODEIDX( state );
2139             const U32 o_used = trie->trans[ stateidx ].check;
2140             U32 used = trie->trans[ stateidx ].check;
2141             trie->trans[ stateidx ].check = 0;
2142
2143             for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2144                 if ( flag || trie->trans[ stateidx + charid ].next ) {
2145                     if ( trie->trans[ stateidx + charid ].next ) {
2146                         if (o_used == 1) {
2147                             for ( ; zp < pos ; zp++ ) {
2148                                 if ( ! trie->trans[ zp ].next ) {
2149                                     break;
2150                                 }
2151                             }
2152                             trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2153                             trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2154                             trie->trans[ zp ].check = state;
2155                             if ( ++zp > pos ) pos = zp;
2156                             break;
2157                         }
2158                         used--;
2159                     }
2160                     if ( !flag ) {
2161                         flag = 1;
2162                         trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2163                     }
2164                     trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2165                     trie->trans[ pos ].check = state;
2166                     pos++;
2167                 }
2168             }
2169         }
2170         trie->lasttrans = pos + 1;
2171         trie->states = (reg_trie_state *)
2172             PerlMemShared_realloc( trie->states, laststate
2173                                    * sizeof(reg_trie_state) );
2174         DEBUG_TRIE_COMPILE_MORE_r(
2175                 PerlIO_printf( Perl_debug_log,
2176                     "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2177                     (int)depth * 2 + 2,"",
2178                     (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
2179                     (IV)next_alloc,
2180                     (IV)pos,
2181                     ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2182             );
2183
2184         } /* end table compress */
2185     }
2186     DEBUG_TRIE_COMPILE_MORE_r(
2187             PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2188                 (int)depth * 2 + 2, "",
2189                 (UV)trie->statecount,
2190                 (UV)trie->lasttrans)
2191     );
2192     /* resize the trans array to remove unused space */
2193     trie->trans = (reg_trie_trans *)
2194         PerlMemShared_realloc( trie->trans, trie->lasttrans
2195                                * sizeof(reg_trie_trans) );
2196
2197     {   /* Modify the program and insert the new TRIE node */
2198         U8 nodetype =(U8)(flags & 0xFF);
2199         char *str=NULL;
2200
2201 #ifdef DEBUGGING
2202         regnode *optimize = NULL;
2203 #ifdef RE_TRACK_PATTERN_OFFSETS
2204
2205         U32 mjd_offset = 0;
2206         U32 mjd_nodelen = 0;
2207 #endif /* RE_TRACK_PATTERN_OFFSETS */
2208 #endif /* DEBUGGING */
2209         /*
2210            This means we convert either the first branch or the first Exact,
2211            depending on whether the thing following (in 'last') is a branch
2212            or not and whther first is the startbranch (ie is it a sub part of
2213            the alternation or is it the whole thing.)
2214            Assuming its a sub part we convert the EXACT otherwise we convert
2215            the whole branch sequence, including the first.
2216          */
2217         /* Find the node we are going to overwrite */
2218         if ( first != startbranch || OP( last ) == BRANCH ) {
2219             /* branch sub-chain */
2220             NEXT_OFF( first ) = (U16)(last - first);
2221 #ifdef RE_TRACK_PATTERN_OFFSETS
2222             DEBUG_r({
2223                 mjd_offset= Node_Offset((convert));
2224                 mjd_nodelen= Node_Length((convert));
2225             });
2226 #endif
2227             /* whole branch chain */
2228         }
2229 #ifdef RE_TRACK_PATTERN_OFFSETS
2230         else {
2231             DEBUG_r({
2232                 const  regnode *nop = NEXTOPER( convert );
2233                 mjd_offset= Node_Offset((nop));
2234                 mjd_nodelen= Node_Length((nop));
2235             });
2236         }
2237         DEBUG_OPTIMISE_r(
2238             PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2239                 (int)depth * 2 + 2, "",
2240                 (UV)mjd_offset, (UV)mjd_nodelen)
2241         );
2242 #endif
2243         /* But first we check to see if there is a common prefix we can
2244            split out as an EXACT and put in front of the TRIE node.  */
2245         trie->startstate= 1;
2246         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
2247             U32 state;
2248             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2249                 U32 ofs = 0;
2250                 I32 idx = -1;
2251                 U32 count = 0;
2252                 const U32 base = trie->states[ state ].trans.base;
2253
2254                 if ( trie->states[state].wordnum )
2255                         count = 1;
2256
2257                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2258                     if ( ( base + ofs >= trie->uniquecharcount ) &&
2259                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2260                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2261                     {
2262                         if ( ++count > 1 ) {
2263                             SV **tmp = av_fetch( revcharmap, ofs, 0);
2264                             const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2265                             if ( state == 1 ) break;
2266                             if ( count == 2 ) {
2267                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2268                                 DEBUG_OPTIMISE_r(
2269                                     PerlIO_printf(Perl_debug_log,
2270                                         "%*sNew Start State=%"UVuf" Class: [",
2271                                         (int)depth * 2 + 2, "",
2272                                         (UV)state));
2273                                 if (idx >= 0) {
2274                                     SV ** const tmp = av_fetch( revcharmap, idx, 0);
2275                                     const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2276
2277                                     TRIE_BITMAP_SET(trie,*ch);
2278                                     if ( folder )
2279                                         TRIE_BITMAP_SET(trie, folder[ *ch ]);
2280                                     DEBUG_OPTIMISE_r(
2281                                         PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2282                                     );
2283                                 }
2284                             }
2285                             TRIE_BITMAP_SET(trie,*ch);
2286                             if ( folder )
2287                                 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2288                             DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2289                         }
2290                         idx = ofs;
2291                     }
2292                 }
2293                 if ( count == 1 ) {
2294                     SV **tmp = av_fetch( revcharmap, idx, 0);
2295                     STRLEN len;
2296                     char *ch = SvPV( *tmp, len );
2297                     DEBUG_OPTIMISE_r({
2298                         SV *sv=sv_newmortal();
2299                         PerlIO_printf( Perl_debug_log,
2300                             "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2301                             (int)depth * 2 + 2, "",
2302                             (UV)state, (UV)idx,
2303                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2304                                 PL_colors[0], PL_colors[1],
2305                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2306                                 PERL_PV_ESCAPE_FIRSTCHAR
2307                             )
2308                         );
2309                     });
2310                     if ( state==1 ) {
2311                         OP( convert ) = nodetype;
2312                         str=STRING(convert);
2313                         STR_LEN(convert)=0;
2314                     }
2315                     STR_LEN(convert) += len;
2316                     while (len--)
2317                         *str++ = *ch++;
2318                 } else {
2319 #ifdef DEBUGGING
2320                     if (state>1)
2321                         DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2322 #endif
2323                     break;
2324                 }
2325             }
2326             trie->prefixlen = (state-1);
2327             if (str) {
2328                 regnode *n = convert+NODE_SZ_STR(convert);
2329                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2330                 trie->startstate = state;
2331                 trie->minlen -= (state - 1);
2332                 trie->maxlen -= (state - 1);
2333 #ifdef DEBUGGING
2334                /* At least the UNICOS C compiler choked on this
2335                 * being argument to DEBUG_r(), so let's just have
2336                 * it right here. */
2337                if (
2338 #ifdef PERL_EXT_RE_BUILD
2339                    1
2340 #else
2341                    DEBUG_r_TEST
2342 #endif
2343                    ) {
2344                    regnode *fix = convert;
2345                    U32 word = trie->wordcount;
2346                    mjd_nodelen++;
2347                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2348                    while( ++fix < n ) {
2349                        Set_Node_Offset_Length(fix, 0, 0);
2350                    }
2351                    while (word--) {
2352                        SV ** const tmp = av_fetch( trie_words, word, 0 );
2353                        if (tmp) {
2354                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
2355                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2356                            else
2357                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2358                        }
2359                    }
2360                }
2361 #endif
2362                 if (trie->maxlen) {
2363                     convert = n;
2364                 } else {
2365                     NEXT_OFF(convert) = (U16)(tail - convert);
2366                     DEBUG_r(optimize= n);
2367                 }
2368             }
2369         }
2370         if (!jumper)
2371             jumper = last;
2372         if ( trie->maxlen ) {
2373             NEXT_OFF( convert ) = (U16)(tail - convert);
2374             ARG_SET( convert, data_slot );
2375             /* Store the offset to the first unabsorbed branch in
2376                jump[0], which is otherwise unused by the jump logic.
2377                We use this when dumping a trie and during optimisation. */
2378             if (trie->jump)
2379                 trie->jump[0] = (U16)(nextbranch - convert);
2380
2381             /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2382              *   and there is a bitmap
2383              *   and the first "jump target" node we found leaves enough room
2384              * then convert the TRIE node into a TRIEC node, with the bitmap
2385              * embedded inline in the opcode - this is hypothetically faster.
2386              */
2387             if ( !trie->states[trie->startstate].wordnum
2388                  && trie->bitmap
2389                  && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2390             {
2391                 OP( convert ) = TRIEC;
2392                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2393                 PerlMemShared_free(trie->bitmap);
2394                 trie->bitmap= NULL;
2395             } else
2396                 OP( convert ) = TRIE;
2397
2398             /* store the type in the flags */
2399             convert->flags = nodetype;
2400             DEBUG_r({
2401             optimize = convert
2402                       + NODE_STEP_REGNODE
2403                       + regarglen[ OP( convert ) ];
2404             });
2405             /* XXX We really should free up the resource in trie now,
2406                    as we won't use them - (which resources?) dmq */
2407         }
2408         /* needed for dumping*/
2409         DEBUG_r(if (optimize) {
2410             regnode *opt = convert;
2411
2412             while ( ++opt < optimize) {
2413                 Set_Node_Offset_Length(opt,0,0);
2414             }
2415             /*
2416                 Try to clean up some of the debris left after the
2417                 optimisation.
2418              */
2419             while( optimize < jumper ) {
2420                 mjd_nodelen += Node_Length((optimize));
2421                 OP( optimize ) = OPTIMIZED;
2422                 Set_Node_Offset_Length(optimize,0,0);
2423                 optimize++;
2424             }
2425             Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2426         });
2427     } /* end node insert */
2428
2429     /*  Finish populating the prev field of the wordinfo array.  Walk back
2430      *  from each accept state until we find another accept state, and if
2431      *  so, point the first word's .prev field at the second word. If the
2432      *  second already has a .prev field set, stop now. This will be the
2433      *  case either if we've already processed that word's accept state,
2434      *  or that state had multiple words, and the overspill words were
2435      *  already linked up earlier.
2436      */
2437     {
2438         U16 word;
2439         U32 state;
2440         U16 prev;
2441
2442         for (word=1; word <= trie->wordcount; word++) {
2443             prev = 0;
2444             if (trie->wordinfo[word].prev)
2445                 continue;
2446             state = trie->wordinfo[word].accept;
2447             while (state) {
2448                 state = prev_states[state];
2449                 if (!state)
2450                     break;
2451                 prev = trie->states[state].wordnum;
2452                 if (prev)
2453                     break;
2454             }
2455             trie->wordinfo[word].prev = prev;
2456         }
2457         Safefree(prev_states);
2458     }
2459
2460
2461     /* and now dump out the compressed format */
2462     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2463
2464     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2465 #ifdef DEBUGGING
2466     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2467     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2468 #else
2469     SvREFCNT_dec_NN(revcharmap);
2470 #endif
2471     return trie->jump
2472            ? MADE_JUMP_TRIE
2473            : trie->startstate>1
2474              ? MADE_EXACT_TRIE
2475              : MADE_TRIE;
2476 }
2477
2478 STATIC void
2479 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode *stclass, U32 depth)
2480 {
2481 /* The Trie is constructed and compressed now so we can build a fail array if it's needed
2482
2483    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2484    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2485    ISBN 0-201-10088-6
2486
2487    We find the fail state for each state in the trie, this state is the longest proper
2488    suffix of the current state's 'word' that is also a proper prefix of another word in our
2489    trie. State 1 represents the word '' and is thus the default fail state. This allows
2490    the DFA not to have to restart after its tried and failed a word at a given point, it
2491    simply continues as though it had been matching the other word in the first place.
2492    Consider
2493       'abcdgu'=~/abcdefg|cdgu/
2494    When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2495    fail, which would bring us to the state representing 'd' in the second word where we would
2496    try 'g' and succeed, proceeding to match 'cdgu'.
2497  */
2498  /* add a fail transition */
2499     const U32 trie_offset = ARG(source);
2500     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2501     U32 *q;
2502     const U32 ucharcount = trie->uniquecharcount;
2503     const U32 numstates = trie->statecount;
2504     const U32 ubound = trie->lasttrans + ucharcount;
2505     U32 q_read = 0;
2506     U32 q_write = 0;
2507     U32 charid;
2508     U32 base = trie->states[ 1 ].trans.base;
2509     U32 *fail;
2510     reg_ac_data *aho;
2511     const U32 data_slot = add_data( pRExC_state, 1, "T" );
2512     GET_RE_DEBUG_FLAGS_DECL;
2513
2514     PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2515 #ifndef DEBUGGING
2516     PERL_UNUSED_ARG(depth);
2517 #endif
2518
2519
2520     ARG_SET( stclass, data_slot );
2521     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2522     RExC_rxi->data->data[ data_slot ] = (void*)aho;
2523     aho->trie=trie_offset;
2524     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2525     Copy( trie->states, aho->states, numstates, reg_trie_state );
2526     Newxz( q, numstates, U32);
2527     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2528     aho->refcount = 1;
2529     fail = aho->fail;
2530     /* initialize fail[0..1] to be 1 so that we always have
2531        a valid final fail state */
2532     fail[ 0 ] = fail[ 1 ] = 1;
2533
2534     for ( charid = 0; charid < ucharcount ; charid++ ) {
2535         const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2536         if ( newstate ) {
2537             q[ q_write ] = newstate;
2538             /* set to point at the root */
2539             fail[ q[ q_write++ ] ]=1;
2540         }
2541     }
2542     while ( q_read < q_write) {
2543         const U32 cur = q[ q_read++ % numstates ];
2544         base = trie->states[ cur ].trans.base;
2545
2546         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2547             const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2548             if (ch_state) {
2549                 U32 fail_state = cur;
2550                 U32 fail_base;
2551                 do {
2552                     fail_state = fail[ fail_state ];
2553                     fail_base = aho->states[ fail_state ].trans.base;
2554                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2555
2556                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2557                 fail[ ch_state ] = fail_state;
2558                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2559                 {
2560                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
2561                 }
2562                 q[ q_write++ % numstates] = ch_state;
2563             }
2564         }
2565     }
2566     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2567        when we fail in state 1, this allows us to use the
2568        charclass scan to find a valid start char. This is based on the principle
2569        that theres a good chance the string being searched contains lots of stuff
2570        that cant be a start char.
2571      */
2572     fail[ 0 ] = fail[ 1 ] = 0;
2573     DEBUG_TRIE_COMPILE_r({
2574         PerlIO_printf(Perl_debug_log,
2575                       "%*sStclass Failtable (%"UVuf" states): 0",
2576                       (int)(depth * 2), "", (UV)numstates
2577         );
2578         for( q_read=1; q_read<numstates; q_read++ ) {
2579             PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2580         }
2581         PerlIO_printf(Perl_debug_log, "\n");
2582     });
2583     Safefree(q);
2584     /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2585 }
2586
2587
2588 /*
2589  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2590  * These need to be revisited when a newer toolchain becomes available.
2591  */
2592 #if defined(__sparc64__) && defined(__GNUC__)
2593 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2594 #       undef  SPARC64_GCC_WORKAROUND
2595 #       define SPARC64_GCC_WORKAROUND 1
2596 #   endif
2597 #endif
2598
2599 #define DEBUG_PEEP(str,scan,depth) \
2600     DEBUG_OPTIMISE_r({if (scan){ \
2601        SV * const mysv=sv_newmortal(); \
2602        regnode *Next = regnext(scan); \
2603        regprop(RExC_rx, mysv, scan); \
2604        PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2605        (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2606        Next ? (REG_NODE_NUM(Next)) : 0 ); \
2607    }});
2608
2609
2610 /* The below joins as many adjacent EXACTish nodes as possible into a single
2611  * one.  The regop may be changed if the node(s) contain certain sequences that
2612  * require special handling.  The joining is only done if:
2613  * 1) there is room in the current conglomerated node to entirely contain the
2614  *    next one.
2615  * 2) they are the exact same node type
2616  *
2617  * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
2618  * these get optimized out
2619  *
2620  * If a node is to match under /i (folded), the number of characters it matches
2621  * can be different than its character length if it contains a multi-character
2622  * fold.  *min_subtract is set to the total delta of the input nodes.
2623  *
2624  * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
2625  * and contains LATIN SMALL LETTER SHARP S
2626  *
2627  * This is as good a place as any to discuss the design of handling these
2628  * multi-character fold sequences.  It's been wrong in Perl for a very long
2629  * time.  There are three code points in Unicode whose multi-character folds
2630  * were long ago discovered to mess things up.  The previous designs for
2631  * dealing with these involved assigning a special node for them.  This
2632  * approach doesn't work, as evidenced by this example:
2633  *      "\xDFs" =~ /s\xDF/ui    # Used to fail before these patches
2634  * Both these fold to "sss", but if the pattern is parsed to create a node that
2635  * would match just the \xDF, it won't be able to handle the case where a
2636  * successful match would have to cross the node's boundary.  The new approach
2637  * that hopefully generally solves the problem generates an EXACTFU_SS node
2638  * that is "sss".
2639  *
2640  * It turns out that there are problems with all multi-character folds, and not
2641  * just these three.  Now the code is general, for all such cases, but the
2642  * three still have some special handling.  The approach taken is:
2643  * 1)   This routine examines each EXACTFish node that could contain multi-
2644  *      character fold sequences.  It returns in *min_subtract how much to
2645  *      subtract from the the actual length of the string to get a real minimum
2646  *      match length; it is 0 if there are no multi-char folds.  This delta is
2647  *      used by the caller to adjust the min length of the match, and the delta
2648  *      between min and max, so that the optimizer doesn't reject these
2649  *      possibilities based on size constraints.
2650  * 2)   Certain of these sequences require special handling by the trie code,
2651  *      so, if found, this code changes the joined node type to special ops:
2652  *      EXACTFU_TRICKYFOLD and EXACTFU_SS.
2653  * 3)   For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
2654  *      is used for an EXACTFU node that contains at least one "ss" sequence in
2655  *      it.  For non-UTF-8 patterns and strings, this is the only case where
2656  *      there is a possible fold length change.  That means that a regular
2657  *      EXACTFU node without UTF-8 involvement doesn't have to concern itself
2658  *      with length changes, and so can be processed faster.  regexec.c takes
2659  *      advantage of this.  Generally, an EXACTFish node that is in UTF-8 is
2660  *      pre-folded by regcomp.c.  This saves effort in regex matching.
2661  *      However, the pre-folding isn't done for non-UTF8 patterns because the
2662  *      fold of the MICRO SIGN requires UTF-8, and we don't want to slow things
2663  *      down by forcing the pattern into UTF8 unless necessary.  Also what
2664  *      EXACTF and EXACTFL nodes fold to isn't known until runtime.  The fold
2665  *      possibilities for the non-UTF8 patterns are quite simple, except for
2666  *      the sharp s.  All the ones that don't involve a UTF-8 target string are
2667  *      members of a fold-pair, and arrays are set up for all of them so that
2668  *      the other member of the pair can be found quickly.  Code elsewhere in
2669  *      this file makes sure that in EXACTFU nodes, the sharp s gets folded to
2670  *      'ss', even if the pattern isn't UTF-8.  This avoids the issues
2671  *      described in the next item.
2672  * 4)   A problem remains for the sharp s in EXACTF nodes.  Whether it matches
2673  *      'ss' or not is not knowable at compile time.  It will match iff the
2674  *      target string is in UTF-8, unlike the EXACTFU nodes, where it always
2675  *      matches; and the EXACTFL and EXACTFA nodes where it never does.  Thus
2676  *      it can't be folded to "ss" at compile time, unlike EXACTFU does (as
2677  *      described in item 3).  An assumption that the optimizer part of
2678  *      regexec.c (probably unwittingly) makes is that a character in the
2679  *      pattern corresponds to at most a single character in the target string.
2680  *      (And I do mean character, and not byte here, unlike other parts of the
2681  *      documentation that have never been updated to account for multibyte
2682  *      Unicode.)  This assumption is wrong only in this case, as all other
2683  *      cases are either 1-1 folds when no UTF-8 is involved; or is true by
2684  *      virtue of having this file pre-fold UTF-8 patterns.   I'm
2685  *      reluctant to try to change this assumption, so instead the code punts.
2686  *      This routine examines EXACTF nodes for the sharp s, and returns a
2687  *      boolean indicating whether or not the node is an EXACTF node that
2688  *      contains a sharp s.  When it is true, the caller sets a flag that later
2689  *      causes the optimizer in this file to not set values for the floating
2690  *      and fixed string lengths, and thus avoids the optimizer code in
2691  *      regexec.c that makes the invalid assumption.  Thus, there is no
2692  *      optimization based on string lengths for EXACTF nodes that contain the
2693  *      sharp s.  This only happens for /id rules (which means the pattern
2694  *      isn't in UTF-8).
2695  */
2696
2697 #define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
2698     if (PL_regkind[OP(scan)] == EXACT) \
2699         join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
2700
2701 STATIC U32
2702 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
2703     /* Merge several consecutive EXACTish nodes into one. */
2704     regnode *n = regnext(scan);
2705     U32 stringok = 1;
2706     regnode *next = scan + NODE_SZ_STR(scan);
2707     U32 merged = 0;
2708     U32 stopnow = 0;
2709 #ifdef DEBUGGING
2710     regnode *stop = scan;
2711     GET_RE_DEBUG_FLAGS_DECL;
2712 #else
2713     PERL_UNUSED_ARG(depth);
2714 #endif
2715
2716     PERL_ARGS_ASSERT_JOIN_EXACT;
2717 #ifndef EXPERIMENTAL_INPLACESCAN
2718     PERL_UNUSED_ARG(flags);
2719     PERL_UNUSED_ARG(val);
2720 #endif
2721     DEBUG_PEEP("join",scan,depth);
2722
2723     /* Look through the subsequent nodes in the chain.  Skip NOTHING, merge
2724      * EXACT ones that are mergeable to the current one. */
2725     while (n
2726            && (PL_regkind[OP(n)] == NOTHING
2727                || (stringok && OP(n) == OP(scan)))
2728            && NEXT_OFF(n)
2729            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
2730     {
2731
2732         if (OP(n) == TAIL || n > next)
2733             stringok = 0;
2734         if (PL_regkind[OP(n)] == NOTHING) {
2735             DEBUG_PEEP("skip:",n,depth);
2736             NEXT_OFF(scan) += NEXT_OFF(n);
2737             next = n + NODE_STEP_REGNODE;
2738 #ifdef DEBUGGING
2739             if (stringok)
2740                 stop = n;
2741 #endif
2742             n = regnext(n);
2743         }
2744         else if (stringok) {
2745             const unsigned int oldl = STR_LEN(scan);
2746             regnode * const nnext = regnext(n);
2747
2748             /* XXX I (khw) kind of doubt that this works on platforms where
2749              * U8_MAX is above 255 because of lots of other assumptions */
2750             /* Don't join if the sum can't fit into a single node */
2751             if (oldl + STR_LEN(n) > U8_MAX)
2752                 break;
2753
2754             DEBUG_PEEP("merg",n,depth);
2755             merged++;
2756
2757             NEXT_OFF(scan) += NEXT_OFF(n);
2758             STR_LEN(scan) += STR_LEN(n);
2759             next = n + NODE_SZ_STR(n);
2760             /* Now we can overwrite *n : */
2761             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2762 #ifdef DEBUGGING
2763             stop = next - 1;
2764 #endif
2765             n = nnext;
2766             if (stopnow) break;
2767         }
2768
2769 #ifdef EXPERIMENTAL_INPLACESCAN
2770         if (flags && !NEXT_OFF(n)) {
2771             DEBUG_PEEP("atch", val, depth);
2772             if (reg_off_by_arg[OP(n)]) {
2773                 ARG_SET(n, val - n);
2774             }
2775             else {
2776                 NEXT_OFF(n) = val - n;
2777             }
2778             stopnow = 1;
2779         }
2780 #endif
2781     }
2782
2783     *min_subtract = 0;
2784     *has_exactf_sharp_s = FALSE;
2785
2786     /* Here, all the adjacent mergeable EXACTish nodes have been merged.  We
2787      * can now analyze for sequences of problematic code points.  (Prior to
2788      * this final joining, sequences could have been split over boundaries, and
2789      * hence missed).  The sequences only happen in folding, hence for any
2790      * non-EXACT EXACTish node */
2791     if (OP(scan) != EXACT) {
2792         const U8 * const s0 = (U8*) STRING(scan);
2793         const U8 * s = s0;
2794         const U8 * const s_end = s0 + STR_LEN(scan);
2795
2796         /* One pass is made over the node's string looking for all the
2797          * possibilities.  to avoid some tests in the loop, there are two main
2798          * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
2799          * non-UTF-8 */
2800         if (UTF) {
2801
2802             /* Examine the string for a multi-character fold sequence.  UTF-8
2803              * patterns have all characters pre-folded by the time this code is
2804              * executed */
2805             while (s < s_end - 1) /* Can stop 1 before the end, as minimum
2806                                      length sequence we are looking for is 2 */
2807             {
2808                 int count = 0;
2809                 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
2810                 if (! len) {    /* Not a multi-char fold: get next char */
2811                     s += UTF8SKIP(s);
2812                     continue;
2813                 }
2814
2815                 /* Nodes with 'ss' require special handling, except for EXACTFL
2816                  * and EXACTFA for which there is no multi-char fold to this */
2817                 if (len == 2 && *s == 's' && *(s+1) == 's'
2818                     && OP(scan) != EXACTFL && OP(scan) != EXACTFA)
2819                 {
2820                     count = 2;
2821                     OP(scan) = EXACTFU_SS;
2822                     s += 2;
2823                 }
2824                 else if (len == 6   /* len is the same in both ASCII and EBCDIC for these */
2825                          && (memEQ(s, GREEK_SMALL_LETTER_IOTA_UTF8
2826                                       COMBINING_DIAERESIS_UTF8
2827                                       COMBINING_ACUTE_ACCENT_UTF8,
2828                                    6)
2829                              || memEQ(s, GREEK_SMALL_LETTER_UPSILON_UTF8
2830                                          COMBINING_DIAERESIS_UTF8
2831                                          COMBINING_ACUTE_ACCENT_UTF8,
2832                                      6)))
2833                 {
2834                     count = 3;
2835
2836                     /* These two folds require special handling by trie's, so
2837                      * change the node type to indicate this.  If EXACTFA and
2838                      * EXACTFL were ever to be handled by trie's, this would
2839                      * have to be changed.  If this node has already been
2840                      * changed to EXACTFU_SS in this loop, leave it as is.  (I
2841                      * (khw) think it doesn't matter in regexec.c for UTF
2842                      * patterns, but no need to change it */
2843                     if (OP(scan) == EXACTFU) {
2844                         OP(scan) = EXACTFU_TRICKYFOLD;
2845                     }
2846                     s += 6;
2847                 }
2848                 else { /* Here is a generic multi-char fold. */
2849                     const U8* multi_end  = s + len;
2850
2851                     /* Count how many characters in it.  In the case of /l and
2852                      * /aa, no folds which contain ASCII code points are
2853                      * allowed, so check for those, and skip if found.  (In
2854                      * EXACTFL, no folds are allowed to any Latin1 code point,
2855                      * not just ASCII.  But there aren't any of these
2856                      * currently, nor ever likely, so don't take the time to
2857                      * test for them.  The code that generates the
2858                      * is_MULTI_foo() macros croaks should one actually get put
2859                      * into Unicode .) */
2860                     if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
2861                         count = utf8_length(s, multi_end);
2862                         s = multi_end;
2863                     }
2864                     else {
2865                         while (s < multi_end) {
2866                             if (isASCII(*s)) {
2867                                 s++;
2868                                 goto next_iteration;
2869                             }
2870                             else {
2871                                 s += UTF8SKIP(s);
2872                             }
2873                             count++;
2874                         }
2875                     }
2876                 }
2877
2878                 /* The delta is how long the sequence is minus 1 (1 is how long
2879                  * the character that folds to the sequence is) */
2880                 *min_subtract += count - 1;
2881             next_iteration: ;
2882             }
2883         }
2884         else if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
2885
2886             /* Here, the pattern is not UTF-8.  Look for the multi-char folds
2887              * that are all ASCII.  As in the above case, EXACTFL and EXACTFA
2888              * nodes can't have multi-char folds to this range (and there are
2889              * no existing ones in the upper latin1 range).  In the EXACTF
2890              * case we look also for the sharp s, which can be in the final
2891              * position.  Otherwise we can stop looking 1 byte earlier because
2892              * have to find at least two characters for a multi-fold */
2893             const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
2894
2895             /* The below is perhaps overboard, but this allows us to save a
2896              * test each time through the loop at the expense of a mask.  This
2897              * is because on both EBCDIC and ASCII machines, 'S' and 's' differ
2898              * by a single bit.  On ASCII they are 32 apart; on EBCDIC, they
2899              * are 64.  This uses an exclusive 'or' to find that bit and then
2900              * inverts it to form a mask, with just a single 0, in the bit
2901              * position where 'S' and 's' differ. */
2902             const U8 S_or_s_mask = (U8) ~ ('S' ^ 's');
2903             const U8 s_masked = 's' & S_or_s_mask;
2904
2905             while (s < upper) {
2906                 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
2907                 if (! len) {    /* Not a multi-char fold. */
2908                     if (*s == LATIN_SMALL_LETTER_SHARP_S && OP(scan) == EXACTF)
2909                     {
2910                         *has_exactf_sharp_s = TRUE;
2911                     }
2912                     s++;
2913                     continue;
2914                 }
2915
2916                 if (len == 2
2917                     && ((*s & S_or_s_mask) == s_masked)
2918                     && ((*(s+1) & S_or_s_mask) == s_masked))
2919                 {
2920
2921                     /* EXACTF nodes need to know that the minimum length
2922                      * changed so that a sharp s in the string can match this
2923                      * ss in the pattern, but they remain EXACTF nodes, as they
2924                      * won't match this unless the target string is is UTF-8,
2925                      * which we don't know until runtime */
2926                     if (OP(scan) != EXACTF) {
2927                         OP(scan) = EXACTFU_SS;
2928                     }
2929                 }
2930
2931                 *min_subtract += len - 1;
2932                 s += len;
2933             }
2934         }
2935     }
2936
2937 #ifdef DEBUGGING
2938     /* Allow dumping but overwriting the collection of skipped
2939      * ops and/or strings with fake optimized ops */
2940     n = scan + NODE_SZ_STR(scan);
2941     while (n <= stop) {
2942         OP(n) = OPTIMIZED;
2943         FLAGS(n) = 0;
2944         NEXT_OFF(n) = 0;
2945         n++;
2946     }
2947 #endif
2948     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2949     return stopnow;
2950 }
2951
2952 /* REx optimizer.  Converts nodes into quicker variants "in place".
2953    Finds fixed substrings.  */
2954
2955 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2956    to the position after last scanned or to NULL. */
2957
2958 #define INIT_AND_WITHP \
2959     assert(!and_withp); \
2960     Newx(and_withp,1,struct regnode_charclass_class); \
2961     SAVEFREEPV(and_withp)
2962
2963 /* this is a chain of data about sub patterns we are processing that
2964    need to be handled separately/specially in study_chunk. Its so
2965    we can simulate recursion without losing state.  */
2966 struct scan_frame;
2967 typedef struct scan_frame {
2968     regnode *last;  /* last node to process in this frame */
2969     regnode *next;  /* next node to process when last is reached */
2970     struct scan_frame *prev; /*previous frame*/
2971     I32 stop; /* what stopparen do we use */
2972 } scan_frame;
2973
2974
2975 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2976
2977 STATIC I32
2978 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2979                         I32 *minlenp, I32 *deltap,
2980                         regnode *last,
2981                         scan_data_t *data,
2982                         I32 stopparen,
2983                         U8* recursed,
2984                         struct regnode_charclass_class *and_withp,
2985                         U32 flags, U32 depth)
2986                         /* scanp: Start here (read-write). */
2987                         /* deltap: Write maxlen-minlen here. */
2988                         /* last: Stop before this one. */
2989                         /* data: string data about the pattern */
2990                         /* stopparen: treat close N as END */
2991                         /* recursed: which subroutines have we recursed into */
2992                         /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2993 {
2994     dVAR;
2995     I32 min = 0;    /* There must be at least this number of characters to match */
2996     I32 pars = 0, code;
2997     regnode *scan = *scanp, *next;
2998     I32 delta = 0;
2999     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3000     int is_inf_internal = 0;            /* The studied chunk is infinite */
3001     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3002     scan_data_t data_fake;
3003     SV *re_trie_maxbuff = NULL;
3004     regnode *first_non_open = scan;
3005     I32 stopmin = I32_MAX;
3006     scan_frame *frame = NULL;
3007     GET_RE_DEBUG_FLAGS_DECL;
3008
3009     PERL_ARGS_ASSERT_STUDY_CHUNK;
3010
3011 #ifdef DEBUGGING
3012     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3013 #endif
3014
3015     if ( depth == 0 ) {
3016         while (first_non_open && OP(first_non_open) == OPEN)
3017             first_non_open=regnext(first_non_open);
3018     }
3019
3020
3021   fake_study_recurse:
3022     while ( scan && OP(scan) != END && scan < last ){
3023         UV min_subtract = 0;    /* How mmany chars to subtract from the minimum
3024                                    node length to get a real minimum (because
3025                                    the folded version may be shorter) */
3026         bool has_exactf_sharp_s = FALSE;
3027         /* Peephole optimizer: */
3028         DEBUG_STUDYDATA("Peep:", data,depth);
3029         DEBUG_PEEP("Peep",scan,depth);
3030
3031         /* Its not clear to khw or hv why this is done here, and not in the
3032          * clauses that deal with EXACT nodes.  khw's guess is that it's
3033          * because of a previous design */
3034         JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
3035
3036         /* Follow the next-chain of the current node and optimize
3037            away all the NOTHINGs from it.  */
3038         if (OP(scan) != CURLYX) {
3039             const int max = (reg_off_by_arg[OP(scan)]
3040                        ? I32_MAX
3041                        /* I32 may be smaller than U16 on CRAYs! */
3042                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3043             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3044             int noff;
3045             regnode *n = scan;
3046
3047             /* Skip NOTHING and LONGJMP. */
3048             while ((n = regnext(n))
3049                    && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3050                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3051                    && off + noff < max)
3052                 off += noff;
3053             if (reg_off_by_arg[OP(scan)])
3054                 ARG(scan) = off;
3055             else
3056                 NEXT_OFF(scan) = off;
3057         }
3058
3059
3060
3061         /* The principal pseudo-switch.  Cannot be a switch, since we
3062            look into several different things.  */
3063         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3064                    || OP(scan) == IFTHEN) {
3065             next = regnext(scan);
3066             code = OP(scan);
3067             /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
3068
3069             if (OP(next) == code || code == IFTHEN) {
3070                 /* NOTE - There is similar code to this block below for handling
3071                    TRIE nodes on a re-study.  If you change stuff here check there
3072                    too. */
3073                 I32 max1 = 0, min1 = I32_MAX, num = 0;
3074                 struct regnode_charclass_class accum;
3075                 regnode * const startbranch=scan;
3076
3077                 if (flags & SCF_DO_SUBSTR)
3078                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
3079                 if (flags & SCF_DO_STCLASS)
3080                     cl_init_zero(pRExC_state, &accum);
3081
3082                 while (OP(scan) == code) {
3083                     I32 deltanext, minnext, f = 0, fake;
3084                     struct regnode_charclass_class this_class;
3085
3086                     num++;
3087                     data_fake.flags = 0;
3088                     if (data) {
3089                         data_fake.whilem_c = data->whilem_c;
3090                         data_fake.last_closep = data->last_closep;
3091                     }
3092                     else
3093                         data_fake.last_closep = &fake;
3094
3095                     data_fake.pos_delta = delta;
3096                     next = regnext(scan);
3097                     scan = NEXTOPER(scan);
3098                     if (code != BRANCH)
3099                         scan = NEXTOPER(scan);
3100                     if (flags & SCF_DO_STCLASS) {
3101                         cl_init(pRExC_state, &this_class);
3102                         data_fake.start_class = &this_class;
3103                         f = SCF_DO_STCLASS_AND;
3104                     }
3105                     if (flags & SCF_WHILEM_VISITED_POS)
3106                         f |= SCF_WHILEM_VISITED_POS;
3107
3108                     /* we suppose the run is continuous, last=next...*/
3109                     minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3110                                           next, &data_fake,
3111                                           stopparen, recursed, NULL, f,depth+1);
3112                     if (min1 > minnext)
3113                         min1 = minnext;
3114                     if (max1 < minnext + deltanext)
3115                         max1 = minnext + deltanext;
3116                     if (deltanext == I32_MAX)
3117                         is_inf = is_inf_internal = 1;
3118                     scan = next;
3119                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3120                         pars++;
3121                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
3122                         if ( stopmin > minnext)
3123                             stopmin = min + min1;
3124                         flags &= ~SCF_DO_SUBSTR;
3125                         if (data)
3126                             data->flags |= SCF_SEEN_ACCEPT;
3127                     }
3128                     if (data) {
3129                         if (data_fake.flags & SF_HAS_EVAL)
3130                             data->flags |= SF_HAS_EVAL;
3131                         data->whilem_c = data_fake.whilem_c;
3132                     }
3133                     if (flags & SCF_DO_STCLASS)
3134                         cl_or(pRExC_state, &accum, &this_class);
3135                 }
3136                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3137                     min1 = 0;
3138                 if (flags & SCF_DO_SUBSTR) {
3139                     data->pos_min += min1;
3140                     data->pos_delta += max1 - min1;
3141                     if (max1 != min1 || is_inf)
3142                         data->longest = &(data->longest_float);
3143                 }
3144                 min += min1;
3145                 delta += max1 - min1;
3146                 if (flags & SCF_DO_STCLASS_OR) {
3147                     cl_or(pRExC_state, data->start_class, &accum);
3148                     if (min1) {
3149                         cl_and(data->start_class, and_withp);
3150                         flags &= ~SCF_DO_STCLASS;
3151                     }
3152                 }
3153                 else if (flags & SCF_DO_STCLASS_AND) {
3154                     if (min1) {
3155                         cl_and(data->start_class, &accum);
3156                         flags &= ~SCF_DO_STCLASS;
3157                     }
3158                     else {
3159                         /* Switch to OR mode: cache the old value of
3160                          * data->start_class */
3161                         INIT_AND_WITHP;
3162                         StructCopy(data->start_class, and_withp,
3163                                    struct regnode_charclass_class);
3164                         flags &= ~SCF_DO_STCLASS_AND;
3165                         StructCopy(&accum, data->start_class,
3166                                    struct regnode_charclass_class);
3167                         flags |= SCF_DO_STCLASS_OR;
3168                         SET_SSC_EOS(data->start_class);
3169                     }
3170                 }
3171
3172                 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
3173                 /* demq.
3174
3175                    Assuming this was/is a branch we are dealing with: 'scan' now
3176                    points at the item that follows the branch sequence, whatever
3177                    it is. We now start at the beginning of the sequence and look
3178                    for subsequences of
3179
3180                    BRANCH->EXACT=>x1
3181                    BRANCH->EXACT=>x2
3182                    tail
3183
3184                    which would be constructed from a pattern like /A|LIST|OF|WORDS/
3185
3186                    If we can find such a subsequence we need to turn the first
3187                    element into a trie and then add the subsequent branch exact
3188                    strings to the trie.
3189
3190                    We have two cases
3191
3192                      1. patterns where the whole set of branches can be converted.
3193
3194                      2. patterns where only a subset can be converted.
3195
3196                    In case 1 we can replace the whole set with a single regop
3197                    for the trie. In case 2 we need to keep the start and end
3198                    branches so
3199
3200                      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3201                      becomes BRANCH TRIE; BRANCH X;
3202
3203                   There is an additional case, that being where there is a
3204                   common prefix, which gets split out into an EXACT like node
3205                   preceding the TRIE node.
3206
3207                   If x(1..n)==tail then we can do a simple trie, if not we make
3208                   a "jump" trie, such that when we match the appropriate word
3209                   we "jump" to the appropriate tail node. Essentially we turn
3210                   a nested if into a case structure of sorts.
3211
3212                 */
3213
3214                     int made=0;
3215                     if (!re_trie_maxbuff) {
3216                         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3217                         if (!SvIOK(re_trie_maxbuff))
3218                             sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3219                     }
3220                     if ( SvIV(re_trie_maxbuff)>=0  ) {
3221                         regnode *cur;
3222                         regnode *first = (regnode *)NULL;
3223                         regnode *last = (regnode *)NULL;
3224                         regnode *tail = scan;
3225                         U8 trietype = 0;
3226                         U32 count=0;
3227
3228 #ifdef DEBUGGING
3229                         SV * const mysv = sv_newmortal();       /* for dumping */
3230 #endif
3231                         /* var tail is used because there may be a TAIL
3232                            regop in the way. Ie, the exacts will point to the
3233                            thing following the TAIL, but the last branch will
3234                            point at the TAIL. So we advance tail. If we
3235                            have nested (?:) we may have to move through several
3236                            tails.
3237                          */
3238
3239                         while ( OP( tail ) == TAIL ) {
3240                             /* this is the TAIL generated by (?:) */
3241                             tail = regnext( tail );
3242                         }
3243
3244
3245                         DEBUG_TRIE_COMPILE_r({
3246                             regprop(RExC_rx, mysv, tail );
3247                             PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3248                                 (int)depth * 2 + 2, "",
3249                                 "Looking for TRIE'able sequences. Tail node is: ",
3250                                 SvPV_nolen_const( mysv )
3251                             );
3252                         });
3253
3254                         /*
3255
3256                             Step through the branches
3257                                 cur represents each branch,
3258                                 noper is the first thing to be matched as part of that branch
3259                                 noper_next is the regnext() of that node.
3260
3261                             We normally handle a case like this /FOO[xyz]|BAR[pqr]/
3262                             via a "jump trie" but we also support building with NOJUMPTRIE,
3263                             which restricts the trie logic to structures like /FOO|BAR/.
3264
3265                             If noper is a trieable nodetype then the branch is a possible optimization
3266                             target. If we are building under NOJUMPTRIE then we require that noper_next
3267                             is the same as scan (our current position in the regex program).
3268
3269                             Once we have two or more consecutive such branches we can create a
3270                             trie of the EXACT's contents and stitch it in place into the program.
3271
3272                             If the sequence represents all of the branches in the alternation we
3273                             replace the entire thing with a single TRIE node.
3274
3275                             Otherwise when it is a subsequence we need to stitch it in place and
3276                             replace only the relevant branches. This means the first branch has
3277                             to remain as it is used by the alternation logic, and its next pointer,
3278                             and needs to be repointed at the item on the branch chain following
3279                             the last branch we have optimized away.
3280
3281                             This could be either a BRANCH, in which case the subsequence is internal,
3282                             or it could be the item following the branch sequence in which case the
3283                             subsequence is at the end (which does not necessarily mean the first node
3284                             is the start of the alternation).
3285
3286                             TRIE_TYPE(X) is a define which maps the optype to a trietype.
3287
3288                                 optype          |  trietype
3289                                 ----------------+-----------
3290                                 NOTHING         | NOTHING
3291                                 EXACT           | EXACT
3292                                 EXACTFU         | EXACTFU
3293                                 EXACTFU_SS      | EXACTFU
3294                                 EXACTFU_TRICKYFOLD | EXACTFU
3295                                 EXACTFA         | 0
3296
3297
3298                         */
3299 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING :   \
3300                        ( EXACT == (X) )   ? EXACT :        \
3301                        ( EXACTFU == (X) || EXACTFU_SS == (X) || EXACTFU_TRICKYFOLD == (X) ) ? EXACTFU :        \
3302                        0 )
3303
3304                         /* dont use tail as the end marker for this traverse */
3305                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3306                             regnode * const noper = NEXTOPER( cur );
3307                             U8 noper_type = OP( noper );
3308                             U8 noper_trietype = TRIE_TYPE( noper_type );
3309 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3310                             regnode * const noper_next = regnext( noper );
3311                             U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
3312                             U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
3313 #endif
3314
3315                             DEBUG_TRIE_COMPILE_r({
3316                                 regprop(RExC_rx, mysv, cur);
3317                                 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3318                                    (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3319
3320                                 regprop(RExC_rx, mysv, noper);
3321                                 PerlIO_printf( Perl_debug_log, " -> %s",
3322                                     SvPV_nolen_const(mysv));
3323
3324                                 if ( noper_next ) {
3325                                   regprop(RExC_rx, mysv, noper_next );
3326                                   PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3327                                     SvPV_nolen_const(mysv));
3328                                 }
3329                                 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
3330                                    REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
3331                                    PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
3332                                 );
3333                             });
3334
3335                             /* Is noper a trieable nodetype that can be merged with the
3336                              * current trie (if there is one)? */
3337                             if ( noper_trietype
3338                                   &&
3339                                   (
3340                                         ( noper_trietype == NOTHING)
3341                                         || ( trietype == NOTHING )
3342                                         || ( trietype == noper_trietype )
3343                                   )
3344 #ifdef NOJUMPTRIE
3345                                   && noper_next == tail
3346 #endif
3347                                   && count < U16_MAX)
3348                             {
3349                                 /* Handle mergable triable node
3350                                  * Either we are the first node in a new trieable sequence,
3351                                  * in which case we do some bookkeeping, otherwise we update
3352                                  * the end pointer. */
3353                                 if ( !first ) {
3354                                     first = cur;
3355                                     if ( noper_trietype == NOTHING ) {
3356 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
3357                                         regnode * const noper_next = regnext( noper );
3358                                         U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
3359                                         U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
3360 #endif
3361
3362                                         if ( noper_next_trietype ) {
3363                                             trietype = noper_next_trietype;
3364                                         } else if (noper_next_type)  {
3365                                             /* a NOTHING regop is 1 regop wide. We need at least two
3366                                              * for a trie so we can't merge this in */
3367                                             first = NULL;
3368                                         }
3369                                     } else {
3370                                         trietype = noper_trietype;
3371                                     }
3372                                 } else {
3373                                     if ( trietype == NOTHING )
3374                                         trietype = noper_trietype;
3375                                     last = cur;
3376                                 }
3377                                 if (first)
3378                                     count++;
3379                             } /* end handle mergable triable node */
3380                             else {
3381                                 /* handle unmergable node -
3382                                  * noper may either be a triable node which can not be tried
3383                                  * together with the current trie, or a non triable node */
3384                                 if ( last ) {
3385                                     /* If last is set and trietype is not NOTHING then we have found
3386                                      * at least two triable branch sequences in a row of a similar
3387                                      * trietype so we can turn them into a trie. If/when we
3388                                      * allow NOTHING to start a trie sequence this condition will be
3389                                      * required, and it isn't expensive so we leave it in for now. */
3390                                     if ( trietype && trietype != NOTHING )
3391                                         make_trie( pRExC_state,
3392                                                 startbranch, first, cur, tail, count,
3393                                                 trietype, depth+1 );
3394                                     last = NULL; /* note: we clear/update first, trietype etc below, so we dont do it here */
3395                                 }
3396                                 if ( noper_trietype
3397 #ifdef NOJUMPTRIE
3398                                      && noper_next == tail
3399 #endif
3400                                 ){
3401                                     /* noper is triable, so we can start a new trie sequence */
3402                                     count = 1;
3403                                     first = cur;
3404                                     trietype = noper_trietype;
3405                                 } else if (first) {
3406                                     /* if we already saw a first but the current node is not triable then we have
3407                                      * to reset the first information. */
3408                                     count = 0;
3409                                     first = NULL;
3410                                     trietype = 0;
3411                                 }
3412                             } /* end handle unmergable node */
3413                         } /* loop over branches */
3414                         DEBUG_TRIE_COMPILE_r({
3415                             regprop(RExC_rx, mysv, cur);
3416                             PerlIO_printf( Perl_debug_log,
3417                               "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3418                               "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3419
3420                         });
3421                         if ( last && trietype ) {
3422                             if ( trietype != NOTHING ) {
3423                                 /* the last branch of the sequence was part of a trie,
3424                                  * so we have to construct it here outside of the loop
3425                                  */
3426                                 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 );
3427 #ifdef TRIE_STUDY_OPT
3428                                 if ( ((made == MADE_EXACT_TRIE &&
3429                                      startbranch == first)
3430                                      || ( first_non_open == first )) &&
3431                                      depth==0 ) {
3432                                     flags |= SCF_TRIE_RESTUDY;
3433                                     if ( startbranch == first
3434                                          && scan == tail )
3435                                     {
3436                                         RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3437                                     }
3438                                 }
3439 #endif
3440                             } else {
3441                                 /* at this point we know whatever we have is a NOTHING sequence/branch
3442                                  * AND if 'startbranch' is 'first' then we can turn the whole thing into a NOTHING
3443                                  */
3444                                 if ( startbranch == first ) {
3445                                     regnode *opt;
3446                                     /* the entire thing is a NOTHING sequence, something like this:
3447                                      * (?:|) So we can turn it into a plain NOTHING op. */
3448                                     DEBUG_TRIE_COMPILE_r({
3449                                         regprop(RExC_rx, mysv, cur);
3450                                         PerlIO_printf( Perl_debug_log,
3451                                           "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
3452                                           "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3453
3454                                     });
3455                                     OP(startbranch)= NOTHING;
3456                                     NEXT_OFF(startbranch)= tail - startbranch;
3457                                     for ( opt= startbranch + 1; opt < tail ; opt++ )
3458                                         OP(opt)= OPTIMIZED;
3459                                 }
3460                             }
3461                         } /* end if ( last) */
3462                     } /* TRIE_MAXBUF is non zero */
3463
3464                 } /* do trie */
3465
3466             }
3467             else if ( code == BRANCHJ ) {  /* single branch is optimized. */
3468                 scan = NEXTOPER(NEXTOPER(scan));
3469             } else                      /* single branch is optimized. */
3470                 scan = NEXTOPER(scan);
3471             continue;
3472         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3473             scan_frame *newframe = NULL;
3474             I32 paren;
3475             regnode *start;
3476             regnode *end;
3477
3478             if (OP(scan) != SUSPEND) {
3479             /* set the pointer */
3480                 if (OP(scan) == GOSUB) {
3481                     paren = ARG(scan);
3482                     RExC_recurse[ARG2L(scan)] = scan;
3483                     start = RExC_open_parens[paren-1];
3484                     end   = RExC_close_parens[paren-1];
3485                 } else {
3486                     paren = 0;
3487                     start = RExC_rxi->program + 1;
3488                     end   = RExC_opend;
3489                 }
3490                 if (!recursed) {
3491                     Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3492                     SAVEFREEPV(recursed);
3493                 }
3494                 if (!PAREN_TEST(recursed,paren+1)) {
3495                     PAREN_SET(recursed,paren+1);
3496                     Newx(newframe,1,scan_frame);
3497                 } else {
3498                     if (flags & SCF_DO_SUBSTR) {
3499                         SCAN_COMMIT(pRExC_state,data,minlenp);
3500                         data->longest = &(data->longest_float);
3501                     }
3502                     is_inf = is_inf_internal = 1;
3503                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3504                         cl_anything(pRExC_state, data->start_class);
3505                     flags &= ~SCF_DO_STCLASS;
3506                 }
3507             } else {
3508                 Newx(newframe,1,scan_frame);
3509                 paren = stopparen;
3510                 start = scan+2;
3511                 end = regnext(scan);
3512             }
3513             if (newframe) {
3514                 assert(start);
3515                 assert(end);
3516                 SAVEFREEPV(newframe);
3517                 newframe->next = regnext(scan);
3518                 newframe->last = last;
3519                 newframe->stop = stopparen;
3520                 newframe->prev = frame;
3521
3522                 frame = newframe;
3523                 scan =  start;
3524                 stopparen = paren;
3525                 last = end;
3526
3527                 continue;
3528             }
3529         }
3530         else if (OP(scan) == EXACT) {
3531             I32 l = STR_LEN(scan);
3532             UV uc;
3533             if (UTF) {
3534                 const U8 * const s = (U8*)STRING(scan);
3535                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3536                 l = utf8_length(s, s + l);
3537             } else {
3538                 uc = *((U8*)STRING(scan));
3539             }
3540             min += l;
3541             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3542                 /* The code below prefers earlier match for fixed
3543                    offset, later match for variable offset.  */
3544                 if (data->last_end == -1) { /* Update the start info. */
3545                     data->last_start_min = data->pos_min;
3546                     data->last_start_max = is_inf
3547                         ? I32_MAX : data->pos_min + data->pos_delta;
3548                 }
3549                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3550                 if (UTF)
3551                     SvUTF8_on(data->last_found);
3552                 {
3553                     SV * const sv = data->last_found;
3554                     MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3555                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
3556                     if (mg && mg->mg_len >= 0)
3557                         mg->mg_len += utf8_length((U8*)STRING(scan),
3558                                                   (U8*)STRING(scan)+STR_LEN(scan));
3559                 }
3560                 data->last_end = data->pos_min + l;
3561                 data->pos_min += l; /* As in the first entry. */
3562                 data->flags &= ~SF_BEFORE_EOL;
3563             }
3564             if (flags & SCF_DO_STCLASS_AND) {
3565                 /* Check whether it is compatible with what we know already! */
3566                 int compat = 1;
3567
3568
3569                 /* If compatible, we or it in below.  It is compatible if is
3570                  * in the bitmp and either 1) its bit or its fold is set, or 2)
3571                  * it's for a locale.  Even if there isn't unicode semantics
3572                  * here, at runtime there may be because of matching against a
3573                  * utf8 string, so accept a possible false positive for
3574                  * latin1-range folds */
3575                 if (uc >= 0x100 ||
3576                     (!(data->start_class->flags & ANYOF_LOCALE)
3577                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
3578                     && (!(data->start_class->flags & ANYOF_LOC_FOLD)
3579                         || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3580                     )
3581                 {
3582                     compat = 0;
3583                 }
3584                 ANYOF_CLASS_ZERO(data->start_class);
3585                 ANYOF_BITMAP_ZERO(data->start_class);
3586                 if (compat)
3587                     ANYOF_BITMAP_SET(data->start_class, uc);
3588                 else if (uc >= 0x100) {
3589                     int i;
3590
3591                     /* Some Unicode code points fold to the Latin1 range; as
3592                      * XXX temporary code, instead of figuring out if this is
3593                      * one, just assume it is and set all the start class bits
3594                      * that could be some such above 255 code point's fold
3595                      * which will generate fals positives.  As the code
3596                      * elsewhere that does compute the fold settles down, it
3597                      * can be extracted out and re-used here */
3598                     for (i = 0; i < 256; i++){
3599                         if (HAS_NONLATIN1_FOLD_CLOSURE(i)) {
3600                             ANYOF_BITMAP_SET(data->start_class, i);
3601                         }
3602                     }
3603                 }
3604                 CLEAR_SSC_EOS(data->start_class);
3605                 if (uc < 0x100)
3606                   data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3607             }
3608             else if (flags & SCF_DO_STCLASS_OR) {
3609                 /* false positive possible if the class is case-folded */
3610                 if (uc < 0x100)
3611                     ANYOF_BITMAP_SET(data->start_class, uc);
3612                 else
3613                     data->start_class->flags |= ANYOF_UNICODE_ALL;
3614                 CLEAR_SSC_EOS(data->start_class);
3615                 cl_and(data->start_class, and_withp);
3616             }
3617             flags &= ~SCF_DO_STCLASS;
3618         }
3619         else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3620             I32 l = STR_LEN(scan);
3621             UV uc = *((U8*)STRING(scan));
3622
3623             /* Search for fixed substrings supports EXACT only. */
3624             if (flags & SCF_DO_SUBSTR) {
3625                 assert(data);
3626                 SCAN_COMMIT(pRExC_state, data, minlenp);
3627             }
3628             if (UTF) {
3629                 const U8 * const s = (U8 *)STRING(scan);
3630                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3631                 l = utf8_length(s, s + l);
3632             }
3633             if (has_exactf_sharp_s) {
3634                 RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
3635             }
3636             min += l - min_subtract;
3637             assert (min >= 0);
3638             delta += min_subtract;
3639             if (flags & SCF_DO_SUBSTR) {
3640                 data->pos_min += l - min_subtract;
3641                 if (data->pos_min < 0) {
3642                     data->pos_min = 0;
3643                 }
3644                 data->pos_delta += min_subtract;
3645                 if (min_subtract) {
3646                     data->longest = &(data->longest_float);
3647                 }
3648             }
3649             if (flags & SCF_DO_STCLASS_AND) {
3650                 /* Check whether it is compatible with what we know already! */
3651                 int compat = 1;
3652                 if (uc >= 0x100 ||
3653                  (!(data->start_class->flags & ANYOF_LOCALE)
3654                   && !ANYOF_BITMAP_TEST(data->start_class, uc)
3655                   && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3656                 {
3657                     compat = 0;
3658                 }
3659                 ANYOF_CLASS_ZERO(data->start_class);
3660                 ANYOF_BITMAP_ZERO(data->start_class);
3661                 if (compat) {
3662                     ANYOF_BITMAP_SET(data->start_class, uc);
3663                     CLEAR_SSC_EOS(data->start_class);
3664                     if (OP(scan) == EXACTFL) {
3665                         /* XXX This set is probably no longer necessary, and
3666                          * probably wrong as LOCALE now is on in the initial
3667                          * state */
3668                         data->start_class->flags |= ANYOF_LOCALE|ANYOF_LOC_FOLD;
3669                     }
3670                     else {
3671
3672                         /* Also set the other member of the fold pair.  In case
3673                          * that unicode semantics is called for at runtime, use
3674                          * the full latin1 fold.  (Can't do this for locale,
3675                          * because not known until runtime) */
3676                         ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3677
3678                         /* All other (EXACTFL handled above) folds except under
3679                          * /iaa that include s, S, and sharp_s also may include
3680                          * the others */
3681                         if (OP(scan) != EXACTFA) {
3682                             if (uc == 's' || uc == 'S') {
3683                                 ANYOF_BITMAP_SET(data->start_class,
3684                                                  LATIN_SMALL_LETTER_SHARP_S);
3685                             }
3686                             else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3687                                 ANYOF_BITMAP_SET(data->start_class, 's');
3688                                 ANYOF_BITMAP_SET(data->start_class, 'S');
3689                             }
3690                         }
3691                     }
3692                 }
3693                 else if (uc >= 0x100) {
3694                     int i;
3695                     for (i = 0; i < 256; i++){
3696                         if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3697                             ANYOF_BITMAP_SET(data->start_class, i);
3698                         }
3699                     }
3700                 }
3701             }
3702             else if (flags & SCF_DO_STCLASS_OR) {
3703                 if (data->start_class->flags & ANYOF_LOC_FOLD) {
3704                     /* false positive possible if the class is case-folded.
3705                        Assume that the locale settings are the same... */
3706                     if (uc < 0x100) {
3707                         ANYOF_BITMAP_SET(data->start_class, uc);
3708                         if (OP(scan) != EXACTFL) {
3709
3710                             /* And set the other member of the fold pair, but
3711                              * can't do that in locale because not known until
3712                              * run-time */
3713                             ANYOF_BITMAP_SET(data->start_class,
3714                                              PL_fold_latin1[uc]);
3715
3716                             /* All folds except under /iaa that include s, S,
3717                              * and sharp_s also may include the others */
3718                             if (OP(scan) != EXACTFA) {
3719                                 if (uc == 's' || uc == 'S') {
3720                                     ANYOF_BITMAP_SET(data->start_class,
3721                                                    LATIN_SMALL_LETTER_SHARP_S);
3722                                 }
3723                                 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3724                                     ANYOF_BITMAP_SET(data->start_class, 's');
3725                                     ANYOF_BITMAP_SET(data->start_class, 'S');
3726                                 }
3727                             }
3728                         }
3729                     }
3730                     CLEAR_SSC_EOS(data->start_class);
3731                 }
3732                 cl_and(data->start_class, and_withp);
3733             }
3734             flags &= ~SCF_DO_STCLASS;
3735         }
3736         else if (REGNODE_VARIES(OP(scan))) {
3737             I32 mincount, maxcount, minnext, deltanext, fl = 0;
3738             I32 f = flags, pos_before = 0;
3739             regnode * const oscan = scan;
3740             struct regnode_charclass_class this_class;
3741             struct regnode_charclass_class *oclass = NULL;
3742             I32 next_is_eval = 0;
3743
3744             switch (PL_regkind[OP(scan)]) {
3745             case WHILEM:                /* End of (?:...)* . */
3746                 scan = NEXTOPER(scan);
3747                 goto finish;
3748             case PLUS:
3749                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3750                     next = NEXTOPER(scan);
3751                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3752                         mincount = 1;
3753                         maxcount = REG_INFTY;
3754                         next = regnext(scan);
3755                         scan = NEXTOPER(scan);
3756                         goto do_curly;
3757                     }
3758                 }
3759                 if (flags & SCF_DO_SUBSTR)
3760                     data->pos_min++;
3761                 min++;
3762                 /* Fall through. */
3763             case STAR:
3764                 if (flags & SCF_DO_STCLASS) {
3765                     mincount = 0;
3766                     maxcount = REG_INFTY;
3767                     next = regnext(scan);
3768                     scan = NEXTOPER(scan);
3769                     goto do_curly;
3770                 }
3771                 is_inf = is_inf_internal = 1;
3772                 scan = regnext(scan);
3773                 if (flags & SCF_DO_SUBSTR) {
3774                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3775                     data->longest = &(data->longest_float);
3776                 }
3777                 goto optimize_curly_tail;
3778             case CURLY:
3779                 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3780                     && (scan->flags == stopparen))
3781                 {
3782                     mincount = 1;
3783                     maxcount = 1;
3784                 } else {
3785                     mincount = ARG1(scan);
3786                     maxcount = ARG2(scan);
3787                 }
3788                 next = regnext(scan);
3789                 if (OP(scan) == CURLYX) {
3790                     I32 lp = (data ? *(data->last_closep) : 0);
3791                     scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3792                 }
3793                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3794                 next_is_eval = (OP(scan) == EVAL);
3795               do_curly:
3796                 if (flags & SCF_DO_SUBSTR) {
3797                     if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3798                     pos_before = data->pos_min;
3799                 }
3800                 if (data) {
3801                     fl = data->flags;
3802                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3803                     if (is_inf)
3804                         data->flags |= SF_IS_INF;
3805                 }
3806                 if (flags & SCF_DO_STCLASS) {
3807                     cl_init(pRExC_state, &this_class);
3808                     oclass = data->start_class;
3809                     data->start_class = &this_class;
3810                     f |= SCF_DO_STCLASS_AND;
3811                     f &= ~SCF_DO_STCLASS_OR;
3812                 }
3813                 /* Exclude from super-linear cache processing any {n,m}
3814                    regops for which the combination of input pos and regex
3815                    pos is not enough information to determine if a match
3816                    will be possible.
3817
3818                    For example, in the regex /foo(bar\s*){4,8}baz/ with the
3819                    regex pos at the \s*, the prospects for a match depend not
3820                    only on the input position but also on how many (bar\s*)
3821                    repeats into the {4,8} we are. */
3822                if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3823                     f &= ~SCF_WHILEM_VISITED_POS;
3824
3825                 /* This will finish on WHILEM, setting scan, or on NULL: */
3826                 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3827                                       last, data, stopparen, recursed, NULL,
3828                                       (mincount == 0
3829                                         ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3830
3831                 if (flags & SCF_DO_STCLASS)
3832                     data->start_class = oclass;
3833                 if (mincount == 0 || minnext == 0) {
3834                     if (flags & SCF_DO_STCLASS_OR) {
3835                         cl_or(pRExC_state, data->start_class, &this_class);
3836                     }
3837                     else if (flags & SCF_DO_STCLASS_AND) {
3838                         /* Switch to OR mode: cache the old value of
3839                          * data->start_class */
3840                         INIT_AND_WITHP;
3841                         StructCopy(data->start_class, and_withp,
3842                                    struct regnode_charclass_class);
3843                         flags &= ~SCF_DO_STCLASS_AND;
3844                         StructCopy(&this_class, data->start_class,
3845                                    struct regnode_charclass_class);
3846                         flags |= SCF_DO_STCLASS_OR;
3847                         SET_SSC_EOS(data->start_class);
3848                     }
3849                 } else {                /* Non-zero len */
3850                     if (flags & SCF_DO_STCLASS_OR) {
3851                         cl_or(pRExC_state, data->start_class, &this_class);
3852                         cl_and(data->start_class, and_withp);
3853                     }
3854                     else if (flags & SCF_DO_STCLASS_AND)
3855                         cl_and(data->start_class, &this_class);
3856                     flags &= ~SCF_DO_STCLASS;
3857                 }
3858                 if (!scan)              /* It was not CURLYX, but CURLY. */
3859                     scan = next;
3860                 if ( /* ? quantifier ok, except for (?{ ... }) */
3861                     (next_is_eval || !(mincount == 0 && maxcount == 1))
3862                     && (minnext == 0) && (deltanext == 0)
3863                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3864                     && maxcount <= REG_INFTY/3) /* Complement check for big count */
3865                 {
3866                     /* Fatal warnings may leak the regexp without this: */
3867                     SAVEFREESV(RExC_rx_sv);
3868                     ckWARNreg(RExC_parse,
3869                               "Quantifier unexpected on zero-length expression");
3870                     (void)ReREFCNT_inc(RExC_rx_sv);
3871                 }
3872
3873                 min += minnext * mincount;
3874                 is_inf_internal |= ((maxcount == REG_INFTY
3875                                      && (minnext + deltanext) > 0)
3876                                     || deltanext == I32_MAX);
3877                 is_inf |= is_inf_internal;
3878                 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3879
3880                 /* Try powerful optimization CURLYX => CURLYN. */
3881                 if (  OP(oscan) == CURLYX && data
3882                       && data->flags & SF_IN_PAR
3883                       && !(data->flags & SF_HAS_EVAL)
3884                       && !deltanext && minnext == 1 ) {
3885                     /* Try to optimize to CURLYN.  */
3886                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3887                     regnode * const nxt1 = nxt;
3888 #ifdef DEBUGGING
3889                     regnode *nxt2;
3890 #endif
3891
3892                     /* Skip open. */
3893                     nxt = regnext(nxt);
3894                     if (!REGNODE_SIMPLE(OP(nxt))
3895                         && !(PL_regkind[OP(nxt)] == EXACT
3896                              && STR_LEN(nxt) == 1))
3897                         goto nogo;
3898 #ifdef DEBUGGING
3899                     nxt2 = nxt;
3900 #endif
3901                     nxt = regnext(nxt);
3902                     if (OP(nxt) != CLOSE)
3903                         goto nogo;
3904                     if (RExC_open_parens) {
3905                         RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3906                         RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3907                     }
3908                     /* Now we know that nxt2 is the only contents: */
3909                     oscan->flags = (U8)ARG(nxt);
3910                     OP(oscan) = CURLYN;
3911                     OP(nxt1) = NOTHING; /* was OPEN. */
3912
3913 #ifdef DEBUGGING
3914                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3915                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3916                     NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3917                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
3918                     OP(nxt + 1) = OPTIMIZED; /* was count. */
3919                     NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3920 #endif
3921                 }
3922               nogo:
3923
3924                 /* Try optimization CURLYX => CURLYM. */
3925                 if (  OP(oscan) == CURLYX && data
3926                       && !(data->flags & SF_HAS_PAR)
3927                       && !(data->flags & SF_HAS_EVAL)
3928                       && !deltanext     /* atom is fixed width */
3929                       && minnext != 0   /* CURLYM can't handle zero width */
3930                       && ! (RExC_seen & REG_SEEN_EXACTF_SHARP_S) /* Nor \xDF */
3931                 ) {
3932                     /* XXXX How to optimize if data == 0? */
3933                     /* Optimize to a simpler form.  */
3934                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3935                     regnode *nxt2;
3936
3937                     OP(oscan) = CURLYM;
3938                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3939                             && (OP(nxt2) != WHILEM))
3940                         nxt = nxt2;
3941                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
3942                     /* Need to optimize away parenths. */
3943                     if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3944                         /* Set the parenth number.  */
3945                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3946
3947                         oscan->flags = (U8)ARG(nxt);
3948                         if (RExC_open_parens) {
3949                             RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3950                             RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3951                         }
3952                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
3953                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
3954
3955 #ifdef DEBUGGING
3956                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3957                         OP(nxt + 1) = OPTIMIZED; /* was count. */
3958                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3959                         NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
3960 #endif
3961 #if 0
3962                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
3963                             regnode *nnxt = regnext(nxt1);
3964                             if (nnxt == nxt) {
3965                                 if (reg_off_by_arg[OP(nxt1)])
3966                                     ARG_SET(nxt1, nxt2 - nxt1);
3967                                 else if (nxt2 - nxt1 < U16_MAX)
3968                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
3969                                 else
3970                                     OP(nxt) = NOTHING;  /* Cannot beautify */
3971                             }
3972                             nxt1 = nnxt;
3973                         }
3974 #endif
3975                         /* Optimize again: */
3976                         study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3977                                     NULL, stopparen, recursed, NULL, 0,depth+1);
3978                     }
3979                     else
3980                         oscan->flags = 0;
3981                 }
3982                 else if ((OP(oscan) == CURLYX)
3983                          && (flags & SCF_WHILEM_VISITED_POS)
3984                          /* See the comment on a similar expression above.
3985                             However, this time it's not a subexpression
3986                             we care about, but the expression itself. */
3987                          && (maxcount == REG_INFTY)
3988                          && data && ++data->whilem_c < 16) {
3989                     /* This stays as CURLYX, we can put the count/of pair. */
3990                     /* Find WHILEM (as in regexec.c) */
3991                     regnode *nxt = oscan + NEXT_OFF(oscan);
3992
3993                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3994                         nxt += ARG(nxt);
3995                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
3996                         | (RExC_whilem_seen << 4)); /* On WHILEM */
3997                 }
3998                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3999                     pars++;
4000                 if (flags & SCF_DO_SUBSTR) {
4001                     SV *last_str = NULL;
4002                     int counted = mincount != 0;
4003
4004                     if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
4005 #if defined(SPARC64_GCC_WORKAROUND)
4006                         I32 b = 0;
4007                         STRLEN l = 0;
4008                         const char *s = NULL;
4009                         I32 old = 0;
4010
4011                         if (pos_before >= data->last_start_min)
4012                             b = pos_before;
4013                         else
4014                             b = data->last_start_min;
4015
4016                         l = 0;
4017                         s = SvPV_const(data->last_found, l);
4018                         old = b - data->last_start_min;
4019
4020 #else
4021                         I32 b = pos_before >= data->last_start_min
4022                             ? pos_before : data->last_start_min;
4023                         STRLEN l;
4024                         const char * const s = SvPV_const(data->last_found, l);
4025                         I32 old = b - data->last_start_min;
4026 #endif
4027
4028                         if (UTF)
4029                             old = utf8_hop((U8*)s, old) - (U8*)s;
4030                         l -= old;
4031                         /* Get the added string: */
4032                         last_str = newSVpvn_utf8(s  + old, l, UTF);
4033                         if (deltanext == 0 && pos_before == b) {
4034                             /* What was added is a constant string */
4035                             if (mincount > 1) {
4036                                 SvGROW(last_str, (mincount * l) + 1);
4037                                 repeatcpy(SvPVX(last_str) + l,
4038                                           SvPVX_const(last_str), l, mincount - 1);
4039                                 SvCUR_set(last_str, SvCUR(last_str) * mincount);
4040                                 /* Add additional parts. */
4041                                 SvCUR_set(data->last_found,
4042                                           SvCUR(data->last_found) - l);
4043                                 sv_catsv(data->last_found, last_str);
4044                                 {
4045                                     SV * sv = data->last_found;
4046                                     MAGIC *mg =
4047                                         SvUTF8(sv) && SvMAGICAL(sv) ?
4048                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
4049                                     if (mg && mg->mg_len >= 0)
4050                                         mg->mg_len += CHR_SVLEN(last_str) - l;
4051                                 }
4052                                 data->last_end += l * (mincount - 1);
4053                             }
4054                         } else {
4055                             /* start offset must point into the last copy */
4056                             data->last_start_min += minnext * (mincount - 1);
4057                             data->last_start_max += is_inf ? I32_MAX
4058                                 : (maxcount - 1) * (minnext + data->pos_delta);
4059                         }
4060                     }
4061                     /* It is counted once already... */
4062                     data->pos_min += minnext * (mincount - counted);
4063                     data->pos_delta += - counted * deltanext +
4064                         (minnext + deltanext) * maxcount - minnext * mincount;
4065                     if (mincount != maxcount) {
4066                          /* Cannot extend fixed substrings found inside
4067                             the group.  */
4068                         SCAN_COMMIT(pRExC_state,data,minlenp);
4069                         if (mincount && last_str) {
4070                             SV * const sv = data->last_found;
4071                             MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4072                                 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4073
4074                             if (mg)
4075                                 mg->mg_len = -1;
4076                             sv_setsv(sv, last_str);
4077                             data->last_end = data->pos_min;
4078                             data->last_start_min =
4079                                 data->pos_min - CHR_SVLEN(last_str);
4080                             data->last_start_max = is_inf
4081                                 ? I32_MAX
4082                                 : data->pos_min + data->pos_delta
4083                                 - CHR_SVLEN(last_str);
4084                         }
4085                         data->longest = &(data->longest_float);
4086                     }
4087                     SvREFCNT_dec(last_str);
4088                 }
4089                 if (data && (fl & SF_HAS_EVAL))
4090                     data->flags |= SF_HAS_EVAL;
4091               optimize_curly_tail:
4092                 if (OP(oscan) != CURLYX) {
4093                     while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4094                            && NEXT_OFF(next))
4095                         NEXT_OFF(oscan) += NEXT_OFF(next);
4096                 }
4097                 continue;
4098             default:                    /* REF, ANYOFV, and CLUMP only? */
4099                 if (flags & SCF_DO_SUBSTR) {
4100                     SCAN_COMMIT(pRExC_state,data,minlenp);      /* Cannot expect anything... */
4101                     data->longest = &(data->longest_float);
4102                 }
4103                 is_inf = is_inf_internal = 1;
4104                 if (flags & SCF_DO_STCLASS_OR)
4105                     cl_anything(pRExC_state, data->start_class);
4106                 flags &= ~SCF_DO_STCLASS;
4107                 break;
4108             }
4109         }
4110         else if (OP(scan) == LNBREAK) {
4111             if (flags & SCF_DO_STCLASS) {
4112                 int value = 0;
4113                 CLEAR_SSC_EOS(data->start_class); /* No match on empty */
4114                 if (flags & SCF_DO_STCLASS_AND) {
4115                     for (value = 0; value < 256; value++)
4116                         if (!is_VERTWS_cp(value))
4117                             ANYOF_BITMAP_CLEAR(data->start_class, value);
4118                 }
4119                 else {
4120                     for (value = 0; value < 256; value++)
4121                         if (is_VERTWS_cp(value))
4122                             ANYOF_BITMAP_SET(data->start_class, value);
4123                 }
4124                 if (flags & SCF_DO_STCLASS_OR)
4125                     cl_and(data->start_class, and_withp);
4126                 flags &= ~SCF_DO_STCLASS;
4127             }
4128             min++;
4129             delta++;    /* Because of the 2 char string cr-lf */
4130             if (flags & SCF_DO_SUBSTR) {
4131                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
4132                 data->pos_min += 1;
4133                 data->pos_delta += 1;
4134                 data->longest = &(data->longest_float);
4135             }
4136         }
4137         else if (REGNODE_SIMPLE(OP(scan))) {
4138             int value = 0;
4139
4140             if (flags & SCF_DO_SUBSTR) {
4141                 SCAN_COMMIT(pRExC_state,data,minlenp);
4142                 data->pos_min++;
4143             }
4144             min++;
4145             if (flags & SCF_DO_STCLASS) {
4146                 int loop_max = 256;
4147                 CLEAR_SSC_EOS(data->start_class); /* No match on empty */
4148
4149                 /* Some of the logic below assumes that switching
4150                    locale on will only add false positives. */
4151                 switch (PL_regkind[OP(scan)]) {
4152                     U8 classnum;
4153
4154                 case SANY:
4155                 default:
4156 #ifdef DEBUGGING
4157                    Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan));
4158 #endif
4159                  do_default:
4160                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4161                         cl_anything(pRExC_state, data->start_class);
4162                     break;
4163                 case REG_ANY:
4164                     if (OP(scan) == SANY)
4165                         goto do_default;
4166                     if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
4167                         value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
4168                                 || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
4169                         cl_anything(pRExC_state, data->start_class);
4170                     }
4171                     if (flags & SCF_DO_STCLASS_AND || !value)
4172                         ANYOF_BITMAP_CLEAR(data->start_class,'\n');
4173                     break;
4174                 case ANYOF:
4175                     if (flags & SCF_DO_STCLASS_AND)
4176                         cl_and(data->start_class,
4177                                (struct regnode_charclass_class*)scan);
4178                     else
4179                         cl_or(pRExC_state, data->start_class,
4180                               (struct regnode_charclass_class*)scan);
4181                     break;
4182                 case POSIXA:
4183                     loop_max = 128;
4184                     /* FALL THROUGH */
4185                 case POSIXL:
4186                 case POSIXD:
4187                 case POSIXU:
4188                     classnum = FLAGS(scan);
4189                     if (flags & SCF_DO_STCLASS_AND) {
4190                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4191                             ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum) + 1);
4192                             for (value = 0; value < loop_max; value++) {
4193                                 if (! _generic_isCC(UNI_TO_NATIVE(value), classnum)) {
4194                                     ANYOF_BITMAP_CLEAR(data->start_class, UNI_TO_NATIVE(value));
4195                                 }
4196                             }
4197                         }
4198                     }
4199                     else {
4200                         if (data->start_class->flags & ANYOF_LOCALE) {
4201                             ANYOF_CLASS_SET(data->start_class, classnum_to_namedclass(classnum));
4202                         }
4203                         else {
4204
4205                         /* Even if under locale, set the bits for non-locale
4206                          * in case it isn't a true locale-node.  This will
4207                          * create false positives if it truly is locale */
4208                         for (value = 0; value < loop_max; value++) {
4209                             if (_generic_isCC(UNI_TO_NATIVE(value), classnum)) {
4210                                 ANYOF_BITMAP_SET(data->start_class, UNI_TO_NATIVE(value));
4211                             }
4212                         }
4213                         }
4214                     }
4215                     break;
4216                 case NPOSIXA:
4217                     loop_max = 128;
4218                     /* FALL THROUGH */
4219                 case NPOSIXL:
4220                 case NPOSIXU:
4221                 case NPOSIXD:
4222                     classnum = FLAGS(scan);
4223                     if (flags & SCF_DO_STCLASS_AND) {
4224                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4225                             ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum));
4226                             for (value = 0; value < loop_max; value++) {
4227                                 if (_generic_isCC(UNI_TO_NATIVE(value), classnum)) {
4228                                     ANYOF_BITMAP_CLEAR(data->start_class, UNI_TO_NATIVE(value));
4229                                 }
4230                             }
4231                         }
4232                     }
4233                     else {
4234                         if (data->start_class->flags & ANYOF_LOCALE) {
4235                             ANYOF_CLASS_SET(data->start_class, classnum_to_namedclass(classnum) + 1);
4236                         }
4237                         else {
4238
4239                         /* Even if under locale, set the bits for non-locale in
4240                          * case it isn't a true locale-node.  This will create
4241                          * false positives if it truly is locale */
4242                         for (value = 0; value < loop_max; value++) {
4243                             if (! _generic_isCC(UNI_TO_NATIVE(value), classnum)) {
4244                                 ANYOF_BITMAP_SET(data->start_class, UNI_TO_NATIVE(value));
4245                             }
4246                         }
4247                         if (PL_regkind[OP(scan)] == NPOSIXD) {
4248                             data->start_class->flags |= ANYOF_NON_UTF8_LATIN1_ALL;
4249                         }
4250                         }
4251                     }
4252                     break;
4253                 }
4254                 if (flags & SCF_DO_STCLASS_OR)
4255                     cl_and(data->start_class, and_withp);
4256                 flags &= ~SCF_DO_STCLASS;
4257             }
4258         }
4259         else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
4260             data->flags |= (OP(scan) == MEOL
4261                             ? SF_BEFORE_MEOL
4262                             : SF_BEFORE_SEOL);
4263             SCAN_COMMIT(pRExC_state, data, minlenp);
4264
4265         }
4266         else if (  PL_regkind[OP(scan)] == BRANCHJ
4267                  /* Lookbehind, or need to calculate parens/evals/stclass: */
4268                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
4269                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4270             if ( OP(scan) == UNLESSM &&
4271                  scan->flags == 0 &&
4272                  OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
4273                  OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
4274             ) {
4275                 regnode *opt;
4276                 regnode *upto= regnext(scan);
4277                 DEBUG_PARSE_r({
4278                     SV * const mysv_val=sv_newmortal();
4279                     DEBUG_STUDYDATA("OPFAIL",data,depth);
4280
4281                     /*DEBUG_PARSE_MSG("opfail");*/
4282                     regprop(RExC_rx, mysv_val, upto);
4283                     PerlIO_printf(Perl_debug_log, "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
4284                                   SvPV_nolen_const(mysv_val),
4285                                   (IV)REG_NODE_NUM(upto),
4286                                   (IV)(upto - scan)
4287                     );
4288                 });
4289                 OP(scan) = OPFAIL;
4290                 NEXT_OFF(scan) = upto - scan;
4291                 for (opt= scan + 1; opt < upto ; opt++)
4292                     OP(opt) = OPTIMIZED;
4293                 scan= upto;
4294                 continue;
4295             }
4296             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4297                 || OP(scan) == UNLESSM )
4298             {
4299                 /* Negative Lookahead/lookbehind
4300                    In this case we can't do fixed string optimisation.
4301                 */
4302
4303                 I32 deltanext, minnext, fake = 0;
4304                 regnode *nscan;
4305                 struct regnode_charclass_class intrnl;
4306                 int f = 0;
4307
4308                 data_fake.flags = 0;
4309                 if (data) {
4310                     data_fake.whilem_c = data->whilem_c;
4311                     data_fake.last_closep = data->last_closep;
4312                 }
4313                 else
4314                     data_fake.last_closep = &fake;
4315                 data_fake.pos_delta = delta;
4316                 if ( flags & SCF_DO_STCLASS && !scan->flags
4317                      && OP(scan) == IFMATCH ) { /* Lookahead */
4318                     cl_init(pRExC_state, &intrnl);
4319                     data_fake.start_class = &intrnl;
4320                     f |= SCF_DO_STCLASS_AND;
4321                 }
4322                 if (flags & SCF_WHILEM_VISITED_POS)
4323                     f |= SCF_WHILEM_VISITED_POS;
4324                 next = regnext(scan);
4325                 nscan = NEXTOPER(NEXTOPER(scan));
4326                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
4327                     last, &data_fake, stopparen, recursed, NULL, f, depth+1);
4328                 if (scan->flags) {
4329                     if (deltanext) {
4330                         FAIL("Variable length lookbehind not implemented");
4331                     }
4332                     else if (minnext > (I32)U8_MAX) {
4333                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4334                     }
4335                     scan->flags = (U8)minnext;
4336                 }
4337                 if (data) {
4338                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4339                         pars++;
4340                     if (data_fake.flags & SF_HAS_EVAL)
4341                         data->flags |= SF_HAS_EVAL;
4342                     data->whilem_c = data_fake.whilem_c;
4343                 }
4344                 if (f & SCF_DO_STCLASS_AND) {
4345                     if (flags & SCF_DO_STCLASS_OR) {
4346                         /* OR before, AND after: ideally we would recurse with
4347                          * data_fake to get the AND applied by study of the
4348                          * remainder of the pattern, and then derecurse;
4349                          * *** HACK *** for now just treat as "no information".
4350                          * See [perl #56690].
4351                          */
4352                         cl_init(pRExC_state, data->start_class);
4353                     }  else {
4354                         /* AND before and after: combine and continue */
4355                         const int was = TEST_SSC_EOS(data->start_class);
4356
4357                         cl_and(data->start_class, &intrnl);
4358                         if (was)
4359                             SET_SSC_EOS(data->start_class);
4360                     }
4361                 }
4362             }
4363 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4364             else {
4365                 /* Positive Lookahead/lookbehind
4366                    In this case we can do fixed string optimisation,
4367                    but we must be careful about it. Note in the case of
4368                    lookbehind the positions will be offset by the minimum
4369                    length of the pattern, something we won't know about
4370                    until after the recurse.
4371                 */
4372                 I32 deltanext, fake = 0;
4373                 regnode *nscan;
4374                 struct regnode_charclass_class intrnl;
4375                 int f = 0;
4376                 /* We use SAVEFREEPV so that when the full compile
4377                     is finished perl will clean up the allocated
4378                     minlens when it's all done. This way we don't
4379                     have to worry about freeing them when we know
4380                     they wont be used, which would be a pain.
4381                  */
4382                 I32 *minnextp;
4383                 Newx( minnextp, 1, I32 );
4384                 SAVEFREEPV(minnextp);
4385
4386                 if (data) {
4387                     StructCopy(data, &data_fake, scan_data_t);
4388                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
4389                         f |= SCF_DO_SUBSTR;
4390                         if (scan->flags)
4391                             SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
4392                         data_fake.last_found=newSVsv(data->last_found);
4393                     }
4394                 }
4395                 else
4396                     data_fake.last_closep = &fake;
4397                 data_fake.flags = 0;
4398                 data_fake.pos_delta = delta;
4399                 if (is_inf)
4400                     data_fake.flags |= SF_IS_INF;
4401                 if ( flags & SCF_DO_STCLASS && !scan->flags
4402                      && OP(scan) == IFMATCH ) { /* Lookahead */
4403                     cl_init(pRExC_state, &intrnl);
4404                     data_fake.start_class = &intrnl;
4405                     f |= SCF_DO_STCLASS_AND;
4406                 }
4407                 if (flags & SCF_WHILEM_VISITED_POS)
4408                     f |= SCF_WHILEM_VISITED_POS;
4409                 next = regnext(scan);
4410                 nscan = NEXTOPER(NEXTOPER(scan));
4411
4412                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
4413                     last, &data_fake, stopparen, recursed, NULL, f,depth+1);
4414                 if (scan->flags) {
4415                     if (deltanext) {
4416                         FAIL("Variable length lookbehind not implemented");
4417                     }
4418                     else if (*minnextp > (I32)U8_MAX) {
4419                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4420                     }
4421                     scan->flags = (U8)*minnextp;
4422                 }
4423
4424                 *minnextp += min;
4425
4426                 if (f & SCF_DO_STCLASS_AND) {
4427                     const int was = TEST_SSC_EOS(data.start_class);
4428
4429                     cl_and(data->start_class, &intrnl);
4430                     if (was)
4431                         SET_SSC_EOS(data->start_class);
4432                 }
4433                 if (data) {
4434                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4435                         pars++;
4436                     if (data_fake.flags & SF_HAS_EVAL)
4437                         data->flags |= SF_HAS_EVAL;
4438                     data->whilem_c = data_fake.whilem_c;
4439                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4440                         if (RExC_rx->minlen<*minnextp)
4441                             RExC_rx->minlen=*minnextp;
4442                         SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4443                         SvREFCNT_dec_NN(data_fake.last_found);
4444
4445                         if ( data_fake.minlen_fixed != minlenp )
4446                         {
4447                             data->offset_fixed= data_fake.offset_fixed;
4448                             data->minlen_fixed= data_fake.minlen_fixed;
4449                             data->lookbehind_fixed+= scan->flags;
4450                         }
4451                         if ( data_fake.minlen_float != minlenp )
4452                         {
4453                             data->minlen_float= data_fake.minlen_float;
4454                             data->offset_float_min=data_fake.offset_float_min;
4455                             data->offset_float_max=data_fake.offset_float_max;
4456                             data->lookbehind_float+= scan->flags;
4457                         }
4458                     }
4459                 }
4460             }
4461 #endif
4462         }
4463         else if (OP(scan) == OPEN) {
4464             if (stopparen != (I32)ARG(scan))
4465                 pars++;
4466         }
4467         else if (OP(scan) == CLOSE) {
4468             if (stopparen == (I32)ARG(scan)) {
4469                 break;
4470             }
4471             if ((I32)ARG(scan) == is_par) {
4472                 next = regnext(scan);
4473
4474                 if ( next && (OP(next) != WHILEM) && next < last)
4475                     is_par = 0;         /* Disable optimization */
4476             }
4477             if (data)
4478                 *(data->last_closep) = ARG(scan);
4479         }
4480         else if (OP(scan) == EVAL) {
4481                 if (data)
4482                     data->flags |= SF_HAS_EVAL;
4483         }
4484         else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4485             if (flags & SCF_DO_SUBSTR) {
4486                 SCAN_COMMIT(pRExC_state,data,minlenp);
4487                 flags &= ~SCF_DO_SUBSTR;
4488             }
4489             if (data && OP(scan)==ACCEPT) {
4490                 data->flags |= SCF_SEEN_ACCEPT;
4491                 if (stopmin > min)
4492                     stopmin = min;
4493             }
4494         }
4495         else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4496         {
4497                 if (flags & SCF_DO_SUBSTR) {
4498                     SCAN_COMMIT(pRExC_state,data,minlenp);
4499                     data->longest = &(data->longest_float);
4500                 }
4501                 is_inf = is_inf_internal = 1;
4502                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4503                     cl_anything(pRExC_state, data->start_class);
4504                 flags &= ~SCF_DO_STCLASS;
4505         }
4506         else if (OP(scan) == GPOS) {
4507             if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4508                 !(delta || is_inf || (data && data->pos_delta)))
4509             {
4510                 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4511                     RExC_rx->extflags |= RXf_ANCH_GPOS;
4512                 if (RExC_rx->gofs < (U32)min)
4513                     RExC_rx->gofs = min;
4514             } else {
4515                 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4516                 RExC_rx->gofs = 0;
4517             }
4518         }
4519 #ifdef TRIE_STUDY_OPT
4520 #ifdef FULL_TRIE_STUDY
4521         else if (PL_regkind[OP(scan)] == TRIE) {
4522             /* NOTE - There is similar code to this block above for handling
4523                BRANCH nodes on the initial study.  If you change stuff here
4524                check there too. */
4525             regnode *trie_node= scan;
4526             regnode *tail= regnext(scan);
4527             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4528             I32 max1 = 0, min1 = I32_MAX;
4529             struct regnode_charclass_class accum;
4530
4531             if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4532                 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4533             if (flags & SCF_DO_STCLASS)
4534                 cl_init_zero(pRExC_state, &accum);
4535
4536             if (!trie->jump) {
4537                 min1= trie->minlen;
4538                 max1= trie->maxlen;
4539             } else {
4540                 const regnode *nextbranch= NULL;
4541                 U32 word;
4542
4543                 for ( word=1 ; word <= trie->wordcount ; word++)
4544                 {
4545                     I32 deltanext=0, minnext=0, f = 0, fake;
4546                     struct regnode_charclass_class this_class;
4547
4548                     data_fake.flags = 0;
4549                     if (data) {
4550                         data_fake.whilem_c = data->whilem_c;
4551                         data_fake.last_closep = data->last_closep;
4552                     }
4553                     else
4554                         data_fake.last_closep = &fake;
4555                     data_fake.pos_delta = delta;
4556                     if (flags & SCF_DO_STCLASS) {
4557                         cl_init(pRExC_state, &this_class);
4558                         data_fake.start_class = &this_class;
4559                         f = SCF_DO_STCLASS_AND;
4560                     }
4561                     if (flags & SCF_WHILEM_VISITED_POS)
4562                         f |= SCF_WHILEM_VISITED_POS;
4563
4564                     if (trie->jump[word]) {
4565                         if (!nextbranch)
4566                             nextbranch = trie_node + trie->jump[0];
4567                         scan= trie_node + trie->jump[word];
4568                         /* We go from the jump point to the branch that follows
4569                            it. Note this means we need the vestigal unused branches
4570                            even though they arent otherwise used.
4571                          */
4572                         minnext = study_chunk(pRExC_state, &scan, minlenp,
4573                             &deltanext, (regnode *)nextbranch, &data_fake,
4574                             stopparen, recursed, NULL, f,depth+1);
4575                     }
4576                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4577                         nextbranch= regnext((regnode*)nextbranch);
4578
4579                     if (min1 > (I32)(minnext + trie->minlen))
4580                         min1 = minnext + trie->minlen;
4581                     if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4582                         max1 = minnext + deltanext + trie->maxlen;
4583                     if (deltanext == I32_MAX)
4584                         is_inf = is_inf_internal = 1;
4585
4586                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4587                         pars++;
4588                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
4589                         if ( stopmin > min + min1)
4590                             stopmin = min + min1;
4591                         flags &= ~SCF_DO_SUBSTR;
4592                         if (data)
4593                             data->flags |= SCF_SEEN_ACCEPT;
4594                     }
4595                     if (data) {
4596                         if (data_fake.flags & SF_HAS_EVAL)
4597                             data->flags |= SF_HAS_EVAL;
4598                         data->whilem_c = data_fake.whilem_c;
4599                     }
4600                     if (flags & SCF_DO_STCLASS)
4601                         cl_or(pRExC_state, &accum, &this_class);
4602                 }
4603             }
4604             if (flags & SCF_DO_SUBSTR) {
4605                 data->pos_min += min1;
4606                 data->pos_delta += max1 - min1;
4607                 if (max1 != min1 || is_inf)
4608                     data->longest = &(data->longest_float);
4609             }
4610             min += min1;
4611             delta += max1 - min1;
4612             if (flags & SCF_DO_STCLASS_OR) {
4613                 cl_or(pRExC_state, data->start_class, &accum);
4614                 if (min1) {
4615                     cl_and(data->start_class, and_withp);
4616                     flags &= ~SCF_DO_STCLASS;
4617                 }
4618             }
4619             else if (flags & SCF_DO_STCLASS_AND) {
4620                 if (min1) {
4621                     cl_and(data->start_class, &accum);
4622                     flags &= ~SCF_DO_STCLASS;
4623                 }
4624                 else {
4625                     /* Switch to OR mode: cache the old value of
4626                      * data->start_class */
4627                     INIT_AND_WITHP;
4628                     StructCopy(data->start_class, and_withp,
4629                                struct regnode_charclass_class);
4630                     flags &= ~SCF_DO_STCLASS_AND;
4631                     StructCopy(&accum, data->start_class,
4632                                struct regnode_charclass_class);
4633                     flags |= SCF_DO_STCLASS_OR;
4634                     SET_SSC_EOS(data->start_class);
4635                 }
4636             }
4637             scan= tail;
4638             continue;
4639         }
4640 #else
4641         else if (PL_regkind[OP(scan)] == TRIE) {
4642             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4643             U8*bang=NULL;
4644
4645             min += trie->minlen;
4646             delta += (trie->maxlen - trie->minlen);
4647             flags &= ~SCF_DO_STCLASS; /* xxx */
4648             if (flags & SCF_DO_SUBSTR) {
4649                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
4650                 data->pos_min += trie->minlen;
4651                 data->pos_delta += (trie->maxlen - trie->minlen);
4652                 if (trie->maxlen != trie->minlen)
4653                     data->longest = &(data->longest_float);
4654             }
4655             if (trie->jump) /* no more substrings -- for now /grr*/
4656                 flags &= ~SCF_DO_SUBSTR;
4657         }
4658 #endif /* old or new */
4659 #endif /* TRIE_STUDY_OPT */
4660
4661         /* Else: zero-length, ignore. */
4662         scan = regnext(scan);
4663     }
4664     if (frame) {
4665         last = frame->last;
4666         scan = frame->next;
4667         stopparen = frame->stop;
4668         frame = frame->prev;
4669         goto fake_study_recurse;
4670     }
4671
4672   finish:
4673     assert(!frame);
4674     DEBUG_STUDYDATA("pre-fin:",data,depth);
4675
4676     *scanp = scan;
4677     *deltap = is_inf_internal ? I32_MAX : delta;
4678     if (flags & SCF_DO_SUBSTR && is_inf)
4679         data->pos_delta = I32_MAX - data->pos_min;
4680     if (is_par > (I32)U8_MAX)
4681         is_par = 0;
4682     if (is_par && pars==1 && data) {
4683         data->flags |= SF_IN_PAR;
4684         data->flags &= ~SF_HAS_PAR;
4685     }
4686     else if (pars && data) {
4687         data->flags |= SF_HAS_PAR;
4688         data->flags &= ~SF_IN_PAR;
4689     }
4690     if (flags & SCF_DO_STCLASS_OR)
4691         cl_and(data->start_class, and_withp);
4692     if (flags & SCF_TRIE_RESTUDY)
4693         data->flags |=  SCF_TRIE_RESTUDY;
4694
4695     DEBUG_STUDYDATA("post-fin:",data,depth);
4696
4697     return min < stopmin ? min : stopmin;
4698 }
4699
4700 STATIC U32
4701 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4702 {
4703     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4704
4705     PERL_ARGS_ASSERT_ADD_DATA;
4706
4707     Renewc(RExC_rxi->data,
4708            sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4709            char, struct reg_data);
4710     if(count)
4711         Renew(RExC_rxi->data->what, count + n, U8);
4712     else
4713         Newx(RExC_rxi->data->what, n, U8);
4714     RExC_rxi->data->count = count + n;
4715     Copy(s, RExC_rxi->data->what + count, n, U8);
4716     return count;
4717 }
4718
4719 /*XXX: todo make this not included in a non debugging perl */
4720 #ifndef PERL_IN_XSUB_RE
4721 void
4722 Perl_reginitcolors(pTHX)
4723 {
4724     dVAR;
4725     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4726     if (s) {
4727         char *t = savepv(s);
4728         int i = 0;
4729         PL_colors[0] = t;
4730         while (++i < 6) {
4731             t = strchr(t, '\t');
4732             if (t) {
4733                 *t = '\0';
4734                 PL_colors[i] = ++t;
4735             }
4736             else
4737                 PL_colors[i] = t = (char *)"";
4738         }
4739     } else {
4740         int i = 0;
4741         while (i < 6)
4742             PL_colors[i++] = (char *)"";
4743     }
4744     PL_colorset = 1;
4745 }
4746 #endif
4747
4748
4749 #ifdef TRIE_STUDY_OPT
4750 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething)            \
4751     STMT_START {                                            \
4752         if (                                                \
4753               (data.flags & SCF_TRIE_RESTUDY)               \
4754               && ! restudied++                              \
4755         ) {                                                 \
4756             dOsomething;                                    \
4757             goto reStudy;                                   \
4758         }                                                   \
4759     } STMT_END
4760 #else
4761 #define CHECK_RESTUDY_GOTO_butfirst
4762 #endif
4763
4764 /*
4765  * pregcomp - compile a regular expression into internal code
4766  *
4767  * Decides which engine's compiler to call based on the hint currently in
4768  * scope
4769  */
4770
4771 #ifndef PERL_IN_XSUB_RE
4772
4773 /* return the currently in-scope regex engine (or the default if none)  */
4774
4775 regexp_engine const *
4776 Perl_current_re_engine(pTHX)
4777 {
4778     dVAR;
4779
4780     if (IN_PERL_COMPILETIME) {
4781         HV * const table = GvHV(PL_hintgv);
4782         SV **ptr;
4783
4784         if (!table)
4785             return &PL_core_reg_engine;
4786         ptr = hv_fetchs(table, "regcomp", FALSE);
4787         if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
4788             return &PL_core_reg_engine;
4789         return INT2PTR(regexp_engine*,SvIV(*ptr));
4790     }
4791     else {
4792         SV *ptr;
4793         if (!PL_curcop->cop_hints_hash)
4794             return &PL_core_reg_engine;
4795         ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
4796         if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
4797             return &PL_core_reg_engine;
4798         return INT2PTR(regexp_engine*,SvIV(ptr));
4799     }
4800 }
4801
4802
4803 REGEXP *
4804 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4805 {
4806     dVAR;
4807     regexp_engine const *eng = current_re_engine();
4808     GET_RE_DEBUG_FLAGS_DECL;
4809
4810     PERL_ARGS_ASSERT_PREGCOMP;
4811
4812     /* Dispatch a request to compile a regexp to correct regexp engine. */
4813     DEBUG_COMPILE_r({
4814         PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4815                         PTR2UV(eng));
4816     });
4817     return CALLREGCOMP_ENG(eng, pattern, flags);
4818 }
4819 #endif
4820
4821 /* public(ish) entry point for the perl core's own regex compiling code.
4822  * It's actually a wrapper for Perl_re_op_compile that only takes an SV
4823  * pattern rather than a list of OPs, and uses the internal engine rather
4824  * than the current one */
4825
4826 REGEXP *
4827 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
4828 {
4829     SV *pat = pattern; /* defeat constness! */
4830     PERL_ARGS_ASSERT_RE_COMPILE;
4831     return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
4832 #ifdef PERL_IN_XSUB_RE
4833                                 &my_reg_engine,
4834 #else
4835                                 &PL_core_reg_engine,
4836 #endif
4837                                 NULL, NULL, rx_flags, 0);
4838 }
4839
4840 /* see if there are any run-time code blocks in the pattern.
4841  * False positives are allowed */
4842
4843 static bool
4844 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state, OP *expr,
4845                     U32 pm_flags, char *pat, STRLEN plen)
4846 {
4847     int n = 0;
4848     STRLEN s;
4849
4850     /* avoid infinitely recursing when we recompile the pattern parcelled up
4851      * as qr'...'. A single constant qr// string can't have have any
4852      * run-time component in it, and thus, no runtime code. (A non-qr
4853      * string, however, can, e.g. $x =~ '(?{})') */
4854     if  ((pm_flags & PMf_IS_QR) && expr && expr->op_type == OP_CONST)
4855         return 0;
4856
4857     for (s = 0; s < plen; s++) {
4858         if (n < pRExC_state->num_code_blocks
4859             && s == pRExC_state->code_blocks[n].start)
4860         {
4861             s = pRExC_state->code_blocks[n].end;
4862             n++;
4863             continue;
4864         }
4865         /* TODO ideally should handle [..], (#..), /#.../x to reduce false
4866          * positives here */
4867         if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
4868             (pat[s+2] == '{'
4869                 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
4870         )
4871             return 1;
4872     }
4873     return 0;
4874 }
4875
4876 /* Handle run-time code blocks. We will already have compiled any direct
4877  * or indirect literal code blocks. Now, take the pattern 'pat' and make a
4878  * copy of it, but with any literal code blocks blanked out and
4879  * appropriate chars escaped; then feed it into
4880  *
4881  *    eval "qr'modified_pattern'"
4882  *
4883  * For example,
4884  *
4885  *       a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
4886  *
4887  * becomes
4888  *
4889  *    qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
4890  *
4891  * After eval_sv()-ing that, grab any new code blocks from the returned qr
4892  * and merge them with any code blocks of the original regexp.
4893  *
4894  * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
4895  * instead, just save the qr and return FALSE; this tells our caller that
4896  * the original pattern needs upgrading to utf8.
4897  */
4898
4899 static bool
4900 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
4901     char *pat, STRLEN plen)
4902 {
4903     SV *qr;
4904
4905     GET_RE_DEBUG_FLAGS_DECL;
4906
4907     if (pRExC_state->runtime_code_qr) {
4908         /* this is the second time we've been called; this should
4909          * only happen if the main pattern got upgraded to utf8
4910          * during compilation; re-use the qr we compiled first time
4911          * round (which should be utf8 too)
4912          */
4913         qr = pRExC_state->runtime_code_qr;
4914         pRExC_state->runtime_code_qr = NULL;
4915         assert(RExC_utf8 && SvUTF8(qr));
4916     }
4917     else {
4918         int n = 0;
4919         STRLEN s;
4920         char *p, *newpat;
4921         int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
4922         SV *sv, *qr_ref;
4923         dSP;
4924
4925         /* determine how many extra chars we need for ' and \ escaping */
4926         for (s = 0; s < plen; s++) {
4927             if (pat[s] == '\'' || pat[s] == '\\')
4928                 newlen++;
4929         }
4930
4931         Newx(newpat, newlen, char);
4932         p = newpat;
4933         *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
4934
4935         for (s = 0; s < plen; s++) {
4936             if (n < pRExC_state->num_code_blocks
4937                 && s == pRExC_state->code_blocks[n].start)
4938             {
4939                 /* blank out literal code block */
4940                 assert(pat[s] == '(');
4941                 while (s <= pRExC_state->code_blocks[n].end) {
4942                     *p++ = '_';
4943                     s++;
4944                 }
4945                 s--;
4946                 n++;
4947                 continue;
4948             }
4949             if (pat[s] == '\'' || pat[s] == '\\')
4950                 *p++ = '\\';
4951             *p++ = pat[s];
4952         }
4953         *p++ = '\'';
4954         if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
4955             *p++ = 'x';
4956         *p++ = '\0';
4957         DEBUG_COMPILE_r({
4958             PerlIO_printf(Perl_debug_log,
4959                 "%sre-parsing pattern for runtime code:%s %s\n",
4960                 PL_colors[4],PL_colors[5],newpat);
4961         });
4962
4963         sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
4964         Safefree(newpat);
4965
4966         ENTER;
4967         SAVETMPS;
4968         save_re_context();
4969         PUSHSTACKi(PERLSI_REQUIRE);
4970         /* this causes the toker to collapse \\ into \ when parsing
4971          * qr''; normally only q'' does this. It also alters hints
4972          * handling */
4973         PL_reg_state.re_reparsing = TRUE;
4974         eval_sv(sv, G_SCALAR);
4975         SvREFCNT_dec_NN(sv);
4976         SPAGAIN;
4977         qr_ref = POPs;
4978         PUTBACK;
4979         {
4980             SV * const errsv = ERRSV;
4981             if (SvTRUE_NN(errsv))
4982             {
4983                 Safefree(pRExC_state->code_blocks);
4984                 /* use croak_sv ? */
4985                 Perl_croak_nocontext("%s", SvPV_nolen_const(errsv));
4986             }
4987         }
4988         assert(SvROK(qr_ref));
4989         qr = SvRV(qr_ref);
4990         assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
4991         /* the leaving below frees the tmp qr_ref.
4992          * Give qr a life of its own */
4993         SvREFCNT_inc(qr);
4994         POPSTACK;
4995         FREETMPS;
4996         LEAVE;
4997
4998     }
4999
5000     if (!RExC_utf8 && SvUTF8(qr)) {
5001         /* first time through; the pattern got upgraded; save the
5002          * qr for the next time through */
5003         assert(!pRExC_state->runtime_code_qr);
5004         pRExC_state->runtime_code_qr = qr;
5005         return 0;
5006     }
5007
5008
5009     /* extract any code blocks within the returned qr//  */
5010
5011
5012     /* merge the main (r1) and run-time (r2) code blocks into one */
5013     {
5014         RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
5015         struct reg_code_block *new_block, *dst;
5016         RExC_state_t * const r1 = pRExC_state; /* convenient alias */
5017         int i1 = 0, i2 = 0;
5018
5019         if (!r2->num_code_blocks) /* we guessed wrong */
5020         {
5021             SvREFCNT_dec_NN(qr);
5022             return 1;
5023         }
5024
5025         Newx(new_block,
5026             r1->num_code_blocks + r2->num_code_blocks,
5027             struct reg_code_block);
5028         dst = new_block;
5029
5030         while (    i1 < r1->num_code_blocks
5031                 || i2 < r2->num_code_blocks)
5032         {
5033             struct reg_code_block *src;
5034             bool is_qr = 0;
5035
5036             if (i1 == r1->num_code_blocks) {
5037                 src = &r2->code_blocks[i2++];
5038                 is_qr = 1;
5039             }
5040             else if (i2 == r2->num_code_blocks)
5041                 src = &r1->code_blocks[i1++];
5042             else if (  r1->code_blocks[i1].start
5043                      < r2->code_blocks[i2].start)
5044             {
5045                 src = &r1->code_blocks[i1++];
5046                 assert(src->end < r2->code_blocks[i2].start);
5047             }
5048             else {
5049                 assert(  r1->code_blocks[i1].start
5050                        > r2->code_blocks[i2].start);
5051                 src = &r2->code_blocks[i2++];
5052                 is_qr = 1;
5053                 assert(src->end < r1->code_blocks[i1].start);
5054             }
5055
5056             assert(pat[src->start] == '(');
5057             assert(pat[src->end]   == ')');
5058             dst->start      = src->start;
5059             dst->end        = src->end;
5060             dst->block      = src->block;
5061             dst->src_regex  = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
5062                                     : src->src_regex;
5063             dst++;
5064         }
5065         r1->num_code_blocks += r2->num_code_blocks;
5066         Safefree(r1->code_blocks);
5067         r1->code_blocks = new_block;
5068     }
5069
5070     SvREFCNT_dec_NN(qr);
5071     return 1;
5072 }
5073
5074
5075 STATIC bool
5076 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, SV** rx_utf8, SV** rx_substr, I32* rx_end_shift, I32 lookbehind, I32 offset, I32 *minlen, STRLEN longest_length, bool eol, bool meol)
5077 {
5078     /* This is the common code for setting up the floating and fixed length
5079      * string data extracted from Perlre_op_compile() below.  Returns a boolean
5080      * as to whether succeeded or not */
5081
5082     I32 t,ml;
5083
5084     if (! (longest_length
5085            || (eol /* Can't have SEOL and MULTI */
5086                && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
5087           )
5088             /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
5089         || (RExC_seen & REG_SEEN_EXACTF_SHARP_S))
5090     {
5091         return FALSE;
5092     }
5093
5094     /* copy the information about the longest from the reg_scan_data
5095         over to the program. */
5096     if (SvUTF8(sv_longest)) {
5097         *rx_utf8 = sv_longest;
5098         *rx_substr = NULL;
5099     } else {
5100         *rx_substr = sv_longest;
5101         *rx_utf8 = NULL;
5102     }
5103     /* end_shift is how many chars that must be matched that
5104         follow this item. We calculate it ahead of time as once the
5105         lookbehind offset is added in we lose the ability to correctly
5106         calculate it.*/
5107     ml = minlen ? *(minlen) : (I32)longest_length;
5108     *rx_end_shift = ml - offset
5109         - longest_length + (SvTAIL(sv_longest) != 0)
5110         + lookbehind;
5111
5112     t = (eol/* Can't have SEOL and MULTI */
5113          && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
5114     fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
5115
5116     return TRUE;
5117 }
5118
5119 /*
5120  * Perl_re_op_compile - the perl internal RE engine's function to compile a
5121  * regular expression into internal code.
5122  * The pattern may be passed either as:
5123  *    a list of SVs (patternp plus pat_count)
5124  *    a list of OPs (expr)
5125  * If both are passed, the SV list is used, but the OP list indicates
5126  * which SVs are actually pre-compiled code blocks
5127  *
5128  * The SVs in the list have magic and qr overloading applied to them (and
5129  * the list may be modified in-place with replacement SVs in the latter
5130  * case).
5131  *
5132  * If the pattern hasn't changed from old_re, then old_re will be
5133  * returned.
5134  *
5135  * eng is the current engine. If that engine has an op_comp method, then
5136  * handle directly (i.e. we assume that op_comp was us); otherwise, just
5137  * do the initial concatenation of arguments and pass on to the external
5138  * engine.
5139  *
5140  * If is_bare_re is not null, set it to a boolean indicating whether the
5141  * arg list reduced (after overloading) to a single bare regex which has
5142  * been returned (i.e. /$qr/).
5143  *
5144  * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
5145  *
5146  * pm_flags contains the PMf_* flags, typically based on those from the
5147  * pm_flags field of the related PMOP. Currently we're only interested in
5148  * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
5149  *
5150  * We can't allocate space until we know how big the compiled form will be,
5151  * but we can't compile it (and thus know how big it is) until we've got a
5152  * place to put the code.  So we cheat:  we compile it twice, once with code
5153  * generation turned off and size counting turned on, and once "for real".
5154  * This also means that we don't allocate space until we are sure that the
5155  * thing really will compile successfully, and we never have to move the
5156  * code and thus invalidate pointers into it.  (Note that it has to be in
5157  * one piece because free() must be able to free it all.) [NB: not true in perl]
5158  *
5159  * Beware that the optimization-preparation code in here knows about some
5160  * of the structure of the compiled regexp.  [I'll say.]
5161  */
5162
5163 REGEXP *
5164 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
5165                     OP *expr, const regexp_engine* eng, REGEXP *VOL old_re,
5166                      bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
5167 {
5168     dVAR;
5169     REGEXP *rx;
5170     struct regexp *r;
5171     regexp_internal *ri;
5172     STRLEN plen;
5173     char  * VOL exp;
5174     char* xend;
5175     regnode *scan;
5176     I32 flags;
5177     I32 minlen = 0;
5178     U32 rx_flags;
5179     SV * VOL pat;
5180     SV * VOL code_blocksv = NULL;
5181
5182     /* these are all flags - maybe they should be turned
5183      * into a single int with different bit masks */
5184     I32 sawlookahead = 0;
5185     I32 sawplus = 0;
5186     I32 sawopen = 0;
5187     bool used_setjump = FALSE;
5188     regex_charset initial_charset = get_regex_charset(orig_rx_flags);
5189     bool code_is_utf8 = 0;
5190     bool VOL recompile = 0;
5191     bool runtime_code = 0;
5192     U8 jump_ret = 0;
5193     dJMPENV;
5194     scan_data_t data;
5195     RExC_state_t RExC_state;
5196     RExC_state_t * const pRExC_state = &RExC_state;
5197 #ifdef TRIE_STUDY_OPT
5198     int restudied;
5199     RExC_state_t copyRExC_state;
5200 #endif
5201     GET_RE_DEBUG_FLAGS_DECL;
5202
5203     PERL_ARGS_ASSERT_RE_OP_COMPILE;
5204
5205     DEBUG_r(if (!PL_colorset) reginitcolors());
5206
5207 #ifndef PERL_IN_XSUB_RE
5208     /* Initialize these here instead of as-needed, as is quick and avoids
5209      * having to test them each time otherwise */
5210     if (! PL_AboveLatin1) {
5211         PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
5212         PL_ASCII = _new_invlist_C_array(ASCII_invlist);
5213         PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
5214
5215         PL_L1Posix_ptrs[_CC_ALPHANUMERIC]
5216                                 = _new_invlist_C_array(L1PosixAlnum_invlist);
5217         PL_Posix_ptrs[_CC_ALPHANUMERIC]
5218                                 = _new_invlist_C_array(PosixAlnum_invlist);
5219
5220         PL_L1Posix_ptrs[_CC_ALPHA]
5221                                 = _new_invlist_C_array(L1PosixAlpha_invlist);
5222         PL_Posix_ptrs[_CC_ALPHA] = _new_invlist_C_array(PosixAlpha_invlist);
5223
5224         PL_Posix_ptrs[_CC_BLANK] = _new_invlist_C_array(PosixBlank_invlist);
5225         PL_XPosix_ptrs[_CC_BLANK] = _new_invlist_C_array(XPosixBlank_invlist);
5226
5227         /* Cased is the same as Alpha in the ASCII range */
5228         PL_L1Posix_ptrs[_CC_CASED] =  _new_invlist_C_array(L1Cased_invlist);
5229         PL_Posix_ptrs[_CC_CASED] =  _new_invlist_C_array(PosixAlpha_invlist);
5230
5231         PL_Posix_ptrs[_CC_CNTRL] = _new_invlist_C_array(PosixCntrl_invlist);
5232         PL_XPosix_ptrs[_CC_CNTRL] = _new_invlist_C_array(XPosixCntrl_invlist);
5233
5234         PL_Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist);
5235         PL_L1Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist);
5236
5237         PL_L1Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(L1PosixGraph_invlist);
5238         PL_Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(PosixGraph_invlist);
5239
5240         PL_L1Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(L1PosixLower_invlist);
5241         PL_Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(PosixLower_invlist);
5242
5243         PL_L1Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(L1PosixPrint_invlist);
5244         PL_Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(PosixPrint_invlist);
5245
5246         PL_L1Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(L1PosixPunct_invlist);
5247         PL_Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(PosixPunct_invlist);
5248
5249         PL_Posix_ptrs[_CC_SPACE] = _new_invlist_C_array(PerlSpace_invlist);
5250         PL_XPosix_ptrs[_CC_SPACE] = _new_invlist_C_array(XPerlSpace_invlist);
5251         PL_Posix_ptrs[_CC_PSXSPC] = _new_invlist_C_array(PosixSpace_invlist);
5252         PL_XPosix_ptrs[_CC_PSXSPC] = _new_invlist_C_array(XPosixSpace_invlist);
5253
5254         PL_L1Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(L1PosixUpper_invlist);
5255         PL_Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(PosixUpper_invlist);
5256
5257         PL_XPosix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(VertSpace_invlist);
5258
5259         PL_Posix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(PosixWord_invlist);
5260         PL_L1Posix_ptrs[_CC_WORDCHAR]
5261                                 = _new_invlist_C_array(L1PosixWord_invlist);
5262
5263         PL_Posix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(PosixXDigit_invlist);
5264         PL_XPosix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(XPosixXDigit_invlist);
5265
5266         PL_HasMultiCharFold = _new_invlist_C_array(_Perl_Multi_Char_Folds_invlist);
5267     }
5268 #endif
5269
5270     pRExC_state->code_blocks = NULL;
5271     pRExC_state->num_code_blocks = 0;
5272
5273     if (is_bare_re)
5274         *is_bare_re = FALSE;
5275
5276     if (expr && (expr->op_type == OP_LIST ||
5277                 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
5278
5279         /* is the source UTF8, and how many code blocks are there? */
5280         OP *o;
5281         int ncode = 0;
5282
5283         for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5284             if (o->op_type == OP_CONST && SvUTF8(cSVOPo_sv))
5285                 code_is_utf8 = 1;
5286             else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
5287                 /* count of DO blocks */
5288                 ncode++;
5289         }
5290         if (ncode) {
5291             pRExC_state->num_code_blocks = ncode;
5292             Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
5293         }
5294     }
5295
5296     if (pat_count) {
5297         /* handle a list of SVs */
5298
5299         SV **svp;
5300
5301         /* apply magic and RE overloading to each arg */
5302         for (svp = patternp; svp < patternp + pat_count; svp++) {
5303             SV *rx = *svp;
5304             SvGETMAGIC(rx);
5305             if (SvROK(rx) && SvAMAGIC(rx)) {
5306                 SV *sv = AMG_CALLunary(rx, regexp_amg);
5307                 if (sv) {
5308                     if (SvROK(sv))
5309                         sv = SvRV(sv);
5310                     if (SvTYPE(sv) != SVt_REGEXP)
5311                         Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5312                     *svp = sv;
5313                 }
5314             }
5315         }
5316
5317         if (pat_count > 1) {
5318             /* concat multiple args and find any code block indexes */
5319
5320             OP *o = NULL;
5321             int n = 0;
5322             bool utf8 = 0;
5323             STRLEN orig_patlen = 0;
5324
5325             if (pRExC_state->num_code_blocks) {
5326                 o = cLISTOPx(expr)->op_first;
5327                 assert(   o->op_type == OP_PUSHMARK
5328                        || (o->op_type == OP_NULL && o->op_targ == OP_PUSHMARK)
5329                        || o->op_type == OP_PADRANGE);
5330                 o = o->op_sibling;
5331             }
5332
5333             pat = newSVpvn("", 0);
5334             SAVEFREESV(pat);
5335
5336             /* determine if the pattern is going to be utf8 (needed
5337              * in advance to align code block indices correctly).
5338              * XXX This could fail to be detected for an arg with
5339              * overloading but not concat overloading; but the main effect
5340              * in this obscure case is to need a 'use re eval' for a
5341              * literal code block */
5342             for (svp = patternp; svp < patternp + pat_count; svp++) {
5343                 if (SvUTF8(*svp))
5344                     utf8 = 1;
5345             }
5346             if (utf8)
5347                 SvUTF8_on(pat);
5348
5349             for (svp = patternp; svp < patternp + pat_count; svp++) {
5350                 SV *sv, *msv = *svp;
5351                 SV *rx;
5352                 bool code = 0;
5353                 /* we make the assumption here that each op in the list of
5354                  * op_siblings maps to one SV pushed onto the stack,
5355                  * except for code blocks, with have both an OP_NULL and
5356                  * and OP_CONST.
5357                  * This allows us to match up the list of SVs against the
5358                  * list of OPs to find the next code block.
5359                  *
5360                  * Note that       PUSHMARK PADSV PADSV ..
5361                  * is optimised to
5362                  *                 PADRANGE NULL  NULL  ..
5363                  * so the alignment still works. */
5364                 if (o) {
5365                     if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
5366                         assert(n < pRExC_state->num_code_blocks);
5367                         pRExC_state->code_blocks[n].start = SvCUR(pat);
5368                         pRExC_state->code_blocks[n].block = o;
5369                         pRExC_state->code_blocks[n].src_regex = NULL;
5370                         n++;
5371                         code = 1;
5372                         o = o->op_sibling; /* skip CONST */
5373                         assert(o);
5374                     }
5375                     o = o->op_sibling;;
5376                 }
5377
5378                 if ((SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5379                         (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5380                 {
5381                     sv_setsv(pat, sv);
5382                     /* overloading involved: all bets are off over literal
5383                      * code. Pretend we haven't seen it */
5384                     pRExC_state->num_code_blocks -= n;
5385                     n = 0;
5386                     rx = NULL;
5387
5388                 }
5389                 else  {
5390                     while (SvAMAGIC(msv)
5391                             && (sv = AMG_CALLunary(msv, string_amg))
5392                             && sv != msv
5393                             &&  !(   SvROK(msv)
5394                                   && SvROK(sv)
5395                                   && SvRV(msv) == SvRV(sv))
5396                     ) {
5397                         msv = sv;
5398                         SvGETMAGIC(msv);
5399                     }
5400                     if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5401                         msv = SvRV(msv);
5402                     orig_patlen = SvCUR(pat);
5403                     sv_catsv_nomg(pat, msv);
5404                     rx = msv;
5405                     if (code)
5406                         pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
5407                 }
5408
5409                 /* extract any code blocks within any embedded qr//'s */
5410                 if (rx && SvTYPE(rx) == SVt_REGEXP
5411                     && RX_ENGINE((REGEXP*)rx)->op_comp)
5412                 {
5413
5414                     RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
5415                     if (ri->num_code_blocks) {
5416                         int i;
5417                         /* the presence of an embedded qr// with code means
5418                          * we should always recompile: the text of the
5419                          * qr// may not have changed, but it may be a
5420                          * different closure than last time */
5421                         recompile = 1;
5422                         Renew(pRExC_state->code_blocks,
5423                             pRExC_state->num_code_blocks + ri->num_code_blocks,
5424                             struct reg_code_block);
5425                         pRExC_state->num_code_blocks += ri->num_code_blocks;
5426                         for (i=0; i < ri->num_code_blocks; i++) {
5427                             struct reg_code_block *src, *dst;
5428                             STRLEN offset =  orig_patlen
5429                                 + ReANY((REGEXP *)rx)->pre_prefix;
5430                             assert(n < pRExC_state->num_code_blocks);
5431                             src = &ri->code_blocks[i];
5432                             dst = &pRExC_state->code_blocks[n];
5433                             dst->start      = src->start + offset;
5434                             dst->end        = src->end   + offset;
5435                             dst->block      = src->block;
5436                             dst->src_regex  = (REGEXP*) SvREFCNT_inc( (SV*)
5437                                                     src->src_regex
5438                                                         ? src->src_regex
5439                                                         : (REGEXP*)rx);
5440                             n++;
5441                         }
5442                     }
5443                 }
5444             }
5445             SvSETMAGIC(pat);
5446         }
5447         else {
5448             SV *sv;
5449             pat = *patternp;
5450             while (SvAMAGIC(pat)
5451                     && (sv = AMG_CALLunary(pat, string_amg))
5452                     && sv != pat)
5453             {
5454                 pat = sv;
5455                 SvGETMAGIC(pat);
5456             }
5457         }
5458
5459         /* handle bare regex: foo =~ $re */
5460         {
5461             SV *re = pat;
5462             if (SvROK(re))
5463                 re = SvRV(re);
5464             if (SvTYPE(re) == SVt_REGEXP) {
5465                 if (is_bare_re)
5466                     *is_bare_re = TRUE;
5467                 SvREFCNT_inc(re);
5468                 Safefree(pRExC_state->code_blocks);
5469                 return (REGEXP*)re;
5470             }
5471         }
5472     }
5473     else {
5474         /* not a list of SVs, so must be a list of OPs */
5475         assert(expr);
5476         if (expr->op_type == OP_LIST) {
5477             int i = -1;
5478             bool is_code = 0;
5479             OP *o;
5480
5481             pat = newSVpvn("", 0);
5482             SAVEFREESV(pat);
5483             if (code_is_utf8)
5484                 SvUTF8_on(pat);
5485
5486             /* given a list of CONSTs and DO blocks in expr, append all
5487              * the CONSTs to pat, and record the start and end of each
5488              * code block in code_blocks[] (each DO{} op is followed by an
5489              * OP_CONST containing the corresponding literal '(?{...})
5490              * text)
5491              */
5492             for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5493                 if (o->op_type == OP_CONST) {
5494                     sv_catsv(pat, cSVOPo_sv);
5495                     if (is_code) {
5496                         pRExC_state->code_blocks[i].end = SvCUR(pat)-1;
5497                         is_code = 0;
5498                     }
5499                 }
5500                 else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
5501                     assert(i+1 < pRExC_state->num_code_blocks);
5502                     pRExC_state->code_blocks[++i].start = SvCUR(pat);
5503                     pRExC_state->code_blocks[i].block = o;
5504                     pRExC_state->code_blocks[i].src_regex = NULL;
5505                     is_code = 1;
5506                 }
5507             }
5508         }
5509         else {
5510             assert(expr->op_type == OP_CONST);
5511             pat = cSVOPx_sv(expr);
5512         }
5513     }
5514
5515     exp = SvPV_nomg(pat, plen);
5516
5517     if (!eng->op_comp) {
5518         if ((SvUTF8(pat) && IN_BYTES)
5519                 || SvGMAGICAL(pat) || SvAMAGIC(pat))
5520         {
5521             /* make a temporary copy; either to convert to bytes,
5522              * or to avoid repeating get-magic / overloaded stringify */
5523             pat = newSVpvn_flags(exp, plen, SVs_TEMP |
5524                                         (IN_BYTES ? 0 : SvUTF8(pat)));
5525         }
5526         Safefree(pRExC_state->code_blocks);
5527         return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
5528     }
5529
5530     /* ignore the utf8ness if the pattern is 0 length */
5531     RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
5532     RExC_uni_semantics = 0;
5533     RExC_contains_locale = 0;
5534     pRExC_state->runtime_code_qr = NULL;
5535
5536     /****************** LONG JUMP TARGET HERE***********************/
5537     /* Longjmp back to here if have to switch in midstream to utf8 */
5538     if (! RExC_orig_utf8) {
5539         JMPENV_PUSH(jump_ret);
5540         used_setjump = TRUE;
5541     }
5542
5543     if (jump_ret == 0) {    /* First time through */
5544         xend = exp + plen;
5545
5546         DEBUG_COMPILE_r({
5547             SV *dsv= sv_newmortal();
5548             RE_PV_QUOTED_DECL(s, RExC_utf8,
5549                 dsv, exp, plen, 60);
5550             PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
5551                            PL_colors[4],PL_colors[5],s);
5552         });
5553     }
5554     else {  /* longjumped back */
5555         U8 *src, *dst;
5556         int n=0;
5557         STRLEN s = 0, d = 0;
5558         bool do_end = 0;
5559
5560         /* If the cause for the longjmp was other than changing to utf8, pop
5561          * our own setjmp, and longjmp to the correct handler */
5562         if (jump_ret != UTF8_LONGJMP) {
5563             JMPENV_POP;
5564             JMPENV_JUMP(jump_ret);
5565         }
5566
5567         GET_RE_DEBUG_FLAGS;
5568
5569         /* It's possible to write a regexp in ascii that represents Unicode
5570         codepoints outside of the byte range, such as via \x{100}. If we
5571         detect such a sequence we have to convert the entire pattern to utf8
5572         and then recompile, as our sizing calculation will have been based
5573         on 1 byte == 1 character, but we will need to use utf8 to encode
5574         at least some part of the pattern, and therefore must convert the whole
5575         thing.
5576         -- dmq */
5577         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5578             "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5579
5580         /* upgrade pattern to UTF8, and if there are code blocks,
5581          * recalculate the indices.
5582          * This is essentially an unrolled Perl_bytes_to_utf8() */
5583
5584         src = (U8*)SvPV_nomg(pat, plen);
5585         Newx(dst, plen * 2 + 1, U8);
5586
5587         while (s < plen) {
5588             const UV uv = NATIVE_TO_ASCII(src[s]);
5589             if (UNI_IS_INVARIANT(uv))
5590                 dst[d]   = (U8)UTF_TO_NATIVE(uv);
5591             else {
5592                 dst[d++] = (U8)UTF8_EIGHT_BIT_HI(uv);
5593                 dst[d]   = (U8)UTF8_EIGHT_BIT_LO(uv);
5594             }
5595             if (n < pRExC_state->num_code_blocks) {
5596                 if (!do_end && pRExC_state->code_blocks[n].start == s) {
5597                     pRExC_state->code_blocks[n].start = d;
5598                     assert(dst[d] == '(');
5599                     do_end = 1;
5600                 }
5601                 else if (do_end && pRExC_state->code_blocks[n].end == s) {
5602                     pRExC_state->code_blocks[n].end = d;
5603                     assert(dst[d] == ')');
5604                     do_end = 0;
5605                     n++;
5606                 }
5607             }
5608             s++;
5609             d++;
5610         }
5611         dst[d] = '\0';
5612         plen = d;
5613         exp = (char*) dst;
5614         xend = exp + plen;
5615         SAVEFREEPV(exp);
5616         RExC_orig_utf8 = RExC_utf8 = 1;
5617     }
5618
5619     /* return old regex if pattern hasn't changed */
5620
5621     if (   old_re
5622         && !recompile
5623         && !!RX_UTF8(old_re) == !!RExC_utf8
5624         && RX_PRECOMP(old_re)
5625         && RX_PRELEN(old_re) == plen
5626         && memEQ(RX_PRECOMP(old_re), exp, plen))
5627     {
5628         /* with runtime code, always recompile */
5629         runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags,
5630                                             exp, plen);
5631         if (!runtime_code) {
5632             if (used_setjump) {
5633                 JMPENV_POP;
5634             }
5635             Safefree(pRExC_state->code_blocks);
5636             return old_re;
5637         }
5638     }
5639     else if ((pm_flags & PMf_USE_RE_EVAL)
5640                 /* this second condition covers the non-regex literal case,
5641                  * i.e.  $foo =~ '(?{})'. */
5642                 || ( !PL_reg_state.re_reparsing && IN_PERL_COMPILETIME
5643                     && (PL_hints & HINT_RE_EVAL))
5644     )
5645         runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags,
5646                             exp, plen);
5647
5648 #ifdef TRIE_STUDY_OPT
5649     restudied = 0;
5650 #endif
5651
5652     rx_flags = orig_rx_flags;
5653
5654     if (initial_charset == REGEX_LOCALE_CHARSET) {
5655         RExC_contains_locale = 1;
5656     }
5657     else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
5658
5659         /* Set to use unicode semantics if the pattern is in utf8 and has the
5660          * 'depends' charset specified, as it means unicode when utf8  */
5661         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5662     }
5663
5664     RExC_precomp = exp;
5665     RExC_flags = rx_flags;
5666     RExC_pm_flags = pm_flags;
5667
5668     if (runtime_code) {
5669         if (TAINTING_get && TAINT_get)
5670             Perl_croak(aTHX_ "Eval-group in insecure regular expression");
5671
5672         if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
5673             /* whoops, we have a non-utf8 pattern, whilst run-time code
5674              * got compiled as utf8. Try again with a utf8 pattern */
5675              JMPENV_JUMP(UTF8_LONGJMP);
5676         }
5677     }
5678     assert(!pRExC_state->runtime_code_qr);
5679
5680     RExC_sawback = 0;
5681
5682     RExC_seen = 0;
5683     RExC_in_lookbehind = 0;
5684     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
5685     RExC_extralen = 0;
5686     RExC_override_recoding = 0;
5687     RExC_in_multi_char_class = 0;
5688
5689     /* First pass: determine size, legality. */
5690     RExC_parse = exp;
5691     RExC_start = exp;
5692     RExC_end = xend;
5693     RExC_naughty = 0;
5694     RExC_npar = 1;
5695     RExC_nestroot = 0;
5696     RExC_size = 0L;
5697     RExC_emit = &PL_regdummy;
5698     RExC_whilem_seen = 0;
5699     RExC_open_parens = NULL;
5700     RExC_close_parens = NULL;
5701     RExC_opend = NULL;
5702     RExC_paren_names = NULL;
5703 #ifdef DEBUGGING
5704     RExC_paren_name_list = NULL;
5705 #endif
5706     RExC_recurse = NULL;
5707     RExC_recurse_count = 0;
5708     pRExC_state->code_index = 0;
5709
5710 #if 0 /* REGC() is (currently) a NOP at the first pass.
5711        * Clever compilers notice this and complain. --jhi */
5712     REGC((U8)REG_MAGIC, (char*)RExC_emit);
5713 #endif
5714     DEBUG_PARSE_r(
5715         PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
5716         RExC_lastnum=0;
5717         RExC_lastparse=NULL;
5718     );
5719     /* reg may croak on us, not giving us a chance to free
5720        pRExC_state->code_blocks.  We cannot SAVEFREEPV it now, as we may
5721        need it to survive as long as the regexp (qr/(?{})/).
5722        We must check that code_blocksv is not already set, because we may
5723        have longjmped back. */
5724     if (pRExC_state->code_blocks && !code_blocksv) {
5725         code_blocksv = newSV_type(SVt_PV);
5726         SAVEFREESV(code_blocksv);
5727         SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
5728         SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
5729     }
5730     if (reg(pRExC_state, 0, &flags,1) == NULL) {
5731         RExC_precomp = NULL;
5732         return(NULL);
5733     }
5734     if (code_blocksv)
5735         SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
5736
5737     /* Here, finished first pass.  Get rid of any added setjmp */
5738     if (used_setjump) {
5739         JMPENV_POP;
5740     }
5741
5742     DEBUG_PARSE_r({
5743         PerlIO_printf(Perl_debug_log,
5744             "Required size %"IVdf" nodes\n"
5745             "Starting second pass (creation)\n",
5746             (IV)RExC_size);
5747         RExC_lastnum=0;
5748         RExC_lastparse=NULL;
5749     });
5750
5751     /* The first pass could have found things that force Unicode semantics */
5752     if ((RExC_utf8 || RExC_uni_semantics)
5753          && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
5754     {
5755         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5756     }
5757
5758     /* Small enough for pointer-storage convention?
5759        If extralen==0, this means that we will not need long jumps. */
5760     if (RExC_size >= 0x10000L && RExC_extralen)
5761         RExC_size += RExC_extralen;
5762     else
5763         RExC_extralen = 0;
5764     if (RExC_whilem_seen > 15)
5765         RExC_whilem_seen = 15;
5766
5767     /* Allocate space and zero-initialize. Note, the two step process
5768        of zeroing when in debug mode, thus anything assigned has to
5769        happen after that */
5770     rx = (REGEXP*) newSV_type(SVt_REGEXP);
5771     r = ReANY(rx);
5772     Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
5773          char, regexp_internal);
5774     if ( r == NULL || ri == NULL )
5775         FAIL("Regexp out of space");
5776 #ifdef DEBUGGING
5777     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
5778     Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
5779 #else
5780     /* bulk initialize base fields with 0. */
5781     Zero(ri, sizeof(regexp_internal), char);
5782 #endif
5783
5784     /* non-zero initialization begins here */
5785     RXi_SET( r, ri );
5786     r->engine= eng;
5787     r->extflags = rx_flags;
5788     if (pm_flags & PMf_IS_QR) {
5789         ri->code_blocks = pRExC_state->code_blocks;
5790         ri->num_code_blocks = pRExC_state->num_code_blocks;
5791     }
5792     else
5793     {
5794         int n;
5795         for (n = 0; n < pRExC_state->num_code_blocks; n++)
5796             if (pRExC_state->code_blocks[n].src_regex)
5797                 SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
5798         SAVEFREEPV(pRExC_state->code_blocks);
5799     }
5800
5801     {
5802         bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
5803         bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
5804
5805         /* The caret is output if there are any defaults: if not all the STD
5806          * flags are set, or if no character set specifier is needed */
5807         bool has_default =
5808                     (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
5809                     || ! has_charset);
5810         bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
5811         U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
5812                             >> RXf_PMf_STD_PMMOD_SHIFT);
5813         const char *fptr = STD_PAT_MODS;        /*"msix"*/
5814         char *p;
5815         /* Allocate for the worst case, which is all the std flags are turned
5816          * on.  If more precision is desired, we could do a population count of
5817          * the flags set.  This could be done with a small lookup table, or by
5818          * shifting, masking and adding, or even, when available, assembly
5819          * language for a machine-language population count.
5820          * We never output a minus, as all those are defaults, so are
5821          * covered by the caret */
5822         const STRLEN wraplen = plen + has_p + has_runon
5823             + has_default       /* If needs a caret */
5824
5825                 /* If needs a character set specifier */
5826             + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
5827             + (sizeof(STD_PAT_MODS) - 1)
5828             + (sizeof("(?:)") - 1);
5829
5830         Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
5831         r->xpv_len_u.xpvlenu_pv = p;
5832         if (RExC_utf8)
5833             SvFLAGS(rx) |= SVf_UTF8;
5834         *p++='('; *p++='?';
5835
5836         /* If a default, cover it using the caret */
5837         if (has_default) {
5838             *p++= DEFAULT_PAT_MOD;
5839         }
5840         if (has_charset) {
5841             STRLEN len;
5842             const char* const name = get_regex_charset_name(r->extflags, &len);
5843             Copy(name, p, len, char);
5844             p += len;
5845         }
5846         if (has_p)
5847             *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
5848         {
5849             char ch;
5850             while((ch = *fptr++)) {
5851                 if(reganch & 1)
5852                     *p++ = ch;
5853                 reganch >>= 1;
5854             }
5855         }
5856
5857         *p++ = ':';
5858         Copy(RExC_precomp, p, plen, char);
5859         assert ((RX_WRAPPED(rx) - p) < 16);
5860         r->pre_prefix = p - RX_WRAPPED(rx);
5861         p += plen;
5862         if (has_runon)
5863             *p++ = '\n';
5864         *p++ = ')';
5865         *p = 0;
5866         SvCUR_set(rx, p - RX_WRAPPED(rx));
5867     }
5868
5869     r->intflags = 0;
5870     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
5871
5872     if (RExC_seen & REG_SEEN_RECURSE) {
5873         Newxz(RExC_open_parens, RExC_npar,regnode *);
5874         SAVEFREEPV(RExC_open_parens);
5875         Newxz(RExC_close_parens,RExC_npar,regnode *);
5876         SAVEFREEPV(RExC_close_parens);
5877     }
5878
5879     /* Useful during FAIL. */
5880 #ifdef RE_TRACK_PATTERN_OFFSETS
5881     Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
5882     DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
5883                           "%s %"UVuf" bytes for offset annotations.\n",
5884                           ri->u.offsets ? "Got" : "Couldn't get",
5885                           (UV)((2*RExC_size+1) * sizeof(U32))));
5886 #endif
5887     SetProgLen(ri,RExC_size);
5888     RExC_rx_sv = rx;
5889     RExC_rx = r;
5890     RExC_rxi = ri;
5891
5892     /* Second pass: emit code. */
5893     RExC_flags = rx_flags;      /* don't let top level (?i) bleed */
5894     RExC_pm_flags = pm_flags;
5895     RExC_parse = exp;
5896     RExC_end = xend;
5897     RExC_naughty = 0;
5898     RExC_npar = 1;
5899     RExC_emit_start = ri->program;
5900     RExC_emit = ri->program;
5901     RExC_emit_bound = ri->program + RExC_size + 1;
5902     pRExC_state->code_index = 0;
5903
5904     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
5905     if (reg(pRExC_state, 0, &flags,1) == NULL) {
5906         ReREFCNT_dec(rx);
5907         return(NULL);
5908     }
5909     /* XXXX To minimize changes to RE engine we always allocate
5910        3-units-long substrs field. */
5911     Newx(r->substrs, 1, struct reg_substr_data);
5912     if (RExC_recurse_count) {
5913         Newxz(RExC_recurse,RExC_recurse_count,regnode *);
5914         SAVEFREEPV(RExC_recurse);
5915     }
5916
5917 reStudy:
5918     r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
5919     Zero(r->substrs, 1, struct reg_substr_data);
5920
5921 #ifdef TRIE_STUDY_OPT
5922     if (!restudied) {
5923         StructCopy(&zero_scan_data, &data, scan_data_t);
5924         copyRExC_state = RExC_state;
5925     } else {
5926         U32 seen=RExC_seen;
5927         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
5928
5929         RExC_state = copyRExC_state;
5930         if (seen & REG_TOP_LEVEL_BRANCHES)
5931             RExC_seen |= REG_TOP_LEVEL_BRANCHES;
5932         else
5933             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
5934         StructCopy(&zero_scan_data, &data, scan_data_t);
5935     }
5936 #else
5937     StructCopy(&zero_scan_data, &data, scan_data_t);
5938 #endif
5939
5940     /* Dig out information for optimizations. */
5941     r->extflags = RExC_flags; /* was pm_op */
5942     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
5943
5944     if (UTF)
5945         SvUTF8_on(rx);  /* Unicode in it? */
5946     ri->regstclass = NULL;
5947     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
5948         r->intflags |= PREGf_NAUGHTY;
5949     scan = ri->program + 1;             /* First BRANCH. */
5950
5951     /* testing for BRANCH here tells us whether there is "must appear"
5952        data in the pattern. If there is then we can use it for optimisations */
5953     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /*  Only one top-level choice. */
5954         I32 fake;
5955         STRLEN longest_float_length, longest_fixed_length;
5956         struct regnode_charclass_class ch_class; /* pointed to by data */
5957         int stclass_flag;
5958         I32 last_close = 0; /* pointed to by data */
5959         regnode *first= scan;
5960         regnode *first_next= regnext(first);
5961         /*
5962          * Skip introductions and multiplicators >= 1
5963          * so that we can extract the 'meat' of the pattern that must
5964          * match in the large if() sequence following.
5965          * NOTE that EXACT is NOT covered here, as it is normally
5966          * picked up by the optimiser separately.
5967          *
5968          * This is unfortunate as the optimiser isnt handling lookahead
5969          * properly currently.
5970          *
5971          */
5972         while ((OP(first) == OPEN && (sawopen = 1)) ||
5973                /* An OR of *one* alternative - should not happen now. */
5974             (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
5975             /* for now we can't handle lookbehind IFMATCH*/
5976             (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
5977             (OP(first) == PLUS) ||
5978             (OP(first) == MINMOD) ||
5979                /* An {n,m} with n>0 */
5980             (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
5981             (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
5982         {
5983                 /*
5984                  * the only op that could be a regnode is PLUS, all the rest
5985                  * will be regnode_1 or regnode_2.
5986                  *
5987                  */
5988                 if (OP(first) == PLUS)
5989                     sawplus = 1;
5990                 else
5991                     first += regarglen[OP(first)];
5992
5993                 first = NEXTOPER(first);
5994                 first_next= regnext(first);
5995         }
5996
5997         /* Starting-point info. */
5998       again:
5999         DEBUG_PEEP("first:",first,0);
6000         /* Ignore EXACT as we deal with it later. */
6001         if (PL_regkind[OP(first)] == EXACT) {
6002             if (OP(first) == EXACT)
6003                 NOOP;   /* Empty, get anchored substr later. */
6004             else
6005                 ri->regstclass = first;
6006         }
6007 #ifdef TRIE_STCLASS
6008         else if (PL_regkind[OP(first)] == TRIE &&
6009                 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
6010         {
6011             regnode *trie_op;
6012             /* this can happen only on restudy */
6013             if ( OP(first) == TRIE ) {
6014                 struct regnode_1 *trieop = (struct regnode_1 *)
6015                     PerlMemShared_calloc(1, sizeof(struct regnode_1));
6016                 StructCopy(first,trieop,struct regnode_1);
6017                 trie_op=(regnode *)trieop;
6018             } else {
6019                 struct regnode_charclass *trieop = (struct regnode_charclass *)
6020                     PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
6021                 StructCopy(first,trieop,struct regnode_charclass);
6022                 trie_op=(regnode *)trieop;
6023             }
6024             OP(trie_op)+=2;
6025             make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
6026             ri->regstclass = trie_op;
6027         }
6028 #endif
6029         else if (REGNODE_SIMPLE(OP(first)))
6030             ri->regstclass = first;
6031         else if (PL_regkind[OP(first)] == BOUND ||
6032                  PL_regkind[OP(first)] == NBOUND)
6033             ri->regstclass = first;
6034         else if (PL_regkind[OP(first)] == BOL) {
6035             r->extflags |= (OP(first) == MBOL
6036                            ? RXf_ANCH_MBOL
6037                            : (OP(first) == SBOL
6038                               ? RXf_ANCH_SBOL
6039                               : RXf_ANCH_BOL));
6040             first = NEXTOPER(first);
6041             goto again;
6042         }
6043         else if (OP(first) == GPOS) {
6044             r->extflags |= RXf_ANCH_GPOS;
6045             first = NEXTOPER(first);
6046             goto again;
6047         }
6048         else if ((!sawopen || !RExC_sawback) &&
6049             (OP(first) == STAR &&
6050             PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
6051             !(r->extflags & RXf_ANCH) && !pRExC_state->num_code_blocks)
6052         {
6053             /* turn .* into ^.* with an implied $*=1 */
6054             const int type =
6055                 (OP(NEXTOPER(first)) == REG_ANY)
6056                     ? RXf_ANCH_MBOL
6057                     : RXf_ANCH_SBOL;
6058             r->extflags |= type;
6059             r->intflags |= PREGf_IMPLICIT;
6060             first = NEXTOPER(first);
6061             goto again;
6062         }
6063         if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
6064             && !pRExC_state->num_code_blocks) /* May examine pos and $& */
6065             /* x+ must match at the 1st pos of run of x's */
6066             r->intflags |= PREGf_SKIP;
6067
6068         /* Scan is after the zeroth branch, first is atomic matcher. */
6069 #ifdef TRIE_STUDY_OPT
6070         DEBUG_PARSE_r(
6071             if (!restudied)
6072                 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6073                               (IV)(first - scan + 1))
6074         );
6075 #else
6076         DEBUG_PARSE_r(
6077             PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6078                 (IV)(first - scan + 1))
6079         );
6080 #endif
6081
6082
6083         /*
6084         * If there's something expensive in the r.e., find the
6085         * longest literal string that must appear and make it the
6086         * regmust.  Resolve ties in favor of later strings, since
6087         * the regstart check works with the beginning of the r.e.
6088         * and avoiding duplication strengthens checking.  Not a
6089         * strong reason, but sufficient in the absence of others.
6090         * [Now we resolve ties in favor of the earlier string if
6091         * it happens that c_offset_min has been invalidated, since the
6092         * earlier string may buy us something the later one won't.]
6093         */
6094
6095         data.longest_fixed = newSVpvs("");
6096         data.longest_float = newSVpvs("");
6097         data.last_found = newSVpvs("");
6098         data.longest = &(data.longest_fixed);
6099         ENTER_with_name("study_chunk");
6100         SAVEFREESV(data.longest_fixed);
6101         SAVEFREESV(data.longest_float);
6102         SAVEFREESV(data.last_found);
6103         first = scan;
6104         if (!ri->regstclass) {
6105             cl_init(pRExC_state, &ch_class);
6106             data.start_class = &ch_class;
6107             stclass_flag = SCF_DO_STCLASS_AND;
6108         } else                          /* XXXX Check for BOUND? */
6109             stclass_flag = 0;
6110         data.last_closep = &last_close;
6111
6112         minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
6113             &data, -1, NULL, NULL,
6114             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
6115
6116
6117         CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
6118
6119
6120         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
6121              && data.last_start_min == 0 && data.last_end > 0
6122              && !RExC_seen_zerolen
6123              && !(RExC_seen & REG_SEEN_VERBARG)
6124              && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
6125             r->extflags |= RXf_CHECK_ALL;
6126         scan_commit(pRExC_state, &data,&minlen,0);
6127
6128         longest_float_length = CHR_SVLEN(data.longest_float);
6129
6130         if (! ((SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
6131                    && data.offset_fixed == data.offset_float_min
6132                    && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
6133             && S_setup_longest (aTHX_ pRExC_state,
6134                                     data.longest_float,
6135                                     &(r->float_utf8),
6136                                     &(r->float_substr),
6137                                     &(r->float_end_shift),
6138                                     data.lookbehind_float,
6139                                     data.offset_float_min,
6140                                     data.minlen_float,
6141                                     longest_float_length,
6142                                     cBOOL(data.flags & SF_FL_BEFORE_EOL),
6143                                     cBOOL(data.flags & SF_FL_BEFORE_MEOL)))
6144         {
6145             r->float_min_offset = data.offset_float_min - data.lookbehind_float;
6146             r->float_max_offset = data.offset_float_max;
6147             if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
6148                 r->float_max_offset -= data.lookbehind_float;
6149             SvREFCNT_inc_simple_void_NN(data.longest_float);
6150         }
6151         else {
6152             r->float_substr = r->float_utf8 = NULL;
6153             longest_float_length = 0;
6154         }
6155
6156         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
6157
6158         if (S_setup_longest (aTHX_ pRExC_state,
6159                                 data.longest_fixed,
6160                                 &(r->anchored_utf8),
6161                                 &(r->anchored_substr),
6162                                 &(r->anchored_end_shift),
6163                                 data.lookbehind_fixed,
6164                                 data.offset_fixed,
6165                                 data.minlen_fixed,
6166                                 longest_fixed_length,
6167                                 cBOOL(data.flags & SF_FIX_BEFORE_EOL),
6168                                 cBOOL(data.flags & SF_FIX_BEFORE_MEOL)))
6169         {
6170             r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
6171             SvREFCNT_inc_simple_void_NN(data.longest_fixed);
6172         }
6173         else {
6174             r->anchored_substr = r->anchored_utf8 = NULL;
6175             longest_fixed_length = 0;
6176         }
6177         LEAVE_with_name("study_chunk");
6178
6179         if (ri->regstclass
6180             && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
6181             ri->regstclass = NULL;
6182
6183         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
6184             && stclass_flag
6185             && ! TEST_SSC_EOS(data.start_class)
6186             && !cl_is_anything(data.start_class))
6187         {
6188             const U32 n = add_data(pRExC_state, 1, "f");
6189             OP(data.start_class) = ANYOF_SYNTHETIC;
6190
6191             Newx(RExC_rxi->data->data[n], 1,
6192                 struct regnode_charclass_class);
6193             StructCopy(data.start_class,
6194                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6195                        struct regnode_charclass_class);
6196             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6197             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6198             DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
6199                       regprop(r, sv, (regnode*)data.start_class);
6200                       PerlIO_printf(Perl_debug_log,
6201                                     "synthetic stclass \"%s\".\n",
6202                                     SvPVX_const(sv));});
6203         }
6204
6205         /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
6206         if (longest_fixed_length > longest_float_length) {
6207             r->check_end_shift = r->anchored_end_shift;
6208             r->check_substr = r->anchored_substr;
6209             r->check_utf8 = r->anchored_utf8;
6210             r->check_offset_min = r->check_offset_max = r->anchored_offset;
6211             if (r->extflags & RXf_ANCH_SINGLE)
6212                 r->extflags |= RXf_NOSCAN;
6213         }
6214         else {
6215             r->check_end_shift = r->float_end_shift;
6216             r->check_substr = r->float_substr;
6217             r->check_utf8 = r->float_utf8;
6218             r->check_offset_min = r->float_min_offset;
6219             r->check_offset_max = r->float_max_offset;
6220         }
6221         /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
6222            This should be changed ASAP!  */
6223         if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
6224             r->extflags |= RXf_USE_INTUIT;
6225             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
6226                 r->extflags |= RXf_INTUIT_TAIL;
6227         }
6228         /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
6229         if ( (STRLEN)minlen < longest_float_length )
6230             minlen= longest_float_length;
6231         if ( (STRLEN)minlen < longest_fixed_length )
6232             minlen= longest_fixed_length;
6233         */
6234     }
6235     else {
6236         /* Several toplevels. Best we can is to set minlen. */
6237         I32 fake;
6238         struct regnode_charclass_class ch_class;
6239         I32 last_close = 0;
6240
6241         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
6242
6243         scan = ri->program + 1;
6244         cl_init(pRExC_state, &ch_class);
6245         data.start_class = &ch_class;
6246         data.last_closep = &last_close;
6247
6248
6249         minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
6250             &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
6251
6252         CHECK_RESTUDY_GOTO_butfirst(NOOP);
6253
6254         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
6255                 = r->float_substr = r->float_utf8 = NULL;
6256
6257         if (! TEST_SSC_EOS(data.start_class)
6258             && !cl_is_anything(data.start_class))
6259         {
6260             const U32 n = add_data(pRExC_state, 1, "f");
6261             OP(data.start_class) = ANYOF_SYNTHETIC;
6262
6263             Newx(RExC_rxi->data->data[n], 1,
6264                 struct regnode_charclass_class);
6265             StructCopy(data.start_class,
6266                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6267                        struct regnode_charclass_class);
6268             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6269             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6270             DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
6271                       regprop(r, sv, (regnode*)data.start_class);
6272                       PerlIO_printf(Perl_debug_log,
6273                                     "synthetic stclass \"%s\".\n",
6274                                     SvPVX_const(sv));});
6275         }
6276     }
6277
6278     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
6279        the "real" pattern. */
6280     DEBUG_OPTIMISE_r({
6281         PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
6282                       (IV)minlen, (IV)r->minlen);
6283     });
6284     r->minlenret = minlen;
6285     if (r->minlen < minlen)
6286         r->minlen = minlen;
6287
6288     if (RExC_seen & REG_SEEN_GPOS)
6289         r->extflags |= RXf_GPOS_SEEN;
6290     if (RExC_seen & REG_SEEN_LOOKBEHIND)
6291         r->extflags |= RXf_LOOKBEHIND_SEEN;
6292     if (pRExC_state->num_code_blocks)
6293         r->extflags |= RXf_EVAL_SEEN;
6294     if (RExC_seen & REG_SEEN_CANY)
6295         r->extflags |= RXf_CANY_SEEN;
6296     if (RExC_seen & REG_SEEN_VERBARG)
6297     {
6298         r->intflags |= PREGf_VERBARG_SEEN;
6299         r->extflags |= RXf_MODIFIES_VARS;
6300     }
6301     if (RExC_seen & REG_SEEN_CUTGROUP)
6302         r->intflags |= PREGf_CUTGROUP_SEEN;
6303     if (pm_flags & PMf_USE_RE_EVAL)
6304         r->intflags |= PREGf_USE_RE_EVAL;
6305     if (RExC_paren_names)
6306         RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
6307     else
6308         RXp_PAREN_NAMES(r) = NULL;
6309
6310 #ifdef STUPID_PATTERN_CHECKS
6311     if (RX_PRELEN(rx) == 0)
6312         r->extflags |= RXf_NULL;
6313     if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
6314         r->extflags |= RXf_WHITE;
6315     else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
6316         r->extflags |= RXf_START_ONLY;
6317 #else
6318     {
6319         regnode *first = ri->program + 1;
6320         U8 fop = OP(first);
6321
6322         if (PL_regkind[fop] == NOTHING && OP(NEXTOPER(first)) == END)
6323             r->extflags |= RXf_NULL;
6324         else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END)
6325             r->extflags |= RXf_START_ONLY;
6326         else if (fop == PLUS && PL_regkind[OP(NEXTOPER(first))] == POSIXD && FLAGS(NEXTOPER(first)) == _CC_SPACE
6327                              && OP(regnext(first)) == END)
6328             r->extflags |= RXf_WHITE;
6329     }
6330 #endif
6331 #ifdef DEBUGGING
6332     if (RExC_paren_names) {
6333         ri->name_list_idx = add_data( pRExC_state, 1, "a" );
6334         ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
6335     } else
6336 #endif
6337         ri->name_list_idx = 0;
6338
6339     if (RExC_recurse_count) {
6340         for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
6341             const regnode *scan = RExC_recurse[RExC_recurse_count-1];
6342             ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
6343         }
6344     }
6345     Newxz(r->offs, RExC_npar, regexp_paren_pair);
6346     /* assume we don't need to swap parens around before we match */
6347
6348     DEBUG_DUMP_r({
6349         PerlIO_printf(Perl_debug_log,"Final program:\n");
6350         regdump(r);
6351     });
6352 #ifdef RE_TRACK_PATTERN_OFFSETS
6353     DEBUG_OFFSETS_r(if (ri->u.offsets) {
6354         const U32 len = ri->u.offsets[0];
6355         U32 i;
6356         GET_RE_DEBUG_FLAGS_DECL;
6357         PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
6358         for (i = 1; i <= len; i++) {
6359             if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
6360                 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
6361                 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
6362             }
6363         PerlIO_printf(Perl_debug_log, "\n");
6364     });
6365 #endif
6366
6367 #ifdef USE_ITHREADS
6368     /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
6369      * by setting the regexp SV to readonly-only instead. If the
6370      * pattern's been recompiled, the USEDness should remain. */
6371     if (old_re && SvREADONLY(old_re))
6372         SvREADONLY_on(rx);
6373 #endif
6374     return rx;
6375 }
6376
6377
6378 SV*
6379 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
6380                     const U32 flags)
6381 {
6382     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
6383
6384     PERL_UNUSED_ARG(value);
6385
6386     if (flags & RXapif_FETCH) {
6387         return reg_named_buff_fetch(rx, key, flags);
6388     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
6389         Perl_croak_no_modify();
6390         return NULL;
6391     } else if (flags & RXapif_EXISTS) {
6392         return reg_named_buff_exists(rx, key, flags)
6393             ? &PL_sv_yes
6394             : &PL_sv_no;
6395     } else if (flags & RXapif_REGNAMES) {
6396         return reg_named_buff_all(rx, flags);
6397     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
6398         return reg_named_buff_scalar(rx, flags);
6399     } else {
6400         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
6401         return NULL;
6402     }
6403 }
6404
6405 SV*
6406 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
6407                          const U32 flags)
6408 {
6409     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
6410     PERL_UNUSED_ARG(lastkey);
6411
6412     if (flags & RXapif_FIRSTKEY)
6413         return reg_named_buff_firstkey(rx, flags);
6414     else if (flags & RXapif_NEXTKEY)
6415         return reg_named_buff_nextkey(rx, flags);
6416     else {
6417         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
6418         return NULL;
6419     }
6420 }
6421
6422 SV*
6423 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
6424                           const U32 flags)
6425 {
6426     AV *retarray = NULL;
6427     SV *ret;
6428     struct regexp *const rx = ReANY(r);
6429
6430     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
6431
6432     if (flags & RXapif_ALL)
6433         retarray=newAV();
6434
6435     if (rx && RXp_PAREN_NAMES(rx)) {
6436         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
6437         if (he_str) {
6438             IV i;
6439             SV* sv_dat=HeVAL(he_str);
6440             I32 *nums=(I32*)SvPVX(sv_dat);
6441             for ( i=0; i<SvIVX(sv_dat); i++ ) {
6442                 if ((I32)(rx->nparens) >= nums[i]
6443                     && rx->offs[nums[i]].start != -1
6444                     && rx->offs[nums[i]].end != -1)
6445                 {
6446                     ret = newSVpvs("");
6447                     CALLREG_NUMBUF_FETCH(r,nums[i],ret);
6448                     if (!retarray)
6449                         return ret;
6450                 } else {
6451                     if (retarray)
6452                         ret = newSVsv(&PL_sv_undef);
6453                 }
6454                 if (retarray)
6455                     av_push(retarray, ret);
6456             }
6457             if (retarray)
6458                 return newRV_noinc(MUTABLE_SV(retarray));
6459         }
6460     }
6461     return NULL;
6462 }
6463
6464 bool
6465 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
6466                            const U32 flags)
6467 {
6468     struct regexp *const rx = ReANY(r);
6469
6470     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
6471
6472     if (rx && RXp_PAREN_NAMES(rx)) {
6473         if (flags & RXapif_ALL) {
6474             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
6475         } else {
6476             SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
6477             if (sv) {
6478                 SvREFCNT_dec_NN(sv);
6479                 return TRUE;
6480             } else {
6481                 return FALSE;
6482             }
6483         }
6484     } else {
6485         return FALSE;
6486     }
6487 }
6488
6489 SV*
6490 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
6491 {
6492     struct regexp *const rx = ReANY(r);
6493
6494     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
6495
6496     if ( rx && RXp_PAREN_NAMES(rx) ) {
6497         (void)hv_iterinit(RXp_PAREN_NAMES(rx));
6498
6499         return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
6500     } else {
6501         return FALSE;
6502     }
6503 }
6504
6505 SV*
6506 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
6507 {
6508     struct regexp *const rx = ReANY(r);
6509     GET_RE_DEBUG_FLAGS_DECL;
6510
6511     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
6512
6513     if (rx && RXp_PAREN_NAMES(rx)) {
6514         HV *hv = RXp_PAREN_NAMES(rx);
6515         HE *temphe;
6516         while ( (temphe = hv_iternext_flags(hv,0)) ) {
6517             IV i;
6518             IV parno = 0;
6519             SV* sv_dat = HeVAL(temphe);
6520             I32 *nums = (I32*)SvPVX(sv_dat);
6521             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6522                 if ((I32)(rx->lastparen) >= nums[i] &&
6523                     rx->offs[nums[i]].start != -1 &&
6524                     rx->offs[nums[i]].end != -1)
6525                 {
6526                     parno = nums[i];
6527                     break;
6528                 }
6529             }
6530             if (parno || flags & RXapif_ALL) {
6531                 return newSVhek(HeKEY_hek(temphe));
6532             }
6533         }
6534     }
6535     return NULL;
6536 }
6537
6538 SV*
6539 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
6540 {
6541     SV *ret;
6542     AV *av;
6543     I32 length;
6544     struct regexp *const rx = ReANY(r);
6545
6546     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
6547
6548     if (rx && RXp_PAREN_NAMES(rx)) {
6549         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
6550             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
6551         } else if (flags & RXapif_ONE) {
6552             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
6553             av = MUTABLE_AV(SvRV(ret));
6554             length = av_len(av);
6555             SvREFCNT_dec_NN(ret);
6556             return newSViv(length + 1);
6557         } else {
6558             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
6559             return NULL;
6560         }
6561     }
6562     return &PL_sv_undef;
6563 }
6564
6565 SV*
6566 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
6567 {
6568     struct regexp *const rx = ReANY(r);
6569     AV *av = newAV();
6570
6571     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
6572
6573     if (rx && RXp_PAREN_NAMES(rx)) {
6574         HV *hv= RXp_PAREN_NAMES(rx);
6575         HE *temphe;
6576         (void)hv_iterinit(hv);
6577         while ( (temphe = hv_iternext_flags(hv,0)) ) {
6578             IV i;
6579             IV parno = 0;
6580             SV* sv_dat = HeVAL(temphe);
6581             I32 *nums = (I32*)SvPVX(sv_dat);
6582             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6583                 if ((I32)(rx->lastparen) >= nums[i] &&
6584                     rx->offs[nums[i]].start != -1 &&
6585                     rx->offs[nums[i]].end != -1)
6586                 {
6587                     parno = nums[i];
6588                     break;
6589                 }
6590             }
6591             if (parno || flags & RXapif_ALL) {
6592                 av_push(av, newSVhek(HeKEY_hek(temphe)));
6593             }
6594         }
6595     }
6596
6597     return newRV_noinc(MUTABLE_SV(av));
6598 }
6599
6600 void
6601 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
6602                              SV * const sv)
6603 {
6604     struct regexp *const rx = ReANY(r);
6605     char *s = NULL;
6606     I32 i = 0;
6607     I32 s1, t1;
6608     I32 n = paren;
6609
6610     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
6611
6612     if ( (    n == RX_BUFF_IDX_CARET_PREMATCH
6613            || n == RX_BUFF_IDX_CARET_FULLMATCH
6614            || n == RX_BUFF_IDX_CARET_POSTMATCH
6615          )
6616          && !(rx->extflags & RXf_PMf_KEEPCOPY)
6617     )
6618         goto ret_undef;
6619
6620     if (!rx->subbeg)
6621         goto ret_undef;
6622
6623     if (n == RX_BUFF_IDX_CARET_FULLMATCH)
6624         /* no need to distinguish between them any more */
6625         n = RX_BUFF_IDX_FULLMATCH;
6626
6627     if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
6628         && rx->offs[0].start != -1)
6629     {
6630         /* $`, ${^PREMATCH} */
6631         i = rx->offs[0].start;
6632         s = rx->subbeg;
6633     }
6634     else
6635     if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
6636         && rx->offs[0].end != -1)
6637     {
6638         /* $', ${^POSTMATCH} */
6639         s = rx->subbeg - rx->suboffset + rx->offs[0].end;
6640         i = rx->sublen + rx->suboffset - rx->offs[0].end;
6641     }
6642     else
6643     if ( 0 <= n && n <= (I32)rx->nparens &&
6644         (s1 = rx->offs[n].start) != -1 &&
6645         (t1 = rx->offs[n].end) != -1)
6646     {
6647         /* $&, ${^MATCH},  $1 ... */
6648         i = t1 - s1;
6649         s = rx->subbeg + s1 - rx->suboffset;
6650     } else {
6651         goto ret_undef;
6652     }
6653
6654     assert(s >= rx->subbeg);
6655     assert(rx->sublen >= (s - rx->subbeg) + i );
6656     if (i >= 0) {
6657 #if NO_TAINT_SUPPORT
6658         sv_setpvn(sv, s, i);
6659 #else
6660         const int oldtainted = TAINT_get;
6661         TAINT_NOT;
6662         sv_setpvn(sv, s, i);
6663         TAINT_set(oldtainted);
6664 #endif
6665         if ( (rx->extflags & RXf_CANY_SEEN)
6666             ? (RXp_MATCH_UTF8(rx)
6667                         && (!i || is_utf8_string((U8*)s, i)))
6668             : (RXp_MATCH_UTF8(rx)) )
6669         {
6670             SvUTF8_on(sv);
6671         }
6672         else
6673             SvUTF8_off(sv);
6674         if (TAINTING_get) {
6675             if (RXp_MATCH_TAINTED(rx)) {
6676                 if (SvTYPE(sv) >= SVt_PVMG) {
6677                     MAGIC* const mg = SvMAGIC(sv);
6678                     MAGIC* mgt;
6679                     TAINT;
6680                     SvMAGIC_set(sv, mg->mg_moremagic);
6681                     SvTAINT(sv);
6682                     if ((mgt = SvMAGIC(sv))) {
6683                         mg->mg_moremagic = mgt;
6684                         SvMAGIC_set(sv, mg);
6685                     }
6686                 } else {
6687                     TAINT;
6688                     SvTAINT(sv);
6689                 }
6690             } else
6691                 SvTAINTED_off(sv);
6692         }
6693     } else {
6694       ret_undef:
6695         sv_setsv(sv,&PL_sv_undef);
6696         return;
6697     }
6698 }
6699
6700 void
6701 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
6702                                                          SV const * const value)
6703 {
6704     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
6705
6706     PERL_UNUSED_ARG(rx);
6707     PERL_UNUSED_ARG(paren);
6708     PERL_UNUSED_ARG(value);
6709
6710     if (!PL_localizing)
6711         Perl_croak_no_modify();
6712 }
6713
6714 I32
6715 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
6716                               const I32 paren)
6717 {
6718     struct regexp *const rx = ReANY(r);
6719     I32 i;
6720     I32 s1, t1;
6721
6722     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
6723
6724     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
6725     switch (paren) {
6726       case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
6727          if (!(rx->extflags & RXf_PMf_KEEPCOPY))
6728             goto warn_undef;
6729         /*FALLTHROUGH*/
6730
6731       case RX_BUFF_IDX_PREMATCH:       /* $` */
6732         if (rx->offs[0].start != -1) {
6733                         i = rx->offs[0].start;
6734                         if (i > 0) {
6735                                 s1 = 0;
6736                                 t1 = i;
6737                                 goto getlen;
6738                         }
6739             }
6740         return 0;
6741
6742       case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
6743          if (!(rx->extflags & RXf_PMf_KEEPCOPY))
6744             goto warn_undef;
6745       case RX_BUFF_IDX_POSTMATCH:       /* $' */
6746             if (rx->offs[0].end != -1) {
6747                         i = rx->sublen - rx->offs[0].end;
6748                         if (i > 0) {
6749                                 s1 = rx->offs[0].end;
6750                                 t1 = rx->sublen;
6751                                 goto getlen;
6752                         }
6753             }
6754         return 0;
6755
6756       case RX_BUFF_IDX_CARET_FULLMATCH: /* ${^MATCH} */
6757          if (!(rx->extflags & RXf_PMf_KEEPCOPY))
6758             goto warn_undef;
6759         /*FALLTHROUGH*/
6760
6761       /* $& / ${^MATCH}, $1, $2, ... */
6762       default:
6763             if (paren <= (I32)rx->nparens &&
6764             (s1 = rx->offs[paren].start) != -1 &&
6765             (t1 = rx->offs[paren].end) != -1)
6766             {
6767             i = t1 - s1;
6768             goto getlen;
6769         } else {
6770           warn_undef:
6771             if (ckWARN(WARN_UNINITIALIZED))
6772                 report_uninit((const SV *)sv);
6773             return 0;
6774         }
6775     }
6776   getlen:
6777     if (i > 0 && RXp_MATCH_UTF8(rx)) {
6778         const char * const s = rx->subbeg - rx->suboffset + s1;
6779         const U8 *ep;
6780         STRLEN el;
6781
6782         i = t1 - s1;
6783         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
6784                         i = el;
6785     }
6786     return i;
6787 }
6788
6789 SV*
6790 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
6791 {
6792     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
6793         PERL_UNUSED_ARG(rx);
6794         if (0)
6795             return NULL;
6796         else
6797             return newSVpvs("Regexp");
6798 }
6799
6800 /* Scans the name of a named buffer from the pattern.
6801  * If flags is REG_RSN_RETURN_NULL returns null.
6802  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
6803  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
6804  * to the parsed name as looked up in the RExC_paren_names hash.
6805  * If there is an error throws a vFAIL().. type exception.
6806  */
6807
6808 #define REG_RSN_RETURN_NULL    0
6809 #define REG_RSN_RETURN_NAME    1
6810 #define REG_RSN_RETURN_DATA    2
6811
6812 STATIC SV*
6813 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
6814 {
6815     char *name_start = RExC_parse;
6816
6817     PERL_ARGS_ASSERT_REG_SCAN_NAME;
6818
6819     if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
6820          /* skip IDFIRST by using do...while */
6821         if (UTF)
6822             do {
6823                 RExC_parse += UTF8SKIP(RExC_parse);
6824             } while (isWORDCHAR_utf8((U8*)RExC_parse));
6825         else
6826             do {
6827                 RExC_parse++;
6828             } while (isWORDCHAR(*RExC_parse));
6829     } else {
6830         RExC_parse++; /* so the <- from the vFAIL is after the offending character */
6831         vFAIL("Group name must start with a non-digit word character");
6832     }
6833     if ( flags ) {
6834         SV* sv_name
6835             = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
6836                              SVs_TEMP | (UTF ? SVf_UTF8 : 0));
6837         if ( flags == REG_RSN_RETURN_NAME)
6838             return sv_name;
6839         else if (flags==REG_RSN_RETURN_DATA) {
6840             HE *he_str = NULL;
6841             SV *sv_dat = NULL;
6842             if ( ! sv_name )      /* should not happen*/
6843                 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
6844             if (RExC_paren_names)
6845                 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
6846             if ( he_str )
6847                 sv_dat = HeVAL(he_str);
6848             if ( ! sv_dat )
6849                 vFAIL("Reference to nonexistent named group");
6850             return sv_dat;
6851         }
6852         else {
6853             Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
6854                        (unsigned long) flags);
6855         }
6856         assert(0); /* NOT REACHED */
6857     }
6858     return NULL;
6859 }
6860
6861 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
6862     int rem=(int)(RExC_end - RExC_parse);                       \
6863     int cut;                                                    \
6864     int num;                                                    \
6865     int iscut=0;                                                \
6866     if (rem>10) {                                               \
6867         rem=10;                                                 \
6868         iscut=1;                                                \
6869     }                                                           \
6870     cut=10-rem;                                                 \
6871     if (RExC_lastparse!=RExC_parse)                             \
6872         PerlIO_printf(Perl_debug_log," >%.*s%-*s",              \
6873             rem, RExC_parse,                                    \
6874             cut + 4,                                            \
6875             iscut ? "..." : "<"                                 \
6876         );                                                      \
6877     else                                                        \
6878         PerlIO_printf(Perl_debug_log,"%16s","");                \
6879                                                                 \
6880     if (SIZE_ONLY)                                              \
6881        num = RExC_size + 1;                                     \
6882     else                                                        \
6883        num=REG_NODE_NUM(RExC_emit);                             \
6884     if (RExC_lastnum!=num)                                      \
6885        PerlIO_printf(Perl_debug_log,"|%4d",num);                \
6886     else                                                        \
6887        PerlIO_printf(Perl_debug_log,"|%4s","");                 \
6888     PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
6889         (int)((depth*2)), "",                                   \
6890         (funcname)                                              \
6891     );                                                          \
6892     RExC_lastnum=num;                                           \
6893     RExC_lastparse=RExC_parse;                                  \
6894 })
6895
6896
6897
6898 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
6899     DEBUG_PARSE_MSG((funcname));                            \
6900     PerlIO_printf(Perl_debug_log,"%4s","\n");               \
6901 })
6902 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
6903     DEBUG_PARSE_MSG((funcname));                            \
6904     PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
6905 })
6906
6907 /* This section of code defines the inversion list object and its methods.  The
6908  * interfaces are highly subject to change, so as much as possible is static to
6909  * this file.  An inversion list is here implemented as a malloc'd C UV array
6910  * with some added info that is placed as UVs at the beginning in a header
6911  * portion.  An inversion list for Unicode is an array of code points, sorted
6912  * by ordinal number.  The zeroth element is the first code point in the list.
6913  * The 1th element is the first element beyond that not in the list.  In other
6914  * words, the first range is
6915  *  invlist[0]..(invlist[1]-1)
6916  * The other ranges follow.  Thus every element whose index is divisible by two
6917  * marks the beginning of a range that is in the list, and every element not
6918  * divisible by two marks the beginning of a range not in the list.  A single
6919  * element inversion list that contains the single code point N generally
6920  * consists of two elements
6921  *  invlist[0] == N
6922  *  invlist[1] == N+1
6923  * (The exception is when N is the highest representable value on the
6924  * machine, in which case the list containing just it would be a single
6925  * element, itself.  By extension, if the last range in the list extends to
6926  * infinity, then the first element of that range will be in the inversion list
6927  * at a position that is divisible by two, and is the final element in the
6928  * list.)
6929  * Taking the complement (inverting) an inversion list is quite simple, if the
6930  * first element is 0, remove it; otherwise add a 0 element at the beginning.
6931  * This implementation reserves an element at the beginning of each inversion
6932  * list to contain 0 when the list contains 0, and contains 1 otherwise.  The
6933  * actual beginning of the list is either that element if 0, or the next one if
6934  * 1.
6935  *
6936  * More about inversion lists can be found in "Unicode Demystified"
6937  * Chapter 13 by Richard Gillam, published by Addison-Wesley.
6938  * More will be coming when functionality is added later.
6939  *
6940  * The inversion list data structure is currently implemented as an SV pointing
6941  * to an array of UVs that the SV thinks are bytes.  This allows us to have an
6942  * array of UV whose memory management is automatically handled by the existing
6943  * facilities for SV's.
6944  *
6945  * Some of the methods should always be private to the implementation, and some
6946  * should eventually be made public */
6947
6948 /* The header definitions are in F<inline_invlist.c> */
6949 #define TO_INTERNAL_SIZE(x) ((x + HEADER_LENGTH) * sizeof(UV))
6950 #define FROM_INTERNAL_SIZE(x) ((x / sizeof(UV)) - HEADER_LENGTH)
6951
6952 #define INVLIST_INITIAL_LEN 10
6953
6954 PERL_STATIC_INLINE UV*
6955 S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
6956 {
6957     /* Returns a pointer to the first element in the inversion list's array.
6958      * This is called upon initialization of an inversion list.  Where the
6959      * array begins depends on whether the list has the code point U+0000
6960      * in it or not.  The other parameter tells it whether the code that
6961      * follows this call is about to put a 0 in the inversion list or not.
6962      * The first element is either the element with 0, if 0, or the next one,
6963      * if 1 */
6964
6965     UV* zero = get_invlist_zero_addr(invlist);
6966
6967     PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
6968
6969     /* Must be empty */
6970     assert(! *_get_invlist_len_addr(invlist));
6971
6972     /* 1^1 = 0; 1^0 = 1 */
6973     *zero = 1 ^ will_have_0;
6974     return zero + *zero;
6975 }
6976
6977 PERL_STATIC_INLINE UV*
6978 S_invlist_array(pTHX_ SV* const invlist)
6979 {
6980     /* Returns the pointer to the inversion list's array.  Every time the
6981      * length changes, this needs to be called in case malloc or realloc moved
6982      * it */
6983
6984     PERL_ARGS_ASSERT_INVLIST_ARRAY;
6985
6986     /* Must not be empty.  If these fail, you probably didn't check for <len>
6987      * being non-zero before trying to get the array */
6988     assert(*_get_invlist_len_addr(invlist));
6989     assert(*get_invlist_zero_addr(invlist) == 0
6990            || *get_invlist_zero_addr(invlist) == 1);
6991
6992     /* The array begins either at the element reserved for zero if the
6993      * list contains 0 (that element will be set to 0), or otherwise the next
6994      * element (in which case the reserved element will be set to 1). */
6995     return (UV *) (get_invlist_zero_addr(invlist)
6996                    + *get_invlist_zero_addr(invlist));
6997 }
6998
6999 PERL_STATIC_INLINE void
7000 S_invlist_set_len(pTHX_ SV* const invlist, const UV len)
7001 {
7002     /* Sets the current number of elements stored in the inversion list */
7003
7004     PERL_ARGS_ASSERT_INVLIST_SET_LEN;
7005
7006     *_get_invlist_len_addr(invlist) = len;
7007
7008     assert(len <= SvLEN(invlist));
7009
7010     SvCUR_set(invlist, TO_INTERNAL_SIZE(len));
7011     /* If the list contains U+0000, that element is part of the header,
7012      * and should not be counted as part of the array.  It will contain
7013      * 0 in that case, and 1 otherwise.  So we could flop 0=>1, 1=>0 and
7014      * subtract:
7015      *  SvCUR_set(invlist,
7016      *            TO_INTERNAL_SIZE(len
7017      *                             - (*get_invlist_zero_addr(inv_list) ^ 1)));
7018      * But, this is only valid if len is not 0.  The consequences of not doing
7019      * this is that the memory allocation code may think that 1 more UV is
7020      * being used than actually is, and so might do an unnecessary grow.  That
7021      * seems worth not bothering to make this the precise amount.
7022      *
7023      * Note that when inverting, SvCUR shouldn't change */
7024 }
7025
7026 PERL_STATIC_INLINE IV*
7027 S_get_invlist_previous_index_addr(pTHX_ SV* invlist)
7028 {
7029     /* Return the address of the UV that is reserved to hold the cached index
7030      * */
7031
7032     PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
7033
7034     return (IV *) (SvPVX(invlist) + (INVLIST_PREVIOUS_INDEX_OFFSET * sizeof (UV)));
7035 }
7036
7037 PERL_STATIC_INLINE IV
7038 S_invlist_previous_index(pTHX_ SV* const invlist)
7039 {
7040     /* Returns cached index of previous search */
7041
7042     PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
7043
7044     return *get_invlist_previous_index_addr(invlist);
7045 }
7046
7047 PERL_STATIC_INLINE void
7048 S_invlist_set_previous_index(pTHX_ SV* const invlist, const IV index)
7049 {
7050     /* Caches <index> for later retrieval */
7051
7052     PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
7053
7054     assert(index == 0 || index < (int) _invlist_len(invlist));
7055
7056     *get_invlist_previous_index_addr(invlist) = index;
7057 }
7058
7059 PERL_STATIC_INLINE UV
7060 S_invlist_max(pTHX_ SV* const invlist)
7061 {
7062     /* Returns the maximum number of elements storable in the inversion list's
7063      * array, without having to realloc() */
7064
7065     PERL_ARGS_ASSERT_INVLIST_MAX;
7066
7067     return SvLEN(invlist) == 0  /* This happens under _new_invlist_C_array */
7068            ? _invlist_len(invlist)
7069            : FROM_INTERNAL_SIZE(SvLEN(invlist));
7070 }
7071
7072 PERL_STATIC_INLINE UV*
7073 S_get_invlist_zero_addr(pTHX_ SV* invlist)
7074 {
7075     /* Return the address of the UV that is reserved to hold 0 if the inversion
7076      * list contains 0.  This has to be the last element of the heading, as the
7077      * list proper starts with either it if 0, or the next element if not.
7078      * (But we force it to contain either 0 or 1) */
7079
7080     PERL_ARGS_ASSERT_GET_INVLIST_ZERO_ADDR;
7081
7082     return (UV *) (SvPVX(invlist) + (INVLIST_ZERO_OFFSET * sizeof (UV)));
7083 }
7084
7085 #ifndef PERL_IN_XSUB_RE
7086 SV*
7087 Perl__new_invlist(pTHX_ IV initial_size)
7088 {
7089
7090     /* Return a pointer to a newly constructed inversion list, with enough
7091      * space to store 'initial_size' elements.  If that number is negative, a
7092      * system default is used instead */
7093
7094     SV* new_list;
7095
7096     if (initial_size < 0) {
7097         initial_size = INVLIST_INITIAL_LEN;
7098     }
7099
7100     /* Allocate the initial space */
7101     new_list = newSV(TO_INTERNAL_SIZE(initial_size));
7102     invlist_set_len(new_list, 0);
7103
7104     /* Force iterinit() to be used to get iteration to work */
7105     *get_invlist_iter_addr(new_list) = UV_MAX;
7106
7107     /* This should force a segfault if a method doesn't initialize this
7108      * properly */
7109     *get_invlist_zero_addr(new_list) = UV_MAX;
7110
7111     *get_invlist_previous_index_addr(new_list) = 0;
7112     *get_invlist_version_id_addr(new_list) = INVLIST_VERSION_ID;
7113 #if HEADER_LENGTH != 5
7114 #   error Need to regenerate VERSION_ID by running perl -E 'say int(rand 2**31-1)', and then changing the #if to the new length
7115 #endif
7116
7117     return new_list;
7118 }
7119 #endif
7120
7121 STATIC SV*
7122 S__new_invlist_C_array(pTHX_ UV* list)
7123 {
7124     /* Return a pointer to a newly constructed inversion list, initialized to
7125      * point to <list>, which has to be in the exact correct inversion list
7126      * form, including internal fields.  Thus this is a dangerous routine that
7127      * should not be used in the wrong hands */
7128
7129     SV* invlist = newSV_type(SVt_PV);
7130
7131     PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
7132
7133     SvPV_set(invlist, (char *) list);
7134     SvLEN_set(invlist, 0);  /* Means we own the contents, and the system
7135                                shouldn't touch it */
7136     SvCUR_set(invlist, TO_INTERNAL_SIZE(_invlist_len(invlist)));
7137
7138     if (*get_invlist_version_id_addr(invlist) != INVLIST_VERSION_ID) {
7139         Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
7140     }
7141
7142     /* Initialize the iteration pointer.
7143      * XXX This could be done at compile time in charclass_invlists.h, but I
7144      * (khw) am not confident that the suffixes for specifying the C constant
7145      * UV_MAX are portable, e.g.  'ull' on a 32 bit machine that is configured
7146      * to use 64 bits; might need a Configure probe */
7147     invlist_iterfinish(invlist);
7148
7149     return invlist;
7150 }
7151
7152 STATIC void
7153 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
7154 {
7155     /* Grow the maximum size of an inversion list */
7156
7157     PERL_ARGS_ASSERT_INVLIST_EXTEND;
7158
7159     SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max));
7160 }
7161
7162 PERL_STATIC_INLINE void
7163 S_invlist_trim(pTHX_ SV* const invlist)
7164 {
7165     PERL_ARGS_ASSERT_INVLIST_TRIM;
7166
7167     /* Change the length of the inversion list to how many entries it currently
7168      * has */
7169
7170     SvPV_shrink_to_cur((SV *) invlist);
7171 }
7172
7173 #define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
7174
7175 STATIC void
7176 S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
7177 {
7178    /* Subject to change or removal.  Append the range from 'start' to 'end' at
7179     * the end of the inversion list.  The range must be above any existing
7180     * ones. */
7181
7182     UV* array;
7183     UV max = invlist_max(invlist);
7184     UV len = _invlist_len(invlist);
7185
7186     PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
7187
7188     if (len == 0) { /* Empty lists must be initialized */
7189         array = _invlist_array_init(invlist, start == 0);
7190     }
7191     else {
7192         /* Here, the existing list is non-empty. The current max entry in the
7193          * list is generally the first value not in the set, except when the
7194          * set extends to the end of permissible values, in which case it is
7195          * the first entry in that final set, and so this call is an attempt to
7196          * append out-of-order */
7197
7198         UV final_element = len - 1;
7199         array = invlist_array(invlist);
7200         if (array[final_element] > start
7201             || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
7202         {
7203             Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
7204                        array[final_element], start,
7205                        ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
7206         }
7207
7208         /* Here, it is a legal append.  If the new range begins with the first
7209          * value not in the set, it is extending the set, so the new first
7210          * value not in the set is one greater than the newly extended range.
7211          * */
7212         if (array[final_element] == start) {
7213             if (end != UV_MAX) {
7214                 array[final_element] = end + 1;
7215             }
7216             else {
7217                 /* But if the end is the maximum representable on the machine,
7218                  * just let the range that this would extend to have no end */
7219                 invlist_set_len(invlist, len - 1);
7220             }
7221             return;
7222         }
7223     }
7224
7225     /* Here the new range doesn't extend any existing set.  Add it */
7226
7227     len += 2;   /* Includes an element each for the start and end of range */
7228
7229     /* If overflows the existing space, extend, which may cause the array to be
7230      * moved */
7231     if (max < len) {
7232         invlist_extend(invlist, len);
7233         invlist_set_len(invlist, len);  /* Have to set len here to avoid assert
7234                                            failure in invlist_array() */
7235         array = invlist_array(invlist);
7236     }
7237     else {
7238         invlist_set_len(invlist, len);
7239     }
7240
7241     /* The next item on the list starts the range, the one after that is
7242      * one past the new range.  */
7243     array[len - 2] = start;
7244     if (end != UV_MAX) {
7245         array[len - 1] = end + 1;
7246     }
7247     else {
7248         /* But if the end is the maximum representable on the machine, just let
7249          * the range have no end */
7250         invlist_set_len(invlist, len - 1);
7251     }
7252 }
7253
7254 #ifndef PERL_IN_XSUB_RE
7255
7256 IV
7257 Perl__invlist_search(pTHX_ SV* const invlist, const UV cp)
7258 {
7259     /* Searches the inversion list for the entry that contains the input code
7260      * point <cp>.  If <cp> is not in the list, -1 is returned.  Otherwise, the
7261      * return value is the index into the list's array of the range that
7262      * contains <cp> */
7263
7264     IV low = 0;
7265     IV mid;
7266     IV high = _invlist_len(invlist);
7267     const IV highest_element = high - 1;
7268     const UV* array;
7269
7270     PERL_ARGS_ASSERT__INVLIST_SEARCH;
7271
7272     /* If list is empty, return failure. */
7273     if (high == 0) {
7274         return -1;
7275     }
7276
7277     /* (We can't get the array unless we know the list is non-empty) */
7278     array = invlist_array(invlist);
7279
7280     mid = invlist_previous_index(invlist);
7281     assert(mid >=0 && mid <= highest_element);
7282
7283     /* <mid> contains the cache of the result of the previous call to this
7284      * function (0 the first time).  See if this call is for the same result,
7285      * or if it is for mid-1.  This is under the theory that calls to this
7286      * function will often be for related code points that are near each other.
7287      * And benchmarks show that caching gives better results.  We also test
7288      * here if the code point is within the bounds of the list.  These tests
7289      * replace others that would have had to be made anyway to make sure that
7290      * the array bounds were not exceeded, and these give us extra information
7291      * at the same time */
7292     if (cp >= array[mid]) {
7293         if (cp >= array[highest_element]) {
7294             return highest_element;
7295         }
7296
7297         /* Here, array[mid] <= cp < array[highest_element].  This means that
7298          * the final element is not the answer, so can exclude it; it also
7299          * means that <mid> is not the final element, so can refer to 'mid + 1'
7300          * safely */
7301         if (cp < array[mid + 1]) {
7302             return mid;
7303         }
7304         high--;
7305         low = mid + 1;
7306     }
7307     else { /* cp < aray[mid] */
7308         if (cp < array[0]) { /* Fail if outside the array */
7309             return -1;
7310         }
7311         high = mid;
7312         if (cp >= array[mid - 1]) {
7313             goto found_entry;
7314         }
7315     }
7316
7317     /* Binary search.  What we are looking for is <i> such that
7318      *  array[i] <= cp < array[i+1]
7319      * The loop below converges on the i+1.  Note that there may not be an
7320      * (i+1)th element in the array, and things work nonetheless */
7321     while (low < high) {
7322         mid = (low + high) / 2;
7323         assert(mid <= highest_element);
7324         if (array[mid] <= cp) { /* cp >= array[mid] */
7325             low = mid + 1;
7326
7327             /* We could do this extra test to exit the loop early.
7328             if (cp < array[low]) {
7329                 return mid;
7330             }
7331             */
7332         }
7333         else { /* cp < array[mid] */
7334             high = mid;
7335         }
7336     }
7337
7338   found_entry:
7339     high--;
7340     invlist_set_previous_index(invlist, high);
7341     return high;
7342 }
7343
7344 void
7345 Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch)
7346 {
7347     /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
7348      * but is used when the swash has an inversion list.  This makes this much
7349      * faster, as it uses a binary search instead of a linear one.  This is
7350      * intimately tied to that function, and perhaps should be in utf8.c,
7351      * except it is intimately tied to inversion lists as well.  It assumes
7352      * that <swatch> is all 0's on input */
7353
7354     UV current = start;
7355     const IV len = _invlist_len(invlist);
7356     IV i;
7357     const UV * array;
7358
7359     PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
7360
7361     if (len == 0) { /* Empty inversion list */
7362         return;
7363     }
7364
7365     array = invlist_array(invlist);
7366
7367     /* Find which element it is */
7368     i = _invlist_search(invlist, start);
7369
7370     /* We populate from <start> to <end> */
7371     while (current < end) {
7372         UV upper;
7373
7374         /* The inversion list gives the results for every possible code point
7375          * after the first one in the list.  Only those ranges whose index is
7376          * even are ones that the inversion list matches.  For the odd ones,
7377          * and if the initial code point is not in the list, we have to skip
7378          * forward to the next element */
7379         if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
7380             i++;
7381             if (i >= len) { /* Finished if beyond the end of the array */
7382                 return;
7383             }
7384             current = array[i];
7385             if (current >= end) {   /* Finished if beyond the end of what we
7386                                        are populating */
7387                 if (LIKELY(end < UV_MAX)) {
7388                     return;
7389                 }
7390
7391                 /* We get here when the upper bound is the maximum
7392                  * representable on the machine, and we are looking for just
7393                  * that code point.  Have to special case it */
7394                 i = len;
7395                 goto join_end_of_list;
7396             }
7397         }
7398         assert(current >= start);
7399
7400         /* The current range ends one below the next one, except don't go past
7401          * <end> */
7402         i++;
7403         upper = (i < len && array[i] < end) ? array[i] : end;
7404
7405         /* Here we are in a range that matches.  Populate a bit in the 3-bit U8
7406          * for each code point in it */
7407         for (; current < upper; current++) {
7408             const STRLEN offset = (STRLEN)(current - start);
7409             swatch[offset >> 3] |= 1 << (offset & 7);
7410         }
7411
7412     join_end_of_list:
7413
7414         /* Quit if at the end of the list */
7415         if (i >= len) {
7416
7417             /* But first, have to deal with the highest possible code point on
7418              * the platform.  The previous code assumes that <end> is one
7419              * beyond where we want to populate, but that is impossible at the
7420              * platform's infinity, so have to handle it specially */
7421             if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
7422             {
7423                 const STRLEN offset = (STRLEN)(end - start);
7424                 swatch[offset >> 3] |= 1 << (offset & 7);
7425             }
7426             return;
7427         }
7428
7429         /* Advance to the next range, which will be for code points not in the
7430          * inversion list */
7431         current = array[i];
7432     }
7433
7434     return;
7435 }
7436
7437 void
7438 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** output)
7439 {
7440     /* Take the union of two inversion lists and point <output> to it.  *output
7441      * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
7442      * the reference count to that list will be decremented.  The first list,
7443      * <a>, may be NULL, in which case a copy of the second list is returned.
7444      * If <complement_b> is TRUE, the union is taken of the complement
7445      * (inversion) of <b> instead of b itself.
7446      *
7447      * The basis for this comes from "Unicode Demystified" Chapter 13 by
7448      * Richard Gillam, published by Addison-Wesley, and explained at some
7449      * length there.  The preface says to incorporate its examples into your
7450      * code at your own risk.
7451      *
7452      * The algorithm is like a merge sort.
7453      *
7454      * XXX A potential performance improvement is to keep track as we go along
7455      * if only one of the inputs contributes to the result, meaning the other
7456      * is a subset of that one.  In that case, we can skip the final copy and
7457      * return the larger of the input lists, but then outside code might need
7458      * to keep track of whether to free the input list or not */
7459
7460     UV* array_a;    /* a's array */
7461     UV* array_b;
7462     UV len_a;       /* length of a's array */
7463     UV len_b;
7464
7465     SV* u;                      /* the resulting union */
7466     UV* array_u;
7467     UV len_u;
7468
7469     UV i_a = 0;             /* current index into a's array */
7470     UV i_b = 0;
7471     UV i_u = 0;
7472
7473     /* running count, as explained in the algorithm source book; items are
7474      * stopped accumulating and are output when the count changes to/from 0.
7475      * The count is incremented when we start a range that's in the set, and
7476      * decremented when we start a range that's not in the set.  So its range
7477      * is 0 to 2.  Only when the count is zero is something not in the set.
7478      */
7479     UV count = 0;
7480
7481     PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
7482     assert(a != b);
7483
7484     /* If either one is empty, the union is the other one */
7485     if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
7486         if (*output == a) {
7487             if (a != NULL) {
7488                 SvREFCNT_dec_NN(a);
7489             }
7490         }
7491         if (*output != b) {
7492             *output = invlist_clone(b);
7493             if (complement_b) {
7494                 _invlist_invert(*output);
7495             }
7496         } /* else *output already = b; */
7497         return;
7498     }
7499     else if ((len_b = _invlist_len(b)) == 0) {
7500         if (*output == b) {
7501             SvREFCNT_dec_NN(b);
7502         }
7503
7504         /* The complement of an empty list is a list that has everything in it,
7505          * so the union with <a> includes everything too */
7506         if (complement_b) {
7507             if (a == *output) {
7508                 SvREFCNT_dec_NN(a);
7509             }
7510             *output = _new_invlist(1);
7511             _append_range_to_invlist(*output, 0, UV_MAX);
7512         }
7513         else if (*output != a) {
7514             *output = invlist_clone(a);
7515         }
7516         /* else *output already = a; */
7517         return;
7518     }
7519
7520     /* Here both lists exist and are non-empty */
7521     array_a = invlist_array(a);
7522     array_b = invlist_array(b);
7523
7524     /* If are to take the union of 'a' with the complement of b, set it
7525      * up so are looking at b's complement. */
7526     if (complement_b) {
7527
7528         /* To complement, we invert: if the first element is 0, remove it.  To
7529          * do this, we just pretend the array starts one later, and clear the
7530          * flag as we don't have to do anything else later */
7531         if (array_b[0] == 0) {
7532             array_b++;
7533             len_b--;
7534             complement_b = FALSE;
7535         }
7536         else {
7537
7538             /* But if the first element is not zero, we unshift a 0 before the
7539              * array.  The data structure reserves a space for that 0 (which
7540              * should be a '1' right now), so physical shifting is unneeded,
7541              * but temporarily change that element to 0.  Before exiting the
7542              * routine, we must restore the element to '1' */
7543             array_b--;
7544             len_b++;
7545             array_b[0] = 0;
7546         }
7547     }
7548
7549     /* Size the union for the worst case: that the sets are completely
7550      * disjoint */
7551     u = _new_invlist(len_a + len_b);
7552
7553     /* Will contain U+0000 if either component does */
7554     array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
7555                                       || (len_b > 0 && array_b[0] == 0));
7556
7557     /* Go through each list item by item, stopping when exhausted one of
7558      * them */
7559     while (i_a < len_a && i_b < len_b) {
7560         UV cp;      /* The element to potentially add to the union's array */
7561         bool cp_in_set;   /* is it in the the input list's set or not */
7562
7563         /* We need to take one or the other of the two inputs for the union.
7564          * Since we are merging two sorted lists, we take the smaller of the
7565          * next items.  In case of a tie, we take the one that is in its set
7566          * first.  If we took one not in the set first, it would decrement the
7567          * count, possibly to 0 which would cause it to be output as ending the
7568          * range, and the next time through we would take the same number, and
7569          * output it again as beginning the next range.  By doing it the
7570          * opposite way, there is no possibility that the count will be
7571          * momentarily decremented to 0, and thus the two adjoining ranges will
7572          * be seamlessly merged.  (In a tie and both are in the set or both not
7573          * in the set, it doesn't matter which we take first.) */
7574         if (array_a[i_a] < array_b[i_b]
7575             || (array_a[i_a] == array_b[i_b]
7576                 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7577         {
7578             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7579             cp= array_a[i_a++];
7580         }
7581         else {
7582             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7583             cp= array_b[i_b++];
7584         }
7585
7586         /* Here, have chosen which of the two inputs to look at.  Only output
7587          * if the running count changes to/from 0, which marks the
7588          * beginning/end of a range in that's in the set */
7589         if (cp_in_set) {
7590             if (count == 0) {
7591                 array_u[i_u++] = cp;
7592             }
7593             count++;
7594         }
7595         else {
7596             count--;
7597             if (count == 0) {
7598                 array_u[i_u++] = cp;
7599             }
7600         }
7601     }
7602
7603     /* Here, we are finished going through at least one of the lists, which
7604      * means there is something remaining in at most one.  We check if the list
7605      * that hasn't been exhausted is positioned such that we are in the middle
7606      * of a range in its set or not.  (i_a and i_b point to the element beyond
7607      * the one we care about.) If in the set, we decrement 'count'; if 0, there
7608      * is potentially more to output.
7609      * There are four cases:
7610      *  1) Both weren't in their sets, count is 0, and remains 0.  What's left
7611      *     in the union is entirely from the non-exhausted set.
7612      *  2) Both were in their sets, count is 2.  Nothing further should
7613      *     be output, as everything that remains will be in the exhausted
7614      *     list's set, hence in the union; decrementing to 1 but not 0 insures
7615      *     that
7616      *  3) the exhausted was in its set, non-exhausted isn't, count is 1.
7617      *     Nothing further should be output because the union includes
7618      *     everything from the exhausted set.  Not decrementing ensures that.
7619      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1;
7620      *     decrementing to 0 insures that we look at the remainder of the
7621      *     non-exhausted set */
7622     if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7623         || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7624     {
7625         count--;
7626     }
7627
7628     /* The final length is what we've output so far, plus what else is about to
7629      * be output.  (If 'count' is non-zero, then the input list we exhausted
7630      * has everything remaining up to the machine's limit in its set, and hence
7631      * in the union, so there will be no further output. */
7632     len_u = i_u;
7633     if (count == 0) {
7634         /* At most one of the subexpressions will be non-zero */
7635         len_u += (len_a - i_a) + (len_b - i_b);
7636     }
7637
7638     /* Set result to final length, which can change the pointer to array_u, so
7639      * re-find it */
7640     if (len_u != _invlist_len(u)) {
7641         invlist_set_len(u, len_u);
7642         invlist_trim(u);
7643         array_u = invlist_array(u);
7644     }
7645
7646     /* When 'count' is 0, the list that was exhausted (if one was shorter than
7647      * the other) ended with everything above it not in its set.  That means
7648      * that the remaining part of the union is precisely the same as the
7649      * non-exhausted list, so can just copy it unchanged.  (If both list were
7650      * exhausted at the same time, then the operations below will be both 0.)
7651      */
7652     if (count == 0) {
7653         IV copy_count; /* At most one will have a non-zero copy count */
7654         if ((copy_count = len_a - i_a) > 0) {
7655             Copy(array_a + i_a, array_u + i_u, copy_count, UV);
7656         }
7657         else if ((copy_count = len_b - i_b) > 0) {
7658             Copy(array_b + i_b, array_u + i_u, copy_count, UV);
7659         }
7660     }
7661
7662     /* If we've changed b, restore it */
7663     if (complement_b) {
7664         array_b[0] = 1;
7665     }
7666
7667     /*  We may be removing a reference to one of the inputs */
7668     if (a == *output || b == *output) {
7669         assert(! invlist_is_iterating(*output));
7670         SvREFCNT_dec_NN(*output);
7671     }
7672
7673     *output = u;
7674     return;
7675 }
7676
7677 void
7678 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** i)
7679 {
7680     /* Take the intersection of two inversion lists and point <i> to it.  *i
7681      * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
7682      * the reference count to that list will be decremented.
7683      * If <complement_b> is TRUE, the result will be the intersection of <a>
7684      * and the complement (or inversion) of <b> instead of <b> directly.
7685      *
7686      * The basis for this comes from "Unicode Demystified" Chapter 13 by
7687      * Richard Gillam, published by Addison-Wesley, and explained at some
7688      * length there.  The preface says to incorporate its examples into your
7689      * code at your own risk.  In fact, it had bugs
7690      *
7691      * The algorithm is like a merge sort, and is essentially the same as the
7692      * union above
7693      */
7694
7695     UV* array_a;                /* a's array */
7696     UV* array_b;
7697     UV len_a;   /* length of a's array */
7698     UV len_b;
7699
7700     SV* r;                   /* the resulting intersection */
7701     UV* array_r;
7702     UV len_r;
7703
7704     UV i_a = 0;             /* current index into a's array */
7705     UV i_b = 0;
7706     UV i_r = 0;
7707
7708     /* running count, as explained in the algorithm source book; items are
7709      * stopped accumulating and are output when the count changes to/from 2.
7710      * The count is incremented when we start a range that's in the set, and
7711      * decremented when we start a range that's not in the set.  So its range
7712      * is 0 to 2.  Only when the count is 2 is something in the intersection.
7713      */
7714     UV count = 0;
7715
7716     PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
7717     assert(a != b);
7718
7719     /* Special case if either one is empty */
7720     len_a = _invlist_len(a);
7721     if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
7722
7723         if (len_a != 0 && complement_b) {
7724
7725             /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
7726              * be empty.  Here, also we are using 'b's complement, which hence
7727              * must be every possible code point.  Thus the intersection is
7728              * simply 'a'. */
7729             if (*i != a) {
7730                 *i = invlist_clone(a);
7731
7732                 if (*i == b) {
7733                     SvREFCNT_dec_NN(b);
7734                 }
7735             }
7736             /* else *i is already 'a' */
7737             return;
7738         }
7739
7740         /* Here, 'a' or 'b' is empty and not using the complement of 'b'.  The
7741          * intersection must be empty */
7742         if (*i == a) {
7743             SvREFCNT_dec_NN(a);
7744         }
7745         else if (*i == b) {
7746             SvREFCNT_dec_NN(b);
7747         }
7748         *i = _new_invlist(0);
7749         return;
7750     }
7751
7752     /* Here both lists exist and are non-empty */
7753     array_a = invlist_array(a);
7754     array_b = invlist_array(b);
7755
7756     /* If are to take the intersection of 'a' with the complement of b, set it
7757      * up so are looking at b's complement. */
7758     if (complement_b) {
7759
7760         /* To complement, we invert: if the first element is 0, remove it.  To
7761          * do this, we just pretend the array starts one later, and clear the
7762          * flag as we don't have to do anything else later */
7763         if (array_b[0] == 0) {
7764             array_b++;
7765             len_b--;
7766             complement_b = FALSE;
7767         }
7768         else {
7769
7770             /* But if the first element is not zero, we unshift a 0 before the
7771              * array.  The data structure reserves a space for that 0 (which
7772              * should be a '1' right now), so physical shifting is unneeded,
7773              * but temporarily change that element to 0.  Before exiting the
7774              * routine, we must restore the element to '1' */
7775             array_b--;
7776             len_b++;
7777             array_b[0] = 0;
7778         }
7779     }
7780
7781     /* Size the intersection for the worst case: that the intersection ends up
7782      * fragmenting everything to be completely disjoint */
7783     r= _new_invlist(len_a + len_b);
7784
7785     /* Will contain U+0000 iff both components do */
7786     array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
7787                                      && len_b > 0 && array_b[0] == 0);
7788
7789     /* Go through each list item by item, stopping when exhausted one of
7790      * them */
7791     while (i_a < len_a && i_b < len_b) {
7792         UV cp;      /* The element to potentially add to the intersection's
7793                        array */
7794         bool cp_in_set; /* Is it in the input list's set or not */
7795
7796         /* We need to take one or the other of the two inputs for the
7797          * intersection.  Since we are merging two sorted lists, we take the
7798          * smaller of the next items.  In case of a tie, we take the one that
7799          * is not in its set first (a difference from the union algorithm).  If
7800          * we took one in the set first, it would increment the count, possibly
7801          * to 2 which would cause it to be output as starting a range in the
7802          * intersection, and the next time through we would take that same
7803          * number, and output it again as ending the set.  By doing it the
7804          * opposite of this, there is no possibility that the count will be
7805          * momentarily incremented to 2.  (In a tie and both are in the set or
7806          * both not in the set, it doesn't matter which we take first.) */
7807         if (array_a[i_a] < array_b[i_b]
7808             || (array_a[i_a] == array_b[i_b]
7809                 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7810         {
7811             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7812             cp= array_a[i_a++];
7813         }
7814         else {
7815             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7816             cp= array_b[i_b++];
7817         }
7818
7819         /* Here, have chosen which of the two inputs to look at.  Only output
7820          * if the running count changes to/from 2, which marks the
7821          * beginning/end of a range that's in the intersection */
7822         if (cp_in_set) {
7823             count++;
7824             if (count == 2) {
7825                 array_r[i_r++] = cp;
7826             }
7827         }
7828         else {
7829             if (count == 2) {
7830                 array_r[i_r++] = cp;
7831             }
7832             count--;
7833         }
7834     }
7835
7836     /* Here, we are finished going through at least one of the lists, which
7837      * means there is something remaining in at most one.  We check if the list
7838      * that has been exhausted is positioned such that we are in the middle
7839      * of a range in its set or not.  (i_a and i_b point to elements 1 beyond
7840      * the ones we care about.)  There are four cases:
7841      *  1) Both weren't in their sets, count is 0, and remains 0.  There's
7842      *     nothing left in the intersection.
7843      *  2) Both were in their sets, count is 2 and perhaps is incremented to
7844      *     above 2.  What should be output is exactly that which is in the
7845      *     non-exhausted set, as everything it has is also in the intersection
7846      *     set, and everything it doesn't have can't be in the intersection
7847      *  3) The exhausted was in its set, non-exhausted isn't, count is 1, and
7848      *     gets incremented to 2.  Like the previous case, the intersection is
7849      *     everything that remains in the non-exhausted set.
7850      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
7851      *     remains 1.  And the intersection has nothing more. */
7852     if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7853         || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7854     {
7855         count++;
7856     }
7857
7858     /* The final length is what we've output so far plus what else is in the
7859      * intersection.  At most one of the subexpressions below will be non-zero */
7860     len_r = i_r;
7861     if (count >= 2) {
7862         len_r += (len_a - i_a) + (len_b - i_b);
7863     }
7864
7865     /* Set result to final length, which can change the pointer to array_r, so
7866      * re-find it */
7867     if (len_r != _invlist_len(r)) {
7868         invlist_set_len(r, len_r);
7869         invlist_trim(r);
7870         array_r = invlist_array(r);
7871     }
7872
7873     /* Finish outputting any remaining */
7874     if (count >= 2) { /* At most one will have a non-zero copy count */
7875         IV copy_count;
7876         if ((copy_count = len_a - i_a) > 0) {
7877             Copy(array_a + i_a, array_r + i_r, copy_count, UV);
7878         }
7879         else if ((copy_count = len_b - i_b) > 0) {
7880             Copy(array_b + i_b, array_r + i_r, copy_count, UV);
7881         }
7882     }
7883
7884     /* If we've changed b, restore it */
7885     if (complement_b) {
7886         array_b[0] = 1;
7887     }
7888
7889     /*  We may be removing a reference to one of the inputs */
7890     if (a == *i || b == *i) {
7891         assert(! invlist_is_iterating(*i));
7892         SvREFCNT_dec_NN(*i);
7893     }
7894
7895     *i = r;
7896     return;
7897 }
7898
7899 SV*
7900 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
7901 {
7902     /* Add the range from 'start' to 'end' inclusive to the inversion list's
7903      * set.  A pointer to the inversion list is returned.  This may actually be
7904      * a new list, in which case the passed in one has been destroyed.  The
7905      * passed in inversion list can be NULL, in which case a new one is created
7906      * with just the one range in it */
7907
7908     SV* range_invlist;
7909     UV len;
7910
7911     if (invlist == NULL) {
7912         invlist = _new_invlist(2);
7913         len = 0;
7914     }
7915     else {
7916         len = _invlist_len(invlist);
7917     }
7918
7919     /* If comes after the final entry actually in the list, can just append it
7920      * to the end, */
7921     if (len == 0
7922         || (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1)
7923             && start >= invlist_array(invlist)[len - 1]))
7924     {
7925         _append_range_to_invlist(invlist, start, end);
7926         return invlist;
7927     }
7928
7929     /* Here, can't just append things, create and return a new inversion list
7930      * which is the union of this range and the existing inversion list */
7931     range_invlist = _new_invlist(2);
7932     _append_range_to_invlist(range_invlist, start, end);
7933
7934     _invlist_union(invlist, range_invlist, &invlist);
7935
7936     /* The temporary can be freed */
7937     SvREFCNT_dec_NN(range_invlist);
7938
7939     return invlist;
7940 }
7941
7942 #endif
7943
7944 PERL_STATIC_INLINE SV*
7945 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
7946     return _add_range_to_invlist(invlist, cp, cp);
7947 }
7948
7949 #ifndef PERL_IN_XSUB_RE
7950 void
7951 Perl__invlist_invert(pTHX_ SV* const invlist)
7952 {
7953     /* Complement the input inversion list.  This adds a 0 if the list didn't
7954      * have a zero; removes it otherwise.  As described above, the data
7955      * structure is set up so that this is very efficient */
7956
7957     UV* len_pos = _get_invlist_len_addr(invlist);
7958
7959     PERL_ARGS_ASSERT__INVLIST_INVERT;
7960
7961     assert(! invlist_is_iterating(invlist));
7962
7963     /* The inverse of matching nothing is matching everything */
7964     if (*len_pos == 0) {
7965         _append_range_to_invlist(invlist, 0, UV_MAX);
7966         return;
7967     }
7968
7969     /* The exclusive or complents 0 to 1; and 1 to 0.  If the result is 1, the
7970      * zero element was a 0, so it is being removed, so the length decrements
7971      * by 1; and vice-versa.  SvCUR is unaffected */
7972     if (*get_invlist_zero_addr(invlist) ^= 1) {
7973         (*len_pos)--;
7974     }
7975     else {
7976         (*len_pos)++;
7977     }
7978 }
7979
7980 void
7981 Perl__invlist_invert_prop(pTHX_ SV* const invlist)
7982 {
7983     /* Complement the input inversion list (which must be a Unicode property,
7984      * all of which don't match above the Unicode maximum code point.)  And
7985      * Perl has chosen to not have the inversion match above that either.  This
7986      * adds a 0x110000 if the list didn't end with it, and removes it if it did
7987      */
7988
7989     UV len;
7990     UV* array;
7991
7992     PERL_ARGS_ASSERT__INVLIST_INVERT_PROP;
7993
7994     _invlist_invert(invlist);
7995
7996     len = _invlist_len(invlist);
7997
7998     if (len != 0) { /* If empty do nothing */
7999         array = invlist_array(invlist);
8000         if (array[len - 1] != PERL_UNICODE_MAX + 1) {
8001             /* Add 0x110000.  First, grow if necessary */
8002             len++;
8003             if (invlist_max(invlist) < len) {
8004                 invlist_extend(invlist, len);
8005                 array = invlist_array(invlist);
8006             }
8007             invlist_set_len(invlist, len);
8008             array[len - 1] = PERL_UNICODE_MAX + 1;
8009         }
8010         else {  /* Remove the 0x110000 */
8011             invlist_set_len(invlist, len - 1);
8012         }
8013     }
8014
8015     return;
8016 }
8017 #endif
8018
8019 PERL_STATIC_INLINE SV*
8020 S_invlist_clone(pTHX_ SV* const invlist)
8021 {
8022
8023     /* Return a new inversion list that is a copy of the input one, which is
8024      * unchanged */
8025
8026     /* Need to allocate extra space to accommodate Perl's addition of a
8027      * trailing NUL to SvPV's, since it thinks they are always strings */
8028     SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
8029     STRLEN length = SvCUR(invlist);
8030
8031     PERL_ARGS_ASSERT_INVLIST_CLONE;
8032
8033     SvCUR_set(new_invlist, length); /* This isn't done automatically */
8034     Copy(SvPVX(invlist), SvPVX(new_invlist), length, char);
8035
8036     return new_invlist;
8037 }
8038
8039 PERL_STATIC_INLINE UV*
8040 S_get_invlist_iter_addr(pTHX_ SV* invlist)
8041 {
8042     /* Return the address of the UV that contains the current iteration
8043      * position */
8044
8045     PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
8046
8047     return (UV *) (SvPVX(invlist) + (INVLIST_ITER_OFFSET * sizeof (UV)));
8048 }
8049
8050 PERL_STATIC_INLINE UV*
8051 S_get_invlist_version_id_addr(pTHX_ SV* invlist)
8052 {
8053     /* Return the address of the UV that contains the version id. */
8054
8055     PERL_ARGS_ASSERT_GET_INVLIST_VERSION_ID_ADDR;
8056
8057     return (UV *) (SvPVX(invlist) + (INVLIST_VERSION_ID_OFFSET * sizeof (UV)));
8058 }
8059
8060 PERL_STATIC_INLINE void
8061 S_invlist_iterinit(pTHX_ SV* invlist)   /* Initialize iterator for invlist */
8062 {
8063     PERL_ARGS_ASSERT_INVLIST_ITERINIT;
8064
8065     *get_invlist_iter_addr(invlist) = 0;
8066 }
8067
8068 PERL_STATIC_INLINE void
8069 S_invlist_iterfinish(pTHX_ SV* invlist)
8070 {
8071     /* Terminate iterator for invlist.  This is to catch development errors.
8072      * Any iteration that is interrupted before completed should call this
8073      * function.  Functions that add code points anywhere else but to the end
8074      * of an inversion list assert that they are not in the middle of an
8075      * iteration.  If they were, the addition would make the iteration
8076      * problematical: if the iteration hadn't reached the place where things
8077      * were being added, it would be ok */
8078
8079     PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
8080
8081     *get_invlist_iter_addr(invlist) = UV_MAX;
8082 }
8083
8084 STATIC bool
8085 S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
8086 {
8087     /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
8088      * This call sets in <*start> and <*end>, the next range in <invlist>.
8089      * Returns <TRUE> if successful and the next call will return the next
8090      * range; <FALSE> if was already at the end of the list.  If the latter,
8091      * <*start> and <*end> are unchanged, and the next call to this function
8092      * will start over at the beginning of the list */
8093
8094     UV* pos = get_invlist_iter_addr(invlist);
8095     UV len = _invlist_len(invlist);
8096     UV *array;
8097
8098     PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
8099
8100     if (*pos >= len) {
8101         *pos = UV_MAX;  /* Force iterinit() to be required next time */
8102         return FALSE;
8103     }
8104
8105     array = invlist_array(invlist);
8106
8107     *start = array[(*pos)++];
8108
8109     if (*pos >= len) {
8110         *end = UV_MAX;
8111     }
8112     else {
8113         *end = array[(*pos)++] - 1;
8114     }
8115
8116     return TRUE;
8117 }
8118
8119 PERL_STATIC_INLINE bool
8120 S_invlist_is_iterating(pTHX_ SV* const invlist)
8121 {
8122     PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
8123
8124     return *(get_invlist_iter_addr(invlist)) < UV_MAX;
8125 }
8126
8127 PERL_STATIC_INLINE UV
8128 S_invlist_highest(pTHX_ SV* const invlist)
8129 {
8130     /* Returns the highest code point that matches an inversion list.  This API
8131      * has an ambiguity, as it returns 0 under either the highest is actually
8132      * 0, or if the list is empty.  If this distinction matters to you, check
8133      * for emptiness before calling this function */
8134
8135     UV len = _invlist_len(invlist);
8136     UV *array;
8137
8138     PERL_ARGS_ASSERT_INVLIST_HIGHEST;
8139
8140     if (len == 0) {
8141         return 0;
8142     }
8143
8144     array = invlist_array(invlist);
8145
8146     /* The last element in the array in the inversion list always starts a
8147      * range that goes to infinity.  That range may be for code points that are
8148      * matched in the inversion list, or it may be for ones that aren't
8149      * matched.  In the latter case, the highest code point in the set is one
8150      * less than the beginning of this range; otherwise it is the final element
8151      * of this range: infinity */
8152     return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
8153            ? UV_MAX
8154            : array[len - 1] - 1;
8155 }
8156
8157 #ifndef PERL_IN_XSUB_RE
8158 SV *
8159 Perl__invlist_contents(pTHX_ SV* const invlist)
8160 {
8161     /* Get the contents of an inversion list into a string SV so that they can
8162      * be printed out.  It uses the format traditionally done for debug tracing
8163      */
8164
8165     UV start, end;
8166     SV* output = newSVpvs("\n");
8167
8168     PERL_ARGS_ASSERT__INVLIST_CONTENTS;
8169
8170     assert(! invlist_is_iterating(invlist));
8171
8172     invlist_iterinit(invlist);
8173     while (invlist_iternext(invlist, &start, &end)) {
8174         if (end == UV_MAX) {
8175             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
8176         }
8177         else if (end != start) {
8178             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
8179                     start,       end);
8180         }
8181         else {
8182             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
8183         }
8184     }
8185
8186     return output;
8187 }
8188 #endif
8189
8190 #ifdef PERL_ARGS_ASSERT__INVLIST_DUMP
8191 void
8192 Perl__invlist_dump(pTHX_ SV* const invlist, const char * const header)
8193 {
8194     /* Dumps out the ranges in an inversion list.  The string 'header'
8195      * if present is output on a line before the first range */
8196
8197     UV start, end;
8198
8199     PERL_ARGS_ASSERT__INVLIST_DUMP;
8200
8201     if (header && strlen(header)) {
8202         PerlIO_printf(Perl_debug_log, "%s\n", header);
8203     }
8204     if (invlist_is_iterating(invlist)) {
8205         PerlIO_printf(Perl_debug_log, "Can't dump because is in middle of iterating\n");
8206         return;
8207     }
8208
8209     invlist_iterinit(invlist);
8210     while (invlist_iternext(invlist, &start, &end)) {
8211         if (end == UV_MAX) {
8212             PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. INFINITY\n", start);
8213         }
8214         else if (end != start) {
8215             PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. 0x%04"UVXf"\n",
8216                                                  start,         end);
8217         }
8218         else {
8219             PerlIO_printf(Perl_debug_log, "0x%04"UVXf"\n", start);
8220         }
8221     }
8222 }
8223 #endif
8224
8225 #if 0
8226 bool
8227 S__invlistEQ(pTHX_ SV* const a, SV* const b, bool complement_b)
8228 {
8229     /* Return a boolean as to if the two passed in inversion lists are
8230      * identical.  The final argument, if TRUE, says to take the complement of
8231      * the second inversion list before doing the comparison */
8232
8233     UV* array_a = invlist_array(a);
8234     UV* array_b = invlist_array(b);
8235     UV len_a = _invlist_len(a);
8236     UV len_b = _invlist_len(b);
8237
8238     UV i = 0;               /* current index into the arrays */
8239     bool retval = TRUE;     /* Assume are identical until proven otherwise */
8240
8241     PERL_ARGS_ASSERT__INVLISTEQ;
8242
8243     /* If are to compare 'a' with the complement of b, set it
8244      * up so are looking at b's complement. */
8245     if (complement_b) {
8246
8247         /* The complement of nothing is everything, so <a> would have to have
8248          * just one element, starting at zero (ending at infinity) */
8249         if (len_b == 0) {
8250             return (len_a == 1 && array_a[0] == 0);
8251         }
8252         else if (array_b[0] == 0) {
8253
8254             /* Otherwise, to complement, we invert.  Here, the first element is
8255              * 0, just remove it.  To do this, we just pretend the array starts
8256              * one later, and clear the flag as we don't have to do anything
8257              * else later */
8258
8259             array_b++;
8260             len_b--;
8261             complement_b = FALSE;
8262         }
8263         else {
8264
8265             /* But if the first element is not zero, we unshift a 0 before the
8266              * array.  The data structure reserves a space for that 0 (which
8267              * should be a '1' right now), so physical shifting is unneeded,
8268              * but temporarily change that element to 0.  Before exiting the
8269              * routine, we must restore the element to '1' */
8270             array_b--;
8271             len_b++;
8272             array_b[0] = 0;
8273         }
8274     }
8275
8276     /* Make sure that the lengths are the same, as well as the final element
8277      * before looping through the remainder.  (Thus we test the length, final,
8278      * and first elements right off the bat) */
8279     if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
8280         retval = FALSE;
8281     }
8282     else for (i = 0; i < len_a - 1; i++) {
8283         if (array_a[i] != array_b[i]) {
8284             retval = FALSE;
8285             break;
8286         }
8287     }
8288
8289     if (complement_b) {
8290         array_b[0] = 1;
8291     }
8292     return retval;
8293 }
8294 #endif
8295
8296 #undef HEADER_LENGTH
8297 #undef INVLIST_INITIAL_LENGTH
8298 #undef TO_INTERNAL_SIZE
8299 #undef FROM_INTERNAL_SIZE
8300 #undef INVLIST_LEN_OFFSET
8301 #undef INVLIST_ZERO_OFFSET
8302 #undef INVLIST_ITER_OFFSET
8303 #undef INVLIST_VERSION_ID
8304 #undef INVLIST_PREVIOUS_INDEX_OFFSET
8305
8306 /* End of inversion list object */
8307
8308 /*
8309  - reg - regular expression, i.e. main body or parenthesized thing
8310  *
8311  * Caller must absorb opening parenthesis.
8312  *
8313  * Combining parenthesis handling with the base level of regular expression
8314  * is a trifle forced, but the need to tie the tails of the branches to what
8315  * follows makes it hard to avoid.
8316  */
8317 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
8318 #ifdef DEBUGGING
8319 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
8320 #else
8321 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
8322 #endif
8323
8324 STATIC regnode *
8325 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
8326     /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
8327 {
8328     dVAR;
8329     regnode *ret;               /* Will be the head of the group. */
8330     regnode *br;
8331     regnode *lastbr;
8332     regnode *ender = NULL;
8333     I32 parno = 0;
8334     I32 flags;
8335     U32 oregflags = RExC_flags;
8336     bool have_branch = 0;
8337     bool is_open = 0;
8338     I32 freeze_paren = 0;
8339     I32 after_freeze = 0;
8340
8341     /* for (?g), (?gc), and (?o) warnings; warning
8342        about (?c) will warn about (?g) -- japhy    */
8343
8344 #define WASTED_O  0x01
8345 #define WASTED_G  0x02
8346 #define WASTED_C  0x04
8347 #define WASTED_GC (0x02|0x04)
8348     I32 wastedflags = 0x00;
8349
8350     char * parse_start = RExC_parse; /* MJD */
8351     char * const oregcomp_parse = RExC_parse;
8352
8353     GET_RE_DEBUG_FLAGS_DECL;
8354
8355     PERL_ARGS_ASSERT_REG;
8356     DEBUG_PARSE("reg ");
8357
8358     *flagp = 0;                         /* Tentatively. */
8359
8360
8361     /* Make an OPEN node, if parenthesized. */
8362     if (paren) {
8363         if ( *RExC_parse == '*') { /* (*VERB:ARG) */
8364             char *start_verb = RExC_parse;
8365             STRLEN verb_len = 0;
8366             char *start_arg = NULL;
8367             unsigned char op = 0;
8368             int argok = 1;
8369             int internal_argval = 0; /* internal_argval is only useful if !argok */
8370             while ( *RExC_parse && *RExC_parse != ')' ) {
8371                 if ( *RExC_parse == ':' ) {
8372                     start_arg = RExC_parse + 1;
8373                     break;
8374                 }
8375                 RExC_parse++;
8376             }
8377             ++start_verb;
8378             verb_len = RExC_parse - start_verb;
8379             if ( start_arg ) {
8380                 RExC_parse++;
8381                 while ( *RExC_parse && *RExC_parse != ')' )
8382                     RExC_parse++;
8383                 if ( *RExC_parse != ')' )
8384                     vFAIL("Unterminated verb pattern argument");
8385                 if ( RExC_parse == start_arg )
8386                     start_arg = NULL;
8387             } else {
8388                 if ( *RExC_parse != ')' )
8389                     vFAIL("Unterminated verb pattern");
8390             }
8391
8392             switch ( *start_verb ) {
8393             case 'A':  /* (*ACCEPT) */
8394                 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
8395                     op = ACCEPT;
8396                     internal_argval = RExC_nestroot;
8397                 }
8398                 break;
8399             case 'C':  /* (*COMMIT) */
8400                 if ( memEQs(start_verb,verb_len,"COMMIT") )
8401                     op = COMMIT;
8402                 break;
8403             case 'F':  /* (*FAIL) */
8404                 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
8405                     op = OPFAIL;
8406                     argok = 0;
8407                 }
8408                 break;
8409             case ':':  /* (*:NAME) */
8410             case 'M':  /* (*MARK:NAME) */
8411                 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
8412                     op = MARKPOINT;
8413                     argok = -1;
8414                 }
8415                 break;
8416             case 'P':  /* (*PRUNE) */
8417                 if ( memEQs(start_verb,verb_len,"PRUNE") )
8418                     op = PRUNE;
8419                 break;
8420             case 'S':   /* (*SKIP) */
8421                 if ( memEQs(start_verb,verb_len,"SKIP") )
8422                     op = SKIP;
8423                 break;
8424             case 'T':  /* (*THEN) */
8425                 /* [19:06] <TimToady> :: is then */
8426                 if ( memEQs(start_verb,verb_len,"THEN") ) {
8427                     op = CUTGROUP;
8428                     RExC_seen |= REG_SEEN_CUTGROUP;
8429                 }
8430                 break;
8431             }
8432             if ( ! op ) {
8433                 RExC_parse++;
8434                 vFAIL3("Unknown verb pattern '%.*s'",
8435                     verb_len, start_verb);
8436             }
8437             if ( argok ) {
8438                 if ( start_arg && internal_argval ) {
8439                     vFAIL3("Verb pattern '%.*s' may not have an argument",
8440                         verb_len, start_verb);
8441                 } else if ( argok < 0 && !start_arg ) {
8442                     vFAIL3("Verb pattern '%.*s' has a mandatory argument",
8443                         verb_len, start_verb);
8444                 } else {
8445                     ret = reganode(pRExC_state, op, internal_argval);
8446                     if ( ! internal_argval && ! SIZE_ONLY ) {
8447                         if (start_arg) {
8448                             SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
8449                             ARG(ret) = add_data( pRExC_state, 1, "S" );
8450                             RExC_rxi->data->data[ARG(ret)]=(void*)sv;
8451                             ret->flags = 0;
8452                         } else {
8453                             ret->flags = 1;
8454                         }
8455                     }
8456                 }
8457                 if (!internal_argval)
8458                     RExC_seen |= REG_SEEN_VERBARG;
8459             } else if ( start_arg ) {
8460                 vFAIL3("Verb pattern '%.*s' may not have an argument",
8461                         verb_len, start_verb);
8462             } else {
8463                 ret = reg_node(pRExC_state, op);
8464             }
8465             nextchar(pRExC_state);
8466             return ret;
8467         } else
8468         if (*RExC_parse == '?') { /* (?...) */
8469             bool is_logical = 0;
8470             const char * const seqstart = RExC_parse;
8471             bool has_use_defaults = FALSE;
8472
8473             RExC_parse++;
8474             paren = *RExC_parse++;
8475             ret = NULL;                 /* For look-ahead/behind. */
8476             switch (paren) {
8477
8478             case 'P':   /* (?P...) variants for those used to PCRE/Python */
8479                 paren = *RExC_parse++;
8480                 if ( paren == '<')         /* (?P<...>) named capture */
8481                     goto named_capture;
8482                 else if (paren == '>') {   /* (?P>name) named recursion */
8483                     goto named_recursion;
8484                 }
8485                 else if (paren == '=') {   /* (?P=...)  named backref */
8486                     /* this pretty much dupes the code for \k<NAME> in regatom(), if
8487                        you change this make sure you change that */
8488                     char* name_start = RExC_parse;
8489                     U32 num = 0;
8490                     SV *sv_dat = reg_scan_name(pRExC_state,
8491                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8492                     if (RExC_parse == name_start || *RExC_parse != ')')
8493                         vFAIL2("Sequence %.3s... not terminated",parse_start);
8494
8495                     if (!SIZE_ONLY) {
8496                         num = add_data( pRExC_state, 1, "S" );
8497                         RExC_rxi->data->data[num]=(void*)sv_dat;
8498                         SvREFCNT_inc_simple_void(sv_dat);
8499                     }
8500                     RExC_sawback = 1;
8501                     ret = reganode(pRExC_state,
8502                                    ((! FOLD)
8503                                      ? NREF
8504                                      : (ASCII_FOLD_RESTRICTED)
8505                                        ? NREFFA
8506                                        : (AT_LEAST_UNI_SEMANTICS)
8507                                          ? NREFFU
8508                                          : (LOC)
8509                                            ? NREFFL
8510                                            : NREFF),
8511                                     num);
8512                     *flagp |= HASWIDTH;
8513
8514                     Set_Node_Offset(ret, parse_start+1);
8515                     Set_Node_Cur_Length(ret); /* MJD */
8516
8517                     nextchar(pRExC_state);
8518                     return ret;
8519                 }
8520                 RExC_parse++;
8521                 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8522                 /*NOTREACHED*/
8523             case '<':           /* (?<...) */
8524                 if (*RExC_parse == '!')
8525                     paren = ',';
8526                 else if (*RExC_parse != '=')
8527               named_capture:
8528                 {               /* (?<...>) */
8529                     char *name_start;
8530                     SV *svname;
8531                     paren= '>';
8532             case '\'':          /* (?'...') */
8533                     name_start= RExC_parse;
8534                     svname = reg_scan_name(pRExC_state,
8535                         SIZE_ONLY ?  /* reverse test from the others */
8536                         REG_RSN_RETURN_NAME :
8537                         REG_RSN_RETURN_NULL);
8538                     if (RExC_parse == name_start) {
8539                         RExC_parse++;
8540                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8541                         /*NOTREACHED*/
8542                     }
8543                     if (*RExC_parse != paren)
8544                         vFAIL2("Sequence (?%c... not terminated",
8545                             paren=='>' ? '<' : paren);
8546                     if (SIZE_ONLY) {
8547                         HE *he_str;
8548                         SV *sv_dat = NULL;
8549                         if (!svname) /* shouldn't happen */
8550                             Perl_croak(aTHX_
8551                                 "panic: reg_scan_name returned NULL");
8552                         if (!RExC_paren_names) {
8553                             RExC_paren_names= newHV();
8554                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
8555 #ifdef DEBUGGING
8556                             RExC_paren_name_list= newAV();
8557                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
8558 #endif
8559                         }
8560                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
8561                         if ( he_str )
8562                             sv_dat = HeVAL(he_str);
8563                         if ( ! sv_dat ) {
8564                             /* croak baby croak */
8565                             Perl_croak(aTHX_
8566                                 "panic: paren_name hash element allocation failed");
8567                         } else if ( SvPOK(sv_dat) ) {
8568                             /* (?|...) can mean we have dupes so scan to check
8569                                its already been stored. Maybe a flag indicating
8570                                we are inside such a construct would be useful,
8571                                but the arrays are likely to be quite small, so
8572                                for now we punt -- dmq */
8573                             IV count = SvIV(sv_dat);
8574                             I32 *pv = (I32*)SvPVX(sv_dat);
8575                             IV i;
8576                             for ( i = 0 ; i < count ; i++ ) {
8577                                 if ( pv[i] == RExC_npar ) {
8578                                     count = 0;
8579                                     break;
8580                                 }
8581                             }
8582                             if ( count ) {
8583                                 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
8584                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
8585                                 pv[count] = RExC_npar;
8586                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
8587                             }
8588                         } else {
8589                             (void)SvUPGRADE(sv_dat,SVt_PVNV);
8590                             sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
8591                             SvIOK_on(sv_dat);
8592                             SvIV_set(sv_dat, 1);
8593                         }
8594 #ifdef DEBUGGING
8595                         /* Yes this does cause a memory leak in debugging Perls */
8596                         if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
8597                             SvREFCNT_dec_NN(svname);
8598 #endif
8599
8600                         /*sv_dump(sv_dat);*/
8601                     }
8602                     nextchar(pRExC_state);
8603                     paren = 1;
8604                     goto capturing_parens;
8605                 }
8606                 RExC_seen |= REG_SEEN_LOOKBEHIND;
8607                 RExC_in_lookbehind++;
8608                 RExC_parse++;
8609             case '=':           /* (?=...) */
8610                 RExC_seen_zerolen++;
8611                 break;
8612             case '!':           /* (?!...) */
8613                 RExC_seen_zerolen++;
8614                 if (*RExC_parse == ')') {
8615                     ret=reg_node(pRExC_state, OPFAIL);
8616                     nextchar(pRExC_state);
8617                     return ret;
8618                 }
8619                 break;
8620             case '|':           /* (?|...) */
8621                 /* branch reset, behave like a (?:...) except that
8622                    buffers in alternations share the same numbers */
8623                 paren = ':';
8624                 after_freeze = freeze_paren = RExC_npar;
8625                 break;
8626             case ':':           /* (?:...) */
8627             case '>':           /* (?>...) */
8628                 break;
8629             case '$':           /* (?$...) */
8630             case '@':           /* (?@...) */
8631                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
8632                 break;
8633             case '#':           /* (?#...) */
8634                 while (*RExC_parse && *RExC_parse != ')')
8635                     RExC_parse++;
8636                 if (*RExC_parse != ')')
8637                     FAIL("Sequence (?#... not terminated");
8638                 nextchar(pRExC_state);
8639                 *flagp = TRYAGAIN;
8640                 return NULL;
8641             case '0' :           /* (?0) */
8642             case 'R' :           /* (?R) */
8643                 if (*RExC_parse != ')')
8644                     FAIL("Sequence (?R) not terminated");
8645                 ret = reg_node(pRExC_state, GOSTART);
8646                 *flagp |= POSTPONED;
8647                 nextchar(pRExC_state);
8648                 return ret;
8649                 /*notreached*/
8650             { /* named and numeric backreferences */
8651                 I32 num;
8652             case '&':            /* (?&NAME) */
8653                 parse_start = RExC_parse - 1;
8654               named_recursion:
8655                 {
8656                     SV *sv_dat = reg_scan_name(pRExC_state,
8657                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8658                      num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
8659                 }
8660                 goto gen_recurse_regop;
8661                 assert(0); /* NOT REACHED */
8662             case '+':
8663                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8664                     RExC_parse++;
8665                     vFAIL("Illegal pattern");
8666                 }
8667                 goto parse_recursion;
8668                 /* NOT REACHED*/
8669             case '-': /* (?-1) */
8670                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8671                     RExC_parse--; /* rewind to let it be handled later */
8672                     goto parse_flags;
8673                 }
8674                 /*FALLTHROUGH */
8675             case '1': case '2': case '3': case '4': /* (?1) */
8676             case '5': case '6': case '7': case '8': case '9':
8677                 RExC_parse--;
8678               parse_recursion:
8679                 num = atoi(RExC_parse);
8680                 parse_start = RExC_parse - 1; /* MJD */
8681                 if (*RExC_parse == '-')
8682                     RExC_parse++;
8683                 while (isDIGIT(*RExC_parse))
8684                         RExC_parse++;
8685                 if (*RExC_parse!=')')
8686                     vFAIL("Expecting close bracket");
8687
8688               gen_recurse_regop:
8689                 if ( paren == '-' ) {
8690                     /*
8691                     Diagram of capture buffer numbering.
8692                     Top line is the normal capture buffer numbers
8693                     Bottom line is the negative indexing as from
8694                     the X (the (?-2))
8695
8696                     +   1 2    3 4 5 X          6 7
8697                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
8698                     -   5 4    3 2 1 X          x x
8699
8700                     */
8701                     num = RExC_npar + num;
8702                     if (num < 1)  {
8703                         RExC_parse++;
8704                         vFAIL("Reference to nonexistent group");
8705                     }
8706                 } else if ( paren == '+' ) {
8707                     num = RExC_npar + num - 1;
8708                 }
8709
8710                 ret = reganode(pRExC_state, GOSUB, num);
8711                 if (!SIZE_ONLY) {
8712                     if (num > (I32)RExC_rx->nparens) {
8713                         RExC_parse++;
8714                         vFAIL("Reference to nonexistent group");
8715                     }
8716                     ARG2L_SET( ret, RExC_recurse_count++);
8717                     RExC_emit++;
8718                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
8719                         "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
8720                 } else {
8721                     RExC_size++;
8722                 }
8723                 RExC_seen |= REG_SEEN_RECURSE;
8724                 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
8725                 Set_Node_Offset(ret, parse_start); /* MJD */
8726
8727                 *flagp |= POSTPONED;
8728                 nextchar(pRExC_state);
8729                 return ret;
8730             } /* named and numeric backreferences */
8731             assert(0); /* NOT REACHED */
8732
8733             case '?':           /* (??...) */
8734                 is_logical = 1;
8735                 if (*RExC_parse != '{') {
8736                     RExC_parse++;
8737                     vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8738                     /*NOTREACHED*/
8739                 }
8740                 *flagp |= POSTPONED;
8741                 paren = *RExC_parse++;
8742                 /* FALL THROUGH */
8743             case '{':           /* (?{...}) */
8744             {
8745                 U32 n = 0;
8746                 struct reg_code_block *cb;
8747
8748                 RExC_seen_zerolen++;
8749
8750                 if (   !pRExC_state->num_code_blocks
8751                     || pRExC_state->code_index >= pRExC_state->num_code_blocks
8752                     || pRExC_state->code_blocks[pRExC_state->code_index].start
8753                         != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
8754                             - RExC_start)
8755                 ) {
8756                     if (RExC_pm_flags & PMf_USE_RE_EVAL)
8757                         FAIL("panic: Sequence (?{...}): no code block found\n");
8758                     FAIL("Eval-group not allowed at runtime, use re 'eval'");
8759                 }
8760                 /* this is a pre-compiled code block (?{...}) */
8761                 cb = &pRExC_state->code_blocks[pRExC_state->code_index];
8762                 RExC_parse = RExC_start + cb->end;
8763                 if (!SIZE_ONLY) {
8764                     OP *o = cb->block;
8765                     if (cb->src_regex) {
8766                         n = add_data(pRExC_state, 2, "rl");
8767                         RExC_rxi->data->data[n] =
8768                             (void*)SvREFCNT_inc((SV*)cb->src_regex);
8769                         RExC_rxi->data->data[n+1] = (void*)o;
8770                     }
8771                     else {
8772                         n = add_data(pRExC_state, 1,
8773                                (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l");
8774                         RExC_rxi->data->data[n] = (void*)o;
8775                     }
8776                 }
8777                 pRExC_state->code_index++;
8778                 nextchar(pRExC_state);
8779
8780                 if (is_logical) {
8781                     regnode *eval;
8782                     ret = reg_node(pRExC_state, LOGICAL);
8783                     eval = reganode(pRExC_state, EVAL, n);
8784                     if (!SIZE_ONLY) {
8785                         ret->flags = 2;
8786                         /* for later propagation into (??{}) return value */
8787                         eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME);
8788                     }
8789                     REGTAIL(pRExC_state, ret, eval);
8790                     /* deal with the length of this later - MJD */
8791                     return ret;
8792                 }
8793                 ret = reganode(pRExC_state, EVAL, n);
8794                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
8795                 Set_Node_Offset(ret, parse_start);
8796                 return ret;
8797             }
8798             case '(':           /* (?(?{...})...) and (?(?=...)...) */
8799             {
8800                 int is_define= 0;
8801                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
8802                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
8803                         || RExC_parse[1] == '<'
8804                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
8805                         I32 flag;
8806
8807                         ret = reg_node(pRExC_state, LOGICAL);
8808                         if (!SIZE_ONLY)
8809                             ret->flags = 1;
8810                         REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
8811                         goto insert_if;
8812                     }
8813                 }
8814                 else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
8815                          || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
8816                 {
8817                     char ch = RExC_parse[0] == '<' ? '>' : '\'';
8818                     char *name_start= RExC_parse++;
8819                     U32 num = 0;
8820                     SV *sv_dat=reg_scan_name(pRExC_state,
8821                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8822                     if (RExC_parse == name_start || *RExC_parse != ch)
8823                         vFAIL2("Sequence (?(%c... not terminated",
8824                             (ch == '>' ? '<' : ch));
8825                     RExC_parse++;
8826                     if (!SIZE_ONLY) {
8827                         num = add_data( pRExC_state, 1, "S" );
8828                         RExC_rxi->data->data[num]=(void*)sv_dat;
8829                         SvREFCNT_inc_simple_void(sv_dat);
8830                     }
8831                     ret = reganode(pRExC_state,NGROUPP,num);
8832                     goto insert_if_check_paren;
8833                 }
8834                 else if (RExC_parse[0] == 'D' &&
8835                          RExC_parse[1] == 'E' &&
8836                          RExC_parse[2] == 'F' &&
8837                          RExC_parse[3] == 'I' &&
8838                          RExC_parse[4] == 'N' &&
8839                          RExC_parse[5] == 'E')
8840                 {
8841                     ret = reganode(pRExC_state,DEFINEP,0);
8842                     RExC_parse +=6 ;
8843                     is_define = 1;
8844                     goto insert_if_check_paren;
8845                 }
8846                 else if (RExC_parse[0] == 'R') {
8847                     RExC_parse++;
8848                     parno = 0;
8849                     if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
8850                         parno = atoi(RExC_parse++);
8851                         while (isDIGIT(*RExC_parse))
8852                             RExC_parse++;
8853                     } else if (RExC_parse[0] == '&') {
8854                         SV *sv_dat;
8855                         RExC_parse++;
8856                         sv_dat = reg_scan_name(pRExC_state,
8857                             SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8858                         parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
8859                     }
8860                     ret = reganode(pRExC_state,INSUBP,parno);
8861                     goto insert_if_check_paren;
8862                 }
8863                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
8864                     /* (?(1)...) */
8865                     char c;
8866                     parno = atoi(RExC_parse++);
8867
8868                     while (isDIGIT(*RExC_parse))
8869                         RExC_parse++;
8870                     ret = reganode(pRExC_state, GROUPP, parno);
8871
8872                  insert_if_check_paren:
8873                     if ((c = *nextchar(pRExC_state)) != ')')
8874                         vFAIL("Switch condition not recognized");
8875                   insert_if:
8876                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
8877                     br = regbranch(pRExC_state, &flags, 1,depth+1);
8878                     if (br == NULL)
8879                         br = reganode(pRExC_state, LONGJMP, 0);
8880                     else
8881                         REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
8882                     c = *nextchar(pRExC_state);
8883                     if (flags&HASWIDTH)
8884                         *flagp |= HASWIDTH;
8885                     if (c == '|') {
8886                         if (is_define)
8887                             vFAIL("(?(DEFINE)....) does not allow branches");
8888                         lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
8889                         regbranch(pRExC_state, &flags, 1,depth+1);
8890                         REGTAIL(pRExC_state, ret, lastbr);
8891                         if (flags&HASWIDTH)
8892                             *flagp |= HASWIDTH;
8893                         c = *nextchar(pRExC_state);
8894                     }
8895                     else
8896                         lastbr = NULL;
8897                     if (c != ')')
8898                         vFAIL("Switch (?(condition)... contains too many branches");
8899                     ender = reg_node(pRExC_state, TAIL);
8900                     REGTAIL(pRExC_state, br, ender);
8901                     if (lastbr) {
8902                         REGTAIL(pRExC_state, lastbr, ender);
8903                         REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
8904                     }
8905                     else
8906                         REGTAIL(pRExC_state, ret, ender);
8907                     RExC_size++; /* XXX WHY do we need this?!!
8908                                     For large programs it seems to be required
8909                                     but I can't figure out why. -- dmq*/
8910                     return ret;
8911                 }
8912                 else {
8913                     vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
8914                 }
8915             }
8916             case '[':           /* (?[ ... ]) */
8917                 return handle_sets(pRExC_state, flagp, depth, oregcomp_parse);
8918             case 0:
8919                 RExC_parse--; /* for vFAIL to print correctly */
8920                 vFAIL("Sequence (? incomplete");
8921                 break;
8922             case DEFAULT_PAT_MOD:   /* Use default flags with the exceptions
8923                                        that follow */
8924                 has_use_defaults = TRUE;
8925                 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
8926                 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
8927                                                 ? REGEX_UNICODE_CHARSET
8928                                                 : REGEX_DEPENDS_CHARSET);
8929                 goto parse_flags;
8930             default:
8931                 --RExC_parse;
8932                 parse_flags:      /* (?i) */
8933             {
8934                 U32 posflags = 0, negflags = 0;
8935                 U32 *flagsp = &posflags;
8936                 char has_charset_modifier = '\0';
8937                 regex_charset cs = get_regex_charset(RExC_flags);
8938                 if (cs == REGEX_DEPENDS_CHARSET
8939                     && (RExC_utf8 || RExC_uni_semantics))
8940                 {
8941                     cs = REGEX_UNICODE_CHARSET;
8942                 }
8943
8944                 while (*RExC_parse) {
8945                     /* && strchr("iogcmsx", *RExC_parse) */
8946                     /* (?g), (?gc) and (?o) are useless here
8947                        and must be globally applied -- japhy */
8948                     switch (*RExC_parse) {
8949                     CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
8950                     case LOCALE_PAT_MOD:
8951                         if (has_charset_modifier) {
8952                             goto excess_modifier;
8953                         }
8954                         else if (flagsp == &negflags) {
8955                             goto neg_modifier;
8956                         }
8957                         cs = REGEX_LOCALE_CHARSET;
8958                         has_charset_modifier = LOCALE_PAT_MOD;
8959                         RExC_contains_locale = 1;
8960                         break;
8961                     case UNICODE_PAT_MOD:
8962                         if (has_charset_modifier) {
8963                             goto excess_modifier;
8964                         }
8965                         else if (flagsp == &negflags) {
8966                             goto neg_modifier;
8967                         }
8968                         cs = REGEX_UNICODE_CHARSET;
8969                         has_charset_modifier = UNICODE_PAT_MOD;
8970                         break;
8971                     case ASCII_RESTRICT_PAT_MOD:
8972                         if (flagsp == &negflags) {
8973                             goto neg_modifier;
8974                         }
8975                         if (has_charset_modifier) {
8976                             if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
8977                                 goto excess_modifier;
8978                             }
8979                             /* Doubled modifier implies more restricted */
8980                             cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
8981                         }
8982                         else {
8983                             cs = REGEX_ASCII_RESTRICTED_CHARSET;
8984                         }
8985                         has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
8986                         break;
8987                     case DEPENDS_PAT_MOD:
8988                         if (has_use_defaults) {
8989                             goto fail_modifiers;
8990                         }
8991                         else if (flagsp == &negflags) {
8992                             goto neg_modifier;
8993                         }
8994                         else if (has_charset_modifier) {
8995                             goto excess_modifier;
8996                         }
8997
8998                         /* The dual charset means unicode semantics if the
8999                          * pattern (or target, not known until runtime) are
9000                          * utf8, or something in the pattern indicates unicode
9001                          * semantics */
9002                         cs = (RExC_utf8 || RExC_uni_semantics)
9003                              ? REGEX_UNICODE_CHARSET
9004                              : REGEX_DEPENDS_CHARSET;
9005                         has_charset_modifier = DEPENDS_PAT_MOD;
9006                         break;
9007                     excess_modifier:
9008                         RExC_parse++;
9009                         if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
9010                             vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
9011                         }
9012                         else if (has_charset_modifier == *(RExC_parse - 1)) {
9013                             vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
9014                         }
9015                         else {
9016                             vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
9017                         }
9018                         /*NOTREACHED*/
9019                     neg_modifier:
9020                         RExC_parse++;
9021                         vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
9022                         /*NOTREACHED*/
9023                     case ONCE_PAT_MOD: /* 'o' */
9024                     case GLOBAL_PAT_MOD: /* 'g' */
9025                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9026                             const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9027                             if (! (wastedflags & wflagbit) ) {
9028                                 wastedflags |= wflagbit;
9029                                 vWARN5(
9030                                     RExC_parse + 1,
9031                                     "Useless (%s%c) - %suse /%c modifier",
9032                                     flagsp == &negflags ? "?-" : "?",
9033                                     *RExC_parse,
9034                                     flagsp == &negflags ? "don't " : "",
9035                                     *RExC_parse
9036                                 );
9037                             }
9038                         }
9039                         break;
9040
9041                     case CONTINUE_PAT_MOD: /* 'c' */
9042                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9043                             if (! (wastedflags & WASTED_C) ) {
9044                                 wastedflags |= WASTED_GC;
9045                                 vWARN3(
9046                                     RExC_parse + 1,
9047                                     "Useless (%sc) - %suse /gc modifier",
9048                                     flagsp == &negflags ? "?-" : "?",
9049                                     flagsp == &negflags ? "don't " : ""
9050                                 );
9051                             }
9052                         }
9053                         break;
9054                     case KEEPCOPY_PAT_MOD: /* 'p' */
9055                         if (flagsp == &negflags) {
9056                             if (SIZE_ONLY)
9057                                 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
9058                         } else {
9059                             *flagsp |= RXf_PMf_KEEPCOPY;
9060                         }
9061                         break;
9062                     case '-':
9063                         /* A flag is a default iff it is following a minus, so
9064                          * if there is a minus, it means will be trying to
9065                          * re-specify a default which is an error */
9066                         if (has_use_defaults || flagsp == &negflags) {
9067             fail_modifiers:
9068                             RExC_parse++;
9069                             vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
9070                             /*NOTREACHED*/
9071                         }
9072                         flagsp = &negflags;
9073                         wastedflags = 0;  /* reset so (?g-c) warns twice */
9074                         break;
9075                     case ':':
9076                         paren = ':';
9077                         /*FALLTHROUGH*/
9078                     case ')':
9079                         RExC_flags |= posflags;
9080                         RExC_flags &= ~negflags;
9081                         set_regex_charset(&RExC_flags, cs);
9082                         if (paren != ':') {
9083                             oregflags |= posflags;
9084                             oregflags &= ~negflags;
9085                             set_regex_charset(&oregflags, cs);
9086                         }
9087                         nextchar(pRExC_state);
9088                         if (paren != ':') {
9089                             *flagp = TRYAGAIN;
9090                             return NULL;
9091                         } else {
9092                             ret = NULL;
9093                             goto parse_rest;
9094                         }
9095                         /*NOTREACHED*/
9096                     default:
9097                         RExC_parse++;
9098                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
9099                         /*NOTREACHED*/
9100                     }
9101                     ++RExC_parse;
9102                 }
9103             }} /* one for the default block, one for the switch */
9104         }
9105         else {                  /* (...) */
9106           capturing_parens:
9107             parno = RExC_npar;
9108             RExC_npar++;
9109
9110             ret = reganode(pRExC_state, OPEN, parno);
9111             if (!SIZE_ONLY ){
9112                 if (!RExC_nestroot)
9113                     RExC_nestroot = parno;
9114                 if (RExC_seen & REG_SEEN_RECURSE
9115                     && !RExC_open_parens[parno-1])
9116                 {
9117                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9118                         "Setting open paren #%"IVdf" to %d\n",
9119                         (IV)parno, REG_NODE_NUM(ret)));
9120                     RExC_open_parens[parno-1]= ret;
9121                 }
9122             }
9123             Set_Node_Length(ret, 1); /* MJD */
9124             Set_Node_Offset(ret, RExC_parse); /* MJD */
9125             is_open = 1;
9126         }
9127     }
9128     else                        /* ! paren */
9129         ret = NULL;
9130
9131    parse_rest:
9132     /* Pick up the branches, linking them together. */
9133     parse_start = RExC_parse;   /* MJD */
9134     br = regbranch(pRExC_state, &flags, 1,depth+1);
9135
9136     /*     branch_len = (paren != 0); */
9137
9138     if (br == NULL)
9139         return(NULL);
9140     if (*RExC_parse == '|') {
9141         if (!SIZE_ONLY && RExC_extralen) {
9142             reginsert(pRExC_state, BRANCHJ, br, depth+1);
9143         }
9144         else {                  /* MJD */
9145             reginsert(pRExC_state, BRANCH, br, depth+1);
9146             Set_Node_Length(br, paren != 0);
9147             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
9148         }
9149         have_branch = 1;
9150         if (SIZE_ONLY)
9151             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
9152     }
9153     else if (paren == ':') {
9154         *flagp |= flags&SIMPLE;
9155     }
9156     if (is_open) {                              /* Starts with OPEN. */
9157         REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
9158     }
9159     else if (paren != '?')              /* Not Conditional */
9160         ret = br;
9161     *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9162     lastbr = br;
9163     while (*RExC_parse == '|') {
9164         if (!SIZE_ONLY && RExC_extralen) {
9165             ender = reganode(pRExC_state, LONGJMP,0);
9166             REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
9167         }
9168         if (SIZE_ONLY)
9169             RExC_extralen += 2;         /* Account for LONGJMP. */
9170         nextchar(pRExC_state);
9171         if (freeze_paren) {
9172             if (RExC_npar > after_freeze)
9173                 after_freeze = RExC_npar;
9174             RExC_npar = freeze_paren;
9175         }
9176         br = regbranch(pRExC_state, &flags, 0, depth+1);
9177
9178         if (br == NULL)
9179             return(NULL);
9180         REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
9181         lastbr = br;
9182         *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9183     }
9184
9185     if (have_branch || paren != ':') {
9186         /* Make a closing node, and hook it on the end. */
9187         switch (paren) {
9188         case ':':
9189             ender = reg_node(pRExC_state, TAIL);
9190             break;
9191         case 1:
9192             ender = reganode(pRExC_state, CLOSE, parno);
9193             if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
9194                 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9195                         "Setting close paren #%"IVdf" to %d\n",
9196                         (IV)parno, REG_NODE_NUM(ender)));
9197                 RExC_close_parens[parno-1]= ender;
9198                 if (RExC_nestroot == parno)
9199                     RExC_nestroot = 0;
9200             }
9201             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
9202             Set_Node_Length(ender,1); /* MJD */
9203             break;
9204         case '<':
9205         case ',':
9206         case '=':
9207         case '!':
9208             *flagp &= ~HASWIDTH;
9209             /* FALL THROUGH */
9210         case '>':
9211             ender = reg_node(pRExC_state, SUCCEED);
9212             break;
9213         case 0:
9214             ender = reg_node(pRExC_state, END);
9215             if (!SIZE_ONLY) {
9216                 assert(!RExC_opend); /* there can only be one! */
9217                 RExC_opend = ender;
9218             }
9219             break;
9220         }
9221         DEBUG_PARSE_r(if (!SIZE_ONLY) {
9222             SV * const mysv_val1=sv_newmortal();
9223             SV * const mysv_val2=sv_newmortal();
9224             DEBUG_PARSE_MSG("lsbr");
9225             regprop(RExC_rx, mysv_val1, lastbr);
9226             regprop(RExC_rx, mysv_val2, ender);
9227             PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9228                           SvPV_nolen_const(mysv_val1),
9229                           (IV)REG_NODE_NUM(lastbr),
9230                           SvPV_nolen_const(mysv_val2),
9231                           (IV)REG_NODE_NUM(ender),
9232                           (IV)(ender - lastbr)
9233             );
9234         });
9235         REGTAIL(pRExC_state, lastbr, ender);
9236
9237         if (have_branch && !SIZE_ONLY) {
9238             char is_nothing= 1;
9239             if (depth==1)
9240                 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
9241
9242             /* Hook the tails of the branches to the closing node. */
9243             for (br = ret; br; br = regnext(br)) {
9244                 const U8 op = PL_regkind[OP(br)];
9245                 if (op == BRANCH) {
9246                     REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9247                     if (OP(NEXTOPER(br)) != NOTHING || regnext(NEXTOPER(br)) != ender)
9248                         is_nothing= 0;
9249                 }
9250                 else if (op == BRANCHJ) {
9251                     REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9252                     /* for now we always disable this optimisation * /
9253                     if (OP(NEXTOPER(NEXTOPER(br))) != NOTHING || regnext(NEXTOPER(NEXTOPER(br))) != ender)
9254                     */
9255                         is_nothing= 0;
9256                 }
9257             }
9258             if (is_nothing) {
9259                 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
9260                 DEBUG_PARSE_r(if (!SIZE_ONLY) {
9261                     SV * const mysv_val1=sv_newmortal();
9262                     SV * const mysv_val2=sv_newmortal();
9263                     DEBUG_PARSE_MSG("NADA");
9264                     regprop(RExC_rx, mysv_val1, ret);
9265                     regprop(RExC_rx, mysv_val2, ender);
9266                     PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9267                                   SvPV_nolen_const(mysv_val1),
9268                                   (IV)REG_NODE_NUM(ret),
9269                                   SvPV_nolen_const(mysv_val2),
9270                                   (IV)REG_NODE_NUM(ender),
9271                                   (IV)(ender - ret)
9272                     );
9273                 });
9274                 OP(br)= NOTHING;
9275                 if (OP(ender) == TAIL) {
9276                     NEXT_OFF(br)= 0;
9277                     RExC_emit= br + 1;
9278                 } else {
9279                     regnode *opt;
9280                     for ( opt= br + 1; opt < ender ; opt++ )
9281                         OP(opt)= OPTIMIZED;
9282                     NEXT_OFF(br)= ender - br;
9283                 }
9284             }
9285         }
9286     }
9287
9288     {
9289         const char *p;
9290         static const char parens[] = "=!<,>";
9291
9292         if (paren && (p = strchr(parens, paren))) {
9293             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
9294             int flag = (p - parens) > 1;
9295
9296             if (paren == '>')
9297                 node = SUSPEND, flag = 0;
9298             reginsert(pRExC_state, node,ret, depth+1);
9299             Set_Node_Cur_Length(ret);
9300             Set_Node_Offset(ret, parse_start + 1);
9301             ret->flags = flag;
9302             REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
9303         }
9304     }
9305
9306     /* Check for proper termination. */
9307     if (paren) {
9308         RExC_flags = oregflags;
9309         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
9310             RExC_parse = oregcomp_parse;
9311             vFAIL("Unmatched (");
9312         }
9313     }
9314     else if (!paren && RExC_parse < RExC_end) {
9315         if (*RExC_parse == ')') {
9316             RExC_parse++;
9317             vFAIL("Unmatched )");
9318         }
9319         else
9320             FAIL("Junk on end of regexp");      /* "Can't happen". */
9321         assert(0); /* NOTREACHED */
9322     }
9323
9324     if (RExC_in_lookbehind) {
9325         RExC_in_lookbehind--;
9326     }
9327     if (after_freeze > RExC_npar)
9328         RExC_npar = after_freeze;
9329     return(ret);
9330 }
9331
9332 /*
9333  - regbranch - one alternative of an | operator
9334  *
9335  * Implements the concatenation operator.
9336  */
9337 STATIC regnode *
9338 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
9339 {
9340     dVAR;
9341     regnode *ret;
9342     regnode *chain = NULL;
9343     regnode *latest;
9344     I32 flags = 0, c = 0;
9345     GET_RE_DEBUG_FLAGS_DECL;
9346
9347     PERL_ARGS_ASSERT_REGBRANCH;
9348
9349     DEBUG_PARSE("brnc");
9350
9351     if (first)
9352         ret = NULL;
9353     else {
9354         if (!SIZE_ONLY && RExC_extralen)
9355             ret = reganode(pRExC_state, BRANCHJ,0);
9356         else {
9357             ret = reg_node(pRExC_state, BRANCH);
9358             Set_Node_Length(ret, 1);
9359         }
9360     }
9361
9362     if (!first && SIZE_ONLY)
9363         RExC_extralen += 1;                     /* BRANCHJ */
9364
9365     *flagp = WORST;                     /* Tentatively. */
9366
9367     RExC_parse--;
9368     nextchar(pRExC_state);
9369     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
9370         flags &= ~TRYAGAIN;
9371         latest = regpiece(pRExC_state, &flags,depth+1);
9372         if (latest == NULL) {
9373             if (flags & TRYAGAIN)
9374                 continue;
9375             return(NULL);
9376         }
9377         else if (ret == NULL)
9378             ret = latest;
9379         *flagp |= flags&(HASWIDTH|POSTPONED);
9380         if (chain == NULL)      /* First piece. */
9381             *flagp |= flags&SPSTART;
9382         else {
9383             RExC_naughty++;
9384             REGTAIL(pRExC_state, chain, latest);
9385         }
9386         chain = latest;
9387         c++;
9388     }
9389     if (chain == NULL) {        /* Loop ran zero times. */
9390         chain = reg_node(pRExC_state, NOTHING);
9391         if (ret == NULL)
9392             ret = chain;
9393     }
9394     if (c == 1) {
9395         *flagp |= flags&SIMPLE;
9396     }
9397
9398     return ret;
9399 }
9400
9401 /*
9402  - regpiece - something followed by possible [*+?]
9403  *
9404  * Note that the branching code sequences used for ? and the general cases
9405  * of * and + are somewhat optimized:  they use the same NOTHING node as
9406  * both the endmarker for their branch list and the body of the last branch.
9407  * It might seem that this node could be dispensed with entirely, but the
9408  * endmarker role is not redundant.
9409  */
9410 STATIC regnode *
9411 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
9412 {
9413     dVAR;
9414     regnode *ret;
9415     char op;
9416     char *next;
9417     I32 flags;
9418     const char * const origparse = RExC_parse;
9419     I32 min;
9420     I32 max = REG_INFTY;
9421 #ifdef RE_TRACK_PATTERN_OFFSETS
9422     char *parse_start;
9423 #endif
9424     const char *maxpos = NULL;
9425
9426     /* Save the original in case we change the emitted regop to a FAIL. */
9427     regnode * const orig_emit = RExC_emit;
9428
9429     GET_RE_DEBUG_FLAGS_DECL;
9430
9431     PERL_ARGS_ASSERT_REGPIECE;
9432
9433     DEBUG_PARSE("piec");
9434
9435     ret = regatom(pRExC_state, &flags,depth+1);
9436     if (ret == NULL) {
9437         if (flags & TRYAGAIN)
9438             *flagp |= TRYAGAIN;
9439         return(NULL);
9440     }
9441
9442     op = *RExC_parse;
9443
9444     if (op == '{' && regcurly(RExC_parse, FALSE)) {
9445         maxpos = NULL;
9446 #ifdef RE_TRACK_PATTERN_OFFSETS
9447         parse_start = RExC_parse; /* MJD */
9448 #endif
9449         next = RExC_parse + 1;
9450         while (isDIGIT(*next) || *next == ',') {
9451             if (*next == ',') {
9452                 if (maxpos)
9453                     break;
9454                 else
9455                     maxpos = next;
9456             }
9457             next++;
9458         }
9459         if (*next == '}') {             /* got one */
9460             if (!maxpos)
9461                 maxpos = next;
9462             RExC_parse++;
9463             min = atoi(RExC_parse);
9464             if (*maxpos == ',')
9465                 maxpos++;
9466             else
9467                 maxpos = RExC_parse;
9468             max = atoi(maxpos);
9469             if (!max && *maxpos != '0')
9470                 max = REG_INFTY;                /* meaning "infinity" */
9471             else if (max >= REG_INFTY)
9472                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
9473             RExC_parse = next;
9474             nextchar(pRExC_state);
9475             if (max < min) {    /* If can't match, warn and optimize to fail
9476                                    unconditionally */
9477                 if (SIZE_ONLY) {
9478                     ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
9479
9480                     /* We can't back off the size because we have to reserve
9481                      * enough space for all the things we are about to throw
9482                      * away, but we can shrink it by the ammount we are about
9483                      * to re-use here */
9484                     RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
9485                 }
9486                 else {
9487                     RExC_emit = orig_emit;
9488                 }
9489                 ret = reg_node(pRExC_state, OPFAIL);
9490                 return ret;
9491             }
9492             else if (max == 0) {    /* replace {0} with a nothing node */
9493                 if (SIZE_ONLY) {
9494                     RExC_size = PREVOPER(RExC_size) - regarglen[(U8)NOTHING];
9495                 }
9496                 else {
9497                     RExC_emit = orig_emit;
9498                 }
9499                 ret = reg_node(pRExC_state, NOTHING);
9500                 return ret;
9501             }
9502
9503         do_curly:
9504             if ((flags&SIMPLE)) {
9505                 RExC_naughty += 2 + RExC_naughty / 2;
9506                 reginsert(pRExC_state, CURLY, ret, depth+1);
9507                 Set_Node_Offset(ret, parse_start+1); /* MJD */
9508                 Set_Node_Cur_Length(ret);
9509             }
9510             else {
9511                 regnode * const w = reg_node(pRExC_state, WHILEM);
9512
9513                 w->flags = 0;
9514                 REGTAIL(pRExC_state, ret, w);
9515                 if (!SIZE_ONLY && RExC_extralen) {
9516                     reginsert(pRExC_state, LONGJMP,ret, depth+1);
9517                     reginsert(pRExC_state, NOTHING,ret, depth+1);
9518                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
9519                 }
9520                 reginsert(pRExC_state, CURLYX,ret, depth+1);
9521                                 /* MJD hk */
9522                 Set_Node_Offset(ret, parse_start+1);
9523                 Set_Node_Length(ret,
9524                                 op == '{' ? (RExC_parse - parse_start) : 1);
9525
9526                 if (!SIZE_ONLY && RExC_extralen)
9527                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
9528                 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
9529                 if (SIZE_ONLY)
9530                     RExC_whilem_seen++, RExC_extralen += 3;
9531                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
9532             }
9533             ret->flags = 0;
9534
9535             if (min > 0)
9536                 *flagp = WORST;
9537             if (max > 0)
9538                 *flagp |= HASWIDTH;
9539             if (!SIZE_ONLY) {
9540                 ARG1_SET(ret, (U16)min);
9541                 ARG2_SET(ret, (U16)max);
9542             }
9543
9544             goto nest_check;
9545         }
9546     }
9547
9548     if (!ISMULT1(op)) {
9549         *flagp = flags;
9550         return(ret);
9551     }
9552
9553 #if 0                           /* Now runtime fix should be reliable. */
9554
9555     /* if this is reinstated, don't forget to put this back into perldiag:
9556
9557             =item Regexp *+ operand could be empty at {#} in regex m/%s/
9558
9559            (F) The part of the regexp subject to either the * or + quantifier
9560            could match an empty string. The {#} shows in the regular
9561            expression about where the problem was discovered.
9562
9563     */
9564
9565     if (!(flags&HASWIDTH) && op != '?')
9566       vFAIL("Regexp *+ operand could be empty");
9567 #endif
9568
9569 #ifdef RE_TRACK_PATTERN_OFFSETS
9570     parse_start = RExC_parse;
9571 #endif
9572     nextchar(pRExC_state);
9573
9574     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
9575
9576     if (op == '*' && (flags&SIMPLE)) {
9577         reginsert(pRExC_state, STAR, ret, depth+1);
9578         ret->flags = 0;
9579         RExC_naughty += 4;
9580     }
9581     else if (op == '*') {
9582         min = 0;
9583         goto do_curly;
9584     }
9585     else if (op == '+' && (flags&SIMPLE)) {
9586         reginsert(pRExC_state, PLUS, ret, depth+1);
9587         ret->flags = 0;
9588         RExC_naughty += 3;
9589     }
9590     else if (op == '+') {
9591         min = 1;
9592         goto do_curly;
9593     }
9594     else if (op == '?') {
9595         min = 0; max = 1;
9596         goto do_curly;
9597     }
9598   nest_check:
9599     if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
9600         SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
9601         ckWARN3reg(RExC_parse,
9602                    "%.*s matches null string many times",
9603                    (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
9604                    origparse);
9605         (void)ReREFCNT_inc(RExC_rx_sv);
9606     }
9607
9608     if (RExC_parse < RExC_end && *RExC_parse == '?') {
9609         nextchar(pRExC_state);
9610         reginsert(pRExC_state, MINMOD, ret, depth+1);
9611         REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
9612     }
9613 #ifndef REG_ALLOW_MINMOD_SUSPEND
9614     else
9615 #endif
9616     if (RExC_parse < RExC_end && *RExC_parse == '+') {
9617         regnode *ender;
9618         nextchar(pRExC_state);
9619         ender = reg_node(pRExC_state, SUCCEED);
9620         REGTAIL(pRExC_state, ret, ender);
9621         reginsert(pRExC_state, SUSPEND, ret, depth+1);
9622         ret->flags = 0;
9623         ender = reg_node(pRExC_state, TAIL);
9624         REGTAIL(pRExC_state, ret, ender);
9625         /*ret= ender;*/
9626     }
9627
9628     if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
9629         RExC_parse++;
9630         vFAIL("Nested quantifiers");
9631     }
9632
9633     return(ret);
9634 }
9635
9636 STATIC bool
9637 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I32 *flagp, U32 depth, bool in_char_class,
9638         const bool strict   /* Apply stricter parsing rules? */
9639     )
9640 {
9641
9642  /* This is expected to be called by a parser routine that has recognized '\N'
9643    and needs to handle the rest. RExC_parse is expected to point at the first
9644    char following the N at the time of the call.  On successful return,
9645    RExC_parse has been updated to point to just after the sequence identified
9646    by this routine, and <*flagp> has been updated.
9647
9648    The \N may be inside (indicated by the boolean <in_char_class>) or outside a
9649    character class.
9650
9651    \N may begin either a named sequence, or if outside a character class, mean
9652    to match a non-newline.  For non single-quoted regexes, the tokenizer has
9653    attempted to decide which, and in the case of a named sequence, converted it
9654    into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
9655    where c1... are the characters in the sequence.  For single-quoted regexes,
9656    the tokenizer passes the \N sequence through unchanged; this code will not
9657    attempt to determine this nor expand those, instead raising a syntax error.
9658    The net effect is that if the beginning of the passed-in pattern isn't '{U+'
9659    or there is no '}', it signals that this \N occurrence means to match a
9660    non-newline.
9661
9662    Only the \N{U+...} form should occur in a character class, for the same
9663    reason that '.' inside a character class means to just match a period: it
9664    just doesn't make sense.
9665
9666    The function raises an error (via vFAIL), and doesn't return for various
9667    syntax errors.  Otherwise it returns TRUE and sets <node_p> or <valuep> on
9668    success; it returns FALSE otherwise.
9669
9670    If <valuep> is non-null, it means the caller can accept an input sequence
9671    consisting of a just a single code point; <*valuep> is set to that value
9672    if the input is such.
9673
9674    If <node_p> is non-null it signifies that the caller can accept any other
9675    legal sequence (i.e., one that isn't just a single code point).  <*node_p>
9676    is set as follows:
9677     1) \N means not-a-NL: points to a newly created REG_ANY node;
9678     2) \N{}:              points to a new NOTHING node;
9679     3) otherwise:         points to a new EXACT node containing the resolved
9680                           string.
9681    Note that FALSE is returned for single code point sequences if <valuep> is
9682    null.
9683  */
9684
9685     char * endbrace;    /* '}' following the name */
9686     char* p;
9687     char *endchar;      /* Points to '.' or '}' ending cur char in the input
9688                            stream */
9689     bool has_multiple_chars; /* true if the input stream contains a sequence of
9690                                 more than one character */
9691
9692     GET_RE_DEBUG_FLAGS_DECL;
9693
9694     PERL_ARGS_ASSERT_GROK_BSLASH_N;
9695
9696     GET_RE_DEBUG_FLAGS;
9697
9698     assert(cBOOL(node_p) ^ cBOOL(valuep));  /* Exactly one should be set */
9699
9700     /* The [^\n] meaning of \N ignores spaces and comments under the /x
9701      * modifier.  The other meaning does not */
9702     p = (RExC_flags & RXf_PMf_EXTENDED)
9703         ? regwhite( pRExC_state, RExC_parse )
9704         : RExC_parse;
9705
9706     /* Disambiguate between \N meaning a named character versus \N meaning
9707      * [^\n].  The former is assumed when it can't be the latter. */
9708     if (*p != '{' || regcurly(p, FALSE)) {
9709         RExC_parse = p;
9710         if (! node_p) {
9711             /* no bare \N in a charclass */
9712             if (in_char_class) {
9713                 vFAIL("\\N in a character class must be a named character: \\N{...}");
9714             }
9715             return FALSE;
9716         }
9717         nextchar(pRExC_state);
9718         *node_p = reg_node(pRExC_state, REG_ANY);
9719         *flagp |= HASWIDTH|SIMPLE;
9720         RExC_naughty++;
9721         RExC_parse--;
9722         Set_Node_Length(*node_p, 1); /* MJD */
9723         return TRUE;
9724     }
9725
9726     /* Here, we have decided it should be a named character or sequence */
9727
9728     /* The test above made sure that the next real character is a '{', but
9729      * under the /x modifier, it could be separated by space (or a comment and
9730      * \n) and this is not allowed (for consistency with \x{...} and the
9731      * tokenizer handling of \N{NAME}). */
9732     if (*RExC_parse != '{') {
9733         vFAIL("Missing braces on \\N{}");
9734     }
9735
9736     RExC_parse++;       /* Skip past the '{' */
9737
9738     if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
9739         || ! (endbrace == RExC_parse            /* nothing between the {} */
9740               || (endbrace - RExC_parse >= 2    /* U+ (bad hex is checked below */
9741                   && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
9742     {
9743         if (endbrace) RExC_parse = endbrace;    /* position msg's '<--HERE' */
9744         vFAIL("\\N{NAME} must be resolved by the lexer");
9745     }
9746
9747     if (endbrace == RExC_parse) {   /* empty: \N{} */
9748         bool ret = TRUE;
9749         if (node_p) {
9750             *node_p = reg_node(pRExC_state,NOTHING);
9751         }
9752         else if (in_char_class) {
9753             if (SIZE_ONLY && in_char_class) {
9754                 if (strict) {
9755                     RExC_parse++;   /* Position after the "}" */
9756                     vFAIL("Zero length \\N{}");
9757                 }
9758                 else {
9759                     ckWARNreg(RExC_parse,
9760                               "Ignoring zero length \\N{} in character class");
9761                 }
9762             }
9763             ret = FALSE;
9764         }
9765         else {
9766             return FALSE;
9767         }
9768         nextchar(pRExC_state);
9769         return ret;
9770     }
9771
9772     RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
9773     RExC_parse += 2;    /* Skip past the 'U+' */
9774
9775     endchar = RExC_parse + strcspn(RExC_parse, ".}");
9776
9777     /* Code points are separated by dots.  If none, there is only one code
9778      * point, and is terminated by the brace */
9779     has_multiple_chars = (endchar < endbrace);
9780
9781     if (valuep && (! has_multiple_chars || in_char_class)) {
9782         /* We only pay attention to the first char of
9783         multichar strings being returned in char classes. I kinda wonder
9784         if this makes sense as it does change the behaviour
9785         from earlier versions, OTOH that behaviour was broken
9786         as well. XXX Solution is to recharacterize as
9787         [rest-of-class]|multi1|multi2... */
9788
9789         STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
9790         I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
9791             | PERL_SCAN_DISALLOW_PREFIX
9792             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
9793
9794         *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
9795
9796         /* The tokenizer should have guaranteed validity, but it's possible to
9797          * bypass it by using single quoting, so check */
9798         if (length_of_hex == 0
9799             || length_of_hex != (STRLEN)(endchar - RExC_parse) )
9800         {
9801             RExC_parse += length_of_hex;        /* Includes all the valid */
9802             RExC_parse += (RExC_orig_utf8)      /* point to after 1st invalid */
9803                             ? UTF8SKIP(RExC_parse)
9804                             : 1;
9805             /* Guard against malformed utf8 */
9806             if (RExC_parse >= endchar) {
9807                 RExC_parse = endchar;
9808             }
9809             vFAIL("Invalid hexadecimal number in \\N{U+...}");
9810         }
9811
9812         if (in_char_class && has_multiple_chars) {
9813             if (strict) {
9814                 RExC_parse = endbrace;
9815                 vFAIL("\\N{} in character class restricted to one character");
9816             }
9817             else {
9818                 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
9819             }
9820         }
9821
9822         RExC_parse = endbrace + 1;
9823     }
9824     else if (! node_p || ! has_multiple_chars) {
9825
9826         /* Here, the input is legal, but not according to the caller's
9827          * options.  We fail without advancing the parse, so that the
9828          * caller can try again */
9829         RExC_parse = p;
9830         return FALSE;
9831     }
9832     else {
9833
9834         /* What is done here is to convert this to a sub-pattern of the form
9835          * (?:\x{char1}\x{char2}...)
9836          * and then call reg recursively.  That way, it retains its atomicness,
9837          * while not having to worry about special handling that some code
9838          * points may have.  toke.c has converted the original Unicode values
9839          * to native, so that we can just pass on the hex values unchanged.  We
9840          * do have to set a flag to keep recoding from happening in the
9841          * recursion */
9842
9843         SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
9844         STRLEN len;
9845         char *orig_end = RExC_end;
9846         I32 flags;
9847
9848         while (RExC_parse < endbrace) {
9849
9850             /* Convert to notation the rest of the code understands */
9851             sv_catpv(substitute_parse, "\\x{");
9852             sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
9853             sv_catpv(substitute_parse, "}");
9854
9855             /* Point to the beginning of the next character in the sequence. */
9856             RExC_parse = endchar + 1;
9857             endchar = RExC_parse + strcspn(RExC_parse, ".}");
9858         }
9859         sv_catpv(substitute_parse, ")");
9860
9861         RExC_parse = SvPV(substitute_parse, len);
9862
9863         /* Don't allow empty number */
9864         if (len < 8) {
9865             vFAIL("Invalid hexadecimal number in \\N{U+...}");
9866         }
9867         RExC_end = RExC_parse + len;
9868
9869         /* The values are Unicode, and therefore not subject to recoding */
9870         RExC_override_recoding = 1;
9871
9872         *node_p = reg(pRExC_state, 1, &flags, depth+1);
9873         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
9874
9875         RExC_parse = endbrace;
9876         RExC_end = orig_end;
9877         RExC_override_recoding = 0;
9878
9879         nextchar(pRExC_state);
9880     }
9881
9882     return TRUE;
9883 }
9884
9885
9886 /*
9887  * reg_recode
9888  *
9889  * It returns the code point in utf8 for the value in *encp.
9890  *    value: a code value in the source encoding
9891  *    encp:  a pointer to an Encode object
9892  *
9893  * If the result from Encode is not a single character,
9894  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
9895  */
9896 STATIC UV
9897 S_reg_recode(pTHX_ const char value, SV **encp)
9898 {
9899     STRLEN numlen = 1;
9900     SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
9901     const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
9902     const STRLEN newlen = SvCUR(sv);
9903     UV uv = UNICODE_REPLACEMENT;
9904
9905     PERL_ARGS_ASSERT_REG_RECODE;
9906
9907     if (newlen)
9908         uv = SvUTF8(sv)
9909              ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
9910              : *(U8*)s;
9911
9912     if (!newlen || numlen != newlen) {
9913         uv = UNICODE_REPLACEMENT;
9914         *encp = NULL;
9915     }
9916     return uv;
9917 }
9918
9919 PERL_STATIC_INLINE U8
9920 S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state)
9921 {
9922     U8 op;
9923
9924     PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
9925
9926     if (! FOLD) {
9927         return EXACT;
9928     }
9929
9930     op = get_regex_charset(RExC_flags);
9931     if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
9932         op--; /* /a is same as /u, and map /aa's offset to what /a's would have
9933                  been, so there is no hole */
9934     }
9935
9936     return op + EXACTF;
9937 }
9938
9939 PERL_STATIC_INLINE void
9940 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32* flagp, STRLEN len, UV code_point)
9941 {
9942     /* This knows the details about sizing an EXACTish node, setting flags for
9943      * it (by setting <*flagp>, and potentially populating it with a single
9944      * character.
9945      *
9946      * If <len> (the length in bytes) is non-zero, this function assumes that
9947      * the node has already been populated, and just does the sizing.  In this
9948      * case <code_point> should be the final code point that has already been
9949      * placed into the node.  This value will be ignored except that under some
9950      * circumstances <*flagp> is set based on it.
9951      *
9952      * If <len> is zero, the function assumes that the node is to contain only
9953      * the single character given by <code_point> and calculates what <len>
9954      * should be.  In pass 1, it sizes the node appropriately.  In pass 2, it
9955      * additionally will populate the node's STRING with <code_point>, if <len>
9956      * is 0.  In both cases <*flagp> is appropriately set
9957      *
9958      * It knows that under FOLD, UTF characters and the Latin Sharp S must be
9959      * folded (the latter only when the rules indicate it can match 'ss') */
9960
9961     bool len_passed_in = cBOOL(len != 0);
9962     U8 character[UTF8_MAXBYTES_CASE+1];
9963
9964     PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
9965
9966     if (! len_passed_in) {
9967         if (UTF) {
9968             if (FOLD) {
9969                 to_uni_fold(NATIVE_TO_UNI(code_point), character, &len);
9970             }
9971             else {
9972                 uvchr_to_utf8( character, code_point);
9973                 len = UTF8SKIP(character);
9974             }
9975         }
9976         else if (! FOLD
9977                  || code_point != LATIN_SMALL_LETTER_SHARP_S
9978                  || ASCII_FOLD_RESTRICTED
9979                  || ! AT_LEAST_UNI_SEMANTICS)
9980         {
9981             *character = (U8) code_point;
9982             len = 1;
9983         }
9984         else {
9985             *character = 's';
9986             *(character + 1) = 's';
9987             len = 2;
9988         }
9989     }
9990
9991     if (SIZE_ONLY) {
9992         RExC_size += STR_SZ(len);
9993     }
9994     else {
9995         RExC_emit += STR_SZ(len);
9996         STR_LEN(node) = len;
9997         if (! len_passed_in) {
9998             Copy((char *) character, STRING(node), len, char);
9999         }
10000     }
10001
10002     *flagp |= HASWIDTH;
10003
10004     /* A single character node is SIMPLE, except for the special-cased SHARP S
10005      * under /di. */
10006     if ((len == 1 || (UTF && len == UNISKIP(code_point)))
10007         && (code_point != LATIN_SMALL_LETTER_SHARP_S
10008             || ! FOLD || ! DEPENDS_SEMANTICS))
10009     {
10010         *flagp |= SIMPLE;
10011     }
10012 }
10013
10014 /*
10015  - regatom - the lowest level
10016
10017    Try to identify anything special at the start of the pattern. If there
10018    is, then handle it as required. This may involve generating a single regop,
10019    such as for an assertion; or it may involve recursing, such as to
10020    handle a () structure.
10021
10022    If the string doesn't start with something special then we gobble up
10023    as much literal text as we can.
10024
10025    Once we have been able to handle whatever type of thing started the
10026    sequence, we return.
10027
10028    Note: we have to be careful with escapes, as they can be both literal
10029    and special, and in the case of \10 and friends, context determines which.
10030
10031    A summary of the code structure is:
10032
10033    switch (first_byte) {
10034         cases for each special:
10035             handle this special;
10036             break;
10037         case '\\':
10038             switch (2nd byte) {
10039                 cases for each unambiguous special:
10040                     handle this special;
10041                     break;
10042                 cases for each ambigous special/literal:
10043                     disambiguate;
10044                     if (special)  handle here
10045                     else goto defchar;
10046                 default: // unambiguously literal:
10047                     goto defchar;
10048             }
10049         default:  // is a literal char
10050             // FALL THROUGH
10051         defchar:
10052             create EXACTish node for literal;
10053             while (more input and node isn't full) {
10054                 switch (input_byte) {
10055                    cases for each special;
10056                        make sure parse pointer is set so that the next call to
10057                            regatom will see this special first
10058                        goto loopdone; // EXACTish node terminated by prev. char
10059                    default:
10060                        append char to EXACTISH node;
10061                 }
10062                 get next input byte;
10063             }
10064         loopdone:
10065    }
10066    return the generated node;
10067
10068    Specifically there are two separate switches for handling
10069    escape sequences, with the one for handling literal escapes requiring
10070    a dummy entry for all of the special escapes that are actually handled
10071    by the other.
10072 */
10073
10074 STATIC regnode *
10075 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10076 {
10077     dVAR;
10078     regnode *ret = NULL;
10079     I32 flags = 0;
10080     char *parse_start = RExC_parse;
10081     U8 op;
10082     int invert = 0;
10083
10084     GET_RE_DEBUG_FLAGS_DECL;
10085
10086     *flagp = WORST;             /* Tentatively. */
10087
10088     DEBUG_PARSE("atom");
10089
10090     PERL_ARGS_ASSERT_REGATOM;
10091
10092 tryagain:
10093     switch ((U8)*RExC_parse) {
10094     case '^':
10095         RExC_seen_zerolen++;
10096         nextchar(pRExC_state);
10097         if (RExC_flags & RXf_PMf_MULTILINE)
10098             ret = reg_node(pRExC_state, MBOL);
10099         else if (RExC_flags & RXf_PMf_SINGLELINE)
10100             ret = reg_node(pRExC_state, SBOL);
10101         else
10102             ret = reg_node(pRExC_state, BOL);
10103         Set_Node_Length(ret, 1); /* MJD */
10104         break;
10105     case '$':
10106         nextchar(pRExC_state);
10107         if (*RExC_parse)
10108             RExC_seen_zerolen++;
10109         if (RExC_flags & RXf_PMf_MULTILINE)
10110             ret = reg_node(pRExC_state, MEOL);
10111         else if (RExC_flags & RXf_PMf_SINGLELINE)
10112             ret = reg_node(pRExC_state, SEOL);
10113         else
10114             ret = reg_node(pRExC_state, EOL);
10115         Set_Node_Length(ret, 1); /* MJD */
10116         break;
10117     case '.':
10118         nextchar(pRExC_state);
10119         if (RExC_flags & RXf_PMf_SINGLELINE)
10120             ret = reg_node(pRExC_state, SANY);
10121         else
10122             ret = reg_node(pRExC_state, REG_ANY);
10123         *flagp |= HASWIDTH|SIMPLE;
10124         RExC_naughty++;
10125         Set_Node_Length(ret, 1); /* MJD */
10126         break;
10127     case '[':
10128     {
10129         char * const oregcomp_parse = ++RExC_parse;
10130         ret = regclass(pRExC_state, flagp,depth+1,
10131                        FALSE, /* means parse the whole char class */
10132                        TRUE, /* allow multi-char folds */
10133                        FALSE, /* don't silence non-portable warnings. */
10134                        NULL);
10135         if (*RExC_parse != ']') {
10136             RExC_parse = oregcomp_parse;
10137             vFAIL("Unmatched [");
10138         }
10139         nextchar(pRExC_state);
10140         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
10141         break;
10142     }
10143     case '(':
10144         nextchar(pRExC_state);
10145         ret = reg(pRExC_state, 1, &flags,depth+1);
10146         if (ret == NULL) {
10147                 if (flags & TRYAGAIN) {
10148                     if (RExC_parse == RExC_end) {
10149                          /* Make parent create an empty node if needed. */
10150                         *flagp |= TRYAGAIN;
10151                         return(NULL);
10152                     }
10153                     goto tryagain;
10154                 }
10155                 return(NULL);
10156         }
10157         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10158         break;
10159     case '|':
10160     case ')':
10161         if (flags & TRYAGAIN) {
10162             *flagp |= TRYAGAIN;
10163             return NULL;
10164         }
10165         vFAIL("Internal urp");
10166                                 /* Supposed to be caught earlier. */
10167         break;
10168     case '{':
10169         if (!regcurly(RExC_parse, FALSE)) {
10170             RExC_parse++;
10171             goto defchar;
10172         }
10173         /* FALL THROUGH */
10174     case '?':
10175     case '+':
10176     case '*':
10177         RExC_parse++;
10178         vFAIL("Quantifier follows nothing");
10179         break;
10180     case '\\':
10181         /* Special Escapes
10182
10183            This switch handles escape sequences that resolve to some kind
10184            of special regop and not to literal text. Escape sequnces that
10185            resolve to literal text are handled below in the switch marked
10186            "Literal Escapes".
10187
10188            Every entry in this switch *must* have a corresponding entry
10189            in the literal escape switch. However, the opposite is not
10190            required, as the default for this switch is to jump to the
10191            literal text handling code.
10192         */
10193         switch ((U8)*++RExC_parse) {
10194             U8 arg;
10195         /* Special Escapes */
10196         case 'A':
10197             RExC_seen_zerolen++;
10198             ret = reg_node(pRExC_state, SBOL);
10199             *flagp |= SIMPLE;
10200             goto finish_meta_pat;
10201         case 'G':
10202             ret = reg_node(pRExC_state, GPOS);
10203             RExC_seen |= REG_SEEN_GPOS;
10204             *flagp |= SIMPLE;
10205             goto finish_meta_pat;
10206         case 'K':
10207             RExC_seen_zerolen++;
10208             ret = reg_node(pRExC_state, KEEPS);
10209             *flagp |= SIMPLE;
10210             /* XXX:dmq : disabling in-place substitution seems to
10211              * be necessary here to avoid cases of memory corruption, as
10212              * with: C<$_="x" x 80; s/x\K/y/> -- rgs
10213              */
10214             RExC_seen |= REG_SEEN_LOOKBEHIND;
10215             goto finish_meta_pat;
10216         case 'Z':
10217             ret = reg_node(pRExC_state, SEOL);
10218             *flagp |= SIMPLE;
10219             RExC_seen_zerolen++;                /* Do not optimize RE away */
10220             goto finish_meta_pat;
10221         case 'z':
10222             ret = reg_node(pRExC_state, EOS);
10223             *flagp |= SIMPLE;
10224             RExC_seen_zerolen++;                /* Do not optimize RE away */
10225             goto finish_meta_pat;
10226         case 'C':
10227             ret = reg_node(pRExC_state, CANY);
10228             RExC_seen |= REG_SEEN_CANY;
10229             *flagp |= HASWIDTH|SIMPLE;
10230             goto finish_meta_pat;
10231         case 'X':
10232             ret = reg_node(pRExC_state, CLUMP);
10233             *flagp |= HASWIDTH;
10234             goto finish_meta_pat;
10235
10236         case 'W':
10237             invert = 1;
10238             /* FALLTHROUGH */
10239         case 'w':
10240             arg = ANYOF_WORDCHAR;
10241             goto join_posix;
10242
10243         case 'b':
10244             RExC_seen_zerolen++;
10245             RExC_seen |= REG_SEEN_LOOKBEHIND;
10246             op = BOUND + get_regex_charset(RExC_flags);
10247             if (op > BOUNDA) {  /* /aa is same as /a */
10248                 op = BOUNDA;
10249             }
10250             ret = reg_node(pRExC_state, op);
10251             FLAGS(ret) = get_regex_charset(RExC_flags);
10252             *flagp |= SIMPLE;
10253             if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
10254                 ckWARNregdep(RExC_parse, "\"\\b{\" is deprecated; use \"\\b\\{\" instead");
10255             }
10256             goto finish_meta_pat;
10257         case 'B':
10258             RExC_seen_zerolen++;
10259             RExC_seen |= REG_SEEN_LOOKBEHIND;
10260             op = NBOUND + get_regex_charset(RExC_flags);
10261             if (op > NBOUNDA) { /* /aa is same as /a */
10262                 op = NBOUNDA;
10263             }
10264             ret = reg_node(pRExC_state, op);
10265             FLAGS(ret) = get_regex_charset(RExC_flags);
10266             *flagp |= SIMPLE;
10267             if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
10268                 ckWARNregdep(RExC_parse, "\"\\B{\" is deprecated; use \"\\B\\{\" instead");
10269             }
10270             goto finish_meta_pat;
10271
10272         case 'D':
10273             invert = 1;
10274             /* FALLTHROUGH */
10275         case 'd':
10276             arg = ANYOF_DIGIT;
10277             goto join_posix;
10278
10279         case 'R':
10280             ret = reg_node(pRExC_state, LNBREAK);
10281             *flagp |= HASWIDTH|SIMPLE;
10282             goto finish_meta_pat;
10283
10284         case 'H':
10285             invert = 1;
10286             /* FALLTHROUGH */
10287         case 'h':
10288             arg = ANYOF_BLANK;
10289             op = POSIXU;
10290             goto join_posix_op_known;
10291
10292         case 'V':
10293             invert = 1;
10294             /* FALLTHROUGH */
10295         case 'v':
10296             arg = ANYOF_VERTWS;
10297             op = POSIXU;
10298             goto join_posix_op_known;
10299
10300         case 'S':
10301             invert = 1;
10302             /* FALLTHROUGH */
10303         case 's':
10304             arg = ANYOF_SPACE;
10305
10306         join_posix:
10307
10308             op = POSIXD + get_regex_charset(RExC_flags);
10309             if (op > POSIXA) {  /* /aa is same as /a */
10310                 op = POSIXA;
10311             }
10312
10313         join_posix_op_known:
10314
10315             if (invert) {
10316                 op += NPOSIXD - POSIXD;
10317             }
10318
10319             ret = reg_node(pRExC_state, op);
10320             if (! SIZE_ONLY) {
10321                 FLAGS(ret) = namedclass_to_classnum(arg);
10322             }
10323
10324             *flagp |= HASWIDTH|SIMPLE;
10325             /* FALL THROUGH */
10326
10327          finish_meta_pat:
10328             nextchar(pRExC_state);
10329             Set_Node_Length(ret, 2); /* MJD */
10330             break;
10331         case 'p':
10332         case 'P':
10333             {
10334 #ifdef DEBUGGING
10335                 char* parse_start = RExC_parse - 2;
10336 #endif
10337
10338                 RExC_parse--;
10339
10340                 ret = regclass(pRExC_state, flagp,depth+1,
10341                                TRUE, /* means just parse this element */
10342                                FALSE, /* don't allow multi-char folds */
10343                                FALSE, /* don't silence non-portable warnings.
10344                                          It would be a bug if these returned
10345                                          non-portables */
10346                                NULL);
10347
10348                 RExC_parse--;
10349
10350                 Set_Node_Offset(ret, parse_start + 2);
10351                 Set_Node_Cur_Length(ret);
10352                 nextchar(pRExC_state);
10353             }
10354             break;
10355         case 'N':
10356             /* Handle \N and \N{NAME} with multiple code points here and not
10357              * below because it can be multicharacter. join_exact() will join
10358              * them up later on.  Also this makes sure that things like
10359              * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
10360              * The options to the grok function call causes it to fail if the
10361              * sequence is just a single code point.  We then go treat it as
10362              * just another character in the current EXACT node, and hence it
10363              * gets uniform treatment with all the other characters.  The
10364              * special treatment for quantifiers is not needed for such single
10365              * character sequences */
10366             ++RExC_parse;
10367             if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE,
10368                                 FALSE /* not strict */ )) {
10369                 RExC_parse--;
10370                 goto defchar;
10371             }
10372             break;
10373         case 'k':    /* Handle \k<NAME> and \k'NAME' */
10374         parse_named_seq:
10375         {
10376             char ch= RExC_parse[1];
10377             if (ch != '<' && ch != '\'' && ch != '{') {
10378                 RExC_parse++;
10379                 vFAIL2("Sequence %.2s... not terminated",parse_start);
10380             } else {
10381                 /* this pretty much dupes the code for (?P=...) in reg(), if
10382                    you change this make sure you change that */
10383                 char* name_start = (RExC_parse += 2);
10384                 U32 num = 0;
10385                 SV *sv_dat = reg_scan_name(pRExC_state,
10386                     SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10387                 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
10388                 if (RExC_parse == name_start || *RExC_parse != ch)
10389                     vFAIL2("Sequence %.3s... not terminated",parse_start);
10390
10391                 if (!SIZE_ONLY) {
10392                     num = add_data( pRExC_state, 1, "S" );
10393                     RExC_rxi->data->data[num]=(void*)sv_dat;
10394                     SvREFCNT_inc_simple_void(sv_dat);
10395                 }
10396
10397                 RExC_sawback = 1;
10398                 ret = reganode(pRExC_state,
10399                                ((! FOLD)
10400                                  ? NREF
10401                                  : (ASCII_FOLD_RESTRICTED)
10402                                    ? NREFFA
10403                                    : (AT_LEAST_UNI_SEMANTICS)
10404                                      ? NREFFU
10405                                      : (LOC)
10406                                        ? NREFFL
10407                                        : NREFF),
10408                                 num);
10409                 *flagp |= HASWIDTH;
10410
10411                 /* override incorrect value set in reganode MJD */
10412                 Set_Node_Offset(ret, parse_start+1);
10413                 Set_Node_Cur_Length(ret); /* MJD */
10414                 nextchar(pRExC_state);
10415
10416             }
10417             break;
10418         }
10419         case 'g':
10420         case '1': case '2': case '3': case '4':
10421         case '5': case '6': case '7': case '8': case '9':
10422             {
10423                 I32 num;
10424                 bool isg = *RExC_parse == 'g';
10425                 bool isrel = 0;
10426                 bool hasbrace = 0;
10427                 if (isg) {
10428                     RExC_parse++;
10429                     if (*RExC_parse == '{') {
10430                         RExC_parse++;
10431                         hasbrace = 1;
10432                     }
10433                     if (*RExC_parse == '-') {
10434                         RExC_parse++;
10435                         isrel = 1;
10436                     }
10437                     if (hasbrace && !isDIGIT(*RExC_parse)) {
10438                         if (isrel) RExC_parse--;
10439                         RExC_parse -= 2;
10440                         goto parse_named_seq;
10441                 }   }
10442                 num = atoi(RExC_parse);
10443                 if (isg && num == 0)
10444                     vFAIL("Reference to invalid group 0");
10445                 if (isrel) {
10446                     num = RExC_npar - num;
10447                     if (num < 1)
10448                         vFAIL("Reference to nonexistent or unclosed group");
10449                 }
10450                 if (!isg && num > 9 && num >= RExC_npar)
10451                     /* Probably a character specified in octal, e.g. \35 */
10452                     goto defchar;
10453                 else {
10454                     char * const parse_start = RExC_parse - 1; /* MJD */
10455                     while (isDIGIT(*RExC_parse))
10456                         RExC_parse++;
10457                     if (parse_start == RExC_parse - 1)
10458                         vFAIL("Unterminated \\g... pattern");
10459                     if (hasbrace) {
10460                         if (*RExC_parse != '}')
10461                             vFAIL("Unterminated \\g{...} pattern");
10462                         RExC_parse++;
10463                     }
10464                     if (!SIZE_ONLY) {
10465                         if (num > (I32)RExC_rx->nparens)
10466                             vFAIL("Reference to nonexistent group");
10467                     }
10468                     RExC_sawback = 1;
10469                     ret = reganode(pRExC_state,
10470                                    ((! FOLD)
10471                                      ? REF
10472                                      : (ASCII_FOLD_RESTRICTED)
10473                                        ? REFFA
10474                                        : (AT_LEAST_UNI_SEMANTICS)
10475                                          ? REFFU
10476                                          : (LOC)
10477                                            ? REFFL
10478                                            : REFF),
10479                                     num);
10480                     *flagp |= HASWIDTH;
10481
10482                     /* override incorrect value set in reganode MJD */
10483                     Set_Node_Offset(ret, parse_start+1);
10484                     Set_Node_Cur_Length(ret); /* MJD */
10485                     RExC_parse--;
10486                     nextchar(pRExC_state);
10487                 }
10488             }
10489             break;
10490         case '\0':
10491             if (RExC_parse >= RExC_end)
10492                 FAIL("Trailing \\");
10493             /* FALL THROUGH */
10494         default:
10495             /* Do not generate "unrecognized" warnings here, we fall
10496                back into the quick-grab loop below */
10497             parse_start--;
10498             goto defchar;
10499         }
10500         break;
10501
10502     case '#':
10503         if (RExC_flags & RXf_PMf_EXTENDED) {
10504             if ( reg_skipcomment( pRExC_state ) )
10505                 goto tryagain;
10506         }
10507         /* FALL THROUGH */
10508
10509     default:
10510
10511             parse_start = RExC_parse - 1;
10512
10513             RExC_parse++;
10514
10515         defchar: {
10516             STRLEN len = 0;
10517             UV ender;
10518             char *p;
10519             char *s;
10520 #define MAX_NODE_STRING_SIZE 127
10521             char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
10522             char *s0;
10523             U8 upper_parse = MAX_NODE_STRING_SIZE;
10524             STRLEN foldlen;
10525             U8 node_type;
10526             bool next_is_quantifier;
10527             char * oldp = NULL;
10528
10529             /* If a folding node contains only code points that don't
10530              * participate in folds, it can be changed into an EXACT node,
10531              * which allows the optimizer more things to look for */
10532             bool maybe_exact;
10533
10534             ender = 0;
10535             node_type = compute_EXACTish(pRExC_state);
10536             ret = reg_node(pRExC_state, node_type);
10537
10538             /* In pass1, folded, we use a temporary buffer instead of the
10539              * actual node, as the node doesn't exist yet */
10540             s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
10541
10542             s0 = s;
10543
10544         reparse:
10545
10546             /* We do the EXACTFish to EXACT node only if folding, and not if in
10547              * locale, as whether a character folds or not isn't known until
10548              * runtime */
10549             maybe_exact = FOLD && ! LOC;
10550
10551             /* XXX The node can hold up to 255 bytes, yet this only goes to
10552              * 127.  I (khw) do not know why.  Keeping it somewhat less than
10553              * 255 allows us to not have to worry about overflow due to
10554              * converting to utf8 and fold expansion, but that value is
10555              * 255-UTF8_MAXBYTES_CASE.  join_exact() may join adjacent nodes
10556              * split up by this limit into a single one using the real max of
10557              * 255.  Even at 127, this breaks under rare circumstances.  If
10558              * folding, we do not want to split a node at a character that is a
10559              * non-final in a multi-char fold, as an input string could just
10560              * happen to want to match across the node boundary.  The join
10561              * would solve that problem if the join actually happens.  But a
10562              * series of more than two nodes in a row each of 127 would cause
10563              * the first join to succeed to get to 254, but then there wouldn't
10564              * be room for the next one, which could at be one of those split
10565              * multi-char folds.  I don't know of any fool-proof solution.  One
10566              * could back off to end with only a code point that isn't such a
10567              * non-final, but it is possible for there not to be any in the
10568              * entire node. */
10569             for (p = RExC_parse - 1;
10570                  len < upper_parse && p < RExC_end;
10571                  len++)
10572             {
10573                 oldp = p;
10574
10575                 if (RExC_flags & RXf_PMf_EXTENDED)
10576                     p = regwhite( pRExC_state, p );
10577                 switch ((U8)*p) {
10578                 case '^':
10579                 case '$':
10580                 case '.':
10581                 case '[':
10582                 case '(':
10583                 case ')':
10584                 case '|':
10585                     goto loopdone;
10586                 case '\\':
10587                     /* Literal Escapes Switch
10588
10589                        This switch is meant to handle escape sequences that
10590                        resolve to a literal character.
10591
10592                        Every escape sequence that represents something
10593                        else, like an assertion or a char class, is handled
10594                        in the switch marked 'Special Escapes' above in this
10595                        routine, but also has an entry here as anything that
10596                        isn't explicitly mentioned here will be treated as
10597                        an unescaped equivalent literal.
10598                     */
10599
10600                     switch ((U8)*++p) {
10601                     /* These are all the special escapes. */
10602                     case 'A':             /* Start assertion */
10603                     case 'b': case 'B':   /* Word-boundary assertion*/
10604                     case 'C':             /* Single char !DANGEROUS! */
10605                     case 'd': case 'D':   /* digit class */
10606                     case 'g': case 'G':   /* generic-backref, pos assertion */
10607                     case 'h': case 'H':   /* HORIZWS */
10608                     case 'k': case 'K':   /* named backref, keep marker */
10609                     case 'p': case 'P':   /* Unicode property */
10610                               case 'R':   /* LNBREAK */
10611                     case 's': case 'S':   /* space class */
10612                     case 'v': case 'V':   /* VERTWS */
10613                     case 'w': case 'W':   /* word class */
10614                     case 'X':             /* eXtended Unicode "combining character sequence" */
10615                     case 'z': case 'Z':   /* End of line/string assertion */
10616                         --p;
10617                         goto loopdone;
10618
10619                     /* Anything after here is an escape that resolves to a
10620                        literal. (Except digits, which may or may not)
10621                      */
10622                     case 'n':
10623                         ender = '\n';
10624                         p++;
10625                         break;
10626                     case 'N': /* Handle a single-code point named character. */
10627                         /* The options cause it to fail if a multiple code
10628                          * point sequence.  Handle those in the switch() above
10629                          * */
10630                         RExC_parse = p + 1;
10631                         if (! grok_bslash_N(pRExC_state, NULL, &ender,
10632                                             flagp, depth, FALSE,
10633                                             FALSE /* not strict */ ))
10634                         {
10635                             RExC_parse = p = oldp;
10636                             goto loopdone;
10637                         }
10638                         p = RExC_parse;
10639                         if (ender > 0xff) {
10640                             REQUIRE_UTF8;
10641                         }
10642                         break;
10643                     case 'r':
10644                         ender = '\r';
10645                         p++;
10646                         break;
10647                     case 't':
10648                         ender = '\t';
10649                         p++;
10650                         break;
10651                     case 'f':
10652                         ender = '\f';
10653                         p++;
10654                         break;
10655                     case 'e':
10656                           ender = ASCII_TO_NATIVE('\033');
10657                         p++;
10658                         break;
10659                     case 'a':
10660                           ender = ASCII_TO_NATIVE('\007');
10661                         p++;
10662                         break;
10663                     case 'o':
10664                         {
10665                             UV result;
10666                             const char* error_msg;
10667
10668                             bool valid = grok_bslash_o(&p,
10669                                                        &result,
10670                                                        &error_msg,
10671                                                        TRUE, /* out warnings */
10672                                                        FALSE, /* not strict */
10673                                                        TRUE, /* Output warnings
10674                                                                 for non-
10675                                                                 portables */
10676                                                        UTF);
10677                             if (! valid) {
10678                                 RExC_parse = p; /* going to die anyway; point
10679                                                    to exact spot of failure */
10680                                 vFAIL(error_msg);
10681                             }
10682                             ender = result;
10683                             if (PL_encoding && ender < 0x100) {
10684                                 goto recode_encoding;
10685                             }
10686                             if (ender > 0xff) {
10687                                 REQUIRE_UTF8;
10688                             }
10689                             break;
10690                         }
10691                     case 'x':
10692                         {
10693                             UV result = UV_MAX; /* initialize to erroneous
10694                                                    value */
10695                             const char* error_msg;
10696
10697                             bool valid = grok_bslash_x(&p,
10698                                                        &result,
10699                                                        &error_msg,
10700                                                        TRUE, /* out warnings */
10701                                                        FALSE, /* not strict */
10702                                                        TRUE, /* Output warnings
10703                                                                 for non-
10704                                                                 portables */
10705                                                        UTF);
10706                             if (! valid) {
10707                                 RExC_parse = p; /* going to die anyway; point
10708                                                    to exact spot of failure */
10709                                 vFAIL(error_msg);
10710                             }
10711                             ender = result;
10712
10713                             if (PL_encoding && ender < 0x100) {
10714                                 goto recode_encoding;
10715                             }
10716                             if (ender > 0xff) {
10717                                 REQUIRE_UTF8;
10718                             }
10719                             break;
10720                         }
10721                     case 'c':
10722                         p++;
10723                         ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
10724                         break;
10725                     case '0': case '1': case '2': case '3':case '4':
10726                     case '5': case '6': case '7':
10727                         if (*p == '0' ||
10728                             (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
10729                         {
10730                             I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
10731                             STRLEN numlen = 3;
10732                             ender = grok_oct(p, &numlen, &flags, NULL);
10733                             if (ender > 0xff) {
10734                                 REQUIRE_UTF8;
10735                             }
10736                             p += numlen;
10737                             if (SIZE_ONLY   /* like \08, \178 */
10738                                 && numlen < 3
10739                                 && p < RExC_end
10740                                 && isDIGIT(*p) && ckWARN(WARN_REGEXP))
10741                             {
10742                                 reg_warn_non_literal_string(
10743                                          p + 1,
10744                                          form_short_octal_warning(p, numlen));
10745                             }
10746                         }
10747                         else {  /* Not to be treated as an octal constant, go
10748                                    find backref */
10749                             --p;
10750                             goto loopdone;
10751                         }
10752                         if (PL_encoding && ender < 0x100)
10753                             goto recode_encoding;
10754                         break;
10755                     recode_encoding:
10756                         if (! RExC_override_recoding) {
10757                             SV* enc = PL_encoding;
10758                             ender = reg_recode((const char)(U8)ender, &enc);
10759                             if (!enc && SIZE_ONLY)
10760                                 ckWARNreg(p, "Invalid escape in the specified encoding");
10761                             REQUIRE_UTF8;
10762                         }
10763                         break;
10764                     case '\0':
10765                         if (p >= RExC_end)
10766                             FAIL("Trailing \\");
10767                         /* FALL THROUGH */
10768                     default:
10769                         if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
10770                             /* Include any { following the alpha to emphasize
10771                              * that it could be part of an escape at some point
10772                              * in the future */
10773                             int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
10774                             ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
10775                         }
10776                         goto normal_default;
10777                     }
10778                     break;
10779                 default:
10780                   normal_default:
10781                     if (UTF8_IS_START(*p) && UTF) {
10782                         STRLEN numlen;
10783                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
10784                                                &numlen, UTF8_ALLOW_DEFAULT);
10785                         p += numlen;
10786                     }
10787                     else
10788                         ender = (U8) *p++;
10789                     break;
10790                 } /* End of switch on the literal */
10791
10792                 /* Here, have looked at the literal character and <ender>
10793                  * contains its ordinal, <p> points to the character after it
10794                  */
10795
10796                 if ( RExC_flags & RXf_PMf_EXTENDED)
10797                     p = regwhite( pRExC_state, p );
10798
10799                 /* If the next thing is a quantifier, it applies to this
10800                  * character only, which means that this character has to be in
10801                  * its own node and can't just be appended to the string in an
10802                  * existing node, so if there are already other characters in
10803                  * the node, close the node with just them, and set up to do
10804                  * this character again next time through, when it will be the
10805                  * only thing in its new node */
10806                 if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
10807                 {
10808                     p = oldp;
10809                     goto loopdone;
10810                 }
10811
10812                 if (FOLD) {
10813                     if (UTF
10814                             /* See comments for join_exact() as to why we fold
10815                              * this non-UTF at compile time */
10816                         || (node_type == EXACTFU
10817                             && ender == LATIN_SMALL_LETTER_SHARP_S))
10818                     {
10819
10820
10821                         /* Prime the casefolded buffer.  Locale rules, which
10822                          * apply only to code points < 256, aren't known until
10823                          * execution, so for them, just output the original
10824                          * character using utf8.  If we start to fold non-UTF
10825                          * patterns, be sure to update join_exact() */
10826                         if (LOC && ender < 256) {
10827                             if (UNI_IS_INVARIANT(ender)) {
10828                                 *s = (U8) ender;
10829                                 foldlen = 1;
10830                             } else {
10831                                 *s = UTF8_TWO_BYTE_HI(ender);
10832                                 *(s + 1) = UTF8_TWO_BYTE_LO(ender);
10833                                 foldlen = 2;
10834                             }
10835                         }
10836                         else {
10837                             UV folded = _to_uni_fold_flags(
10838                                            ender,
10839                                            (U8 *) s,
10840                                            &foldlen,
10841                                            FOLD_FLAGS_FULL
10842                                            | ((LOC) ?  FOLD_FLAGS_LOCALE
10843                                                     : (ASCII_FOLD_RESTRICTED)
10844                                                       ? FOLD_FLAGS_NOMIX_ASCII
10845                                                       : 0)
10846                                             );
10847
10848                             /* If this node only contains non-folding code
10849                              * points so far, see if this new one is also
10850                              * non-folding */
10851                             if (maybe_exact) {
10852                                 if (folded != ender) {
10853                                     maybe_exact = FALSE;
10854                                 }
10855                                 else {
10856                                     /* Here the fold is the original; we have
10857                                      * to check further to see if anything
10858                                      * folds to it */
10859                                     if (! PL_utf8_foldable) {
10860                                         SV* swash = swash_init("utf8",
10861                                                            "_Perl_Any_Folds",
10862                                                            &PL_sv_undef, 1, 0);
10863                                         PL_utf8_foldable =
10864                                                     _get_swash_invlist(swash);
10865                                         SvREFCNT_dec_NN(swash);
10866                                     }
10867                                     if (_invlist_contains_cp(PL_utf8_foldable,
10868                                                              ender))
10869                                     {
10870                                         maybe_exact = FALSE;
10871                                     }
10872                                 }
10873                             }
10874                             ender = folded;
10875                         }
10876                         s += foldlen;
10877
10878                         /* The loop increments <len> each time, as all but this
10879                          * path (and the one just below for UTF) through it add
10880                          * a single byte to the EXACTish node.  But this one
10881                          * has changed len to be the correct final value, so
10882                          * subtract one to cancel out the increment that
10883                          * follows */
10884                         len += foldlen - 1;
10885                     }
10886                     else {
10887                         *(s++) = (char) ender;
10888                         maybe_exact &= ! IS_IN_SOME_FOLD_L1(ender);
10889                     }
10890                 }
10891                 else if (UTF) {
10892                     const STRLEN unilen = reguni(pRExC_state, ender, s);
10893                     if (unilen > 0) {
10894                        s   += unilen;
10895                        len += unilen;
10896                     }
10897
10898                     /* See comment just above for - 1 */
10899                     len--;
10900                 }
10901                 else {
10902                     REGC((char)ender, s++);
10903                 }
10904
10905                 if (next_is_quantifier) {
10906
10907                     /* Here, the next input is a quantifier, and to get here,
10908                      * the current character is the only one in the node.
10909                      * Also, here <len> doesn't include the final byte for this
10910                      * character */
10911                     len++;
10912                     goto loopdone;
10913                 }
10914
10915             } /* End of loop through literal characters */
10916
10917             /* Here we have either exhausted the input or ran out of room in
10918              * the node.  (If we encountered a character that can't be in the
10919              * node, transfer is made directly to <loopdone>, and so we
10920              * wouldn't have fallen off the end of the loop.)  In the latter
10921              * case, we artificially have to split the node into two, because
10922              * we just don't have enough space to hold everything.  This
10923              * creates a problem if the final character participates in a
10924              * multi-character fold in the non-final position, as a match that
10925              * should have occurred won't, due to the way nodes are matched,
10926              * and our artificial boundary.  So back off until we find a non-
10927              * problematic character -- one that isn't at the beginning or
10928              * middle of such a fold.  (Either it doesn't participate in any
10929              * folds, or appears only in the final position of all the folds it
10930              * does participate in.)  A better solution with far fewer false
10931              * positives, and that would fill the nodes more completely, would
10932              * be to actually have available all the multi-character folds to
10933              * test against, and to back-off only far enough to be sure that
10934              * this node isn't ending with a partial one.  <upper_parse> is set
10935              * further below (if we need to reparse the node) to include just
10936              * up through that final non-problematic character that this code
10937              * identifies, so when it is set to less than the full node, we can
10938              * skip the rest of this */
10939             if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
10940
10941                 const STRLEN full_len = len;
10942
10943                 assert(len >= MAX_NODE_STRING_SIZE);
10944
10945                 /* Here, <s> points to the final byte of the final character.
10946                  * Look backwards through the string until find a non-
10947                  * problematic character */
10948
10949                 if (! UTF) {
10950
10951                     /* These two have no multi-char folds to non-UTF characters
10952                      */
10953                     if (ASCII_FOLD_RESTRICTED || LOC) {
10954                         goto loopdone;
10955                     }
10956
10957                     while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
10958                     len = s - s0 + 1;
10959                 }
10960                 else {
10961                     if (!  PL_NonL1NonFinalFold) {
10962                         PL_NonL1NonFinalFold = _new_invlist_C_array(
10963                                         NonL1_Perl_Non_Final_Folds_invlist);
10964                     }
10965
10966                     /* Point to the first byte of the final character */
10967                     s = (char *) utf8_hop((U8 *) s, -1);
10968
10969                     while (s >= s0) {   /* Search backwards until find
10970                                            non-problematic char */
10971                         if (UTF8_IS_INVARIANT(*s)) {
10972
10973                             /* There are no ascii characters that participate
10974                              * in multi-char folds under /aa.  In EBCDIC, the
10975                              * non-ascii invariants are all control characters,
10976                              * so don't ever participate in any folds. */
10977                             if (ASCII_FOLD_RESTRICTED
10978                                 || ! IS_NON_FINAL_FOLD(*s))
10979                             {
10980                                 break;
10981                             }
10982                         }
10983                         else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
10984
10985                             /* No Latin1 characters participate in multi-char
10986                              * folds under /l */
10987                             if (LOC
10988                                 || ! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_UNI(
10989                                                                 *s, *(s+1))))
10990                             {
10991                                 break;
10992                             }
10993                         }
10994                         else if (! _invlist_contains_cp(
10995                                         PL_NonL1NonFinalFold,
10996                                         valid_utf8_to_uvchr((U8 *) s, NULL)))
10997                         {
10998                             break;
10999                         }
11000
11001                         /* Here, the current character is problematic in that
11002                          * it does occur in the non-final position of some
11003                          * fold, so try the character before it, but have to
11004                          * special case the very first byte in the string, so
11005                          * we don't read outside the string */
11006                         s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
11007                     } /* End of loop backwards through the string */
11008
11009                     /* If there were only problematic characters in the string,
11010                      * <s> will point to before s0, in which case the length
11011                      * should be 0, otherwise include the length of the
11012                      * non-problematic character just found */
11013                     len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
11014                 }
11015
11016                 /* Here, have found the final character, if any, that is
11017                  * non-problematic as far as ending the node without splitting
11018                  * it across a potential multi-char fold.  <len> contains the
11019                  * number of bytes in the node up-to and including that
11020                  * character, or is 0 if there is no such character, meaning
11021                  * the whole node contains only problematic characters.  In
11022                  * this case, give up and just take the node as-is.  We can't
11023                  * do any better */
11024                 if (len == 0) {
11025                     len = full_len;
11026                 } else {
11027
11028                     /* Here, the node does contain some characters that aren't
11029                      * problematic.  If one such is the final character in the
11030                      * node, we are done */
11031                     if (len == full_len) {
11032                         goto loopdone;
11033                     }
11034                     else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
11035
11036                         /* If the final character is problematic, but the
11037                          * penultimate is not, back-off that last character to
11038                          * later start a new node with it */
11039                         p = oldp;
11040                         goto loopdone;
11041                     }
11042
11043                     /* Here, the final non-problematic character is earlier
11044                      * in the input than the penultimate character.  What we do
11045                      * is reparse from the beginning, going up only as far as
11046                      * this final ok one, thus guaranteeing that the node ends
11047                      * in an acceptable character.  The reason we reparse is
11048                      * that we know how far in the character is, but we don't
11049                      * know how to correlate its position with the input parse.
11050                      * An alternate implementation would be to build that
11051                      * correlation as we go along during the original parse,
11052                      * but that would entail extra work for every node, whereas
11053                      * this code gets executed only when the string is too
11054                      * large for the node, and the final two characters are
11055                      * problematic, an infrequent occurrence.  Yet another
11056                      * possible strategy would be to save the tail of the
11057                      * string, and the next time regatom is called, initialize
11058                      * with that.  The problem with this is that unless you
11059                      * back off one more character, you won't be guaranteed
11060                      * regatom will get called again, unless regbranch,
11061                      * regpiece ... are also changed.  If you do back off that
11062                      * extra character, so that there is input guaranteed to
11063                      * force calling regatom, you can't handle the case where
11064                      * just the first character in the node is acceptable.  I
11065                      * (khw) decided to try this method which doesn't have that
11066                      * pitfall; if performance issues are found, we can do a
11067                      * combination of the current approach plus that one */
11068                     upper_parse = len;
11069                     len = 0;
11070                     s = s0;
11071                     goto reparse;
11072                 }
11073             }   /* End of verifying node ends with an appropriate char */
11074
11075         loopdone:   /* Jumped to when encounters something that shouldn't be in
11076                        the node */
11077
11078             /* If 'maybe_exact' is still set here, means there are no
11079              * code points in the node that participate in folds */
11080             if (FOLD && maybe_exact) {
11081                 OP(ret) = EXACT;
11082             }
11083
11084             /* I (khw) don't know if you can get here with zero length, but the
11085              * old code handled this situation by creating a zero-length EXACT
11086              * node.  Might as well be NOTHING instead */
11087             if (len == 0) {
11088                 OP(ret) = NOTHING;
11089             }
11090             else{
11091                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender);
11092             }
11093
11094             RExC_parse = p - 1;
11095             Set_Node_Cur_Length(ret); /* MJD */
11096             nextchar(pRExC_state);
11097             {
11098                 /* len is STRLEN which is unsigned, need to copy to signed */
11099                 IV iv = len;
11100                 if (iv < 0)
11101                     vFAIL("Internal disaster");
11102             }
11103
11104         } /* End of label 'defchar:' */
11105         break;
11106     } /* End of giant switch on input character */
11107
11108     return(ret);
11109 }
11110
11111 STATIC char *
11112 S_regwhite( RExC_state_t *pRExC_state, char *p )
11113 {
11114     const char *e = RExC_end;
11115
11116     PERL_ARGS_ASSERT_REGWHITE;
11117
11118     while (p < e) {
11119         if (isSPACE(*p))
11120             ++p;
11121         else if (*p == '#') {
11122             bool ended = 0;
11123             do {
11124                 if (*p++ == '\n') {
11125                     ended = 1;
11126                     break;
11127                 }
11128             } while (p < e);
11129             if (!ended)
11130                 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11131         }
11132         else
11133             break;
11134     }
11135     return p;
11136 }
11137
11138 STATIC char *
11139 S_regpatws( RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
11140 {
11141     /* Returns the next non-pattern-white space, non-comment character (the
11142      * latter only if 'recognize_comment is true) in the string p, which is
11143      * ended by RExC_end.  If there is no line break ending a comment,
11144      * RExC_seen has added the REG_SEEN_RUN_ON_COMMENT flag; */
11145     const char *e = RExC_end;
11146
11147     PERL_ARGS_ASSERT_REGPATWS;
11148
11149     while (p < e) {
11150         STRLEN len;
11151         if ((len = is_PATWS_safe(p, e, UTF))) {
11152             p += len;
11153         }
11154         else if (recognize_comment && *p == '#') {
11155             bool ended = 0;
11156             do {
11157                 p++;
11158                 if (is_LNBREAK_safe(p, e, UTF)) {
11159                     ended = 1;
11160                     break;
11161                 }
11162             } while (p < e);
11163             if (!ended)
11164                 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11165         }
11166         else
11167             break;
11168     }
11169     return p;
11170 }
11171
11172 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
11173    Character classes ([:foo:]) can also be negated ([:^foo:]).
11174    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
11175    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
11176    but trigger failures because they are currently unimplemented. */
11177
11178 #define POSIXCC_DONE(c)   ((c) == ':')
11179 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
11180 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
11181
11182 PERL_STATIC_INLINE I32
11183 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, SV *free_me,
11184                     const bool strict)
11185 {
11186     dVAR;
11187     I32 namedclass = OOB_NAMEDCLASS;
11188
11189     PERL_ARGS_ASSERT_REGPPOSIXCC;
11190
11191     if (value == '[' && RExC_parse + 1 < RExC_end &&
11192         /* I smell either [: or [= or [. -- POSIX has been here, right? */
11193         POSIXCC(UCHARAT(RExC_parse)))
11194     {
11195         const char c = UCHARAT(RExC_parse);
11196         char* const s = RExC_parse++;
11197
11198         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
11199             RExC_parse++;
11200         if (RExC_parse == RExC_end) {
11201             if (strict) {
11202
11203                 /* Try to give a better location for the error (than the end of
11204                  * the string) by looking for the matching ']' */
11205                 RExC_parse = s;
11206                 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
11207                     RExC_parse++;
11208                 }
11209                 vFAIL2("Unmatched '%c' in POSIX class", c);
11210             }
11211             /* Grandfather lone [:, [=, [. */
11212             RExC_parse = s;
11213         }
11214         else {
11215             const char* const t = RExC_parse++; /* skip over the c */
11216             assert(*t == c);
11217
11218             if (UCHARAT(RExC_parse) == ']') {
11219                 const char *posixcc = s + 1;
11220                 RExC_parse++; /* skip over the ending ] */
11221
11222                 if (*s == ':') {
11223                     const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
11224                     const I32 skip = t - posixcc;
11225
11226                     /* Initially switch on the length of the name.  */
11227                     switch (skip) {
11228                     case 4:
11229                         if (memEQ(posixcc, "word", 4)) /* this is not POSIX,
11230                                                           this is the Perl \w
11231                                                         */
11232                             namedclass = ANYOF_WORDCHAR;
11233                         break;
11234                     case 5:
11235                         /* Names all of length 5.  */
11236                         /* alnum alpha ascii blank cntrl digit graph lower
11237                            print punct space upper  */
11238                         /* Offset 4 gives the best switch position.  */
11239                         switch (posixcc[4]) {
11240                         case 'a':
11241                             if (memEQ(posixcc, "alph", 4)) /* alpha */
11242                                 namedclass = ANYOF_ALPHA;
11243                             break;
11244                         case 'e':
11245                             if (memEQ(posixcc, "spac", 4)) /* space */
11246                                 namedclass = ANYOF_PSXSPC;
11247                             break;
11248                         case 'h':
11249                             if (memEQ(posixcc, "grap", 4)) /* graph */
11250                                 namedclass = ANYOF_GRAPH;
11251                             break;
11252                         case 'i':
11253                             if (memEQ(posixcc, "asci", 4)) /* ascii */
11254                                 namedclass = ANYOF_ASCII;
11255                             break;
11256                         case 'k':
11257                             if (memEQ(posixcc, "blan", 4)) /* blank */
11258                                 namedclass = ANYOF_BLANK;
11259                             break;
11260                         case 'l':
11261                             if (memEQ(posixcc, "cntr", 4)) /* cntrl */
11262                                 namedclass = ANYOF_CNTRL;
11263                             break;
11264                         case 'm':
11265                             if (memEQ(posixcc, "alnu", 4)) /* alnum */
11266                                 namedclass = ANYOF_ALPHANUMERIC;
11267                             break;
11268                         case 'r':
11269                             if (memEQ(posixcc, "lowe", 4)) /* lower */
11270                                 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
11271                             else if (memEQ(posixcc, "uppe", 4)) /* upper */
11272                                 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
11273                             break;
11274                         case 't':
11275                             if (memEQ(posixcc, "digi", 4)) /* digit */
11276                                 namedclass = ANYOF_DIGIT;
11277                             else if (memEQ(posixcc, "prin", 4)) /* print */
11278                                 namedclass = ANYOF_PRINT;
11279                             else if (memEQ(posixcc, "punc", 4)) /* punct */
11280                                 namedclass = ANYOF_PUNCT;
11281                             break;
11282                         }
11283                         break;
11284                     case 6:
11285                         if (memEQ(posixcc, "xdigit", 6))
11286                             namedclass = ANYOF_XDIGIT;
11287                         break;
11288                     }
11289
11290                     if (namedclass == OOB_NAMEDCLASS)
11291                         Simple_vFAIL3("POSIX class [:%.*s:] unknown",
11292                                       t - s - 1, s + 1);
11293
11294                     /* The #defines are structured so each complement is +1 to
11295                      * the normal one */
11296                     if (complement) {
11297                         namedclass++;
11298                     }
11299                     assert (posixcc[skip] == ':');
11300                     assert (posixcc[skip+1] == ']');
11301                 } else if (!SIZE_ONLY) {
11302                     /* [[=foo=]] and [[.foo.]] are still future. */
11303
11304                     /* adjust RExC_parse so the warning shows after
11305                        the class closes */
11306                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
11307                         RExC_parse++;
11308                     SvREFCNT_dec(free_me);
11309                     vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
11310                 }
11311             } else {
11312                 /* Maternal grandfather:
11313                  * "[:" ending in ":" but not in ":]" */
11314                 if (strict) {
11315                     vFAIL("Unmatched '[' in POSIX class");
11316                 }
11317
11318                 /* Grandfather lone [:, [=, [. */
11319                 RExC_parse = s;
11320             }
11321         }
11322     }
11323
11324     return namedclass;
11325 }
11326
11327 STATIC bool
11328 S_could_it_be_POSIX(pTHX_ RExC_state_t *pRExC_state)
11329 {
11330     /* This applies some heuristics at the current parse position (which should
11331      * be at a '[') to see if what follows might be intended to be a [:posix:]
11332      * class.  It returns true if it really is a posix class, of course, but it
11333      * also can return true if it thinks that what was intended was a posix
11334      * class that didn't quite make it.
11335      *
11336      * It will return true for
11337      *      [:alphanumerics:
11338      *      [:alphanumerics]  (as long as the ] isn't followed immediately by a
11339      *                         ')' indicating the end of the (?[
11340      *      [:any garbage including %^&$ punctuation:]
11341      *
11342      * This is designed to be called only from S_handle_sets; it could be
11343      * easily adapted to be called from the spot at the beginning of regclass()
11344      * that checks to see in a normal bracketed class if the surrounding []
11345      * have been omitted ([:word:] instead of [[:word:]]).  But doing so would
11346      * change long-standing behavior, so I (khw) didn't do that */
11347     char* p = RExC_parse + 1;
11348     char first_char = *p;
11349
11350     PERL_ARGS_ASSERT_COULD_IT_BE_POSIX;
11351
11352     assert(*(p - 1) == '[');
11353
11354     if (! POSIXCC(first_char)) {
11355         return FALSE;
11356     }
11357
11358     p++;
11359     while (p < RExC_end && isWORDCHAR(*p)) p++;
11360
11361     if (p >= RExC_end) {
11362         return FALSE;
11363     }
11364
11365     if (p - RExC_parse > 2    /* Got at least 1 word character */
11366         && (*p == first_char
11367             || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')')))
11368     {
11369         return TRUE;
11370     }
11371
11372     p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse);
11373
11374     return (p
11375             && p - RExC_parse > 2 /* [:] evaluates to colon;
11376                                       [::] is a bad posix class. */
11377             && first_char == *(p - 1));
11378 }
11379
11380 STATIC regnode *
11381 S_handle_sets(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
11382                    char * const oregcomp_parse)
11383 {
11384     /* Handle the (?[...]) construct to do set operations */
11385
11386     U8 curchar;
11387     UV start, end;      /* End points of code point ranges */
11388     SV* result_string;
11389     char *save_end, *save_parse;
11390     SV* final;
11391     STRLEN len;
11392     regnode* node;
11393     AV* stack;
11394     const bool save_fold = FOLD;
11395
11396     GET_RE_DEBUG_FLAGS_DECL;
11397
11398     PERL_ARGS_ASSERT_HANDLE_SETS;
11399
11400     if (LOC) {
11401         vFAIL("(?[...]) not valid in locale");
11402     }
11403     RExC_uni_semantics = 1;
11404
11405     /* This will return only an ANYOF regnode, or (unlikely) something smaller
11406      * (such as EXACT).  Thus we can skip most everything if just sizing.  We
11407      * call regclass to handle '[]' so as to not have to reinvent its parsing
11408      * rules here (throwing away the size it computes each time).  And, we exit
11409      * upon an unescaped ']' that isn't one ending a regclass.  To do both
11410      * these things, we need to realize that something preceded by a backslash
11411      * is escaped, so we have to keep track of backslashes */
11412     if (SIZE_ONLY) {
11413
11414         Perl_ck_warner_d(aTHX_
11415             packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
11416             "The regex_sets feature is experimental" REPORT_LOCATION,
11417             (int) (RExC_parse - RExC_precomp) , RExC_precomp, RExC_parse);
11418
11419         while (RExC_parse < RExC_end) {
11420             SV* current = NULL;
11421             RExC_parse = regpatws(pRExC_state, RExC_parse,
11422                                 TRUE); /* means recognize comments */
11423             switch (*RExC_parse) {
11424                 default:
11425                     break;
11426                 case '\\':
11427                     /* Skip the next byte.  This would have to change to skip
11428                      * the next character if we were to recognize and handle
11429                      * specific non-ASCIIs */
11430                     RExC_parse++;
11431                     break;
11432                 case '[':
11433                 {
11434                     /* If this looks like it is a [:posix:] class, leave the
11435                      * parse pointer at the '[' to fool regclass() into
11436                      * thinking it is part of a '[[:posix]]'.  That function
11437                      * will use strict checking to force a syntax error if it
11438                      * doesn't work out to a legitimate class */
11439                     bool is_posix_class = could_it_be_POSIX(pRExC_state);
11440                     if (! is_posix_class) {
11441                         RExC_parse++;
11442                     }
11443
11444                     (void) regclass(pRExC_state, flagp,depth+1,
11445                                     is_posix_class, /* parse the whole char
11446                                                        class only if not a
11447                                                        posix class */
11448                                     FALSE, /* don't allow multi-char folds */
11449                                     TRUE, /* silence non-portable warnings. */
11450                                     &current);
11451                     /* function call leaves parse pointing to the ']', except
11452                      * if we faked it */
11453                     if (is_posix_class) {
11454                         RExC_parse--;
11455                     }
11456
11457                     SvREFCNT_dec(current);   /* In case it returned something */
11458                     break;
11459                 }
11460
11461                 case ']':
11462                     RExC_parse++;
11463                     if (RExC_parse < RExC_end
11464                         && *RExC_parse == ')')
11465                     {
11466                         node = reganode(pRExC_state, ANYOF, 0);
11467                         RExC_size += ANYOF_SKIP;
11468                         nextchar(pRExC_state);
11469                         Set_Node_Length(node,
11470                                 RExC_parse - oregcomp_parse + 1); /* MJD */
11471                         return node;
11472                     }
11473                     goto no_close;
11474             }
11475             RExC_parse++;
11476         }
11477
11478         no_close:
11479         FAIL("Syntax error in (?[...])");
11480     }
11481
11482     /* Pass 2 only after this.  Everything in this construct is a
11483      * metacharacter.  Operands begin with either a '\' (for an escape
11484      * sequence), or a '[' for a bracketed character class.  Any other
11485      * character should be an operator, or parenthesis for grouping.  Both
11486      * types of operands are handled by calling regclass() to parse them.  It
11487      * is called with a parameter to indicate to return the computed inversion
11488      * list.  The parsing here is implemented via a stack.  Each entry on the
11489      * stack is a single character representing one of the operators, or the
11490      * '('; or else a pointer to an operand inversion list. */
11491
11492 #define IS_OPERAND(a)  (! SvIOK(a))
11493
11494     /* The stack starts empty.  It is a syntax error if the first thing parsed
11495      * is a binary operator; everything else is pushed on the stack.  When an
11496      * operand is parsed, the top of the stack is examined.  If it is a binary
11497      * operator, the item before it should be an operand, and both are replaced
11498      * by the result of doing that operation on the new operand and the one on
11499      * the stack.   Thus a sequence of binary operands is reduced to a single
11500      * one before the next one is parsed.
11501      *
11502      * A unary operator may immediately follow a binary in the input, for
11503      * example
11504      *      [a] + ! [b]
11505      * When an operand is parsed and the top of the stack is a unary operator,
11506      * the operation is performed, and then the stack is rechecked to see if
11507      * this new operand is part of a binary operation; if so, it is handled as
11508      * above.
11509      *
11510      * A '(' is simply pushed on the stack; it is valid only if the stack is
11511      * empty, or the top element of the stack is an operator (for which the
11512      * parenthesized expression will become an operand).  By the time the
11513      * corresponding ')' is parsed everything in between should have been
11514      * parsed and evaluated to a single operand (or else is a syntax error),
11515      * and is handled as a regular operand */
11516
11517     stack = newAV();
11518
11519     while (RExC_parse < RExC_end) {
11520         I32 top_index = av_top(stack);
11521         SV** top_ptr;
11522         SV* current = NULL;
11523
11524         /* Skip white space */
11525         RExC_parse = regpatws(pRExC_state, RExC_parse,
11526                                 TRUE); /* means recognize comments */
11527         if (RExC_parse >= RExC_end
11528             || (curchar = UCHARAT(RExC_parse)) == ']')
11529         {   /* Exit loop at the end */
11530             break;
11531         }
11532
11533         switch (curchar) {
11534
11535             default:
11536                 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
11537                 vFAIL("Unexpected character");
11538
11539             case '\\':
11540                 (void) regclass(pRExC_state, flagp,depth+1,
11541                                 TRUE, /* means parse just the next thing */
11542                                 FALSE, /* don't allow multi-char folds */
11543                                 FALSE, /* don't silence non-portable warnings.
11544                                         */
11545                                 &current);
11546                 /* regclass() will return with parsing just the \ sequence,
11547                  * leaving the parse pointer at the next thing to parse */
11548                 RExC_parse--;
11549                 goto handle_operand;
11550
11551             case '[':   /* Is a bracketed character class */
11552             {
11553                 bool is_posix_class = could_it_be_POSIX(pRExC_state);
11554
11555                 if (! is_posix_class) {
11556                     RExC_parse++;
11557                 }
11558
11559                 (void) regclass(pRExC_state, flagp,depth+1,
11560                                 is_posix_class, /* parse the whole char class
11561                                                    only if not a posix class */
11562                                 FALSE, /* don't allow multi-char folds */
11563                                 FALSE, /* don't silence non-portable warnings.
11564                                         */
11565                                 &current);
11566                 /* function call leaves parse pointing to the ']', except if we
11567                  * faked it */
11568                 if (is_posix_class) {
11569                     RExC_parse--;
11570                 }
11571
11572                 goto handle_operand;
11573             }
11574
11575             case '&':
11576             case '|':
11577             case '+':
11578             case '-':
11579             case '^':
11580                 if (top_index < 0
11581                     || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
11582                     || ! IS_OPERAND(*top_ptr))
11583                 {
11584                     RExC_parse++;
11585                     vFAIL2("Unexpected binary operator '%c' with no preceding operand", curchar);
11586                 }
11587                 av_push(stack, newSVuv(curchar));
11588                 break;
11589
11590             case '!':
11591                 av_push(stack, newSVuv(curchar));
11592                 break;
11593
11594             case '(':
11595                 if (top_index >= 0) {
11596                     top_ptr = av_fetch(stack, top_index, FALSE);
11597                     assert(top_ptr);
11598                     if (IS_OPERAND(*top_ptr)) {
11599                         RExC_parse++;
11600                         vFAIL("Unexpected '(' with no preceding operator");
11601                     }
11602                 }
11603                 av_push(stack, newSVuv(curchar));
11604                 break;
11605
11606             case ')':
11607             {
11608                 SV* lparen;
11609                 if (top_index < 1
11610                     || ! (current = av_pop(stack))
11611                     || ! IS_OPERAND(current)
11612                     || ! (lparen = av_pop(stack))
11613                     || IS_OPERAND(lparen)
11614                     || SvUV(lparen) != '(')
11615                 {
11616                     RExC_parse++;
11617                     vFAIL("Unexpected ')'");
11618                 }
11619                 top_index -= 2;
11620                 SvREFCNT_dec_NN(lparen);
11621
11622                 /* FALL THROUGH */
11623             }
11624
11625               handle_operand:
11626
11627                 /* Here, we have an operand to process, in 'current' */
11628
11629                 if (top_index < 0) {    /* Just push if stack is empty */
11630                     av_push(stack, current);
11631                 }
11632                 else {
11633                     SV* top = av_pop(stack);
11634                     char current_operator;
11635
11636                     if (IS_OPERAND(top)) {
11637                         vFAIL("Operand with no preceding operator");
11638                     }
11639                     current_operator = (char) SvUV(top);
11640                     switch (current_operator) {
11641                         case '(':   /* Push the '(' back on followed by the new
11642                                        operand */
11643                             av_push(stack, top);
11644                             av_push(stack, current);
11645                             SvREFCNT_inc(top);  /* Counters the '_dec' done
11646                                                    just after the 'break', so
11647                                                    it doesn't get wrongly freed
11648                                                  */
11649                             break;
11650
11651                         case '!':
11652                             _invlist_invert(current);
11653
11654                             /* Unlike binary operators, the top of the stack,
11655                              * now that this unary one has been popped off, may
11656                              * legally be an operator, and we now have operand
11657                              * for it. */
11658                             top_index--;
11659                             SvREFCNT_dec_NN(top);
11660                             goto handle_operand;
11661
11662                         case '&':
11663                             _invlist_intersection(av_pop(stack),
11664                                                    current,
11665                                                    &current);
11666                             av_push(stack, current);
11667                             break;
11668
11669                         case '|':
11670                         case '+':
11671                             _invlist_union(av_pop(stack), current, &current);
11672                             av_push(stack, current);
11673                             break;
11674
11675                         case '-':
11676                             _invlist_subtract(av_pop(stack), current, &current);
11677                             av_push(stack, current);
11678                             break;
11679
11680                         case '^':   /* The union minus the intersection */
11681                         {
11682                             SV* i = NULL;
11683                             SV* u = NULL;
11684                             SV* element;
11685
11686                             element = av_pop(stack);
11687                             _invlist_union(element, current, &u);
11688                             _invlist_intersection(element, current, &i);
11689                             _invlist_subtract(u, i, &current);
11690                             av_push(stack, current);
11691                             SvREFCNT_dec_NN(i);
11692                             SvREFCNT_dec_NN(u);
11693                             SvREFCNT_dec_NN(element);
11694                             break;
11695                         }
11696
11697                         default:
11698                             Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack");
11699                 }
11700                 SvREFCNT_dec_NN(top);
11701             }
11702         }
11703
11704         RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
11705     }
11706
11707     if (av_top(stack) < 0   /* Was empty */
11708         || ((final = av_pop(stack)) == NULL)
11709         || ! IS_OPERAND(final)
11710         || av_top(stack) >= 0)  /* More left on stack */
11711     {
11712         vFAIL("Incomplete expression within '(?[ ])'");
11713     }
11714
11715     invlist_iterinit(final);
11716
11717     /* Here, 'final' is the resultant inversion list of evaluating the
11718      * expression.  Feed it to regclass() to generate the real resultant node.
11719      * regclass() is expecting a string of ranges and individual code points */
11720     result_string = newSVpvs("");
11721     while (invlist_iternext(final, &start, &end)) {
11722         if (start == end) {
11723             Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start);
11724         }
11725         else {
11726             Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}",
11727                                                      start,          end);
11728         }
11729     }
11730
11731     save_parse = RExC_parse;
11732     RExC_parse = SvPV(result_string, len);
11733     save_end = RExC_end;
11734     RExC_end = RExC_parse + len;
11735
11736     /* We turn off folding around the call, as the class we have constructed
11737      * already has all folding taken into consideration, and we don't want
11738      * regclass() to add to that */
11739     RExC_flags &= ~RXf_PMf_FOLD;
11740     node = regclass(pRExC_state, flagp,depth+1,
11741                     FALSE, /* means parse the whole char class */
11742                     FALSE, /* don't allow multi-char folds */
11743                     TRUE, /* silence non-portable warnings.  The above may very
11744                              well have generated non-portable code points, but
11745                              they're valid on this machine */
11746                     NULL);
11747     if (save_fold) {
11748         RExC_flags |= RXf_PMf_FOLD;
11749     }
11750     RExC_parse = save_parse + 1;
11751     RExC_end = save_end;
11752     SvREFCNT_dec_NN(final);
11753     SvREFCNT_dec_NN(result_string);
11754     SvREFCNT_dec_NN(stack);
11755
11756     nextchar(pRExC_state);
11757     Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
11758     return node;
11759 }
11760 #undef IS_OPERAND
11761
11762 /* The names of properties whose definitions are not known at compile time are
11763  * stored in this SV, after a constant heading.  So if the length has been
11764  * changed since initialization, then there is a run-time definition. */
11765 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION (SvCUR(listsv) != initial_listsv_len)
11766
11767 STATIC regnode *
11768 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
11769                  const bool stop_at_1,  /* Just parse the next thing, don't
11770                                            look for a full character class */
11771                  bool allow_multi_folds,
11772                  const bool silence_non_portable,   /* Don't output warnings
11773                                                        about too large
11774                                                        characters */
11775                  SV** ret_invlist)  /* Return an inversion list, not a node */
11776 {
11777     /* parse a bracketed class specification.  Most of these will produce an
11778      * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
11779      * EXACTFish node; [[:ascii:]], a POSIXA node; etc.  It is more complex
11780      * under /i with multi-character folds: it will be rewritten following the
11781      * paradigm of this example, where the <multi-fold>s are characters which
11782      * fold to multiple character sequences:
11783      *      /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
11784      * gets effectively rewritten as:
11785      *      /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
11786      * reg() gets called (recursively) on the rewritten version, and this
11787      * function will return what it constructs.  (Actually the <multi-fold>s
11788      * aren't physically removed from the [abcdefghi], it's just that they are
11789      * ignored in the recursion by means of a flag:
11790      * <RExC_in_multi_char_class>.)
11791      *
11792      * ANYOF nodes contain a bit map for the first 256 characters, with the
11793      * corresponding bit set if that character is in the list.  For characters
11794      * above 255, a range list or swash is used.  There are extra bits for \w,
11795      * etc. in locale ANYOFs, as what these match is not determinable at
11796      * compile time */
11797
11798     dVAR;
11799     UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
11800     IV range = 0;
11801     UV value = OOB_UNICODE, save_value = OOB_UNICODE;
11802     regnode *ret;
11803     STRLEN numlen;
11804     IV namedclass = OOB_NAMEDCLASS;
11805     char *rangebegin = NULL;
11806     bool need_class = 0;
11807     SV *listsv = NULL;
11808     STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
11809                                       than just initialized.  */
11810     SV* properties = NULL;    /* Code points that match \p{} \P{} */
11811     SV* posixes = NULL;     /* Code points that match classes like, [:word:],
11812                                extended beyond the Latin1 range */
11813     UV element_count = 0;   /* Number of distinct elements in the class.
11814                                Optimizations may be possible if this is tiny */
11815     AV * multi_char_matches = NULL; /* Code points that fold to more than one
11816                                        character; used under /i */
11817     UV n;
11818     char * stop_ptr = RExC_end;    /* where to stop parsing */
11819     const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
11820                                                    space? */
11821     const bool strict = cBOOL(ret_invlist); /* Apply strict parsing rules? */
11822
11823     /* Unicode properties are stored in a swash; this holds the current one
11824      * being parsed.  If this swash is the only above-latin1 component of the
11825      * character class, an optimization is to pass it directly on to the
11826      * execution engine.  Otherwise, it is set to NULL to indicate that there
11827      * are other things in the class that have to be dealt with at execution
11828      * time */
11829     SV* swash = NULL;           /* Code points that match \p{} \P{} */
11830
11831     /* Set if a component of this character class is user-defined; just passed
11832      * on to the engine */
11833     bool has_user_defined_property = FALSE;
11834
11835     /* inversion list of code points this node matches only when the target
11836      * string is in UTF-8.  (Because is under /d) */
11837     SV* depends_list = NULL;
11838
11839     /* inversion list of code points this node matches.  For much of the
11840      * function, it includes only those that match regardless of the utf8ness
11841      * of the target string */
11842     SV* cp_list = NULL;
11843
11844 #ifdef EBCDIC
11845     /* In a range, counts how many 0-2 of the ends of it came from literals,
11846      * not escapes.  Thus we can tell if 'A' was input vs \x{C1} */
11847     UV literal_endpoint = 0;
11848 #endif
11849     bool invert = FALSE;    /* Is this class to be complemented */
11850
11851     /* Is there any thing like \W or [:^digit:] that matches above the legal
11852      * Unicode range? */
11853     bool runtime_posix_matches_above_Unicode = FALSE;
11854
11855     regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
11856         case we need to change the emitted regop to an EXACT. */
11857     const char * orig_parse = RExC_parse;
11858     const I32 orig_size = RExC_size;
11859     GET_RE_DEBUG_FLAGS_DECL;
11860
11861     PERL_ARGS_ASSERT_REGCLASS;
11862 #ifndef DEBUGGING
11863     PERL_UNUSED_ARG(depth);
11864 #endif
11865
11866     DEBUG_PARSE("clas");
11867
11868     /* Assume we are going to generate an ANYOF node. */
11869     ret = reganode(pRExC_state, ANYOF, 0);
11870
11871     if (SIZE_ONLY) {
11872         RExC_size += ANYOF_SKIP;
11873         listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
11874     }
11875     else {
11876         ANYOF_FLAGS(ret) = 0;
11877
11878         RExC_emit += ANYOF_SKIP;
11879         if (LOC) {
11880             ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
11881         }
11882         listsv = newSVpvs("# comment\n");
11883         initial_listsv_len = SvCUR(listsv);
11884     }
11885
11886     if (skip_white) {
11887         RExC_parse = regpatws(pRExC_state, RExC_parse,
11888                               FALSE /* means don't recognize comments */);
11889     }
11890
11891     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
11892         RExC_parse++;
11893         invert = TRUE;
11894         allow_multi_folds = FALSE;
11895         RExC_naughty++;
11896         if (skip_white) {
11897             RExC_parse = regpatws(pRExC_state, RExC_parse,
11898                                   FALSE /* means don't recognize comments */);
11899         }
11900     }
11901
11902     /* Check that they didn't say [:posix:] instead of [[:posix:]] */
11903     if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) {
11904         const char *s = RExC_parse;
11905         const char  c = *s++;
11906
11907         while (isWORDCHAR(*s))
11908             s++;
11909         if (*s && c == *s && s[1] == ']') {
11910             SAVEFREESV(RExC_rx_sv);
11911             SAVEFREESV(listsv);
11912             ckWARN3reg(s+2,
11913                        "POSIX syntax [%c %c] belongs inside character classes",
11914                        c, c);
11915             (void)ReREFCNT_inc(RExC_rx_sv);
11916             SvREFCNT_inc_simple_void_NN(listsv);
11917         }
11918     }
11919
11920     /* If the caller wants us to just parse a single element, accomplish this
11921      * by faking the loop ending condition */
11922     if (stop_at_1 && RExC_end > RExC_parse) {
11923         stop_ptr = RExC_parse + 1;
11924     }
11925
11926     /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
11927     if (UCHARAT(RExC_parse) == ']')
11928         goto charclassloop;
11929
11930 parseit:
11931     while (1) {
11932         if  (RExC_parse >= stop_ptr) {
11933             break;
11934         }
11935
11936         if (skip_white) {
11937             RExC_parse = regpatws(pRExC_state, RExC_parse,
11938                                   FALSE /* means don't recognize comments */);
11939         }
11940
11941         if  (UCHARAT(RExC_parse) == ']') {
11942             break;
11943         }
11944
11945     charclassloop:
11946
11947         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
11948         save_value = value;
11949         save_prevvalue = prevvalue;
11950
11951         if (!range) {
11952             rangebegin = RExC_parse;
11953             element_count++;
11954         }
11955         if (UTF) {
11956             value = utf8n_to_uvchr((U8*)RExC_parse,
11957                                    RExC_end - RExC_parse,
11958                                    &numlen, UTF8_ALLOW_DEFAULT);
11959             RExC_parse += numlen;
11960         }
11961         else
11962             value = UCHARAT(RExC_parse++);
11963
11964         if (value == '['
11965             && RExC_parse < RExC_end
11966             && POSIXCC(UCHARAT(RExC_parse)))
11967         {
11968             namedclass = regpposixcc(pRExC_state, value, listsv, strict);
11969         }
11970         else if (value == '\\') {
11971             if (UTF) {
11972                 value = utf8n_to_uvchr((U8*)RExC_parse,
11973                                    RExC_end - RExC_parse,
11974                                    &numlen, UTF8_ALLOW_DEFAULT);
11975                 RExC_parse += numlen;
11976             }
11977             else
11978                 value = UCHARAT(RExC_parse++);
11979
11980             /* Some compilers cannot handle switching on 64-bit integer
11981              * values, therefore value cannot be an UV.  Yes, this will
11982              * be a problem later if we want switch on Unicode.
11983              * A similar issue a little bit later when switching on
11984              * namedclass. --jhi */
11985
11986             /* If the \ is escaping white space when white space is being
11987              * skipped, it means that that white space is wanted literally, and
11988              * is already in 'value'.  Otherwise, need to translate the escape
11989              * into what it signifies. */
11990             if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) {
11991
11992             case 'w':   namedclass = ANYOF_WORDCHAR;    break;
11993             case 'W':   namedclass = ANYOF_NWORDCHAR;   break;
11994             case 's':   namedclass = ANYOF_SPACE;       break;
11995             case 'S':   namedclass = ANYOF_NSPACE;      break;
11996             case 'd':   namedclass = ANYOF_DIGIT;       break;
11997             case 'D':   namedclass = ANYOF_NDIGIT;      break;
11998             case 'v':   namedclass = ANYOF_VERTWS;      break;
11999             case 'V':   namedclass = ANYOF_NVERTWS;     break;
12000             case 'h':   namedclass = ANYOF_HORIZWS;     break;
12001             case 'H':   namedclass = ANYOF_NHORIZWS;    break;
12002             case 'N':  /* Handle \N{NAME} in class */
12003                 {
12004                     /* We only pay attention to the first char of
12005                     multichar strings being returned. I kinda wonder
12006                     if this makes sense as it does change the behaviour
12007                     from earlier versions, OTOH that behaviour was broken
12008                     as well. */
12009                     if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth,
12010                                       TRUE, /* => charclass */
12011                                       strict))
12012                     {
12013                         goto parseit;
12014                     }
12015                 }
12016                 break;
12017             case 'p':
12018             case 'P':
12019                 {
12020                 char *e;
12021
12022                 /* We will handle any undefined properties ourselves */
12023                 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF;
12024
12025                 if (RExC_parse >= RExC_end)
12026                     vFAIL2("Empty \\%c{}", (U8)value);
12027                 if (*RExC_parse == '{') {
12028                     const U8 c = (U8)value;
12029                     e = strchr(RExC_parse++, '}');
12030                     if (!e)
12031                         vFAIL2("Missing right brace on \\%c{}", c);
12032                     while (isSPACE(UCHARAT(RExC_parse)))
12033                         RExC_parse++;
12034                     if (e == RExC_parse)
12035                         vFAIL2("Empty \\%c{}", c);
12036                     n = e - RExC_parse;
12037                     while (isSPACE(UCHARAT(RExC_parse + n - 1)))
12038                         n--;
12039                 }
12040                 else {
12041                     e = RExC_parse;
12042                     n = 1;
12043                 }
12044                 if (!SIZE_ONLY) {
12045                     SV* invlist;
12046                     char* name;
12047
12048                     if (UCHARAT(RExC_parse) == '^') {
12049                          RExC_parse++;
12050                          n--;
12051                          /* toggle.  (The rhs xor gets the single bit that
12052                           * differs between P and p; the other xor inverts just
12053                           * that bit) */
12054                          value ^= 'P' ^ 'p';
12055
12056                          while (isSPACE(UCHARAT(RExC_parse))) {
12057                               RExC_parse++;
12058                               n--;
12059                          }
12060                     }
12061                     /* Try to get the definition of the property into
12062                      * <invlist>.  If /i is in effect, the effective property
12063                      * will have its name be <__NAME_i>.  The design is
12064                      * discussed in commit
12065                      * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
12066                     Newx(name, n + sizeof("_i__\n"), char);
12067
12068                     sprintf(name, "%s%.*s%s\n",
12069                                     (FOLD) ? "__" : "",
12070                                     (int)n,
12071                                     RExC_parse,
12072                                     (FOLD) ? "_i" : ""
12073                     );
12074
12075                     /* Look up the property name, and get its swash and
12076                      * inversion list, if the property is found  */
12077                     if (swash) {
12078                         SvREFCNT_dec_NN(swash);
12079                     }
12080                     swash = _core_swash_init("utf8", name, &PL_sv_undef,
12081                                              1, /* binary */
12082                                              0, /* not tr/// */
12083                                              NULL, /* No inversion list */
12084                                              &swash_init_flags
12085                                             );
12086                     if (! swash || ! (invlist = _get_swash_invlist(swash))) {
12087                         if (swash) {
12088                             SvREFCNT_dec_NN(swash);
12089                             swash = NULL;
12090                         }
12091
12092                         /* Here didn't find it.  It could be a user-defined
12093                          * property that will be available at run-time.  If we
12094                          * accept only compile-time properties, is an error;
12095                          * otherwise add it to the list for run-time look up */
12096                         if (ret_invlist) {
12097                             RExC_parse = e + 1;
12098                             vFAIL3("Property '%.*s' is unknown", (int) n, name);
12099                         }
12100                         Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n",
12101                                         (value == 'p' ? '+' : '!'),
12102                                         name);
12103                         has_user_defined_property = TRUE;
12104
12105                         /* We don't know yet, so have to assume that the
12106                          * property could match something in the Latin1 range,
12107                          * hence something that isn't utf8.  Note that this
12108                          * would cause things in <depends_list> to match
12109                          * inappropriately, except that any \p{}, including
12110                          * this one forces Unicode semantics, which means there
12111                          * is <no depends_list> */
12112                         ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
12113                     }
12114                     else {
12115
12116                         /* Here, did get the swash and its inversion list.  If
12117                          * the swash is from a user-defined property, then this
12118                          * whole character class should be regarded as such */
12119                         has_user_defined_property =
12120                                     (swash_init_flags
12121                                      & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY);
12122
12123                         /* Invert if asking for the complement */
12124                         if (value == 'P') {
12125                             _invlist_union_complement_2nd(properties,
12126                                                           invlist,
12127                                                           &properties);
12128
12129                             /* The swash can't be used as-is, because we've
12130                              * inverted things; delay removing it to here after
12131                              * have copied its invlist above */
12132                             SvREFCNT_dec_NN(swash);
12133                             swash = NULL;
12134                         }
12135                         else {
12136                             _invlist_union(properties, invlist, &properties);
12137                         }
12138                     }
12139                     Safefree(name);
12140                 }
12141                 RExC_parse = e + 1;
12142                 namedclass = ANYOF_UNIPROP;  /* no official name, but it's
12143                                                 named */
12144
12145                 /* \p means they want Unicode semantics */
12146                 RExC_uni_semantics = 1;
12147                 }
12148                 break;
12149             case 'n':   value = '\n';                   break;
12150             case 'r':   value = '\r';                   break;
12151             case 't':   value = '\t';                   break;
12152             case 'f':   value = '\f';                   break;
12153             case 'b':   value = '\b';                   break;
12154             case 'e':   value = ASCII_TO_NATIVE('\033');break;
12155             case 'a':   value = ASCII_TO_NATIVE('\007');break;
12156             case 'o':
12157                 RExC_parse--;   /* function expects to be pointed at the 'o' */
12158                 {
12159                     const char* error_msg;
12160                     bool valid = grok_bslash_o(&RExC_parse,
12161                                                &value,
12162                                                &error_msg,
12163                                                SIZE_ONLY,   /* warnings in pass
12164                                                                1 only */
12165                                                strict,
12166                                                silence_non_portable,
12167                                                UTF);
12168                     if (! valid) {
12169                         vFAIL(error_msg);
12170                     }
12171                 }
12172                 if (PL_encoding && value < 0x100) {
12173                     goto recode_encoding;
12174                 }
12175                 break;
12176             case 'x':
12177                 RExC_parse--;   /* function expects to be pointed at the 'x' */
12178                 {
12179                     const char* error_msg;
12180                     bool valid = grok_bslash_x(&RExC_parse,
12181                                                &value,
12182                                                &error_msg,
12183                                                TRUE, /* Output warnings */
12184                                                strict,
12185                                                silence_non_portable,
12186                                                UTF);
12187                     if (! valid) {
12188                         vFAIL(error_msg);
12189                     }
12190                 }
12191                 if (PL_encoding && value < 0x100)
12192                     goto recode_encoding;
12193                 break;
12194             case 'c':
12195                 value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
12196                 break;
12197             case '0': case '1': case '2': case '3': case '4':
12198             case '5': case '6': case '7':
12199                 {
12200                     /* Take 1-3 octal digits */
12201                     I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
12202                     numlen = (strict) ? 4 : 3;
12203                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
12204                     RExC_parse += numlen;
12205                     if (numlen != 3) {
12206                         SAVEFREESV(listsv); /* In case warnings are fatalized */
12207                         if (strict) {
12208                             RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12209                             vFAIL("Need exactly 3 octal digits");
12210                         }
12211                         else if (! SIZE_ONLY /* like \08, \178 */
12212                                  && numlen < 3
12213                                  && RExC_parse < RExC_end
12214                                  && isDIGIT(*RExC_parse)
12215                                  && ckWARN(WARN_REGEXP))
12216                         {
12217                             SAVEFREESV(RExC_rx_sv);
12218                             reg_warn_non_literal_string(
12219                                  RExC_parse + 1,
12220                                  form_short_octal_warning(RExC_parse, numlen));
12221                             (void)ReREFCNT_inc(RExC_rx_sv);
12222                         }
12223                         SvREFCNT_inc_simple_void_NN(listsv);
12224                     }
12225                     if (PL_encoding && value < 0x100)
12226                         goto recode_encoding;
12227                     break;
12228                 }
12229             recode_encoding:
12230                 if (! RExC_override_recoding) {
12231                     SV* enc = PL_encoding;
12232                     value = reg_recode((const char)(U8)value, &enc);
12233                     if (!enc) {
12234                         if (strict) {
12235                             vFAIL("Invalid escape in the specified encoding");
12236                         }
12237                         else if (SIZE_ONLY) {
12238                             ckWARNreg(RExC_parse,
12239                                   "Invalid escape in the specified encoding");
12240                         }
12241                     }
12242                     break;
12243                 }
12244             default:
12245                 /* Allow \_ to not give an error */
12246                 if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
12247                     SAVEFREESV(listsv);
12248                     if (strict) {
12249                         vFAIL2("Unrecognized escape \\%c in character class",
12250                                (int)value);
12251                     }
12252                     else {
12253                         SAVEFREESV(RExC_rx_sv);
12254                         ckWARN2reg(RExC_parse,
12255                             "Unrecognized escape \\%c in character class passed through",
12256                             (int)value);
12257                         (void)ReREFCNT_inc(RExC_rx_sv);
12258                     }
12259                     SvREFCNT_inc_simple_void_NN(listsv);
12260                 }
12261                 break;
12262             }   /* End of switch on char following backslash */
12263         } /* end of handling backslash escape sequences */
12264 #ifdef EBCDIC
12265         else
12266             literal_endpoint++;
12267 #endif
12268
12269         /* Here, we have the current token in 'value' */
12270
12271         /* What matches in a locale is not known until runtime.  This includes
12272          * what the Posix classes (like \w, [:space:]) match.  Room must be
12273          * reserved (one time per class) to store such classes, either if Perl
12274          * is compiled so that locale nodes always should have this space, or
12275          * if there is such class info to be stored.  The space will contain a
12276          * bit for each named class that is to be matched against.  This isn't
12277          * needed for \p{} and pseudo-classes, as they are not affected by
12278          * locale, and hence are dealt with separately */
12279         if (LOC
12280             && ! need_class
12281             && (ANYOF_LOCALE == ANYOF_CLASS
12282                 || (namedclass > OOB_NAMEDCLASS && namedclass < ANYOF_MAX)))
12283         {
12284             need_class = 1;
12285             if (SIZE_ONLY) {
12286                 RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
12287             }
12288             else {
12289                 RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
12290                 ANYOF_CLASS_ZERO(ret);
12291             }
12292             ANYOF_FLAGS(ret) |= ANYOF_CLASS;
12293         }
12294
12295         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
12296
12297             /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
12298              * literal, as is the character that began the false range, i.e.
12299              * the 'a' in the examples */
12300             if (range) {
12301                 if (!SIZE_ONLY) {
12302                     const int w = (RExC_parse >= rangebegin)
12303                                   ? RExC_parse - rangebegin
12304                                   : 0;
12305                     SAVEFREESV(listsv); /* in case of fatal warnings */
12306                     if (strict) {
12307                         vFAIL4("False [] range \"%*.*s\"", w, w, rangebegin);
12308                     }
12309                     else {
12310                         SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
12311                         ckWARN4reg(RExC_parse,
12312                                 "False [] range \"%*.*s\"",
12313                                 w, w, rangebegin);
12314                         (void)ReREFCNT_inc(RExC_rx_sv);
12315                         cp_list = add_cp_to_invlist(cp_list, '-');
12316                         cp_list = add_cp_to_invlist(cp_list, prevvalue);
12317                     }
12318                     SvREFCNT_inc_simple_void_NN(listsv);
12319                 }
12320
12321                 range = 0; /* this was not a true range */
12322                 element_count += 2; /* So counts for three values */
12323             }
12324
12325             if (! SIZE_ONLY) {
12326                 U8 classnum = namedclass_to_classnum(namedclass);
12327                 if (namedclass >= ANYOF_MAX) {  /* If a special class */
12328                     if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
12329
12330                         /* Here, should be \h, \H, \v, or \V.  Neither /d nor
12331                          * /l make a difference in what these match.  There
12332                          * would be problems if these characters had folds
12333                          * other than themselves, as cp_list is subject to
12334                          * folding. */
12335                         if (classnum != _CC_VERTSPACE) {
12336                             assert(   namedclass == ANYOF_HORIZWS
12337                                    || namedclass == ANYOF_NHORIZWS);
12338
12339                             /* It turns out that \h is just a synonym for
12340                              * XPosixBlank */
12341                             classnum = _CC_BLANK;
12342                         }
12343
12344                         _invlist_union_maybe_complement_2nd(
12345                                 cp_list,
12346                                 PL_XPosix_ptrs[classnum],
12347                                 cBOOL(namedclass % 2), /* Complement if odd
12348                                                           (NHORIZWS, NVERTWS)
12349                                                         */
12350                                 &cp_list);
12351                     }
12352                 }
12353                 else if (classnum == _CC_ASCII) {
12354 #ifdef HAS_ISASCII
12355                     if (LOC) {
12356                         ANYOF_CLASS_SET(ret, namedclass);
12357                     }
12358                     else
12359 #endif  /* Not isascii(); just use the hard-coded definition for it */
12360                         _invlist_union_maybe_complement_2nd(
12361                                 posixes,
12362                                 PL_ASCII,
12363                                 cBOOL(namedclass % 2), /* Complement if odd
12364                                                           (NASCII) */
12365                                 &posixes);
12366                 }
12367                 else {  /* Garden variety class */
12368
12369                     /* The ascii range inversion list */
12370                     SV* ascii_source = PL_Posix_ptrs[classnum];
12371
12372                     /* The full Latin1 range inversion list */
12373                     SV* l1_source = PL_L1Posix_ptrs[classnum];
12374
12375                     /* This code is structured into two major clauses.  The
12376                      * first is for classes whose complete definitions may not
12377                      * already be known.  It not, the Latin1 definition
12378                      * (guaranteed to already known) is used plus code is
12379                      * generated to load the rest at run-time (only if needed).
12380                      * If the complete definition is known, it drops down to
12381                      * the second clause, where the complete definition is
12382                      * known */
12383
12384                     if (classnum < _FIRST_NON_SWASH_CC) {
12385
12386                         /* Here, the class has a swash, which may or not
12387                          * already be loaded */
12388
12389                         /* The name of the property to use to match the full
12390                          * eXtended Unicode range swash for this character
12391                          * class */
12392                         const char *Xname = swash_property_names[classnum];
12393
12394                         /* If returning the inversion list, we can't defer
12395                          * getting this until runtime */
12396                         if (ret_invlist && !  PL_utf8_swash_ptrs[classnum]) {
12397                             PL_utf8_swash_ptrs[classnum] =
12398                                 _core_swash_init("utf8", Xname, &PL_sv_undef,
12399                                              1, /* binary */
12400                                              0, /* not tr/// */
12401                                              NULL, /* No inversion list */
12402                                              NULL  /* No flags */
12403                                             );
12404                             assert(PL_utf8_swash_ptrs[classnum]);
12405                         }
12406                         if ( !  PL_utf8_swash_ptrs[classnum]) {
12407                             if (namedclass % 2 == 0) { /* A non-complemented
12408                                                           class */
12409                                 /* If not /a matching, there are code points we
12410                                  * don't know at compile time.  Arrange for the
12411                                  * unknown matches to be loaded at run-time, if
12412                                  * needed */
12413                                 if (! AT_LEAST_ASCII_RESTRICTED) {
12414                                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::%s\n",
12415                                                                  Xname);
12416                                 }
12417                                 if (LOC) {  /* Under locale, set run-time
12418                                                lookup */
12419                                     ANYOF_CLASS_SET(ret, namedclass);
12420                                 }
12421                                 else {
12422                                     /* Add the current class's code points to
12423                                      * the running total */
12424                                     _invlist_union(posixes,
12425                                                    (AT_LEAST_ASCII_RESTRICTED)
12426                                                         ? ascii_source
12427                                                         : l1_source,
12428                                                    &posixes);
12429                                 }
12430                             }
12431                             else {  /* A complemented class */
12432                                 if (AT_LEAST_ASCII_RESTRICTED) {
12433                                     /* Under /a should match everything above
12434                                      * ASCII, plus the complement of the set's
12435                                      * ASCII matches */
12436                                     _invlist_union_complement_2nd(posixes,
12437                                                                   ascii_source,
12438                                                                   &posixes);
12439                                 }
12440                                 else {
12441                                     /* Arrange for the unknown matches to be
12442                                      * loaded at run-time, if needed */
12443                                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::%s\n",
12444                                                                  Xname);
12445                                     runtime_posix_matches_above_Unicode = TRUE;
12446                                     if (LOC) {
12447                                         ANYOF_CLASS_SET(ret, namedclass);
12448                                     }
12449                                     else {
12450
12451                                         /* We want to match everything in
12452                                          * Latin1, except those things that
12453                                          * l1_source matches */
12454                                         SV* scratch_list = NULL;
12455                                         _invlist_subtract(PL_Latin1, l1_source,
12456                                                           &scratch_list);
12457
12458                                         /* Add the list from this class to the
12459                                          * running total */
12460                                         if (! posixes) {
12461                                             posixes = scratch_list;
12462                                         }
12463                                         else {
12464                                             _invlist_union(posixes,
12465                                                            scratch_list,
12466                                                            &posixes);
12467                                             SvREFCNT_dec_NN(scratch_list);
12468                                         }
12469                                         if (DEPENDS_SEMANTICS) {
12470                                             ANYOF_FLAGS(ret)
12471                                                   |= ANYOF_NON_UTF8_LATIN1_ALL;
12472                                         }
12473                                     }
12474                                 }
12475                             }
12476                             goto namedclass_done;
12477                         }
12478
12479                         /* Here, there is a swash loaded for the class.  If no
12480                          * inversion list for it yet, get it */
12481                         if (! PL_XPosix_ptrs[classnum]) {
12482                             PL_XPosix_ptrs[classnum]
12483                              = _swash_to_invlist(PL_utf8_swash_ptrs[classnum]);
12484                         }
12485                     }
12486
12487                     /* Here there is an inversion list already loaded for the
12488                      * entire class */
12489
12490                     if (namedclass % 2 == 0) {  /* A non-complemented class,
12491                                                    like ANYOF_PUNCT */
12492                         if (! LOC) {
12493                             /* For non-locale, just add it to any existing list
12494                              * */
12495                             _invlist_union(posixes,
12496                                            (AT_LEAST_ASCII_RESTRICTED)
12497                                                ? ascii_source
12498                                                : PL_XPosix_ptrs[classnum],
12499                                            &posixes);
12500                         }
12501                         else {  /* Locale */
12502                             SV* scratch_list = NULL;
12503
12504                             /* For above Latin1 code points, we use the full
12505                              * Unicode range */
12506                             _invlist_intersection(PL_AboveLatin1,
12507                                                   PL_XPosix_ptrs[classnum],
12508                                                   &scratch_list);
12509                             /* And set the output to it, adding instead if
12510                              * there already is an output.  Checking if
12511                              * 'posixes' is NULL first saves an extra clone.
12512                              * Its reference count will be decremented at the
12513                              * next union, etc, or if this is the only
12514                              * instance, at the end of the routine */
12515                             if (! posixes) {
12516                                 posixes = scratch_list;
12517                             }
12518                             else {
12519                                 _invlist_union(posixes, scratch_list, &posixes);
12520                                 SvREFCNT_dec_NN(scratch_list);
12521                             }
12522
12523 #ifndef HAS_ISBLANK
12524                             if (namedclass != ANYOF_BLANK) {
12525 #endif
12526                                 /* Set this class in the node for runtime
12527                                  * matching */
12528                                 ANYOF_CLASS_SET(ret, namedclass);
12529 #ifndef HAS_ISBLANK
12530                             }
12531                             else {
12532                                 /* No isblank(), use the hard-coded ASCII-range
12533                                  * blanks, adding them to the running total. */
12534
12535                                 _invlist_union(posixes, ascii_source, &posixes);
12536                             }
12537 #endif
12538                         }
12539                     }
12540                     else {  /* A complemented class, like ANYOF_NPUNCT */
12541                         if (! LOC) {
12542                             _invlist_union_complement_2nd(
12543                                                 posixes,
12544                                                 (AT_LEAST_ASCII_RESTRICTED)
12545                                                     ? ascii_source
12546                                                     : PL_XPosix_ptrs[classnum],
12547                                                 &posixes);
12548                             /* Under /d, everything in the upper half of the
12549                              * Latin1 range matches this complement */
12550                             if (DEPENDS_SEMANTICS) {
12551                                 ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
12552                             }
12553                         }
12554                         else {  /* Locale */
12555                             SV* scratch_list = NULL;
12556                             _invlist_subtract(PL_AboveLatin1,
12557                                               PL_XPosix_ptrs[classnum],
12558                                               &scratch_list);
12559                             if (! posixes) {
12560                                 posixes = scratch_list;
12561                             }
12562                             else {
12563                                 _invlist_union(posixes, scratch_list, &posixes);
12564                                 SvREFCNT_dec_NN(scratch_list);
12565                             }
12566 #ifndef HAS_ISBLANK
12567                             if (namedclass != ANYOF_NBLANK) {
12568 #endif
12569                                 ANYOF_CLASS_SET(ret, namedclass);
12570 #ifndef HAS_ISBLANK
12571                             }
12572                             else {
12573                                 /* Get the list of all code points in Latin1
12574                                  * that are not ASCII blanks, and add them to
12575                                  * the running total */
12576                                 _invlist_subtract(PL_Latin1, ascii_source,
12577                                                   &scratch_list);
12578                                 _invlist_union(posixes, scratch_list, &posixes);
12579                                 SvREFCNT_dec_NN(scratch_list);
12580                             }
12581 #endif
12582                         }
12583                     }
12584                 }
12585               namedclass_done:
12586                 continue;   /* Go get next character */
12587             }
12588         } /* end of namedclass \blah */
12589
12590         /* Here, we have a single value.  If 'range' is set, it is the ending
12591          * of a range--check its validity.  Later, we will handle each
12592          * individual code point in the range.  If 'range' isn't set, this
12593          * could be the beginning of a range, so check for that by looking
12594          * ahead to see if the next real character to be processed is the range
12595          * indicator--the minus sign */
12596
12597         if (skip_white) {
12598             RExC_parse = regpatws(pRExC_state, RExC_parse,
12599                                 FALSE /* means don't recognize comments */);
12600         }
12601
12602         if (range) {
12603             if (prevvalue > value) /* b-a */ {
12604                 const int w = RExC_parse - rangebegin;
12605                 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
12606                 range = 0; /* not a valid range */
12607             }
12608         }
12609         else {
12610             prevvalue = value; /* save the beginning of the potential range */
12611             if (! stop_at_1     /* Can't be a range if parsing just one thing */
12612                 && *RExC_parse == '-')
12613             {
12614                 char* next_char_ptr = RExC_parse + 1;
12615                 if (skip_white) {   /* Get the next real char after the '-' */
12616                     next_char_ptr = regpatws(pRExC_state,
12617                                              RExC_parse + 1,
12618                                              FALSE); /* means don't recognize
12619                                                         comments */
12620                 }
12621
12622                 /* If the '-' is at the end of the class (just before the ']',
12623                  * it is a literal minus; otherwise it is a range */
12624                 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
12625                     RExC_parse = next_char_ptr;
12626
12627                     /* a bad range like \w-, [:word:]- ? */
12628                     if (namedclass > OOB_NAMEDCLASS) {
12629                         if (strict || ckWARN(WARN_REGEXP)) {
12630                             const int w =
12631                                 RExC_parse >= rangebegin ?
12632                                 RExC_parse - rangebegin : 0;
12633                             if (strict) {
12634                                 vFAIL4("False [] range \"%*.*s\"",
12635                                     w, w, rangebegin);
12636                             }
12637                             else {
12638                                 vWARN4(RExC_parse,
12639                                     "False [] range \"%*.*s\"",
12640                                     w, w, rangebegin);
12641                             }
12642                         }
12643                         if (!SIZE_ONLY) {
12644                             cp_list = add_cp_to_invlist(cp_list, '-');
12645                         }
12646                         element_count++;
12647                     } else
12648                         range = 1;      /* yeah, it's a range! */
12649                     continue;   /* but do it the next time */
12650                 }
12651             }
12652         }
12653
12654         /* Here, <prevvalue> is the beginning of the range, if any; or <value>
12655          * if not */
12656
12657         /* non-Latin1 code point implies unicode semantics.  Must be set in
12658          * pass1 so is there for the whole of pass 2 */
12659         if (value > 255) {
12660             RExC_uni_semantics = 1;
12661         }
12662
12663         /* Ready to process either the single value, or the completed range.
12664          * For single-valued non-inverted ranges, we consider the possibility
12665          * of multi-char folds.  (We made a conscious decision to not do this
12666          * for the other cases because it can often lead to non-intuitive
12667          * results.  For example, you have the peculiar case that:
12668          *  "s s" =~ /^[^\xDF]+$/i => Y
12669          *  "ss"  =~ /^[^\xDF]+$/i => N
12670          *
12671          * See [perl #89750] */
12672         if (FOLD && allow_multi_folds && value == prevvalue) {
12673             if (value == LATIN_SMALL_LETTER_SHARP_S
12674                 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
12675                                                         value)))
12676             {
12677                 /* Here <value> is indeed a multi-char fold.  Get what it is */
12678
12679                 U8 foldbuf[UTF8_MAXBYTES_CASE];
12680                 STRLEN foldlen;
12681
12682                 UV folded = _to_uni_fold_flags(
12683                                 value,
12684                                 foldbuf,
12685                                 &foldlen,
12686                                 FOLD_FLAGS_FULL
12687                                 | ((LOC) ?  FOLD_FLAGS_LOCALE
12688                                             : (ASCII_FOLD_RESTRICTED)
12689                                               ? FOLD_FLAGS_NOMIX_ASCII
12690                                               : 0)
12691                                 );
12692
12693                 /* Here, <folded> should be the first character of the
12694                  * multi-char fold of <value>, with <foldbuf> containing the
12695                  * whole thing.  But, if this fold is not allowed (because of
12696                  * the flags), <fold> will be the same as <value>, and should
12697                  * be processed like any other character, so skip the special
12698                  * handling */
12699                 if (folded != value) {
12700
12701                     /* Skip if we are recursed, currently parsing the class
12702                      * again.  Otherwise add this character to the list of
12703                      * multi-char folds. */
12704                     if (! RExC_in_multi_char_class) {
12705                         AV** this_array_ptr;
12706                         AV* this_array;
12707                         STRLEN cp_count = utf8_length(foldbuf,
12708                                                       foldbuf + foldlen);
12709                         SV* multi_fold = sv_2mortal(newSVpvn("", 0));
12710
12711                         Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
12712
12713
12714                         if (! multi_char_matches) {
12715                             multi_char_matches = newAV();
12716                         }
12717
12718                         /* <multi_char_matches> is actually an array of arrays.
12719                          * There will be one or two top-level elements: [2],
12720                          * and/or [3].  The [2] element is an array, each
12721                          * element thereof is a character which folds to two
12722                          * characters; likewise for [3].  (Unicode guarantees a
12723                          * maximum of 3 characters in any fold.)  When we
12724                          * rewrite the character class below, we will do so
12725                          * such that the longest folds are written first, so
12726                          * that it prefers the longest matching strings first.
12727                          * This is done even if it turns out that any
12728                          * quantifier is non-greedy, out of programmer
12729                          * laziness.  Tom Christiansen has agreed that this is
12730                          * ok.  This makes the test for the ligature 'ffi' come
12731                          * before the test for 'ff' */
12732                         if (av_exists(multi_char_matches, cp_count)) {
12733                             this_array_ptr = (AV**) av_fetch(multi_char_matches,
12734                                                              cp_count, FALSE);
12735                             this_array = *this_array_ptr;
12736                         }
12737                         else {
12738                             this_array = newAV();
12739                             av_store(multi_char_matches, cp_count,
12740                                      (SV*) this_array);
12741                         }
12742                         av_push(this_array, multi_fold);
12743                     }
12744
12745                     /* This element should not be processed further in this
12746                      * class */
12747                     element_count--;
12748                     value = save_value;
12749                     prevvalue = save_prevvalue;
12750                     continue;
12751                 }
12752             }
12753         }
12754
12755         /* Deal with this element of the class */
12756         if (! SIZE_ONLY) {
12757 #ifndef EBCDIC
12758             cp_list = _add_range_to_invlist(cp_list, prevvalue, value);
12759 #else
12760             UV* this_range = _new_invlist(1);
12761             _append_range_to_invlist(this_range, prevvalue, value);
12762
12763             /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
12764              * If this range was specified using something like 'i-j', we want
12765              * to include only the 'i' and the 'j', and not anything in
12766              * between, so exclude non-ASCII, non-alphabetics from it.
12767              * However, if the range was specified with something like
12768              * [\x89-\x91] or [\x89-j], all code points within it should be
12769              * included.  literal_endpoint==2 means both ends of the range used
12770              * a literal character, not \x{foo} */
12771             if (literal_endpoint == 2
12772                 && (prevvalue >= 'a' && value <= 'z')
12773                     || (prevvalue >= 'A' && value <= 'Z'))
12774             {
12775                 _invlist_intersection(this_range, PL_ASCII, &this_range, );
12776                 _invlist_intersection(this_range, PL_Alpha, &this_range, );
12777             }
12778             _invlist_union(cp_list, this_range, &cp_list);
12779             literal_endpoint = 0;
12780 #endif
12781         }
12782
12783         range = 0; /* this range (if it was one) is done now */
12784     } /* End of loop through all the text within the brackets */
12785
12786     /* If anything in the class expands to more than one character, we have to
12787      * deal with them by building up a substitute parse string, and recursively
12788      * calling reg() on it, instead of proceeding */
12789     if (multi_char_matches) {
12790         SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
12791         I32 cp_count;
12792         STRLEN len;
12793         char *save_end = RExC_end;
12794         char *save_parse = RExC_parse;
12795         bool first_time = TRUE;     /* First multi-char occurrence doesn't get
12796                                        a "|" */
12797         I32 reg_flags;
12798
12799         assert(! invert);
12800 #if 0   /* Have decided not to deal with multi-char folds in inverted classes,
12801            because too confusing */
12802         if (invert) {
12803             sv_catpv(substitute_parse, "(?:");
12804         }
12805 #endif
12806
12807         /* Look at the longest folds first */
12808         for (cp_count = av_len(multi_char_matches); cp_count > 0; cp_count--) {
12809
12810             if (av_exists(multi_char_matches, cp_count)) {
12811                 AV** this_array_ptr;
12812                 SV* this_sequence;
12813
12814                 this_array_ptr = (AV**) av_fetch(multi_char_matches,
12815                                                  cp_count, FALSE);
12816                 while ((this_sequence = av_pop(*this_array_ptr)) !=
12817                                                                 &PL_sv_undef)
12818                 {
12819                     if (! first_time) {
12820                         sv_catpv(substitute_parse, "|");
12821                     }
12822                     first_time = FALSE;
12823
12824                     sv_catpv(substitute_parse, SvPVX(this_sequence));
12825                 }
12826             }
12827         }
12828
12829         /* If the character class contains anything else besides these
12830          * multi-character folds, have to include it in recursive parsing */
12831         if (element_count) {
12832             sv_catpv(substitute_parse, "|[");
12833             sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
12834             sv_catpv(substitute_parse, "]");
12835         }
12836
12837         sv_catpv(substitute_parse, ")");
12838 #if 0
12839         if (invert) {
12840             /* This is a way to get the parse to skip forward a whole named
12841              * sequence instead of matching the 2nd character when it fails the
12842              * first */
12843             sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
12844         }
12845 #endif
12846
12847         RExC_parse = SvPV(substitute_parse, len);
12848         RExC_end = RExC_parse + len;
12849         RExC_in_multi_char_class = 1;
12850         RExC_emit = (regnode *)orig_emit;
12851
12852         ret = reg(pRExC_state, 1, &reg_flags, depth+1);
12853
12854         *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED);
12855
12856         RExC_parse = save_parse;
12857         RExC_end = save_end;
12858         RExC_in_multi_char_class = 0;
12859         SvREFCNT_dec_NN(multi_char_matches);
12860         SvREFCNT_dec_NN(listsv);
12861         return ret;
12862     }
12863
12864     /* If the character class contains only a single element, it may be
12865      * optimizable into another node type which is smaller and runs faster.
12866      * Check if this is the case for this class */
12867     if (element_count == 1 && ! ret_invlist) {
12868         U8 op = END;
12869         U8 arg = 0;
12870
12871         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like \w or
12872                                               [:digit:] or \p{foo} */
12873
12874             /* All named classes are mapped into POSIXish nodes, with its FLAG
12875              * argument giving which class it is */
12876             switch ((I32)namedclass) {
12877                 case ANYOF_UNIPROP:
12878                     break;
12879
12880                 /* These don't depend on the charset modifiers.  They always
12881                  * match under /u rules */
12882                 case ANYOF_NHORIZWS:
12883                 case ANYOF_HORIZWS:
12884                     namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
12885                     /* FALLTHROUGH */
12886
12887                 case ANYOF_NVERTWS:
12888                 case ANYOF_VERTWS:
12889                     op = POSIXU;
12890                     goto join_posix;
12891
12892                 /* The actual POSIXish node for all the rest depends on the
12893                  * charset modifier.  The ones in the first set depend only on
12894                  * ASCII or, if available on this platform, locale */
12895                 case ANYOF_ASCII:
12896                 case ANYOF_NASCII:
12897 #ifdef HAS_ISASCII
12898                     op = (LOC) ? POSIXL : POSIXA;
12899 #else
12900                     op = POSIXA;
12901 #endif
12902                     goto join_posix;
12903
12904                 case ANYOF_NCASED:
12905                 case ANYOF_LOWER:
12906                 case ANYOF_NLOWER:
12907                 case ANYOF_UPPER:
12908                 case ANYOF_NUPPER:
12909                     /* under /a could be alpha */
12910                     if (FOLD) {
12911                         if (ASCII_RESTRICTED) {
12912                             namedclass = ANYOF_ALPHA + (namedclass % 2);
12913                         }
12914                         else if (! LOC) {
12915                             break;
12916                         }
12917                     }
12918                     /* FALLTHROUGH */
12919
12920                 /* The rest have more possibilities depending on the charset.
12921                  * We take advantage of the enum ordering of the charset
12922                  * modifiers to get the exact node type, */
12923                 default:
12924                     op = POSIXD + get_regex_charset(RExC_flags);
12925                     if (op > POSIXA) { /* /aa is same as /a */
12926                         op = POSIXA;
12927                     }
12928 #ifndef HAS_ISBLANK
12929                     if (op == POSIXL
12930                         && (namedclass == ANYOF_BLANK
12931                             || namedclass == ANYOF_NBLANK))
12932                     {
12933                         op = POSIXA;
12934                     }
12935 #endif
12936
12937                 join_posix:
12938                     /* The odd numbered ones are the complements of the
12939                      * next-lower even number one */
12940                     if (namedclass % 2 == 1) {
12941                         invert = ! invert;
12942                         namedclass--;
12943                     }
12944                     arg = namedclass_to_classnum(namedclass);
12945                     break;
12946             }
12947         }
12948         else if (value == prevvalue) {
12949
12950             /* Here, the class consists of just a single code point */
12951
12952             if (invert) {
12953                 if (! LOC && value == '\n') {
12954                     op = REG_ANY; /* Optimize [^\n] */
12955                     *flagp |= HASWIDTH|SIMPLE;
12956                     RExC_naughty++;
12957                 }
12958             }
12959             else if (value < 256 || UTF) {
12960
12961                 /* Optimize a single value into an EXACTish node, but not if it
12962                  * would require converting the pattern to UTF-8. */
12963                 op = compute_EXACTish(pRExC_state);
12964             }
12965         } /* Otherwise is a range */
12966         else if (! LOC) {   /* locale could vary these */
12967             if (prevvalue == '0') {
12968                 if (value == '9') {
12969                     arg = _CC_DIGIT;
12970                     op = POSIXA;
12971                 }
12972             }
12973         }
12974
12975         /* Here, we have changed <op> away from its initial value iff we found
12976          * an optimization */
12977         if (op != END) {
12978
12979             /* Throw away this ANYOF regnode, and emit the calculated one,
12980              * which should correspond to the beginning, not current, state of
12981              * the parse */
12982             const char * cur_parse = RExC_parse;
12983             RExC_parse = (char *)orig_parse;
12984             if ( SIZE_ONLY) {
12985                 if (! LOC) {
12986
12987                     /* To get locale nodes to not use the full ANYOF size would
12988                      * require moving the code above that writes the portions
12989                      * of it that aren't in other nodes to after this point.
12990                      * e.g.  ANYOF_CLASS_SET */
12991                     RExC_size = orig_size;
12992                 }
12993             }
12994             else {
12995                 RExC_emit = (regnode *)orig_emit;
12996                 if (PL_regkind[op] == POSIXD) {
12997                     if (invert) {
12998                         op += NPOSIXD - POSIXD;
12999                     }
13000                 }
13001             }
13002
13003             ret = reg_node(pRExC_state, op);
13004
13005             if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
13006                 if (! SIZE_ONLY) {
13007                     FLAGS(ret) = arg;
13008                 }
13009                 *flagp |= HASWIDTH|SIMPLE;
13010             }
13011             else if (PL_regkind[op] == EXACT) {
13012                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
13013             }
13014
13015             RExC_parse = (char *) cur_parse;
13016
13017             SvREFCNT_dec(posixes);
13018             SvREFCNT_dec_NN(listsv);
13019             SvREFCNT_dec(cp_list);
13020             return ret;
13021         }
13022     }
13023
13024     if (SIZE_ONLY)
13025         return ret;
13026     /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
13027
13028     /* If folding, we calculate all characters that could fold to or from the
13029      * ones already on the list */
13030     if (FOLD && cp_list) {
13031         UV start, end;  /* End points of code point ranges */
13032
13033         SV* fold_intersection = NULL;
13034
13035         /* If the highest code point is within Latin1, we can use the
13036          * compiled-in Alphas list, and not have to go out to disk.  This
13037          * yields two false positives, the masculine and feminine ordinal
13038          * indicators, which are weeded out below using the
13039          * IS_IN_SOME_FOLD_L1() macro */
13040         if (invlist_highest(cp_list) < 256) {
13041             _invlist_intersection(PL_L1Posix_ptrs[_CC_ALPHA], cp_list,
13042                                                            &fold_intersection);
13043         }
13044         else {
13045
13046             /* Here, there are non-Latin1 code points, so we will have to go
13047              * fetch the list of all the characters that participate in folds
13048              */
13049             if (! PL_utf8_foldable) {
13050                 SV* swash = swash_init("utf8", "_Perl_Any_Folds",
13051                                        &PL_sv_undef, 1, 0);
13052                 PL_utf8_foldable = _get_swash_invlist(swash);
13053                 SvREFCNT_dec_NN(swash);
13054             }
13055
13056             /* This is a hash that for a particular fold gives all characters
13057              * that are involved in it */
13058             if (! PL_utf8_foldclosures) {
13059
13060                 /* If we were unable to find any folds, then we likely won't be
13061                  * able to find the closures.  So just create an empty list.
13062                  * Folding will effectively be restricted to the non-Unicode
13063                  * rules hard-coded into Perl.  (This case happens legitimately
13064                  * during compilation of Perl itself before the Unicode tables
13065                  * are generated) */
13066                 if (_invlist_len(PL_utf8_foldable) == 0) {
13067                     PL_utf8_foldclosures = newHV();
13068                 }
13069                 else {
13070                     /* If the folds haven't been read in, call a fold function
13071                      * to force that */
13072                     if (! PL_utf8_tofold) {
13073                         U8 dummy[UTF8_MAXBYTES+1];
13074
13075                         /* This string is just a short named one above \xff */
13076                         to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
13077                         assert(PL_utf8_tofold); /* Verify that worked */
13078                     }
13079                     PL_utf8_foldclosures =
13080                                     _swash_inversion_hash(PL_utf8_tofold);
13081                 }
13082             }
13083
13084             /* Only the characters in this class that participate in folds need
13085              * be checked.  Get the intersection of this class and all the
13086              * possible characters that are foldable.  This can quickly narrow
13087              * down a large class */
13088             _invlist_intersection(PL_utf8_foldable, cp_list,
13089                                   &fold_intersection);
13090         }
13091
13092         /* Now look at the foldable characters in this class individually */
13093         invlist_iterinit(fold_intersection);
13094         while (invlist_iternext(fold_intersection, &start, &end)) {
13095             UV j;
13096
13097             /* Locale folding for Latin1 characters is deferred until runtime */
13098             if (LOC && start < 256) {
13099                 start = 256;
13100             }
13101
13102             /* Look at every character in the range */
13103             for (j = start; j <= end; j++) {
13104
13105                 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
13106                 STRLEN foldlen;
13107                 SV** listp;
13108
13109                 if (j < 256) {
13110
13111                     /* We have the latin1 folding rules hard-coded here so that
13112                      * an innocent-looking character class, like /[ks]/i won't
13113                      * have to go out to disk to find the possible matches.
13114                      * XXX It would be better to generate these via regen, in
13115                      * case a new version of the Unicode standard adds new
13116                      * mappings, though that is not really likely, and may be
13117                      * caught by the default: case of the switch below. */
13118
13119                     if (IS_IN_SOME_FOLD_L1(j)) {
13120
13121                         /* ASCII is always matched; non-ASCII is matched only
13122                          * under Unicode rules */
13123                         if (isASCII(j) || AT_LEAST_UNI_SEMANTICS) {
13124                             cp_list =
13125                                 add_cp_to_invlist(cp_list, PL_fold_latin1[j]);
13126                         }
13127                         else {
13128                             depends_list =
13129                              add_cp_to_invlist(depends_list, PL_fold_latin1[j]);
13130                         }
13131                     }
13132
13133                     if (HAS_NONLATIN1_FOLD_CLOSURE(j)
13134                         && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
13135                     {
13136                         /* Certain Latin1 characters have matches outside
13137                          * Latin1.  To get here, <j> is one of those
13138                          * characters.   None of these matches is valid for
13139                          * ASCII characters under /aa, which is why the 'if'
13140                          * just above excludes those.  These matches only
13141                          * happen when the target string is utf8.  The code
13142                          * below adds the single fold closures for <j> to the
13143                          * inversion list. */
13144                         switch (j) {
13145                             case 'k':
13146                             case 'K':
13147                                 cp_list =
13148                                     add_cp_to_invlist(cp_list, KELVIN_SIGN);
13149                                 break;
13150                             case 's':
13151                             case 'S':
13152                                 cp_list = add_cp_to_invlist(cp_list,
13153                                                     LATIN_SMALL_LETTER_LONG_S);
13154                                 break;
13155                             case MICRO_SIGN:
13156                                 cp_list = add_cp_to_invlist(cp_list,
13157                                                     GREEK_CAPITAL_LETTER_MU);
13158                                 cp_list = add_cp_to_invlist(cp_list,
13159                                                     GREEK_SMALL_LETTER_MU);
13160                                 break;
13161                             case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
13162                             case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
13163                                 cp_list =
13164                                     add_cp_to_invlist(cp_list, ANGSTROM_SIGN);
13165                                 break;
13166                             case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
13167                                 cp_list = add_cp_to_invlist(cp_list,
13168                                         LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
13169                                 break;
13170                             case LATIN_SMALL_LETTER_SHARP_S:
13171                                 cp_list = add_cp_to_invlist(cp_list,
13172                                                 LATIN_CAPITAL_LETTER_SHARP_S);
13173                                 break;
13174                             case 'F': case 'f':
13175                             case 'I': case 'i':
13176                             case 'L': case 'l':
13177                             case 'T': case 't':
13178                             case 'A': case 'a':
13179                             case 'H': case 'h':
13180                             case 'J': case 'j':
13181                             case 'N': case 'n':
13182                             case 'W': case 'w':
13183                             case 'Y': case 'y':
13184                                 /* These all are targets of multi-character
13185                                  * folds from code points that require UTF8 to
13186                                  * express, so they can't match unless the
13187                                  * target string is in UTF-8, so no action here
13188                                  * is necessary, as regexec.c properly handles
13189                                  * the general case for UTF-8 matching and
13190                                  * multi-char folds */
13191                                 break;
13192                             default:
13193                                 /* Use deprecated warning to increase the
13194                                  * chances of this being output */
13195                                 ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j);
13196                                 break;
13197                         }
13198                     }
13199                     continue;
13200                 }
13201
13202                 /* Here is an above Latin1 character.  We don't have the rules
13203                  * hard-coded for it.  First, get its fold.  This is the simple
13204                  * fold, as the multi-character folds have been handled earlier
13205                  * and separated out */
13206                 _to_uni_fold_flags(j, foldbuf, &foldlen,
13207                                                ((LOC)
13208                                                ? FOLD_FLAGS_LOCALE
13209                                                : (ASCII_FOLD_RESTRICTED)
13210                                                   ? FOLD_FLAGS_NOMIX_ASCII
13211                                                   : 0));
13212
13213                 /* Single character fold of above Latin1.  Add everything in
13214                  * its fold closure to the list that this node should match.
13215                  * The fold closures data structure is a hash with the keys
13216                  * being the UTF-8 of every character that is folded to, like
13217                  * 'k', and the values each an array of all code points that
13218                  * fold to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ].
13219                  * Multi-character folds are not included */
13220                 if ((listp = hv_fetch(PL_utf8_foldclosures,
13221                                       (char *) foldbuf, foldlen, FALSE)))
13222                 {
13223                     AV* list = (AV*) *listp;
13224                     IV k;
13225                     for (k = 0; k <= av_len(list); k++) {
13226                         SV** c_p = av_fetch(list, k, FALSE);
13227                         UV c;
13228                         if (c_p == NULL) {
13229                             Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
13230                         }
13231                         c = SvUV(*c_p);
13232
13233                         /* /aa doesn't allow folds between ASCII and non-; /l
13234                          * doesn't allow them between above and below 256 */
13235                         if ((ASCII_FOLD_RESTRICTED
13236                                   && (isASCII(c) != isASCII(j)))
13237                             || (LOC && ((c < 256) != (j < 256))))
13238                         {
13239                             continue;
13240                         }
13241
13242                         /* Folds involving non-ascii Latin1 characters
13243                          * under /d are added to a separate list */
13244                         if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
13245                         {
13246                             cp_list = add_cp_to_invlist(cp_list, c);
13247                         }
13248                         else {
13249                           depends_list = add_cp_to_invlist(depends_list, c);
13250                         }
13251                     }
13252                 }
13253             }
13254         }
13255         SvREFCNT_dec_NN(fold_intersection);
13256     }
13257
13258     /* And combine the result (if any) with any inversion list from posix
13259      * classes.  The lists are kept separate up to now because we don't want to
13260      * fold the classes (folding of those is automatically handled by the swash
13261      * fetching code) */
13262     if (posixes) {
13263         if (! DEPENDS_SEMANTICS) {
13264             if (cp_list) {
13265                 _invlist_union(cp_list, posixes, &cp_list);
13266                 SvREFCNT_dec_NN(posixes);
13267             }
13268             else {
13269                 cp_list = posixes;
13270             }
13271         }
13272         else {
13273             /* Under /d, we put into a separate list the Latin1 things that
13274              * match only when the target string is utf8 */
13275             SV* nonascii_but_latin1_properties = NULL;
13276             _invlist_intersection(posixes, PL_Latin1,
13277                                   &nonascii_but_latin1_properties);
13278             _invlist_subtract(nonascii_but_latin1_properties, PL_ASCII,
13279                               &nonascii_but_latin1_properties);
13280             _invlist_subtract(posixes, nonascii_but_latin1_properties,
13281                               &posixes);
13282             if (cp_list) {
13283                 _invlist_union(cp_list, posixes, &cp_list);
13284                 SvREFCNT_dec_NN(posixes);
13285             }
13286             else {
13287                 cp_list = posixes;
13288             }
13289
13290             if (depends_list) {
13291                 _invlist_union(depends_list, nonascii_but_latin1_properties,
13292                                &depends_list);
13293                 SvREFCNT_dec_NN(nonascii_but_latin1_properties);
13294             }
13295             else {
13296                 depends_list = nonascii_but_latin1_properties;
13297             }
13298         }
13299     }
13300
13301     /* And combine the result (if any) with any inversion list from properties.
13302      * The lists are kept separate up to now so that we can distinguish the two
13303      * in regards to matching above-Unicode.  A run-time warning is generated
13304      * if a Unicode property is matched against a non-Unicode code point. But,
13305      * we allow user-defined properties to match anything, without any warning,
13306      * and we also suppress the warning if there is a portion of the character
13307      * class that isn't a Unicode property, and which matches above Unicode, \W
13308      * or [\x{110000}] for example.
13309      * (Note that in this case, unlike the Posix one above, there is no
13310      * <depends_list>, because having a Unicode property forces Unicode
13311      * semantics */
13312     if (properties) {
13313         bool warn_super = ! has_user_defined_property;
13314         if (cp_list) {
13315
13316             /* If it matters to the final outcome, see if a non-property
13317              * component of the class matches above Unicode.  If so, the
13318              * warning gets suppressed.  This is true even if just a single
13319              * such code point is specified, as though not strictly correct if
13320              * another such code point is matched against, the fact that they
13321              * are using above-Unicode code points indicates they should know
13322              * the issues involved */
13323             if (warn_super) {
13324                 bool non_prop_matches_above_Unicode =
13325                             runtime_posix_matches_above_Unicode
13326                             | (invlist_highest(cp_list) > PERL_UNICODE_MAX);
13327                 if (invert) {
13328                     non_prop_matches_above_Unicode =
13329                                             !  non_prop_matches_above_Unicode;
13330                 }
13331                 warn_super = ! non_prop_matches_above_Unicode;
13332             }
13333
13334             _invlist_union(properties, cp_list, &cp_list);
13335             SvREFCNT_dec_NN(properties);
13336         }
13337         else {
13338             cp_list = properties;
13339         }
13340
13341         if (warn_super) {
13342             OP(ret) = ANYOF_WARN_SUPER;
13343         }
13344     }
13345
13346     /* Here, we have calculated what code points should be in the character
13347      * class.
13348      *
13349      * Now we can see about various optimizations.  Fold calculation (which we
13350      * did above) needs to take place before inversion.  Otherwise /[^k]/i
13351      * would invert to include K, which under /i would match k, which it
13352      * shouldn't.  Therefore we can't invert folded locale now, as it won't be
13353      * folded until runtime */
13354
13355     /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
13356      * at compile time.  Besides not inverting folded locale now, we can't
13357      * invert if there are things such as \w, which aren't known until runtime
13358      * */
13359     if (invert
13360         && ! (LOC && (FOLD || (ANYOF_FLAGS(ret) & ANYOF_CLASS)))
13361         && ! depends_list
13362         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13363     {
13364         _invlist_invert(cp_list);
13365
13366         /* Any swash can't be used as-is, because we've inverted things */
13367         if (swash) {
13368             SvREFCNT_dec_NN(swash);
13369             swash = NULL;
13370         }
13371
13372         /* Clear the invert flag since have just done it here */
13373         invert = FALSE;
13374     }
13375
13376     if (ret_invlist) {
13377         *ret_invlist = cp_list;
13378
13379         /* Discard the generated node */
13380         if (SIZE_ONLY) {
13381             RExC_size = orig_size;
13382         }
13383         else {
13384             RExC_emit = orig_emit;
13385         }
13386         return END;
13387     }
13388
13389     /* If we didn't do folding, it's because some information isn't available
13390      * until runtime; set the run-time fold flag for these.  (We don't have to
13391      * worry about properties folding, as that is taken care of by the swash
13392      * fetching) */
13393     if (FOLD && LOC)
13394     {
13395        ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
13396     }
13397
13398     /* Some character classes are equivalent to other nodes.  Such nodes take
13399      * up less room and generally fewer operations to execute than ANYOF nodes.
13400      * Above, we checked for and optimized into some such equivalents for
13401      * certain common classes that are easy to test.  Getting to this point in
13402      * the code means that the class didn't get optimized there.  Since this
13403      * code is only executed in Pass 2, it is too late to save space--it has
13404      * been allocated in Pass 1, and currently isn't given back.  But turning
13405      * things into an EXACTish node can allow the optimizer to join it to any
13406      * adjacent such nodes.  And if the class is equivalent to things like /./,
13407      * expensive run-time swashes can be avoided.  Now that we have more
13408      * complete information, we can find things necessarily missed by the
13409      * earlier code.  I (khw) am not sure how much to look for here.  It would
13410      * be easy, but perhaps too slow, to check any candidates against all the
13411      * node types they could possibly match using _invlistEQ(). */
13412
13413     if (cp_list
13414         && ! invert
13415         && ! depends_list
13416         && ! (ANYOF_FLAGS(ret) & ANYOF_CLASS)
13417         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13418     {
13419         UV start, end;
13420         U8 op = END;  /* The optimzation node-type */
13421         const char * cur_parse= RExC_parse;
13422
13423         invlist_iterinit(cp_list);
13424         if (! invlist_iternext(cp_list, &start, &end)) {
13425
13426             /* Here, the list is empty.  This happens, for example, when a
13427              * Unicode property is the only thing in the character class, and
13428              * it doesn't match anything.  (perluniprops.pod notes such
13429              * properties) */
13430             op = OPFAIL;
13431             *flagp |= HASWIDTH|SIMPLE;
13432         }
13433         else if (start == end) {    /* The range is a single code point */
13434             if (! invlist_iternext(cp_list, &start, &end)
13435
13436                     /* Don't do this optimization if it would require changing
13437                      * the pattern to UTF-8 */
13438                 && (start < 256 || UTF))
13439             {
13440                 /* Here, the list contains a single code point.  Can optimize
13441                  * into an EXACT node */
13442
13443                 value = start;
13444
13445                 if (! FOLD) {
13446                     op = EXACT;
13447                 }
13448                 else if (LOC) {
13449
13450                     /* A locale node under folding with one code point can be
13451                      * an EXACTFL, as its fold won't be calculated until
13452                      * runtime */
13453                     op = EXACTFL;
13454                 }
13455                 else {
13456
13457                     /* Here, we are generally folding, but there is only one
13458                      * code point to match.  If we have to, we use an EXACT
13459                      * node, but it would be better for joining with adjacent
13460                      * nodes in the optimization pass if we used the same
13461                      * EXACTFish node that any such are likely to be.  We can
13462                      * do this iff the code point doesn't participate in any
13463                      * folds.  For example, an EXACTF of a colon is the same as
13464                      * an EXACT one, since nothing folds to or from a colon. */
13465                     if (value < 256) {
13466                         if (IS_IN_SOME_FOLD_L1(value)) {
13467                             op = EXACT;
13468                         }
13469                     }
13470                     else {
13471                         if (! PL_utf8_foldable) {
13472                             SV* swash = swash_init("utf8", "_Perl_Any_Folds",
13473                                                 &PL_sv_undef, 1, 0);
13474                             PL_utf8_foldable = _get_swash_invlist(swash);
13475                             SvREFCNT_dec_NN(swash);
13476                         }
13477                         if (_invlist_contains_cp(PL_utf8_foldable, value)) {
13478                             op = EXACT;
13479                         }
13480                     }
13481
13482                     /* If we haven't found the node type, above, it means we
13483                      * can use the prevailing one */
13484                     if (op == END) {
13485                         op = compute_EXACTish(pRExC_state);
13486                     }
13487                 }
13488             }
13489         }
13490         else if (start == 0) {
13491             if (end == UV_MAX) {
13492                 op = SANY;
13493                 *flagp |= HASWIDTH|SIMPLE;
13494                 RExC_naughty++;
13495             }
13496             else if (end == '\n' - 1
13497                     && invlist_iternext(cp_list, &start, &end)
13498                     && start == '\n' + 1 && end == UV_MAX)
13499             {
13500                 op = REG_ANY;
13501                 *flagp |= HASWIDTH|SIMPLE;
13502                 RExC_naughty++;
13503             }
13504         }
13505         invlist_iterfinish(cp_list);
13506
13507         if (op != END) {
13508             RExC_parse = (char *)orig_parse;
13509             RExC_emit = (regnode *)orig_emit;
13510
13511             ret = reg_node(pRExC_state, op);
13512
13513             RExC_parse = (char *)cur_parse;
13514
13515             if (PL_regkind[op] == EXACT) {
13516                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
13517             }
13518
13519             SvREFCNT_dec_NN(cp_list);
13520             SvREFCNT_dec_NN(listsv);
13521             return ret;
13522         }
13523     }
13524
13525     /* Here, <cp_list> contains all the code points we can determine at
13526      * compile time that match under all conditions.  Go through it, and
13527      * for things that belong in the bitmap, put them there, and delete from
13528      * <cp_list>.  While we are at it, see if everything above 255 is in the
13529      * list, and if so, set a flag to speed up execution */
13530     ANYOF_BITMAP_ZERO(ret);
13531     if (cp_list) {
13532
13533         /* This gets set if we actually need to modify things */
13534         bool change_invlist = FALSE;
13535
13536         UV start, end;
13537
13538         /* Start looking through <cp_list> */
13539         invlist_iterinit(cp_list);
13540         while (invlist_iternext(cp_list, &start, &end)) {
13541             UV high;
13542             int i;
13543
13544             if (end == UV_MAX && start <= 256) {
13545                 ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
13546             }
13547
13548             /* Quit if are above what we should change */
13549             if (start > 255) {
13550                 break;
13551             }
13552
13553             change_invlist = TRUE;
13554
13555             /* Set all the bits in the range, up to the max that we are doing */
13556             high = (end < 255) ? end : 255;
13557             for (i = start; i <= (int) high; i++) {
13558                 if (! ANYOF_BITMAP_TEST(ret, i)) {
13559                     ANYOF_BITMAP_SET(ret, i);
13560                     prevvalue = value;
13561                     value = i;
13562                 }
13563             }
13564         }
13565         invlist_iterfinish(cp_list);
13566
13567         /* Done with loop; remove any code points that are in the bitmap from
13568          * <cp_list> */
13569         if (change_invlist) {
13570             _invlist_subtract(cp_list, PL_Latin1, &cp_list);
13571         }
13572
13573         /* If have completely emptied it, remove it completely */
13574         if (_invlist_len(cp_list) == 0) {
13575             SvREFCNT_dec_NN(cp_list);
13576             cp_list = NULL;
13577         }
13578     }
13579
13580     if (invert) {
13581         ANYOF_FLAGS(ret) |= ANYOF_INVERT;
13582     }
13583
13584     /* Here, the bitmap has been populated with all the Latin1 code points that
13585      * always match.  Can now add to the overall list those that match only
13586      * when the target string is UTF-8 (<depends_list>). */
13587     if (depends_list) {
13588         if (cp_list) {
13589             _invlist_union(cp_list, depends_list, &cp_list);
13590             SvREFCNT_dec_NN(depends_list);
13591         }
13592         else {
13593             cp_list = depends_list;
13594         }
13595     }
13596
13597     /* If there is a swash and more than one element, we can't use the swash in
13598      * the optimization below. */
13599     if (swash && element_count > 1) {
13600         SvREFCNT_dec_NN(swash);
13601         swash = NULL;
13602     }
13603
13604     if (! cp_list
13605         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13606     {
13607         ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
13608         SvREFCNT_dec_NN(listsv);
13609     }
13610     else {
13611         /* av[0] stores the character class description in its textual form:
13612          *       used later (regexec.c:Perl_regclass_swash()) to initialize the
13613          *       appropriate swash, and is also useful for dumping the regnode.
13614          * av[1] if NULL, is a placeholder to later contain the swash computed
13615          *       from av[0].  But if no further computation need be done, the
13616          *       swash is stored there now.
13617          * av[2] stores the cp_list inversion list for use in addition or
13618          *       instead of av[0]; used only if av[1] is NULL
13619          * av[3] is set if any component of the class is from a user-defined
13620          *       property; used only if av[1] is NULL */
13621         AV * const av = newAV();
13622         SV *rv;
13623
13624         av_store(av, 0, (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13625                         ? listsv
13626                         : (SvREFCNT_dec_NN(listsv), &PL_sv_undef));
13627         if (swash) {
13628             av_store(av, 1, swash);
13629             SvREFCNT_dec_NN(cp_list);
13630         }
13631         else {
13632             av_store(av, 1, NULL);
13633             if (cp_list) {
13634                 av_store(av, 2, cp_list);
13635                 av_store(av, 3, newSVuv(has_user_defined_property));
13636             }
13637         }
13638
13639         rv = newRV_noinc(MUTABLE_SV(av));
13640         n = add_data(pRExC_state, 1, "s");
13641         RExC_rxi->data->data[n] = (void*)rv;
13642         ARG_SET(ret, n);
13643     }
13644
13645     *flagp |= HASWIDTH|SIMPLE;
13646     return ret;
13647 }
13648 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
13649
13650
13651 /* reg_skipcomment()
13652
13653    Absorbs an /x style # comments from the input stream.
13654    Returns true if there is more text remaining in the stream.
13655    Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
13656    terminates the pattern without including a newline.
13657
13658    Note its the callers responsibility to ensure that we are
13659    actually in /x mode
13660
13661 */
13662
13663 STATIC bool
13664 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
13665 {
13666     bool ended = 0;
13667
13668     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
13669
13670     while (RExC_parse < RExC_end)
13671         if (*RExC_parse++ == '\n') {
13672             ended = 1;
13673             break;
13674         }
13675     if (!ended) {
13676         /* we ran off the end of the pattern without ending
13677            the comment, so we have to add an \n when wrapping */
13678         RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
13679         return 0;
13680     } else
13681         return 1;
13682 }
13683
13684 /* nextchar()
13685
13686    Advances the parse position, and optionally absorbs
13687    "whitespace" from the inputstream.
13688
13689    Without /x "whitespace" means (?#...) style comments only,
13690    with /x this means (?#...) and # comments and whitespace proper.
13691
13692    Returns the RExC_parse point from BEFORE the scan occurs.
13693
13694    This is the /x friendly way of saying RExC_parse++.
13695 */
13696
13697 STATIC char*
13698 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
13699 {
13700     char* const retval = RExC_parse++;
13701
13702     PERL_ARGS_ASSERT_NEXTCHAR;
13703
13704     for (;;) {
13705         if (RExC_end - RExC_parse >= 3
13706             && *RExC_parse == '('
13707             && RExC_parse[1] == '?'
13708             && RExC_parse[2] == '#')
13709         {
13710             while (*RExC_parse != ')') {
13711                 if (RExC_parse == RExC_end)
13712                     FAIL("Sequence (?#... not terminated");
13713                 RExC_parse++;
13714             }
13715             RExC_parse++;
13716             continue;
13717         }
13718         if (RExC_flags & RXf_PMf_EXTENDED) {
13719             if (isSPACE(*RExC_parse)) {
13720                 RExC_parse++;
13721                 continue;
13722             }
13723             else if (*RExC_parse == '#') {
13724                 if ( reg_skipcomment( pRExC_state ) )
13725                     continue;
13726             }
13727         }
13728         return retval;
13729     }
13730 }
13731
13732 /*
13733 - reg_node - emit a node
13734 */
13735 STATIC regnode *                        /* Location. */
13736 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
13737 {
13738     dVAR;
13739     regnode *ptr;
13740     regnode * const ret = RExC_emit;
13741     GET_RE_DEBUG_FLAGS_DECL;
13742
13743     PERL_ARGS_ASSERT_REG_NODE;
13744
13745     if (SIZE_ONLY) {
13746         SIZE_ALIGN(RExC_size);
13747         RExC_size += 1;
13748         return(ret);
13749     }
13750     if (RExC_emit >= RExC_emit_bound)
13751         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
13752                    op, RExC_emit, RExC_emit_bound);
13753
13754     NODE_ALIGN_FILL(ret);
13755     ptr = ret;
13756     FILL_ADVANCE_NODE(ptr, op);
13757 #ifdef RE_TRACK_PATTERN_OFFSETS
13758     if (RExC_offsets) {         /* MJD */
13759         MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
13760               "reg_node", __LINE__,
13761               PL_reg_name[op],
13762               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
13763                 ? "Overwriting end of array!\n" : "OK",
13764               (UV)(RExC_emit - RExC_emit_start),
13765               (UV)(RExC_parse - RExC_start),
13766               (UV)RExC_offsets[0]));
13767         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
13768     }
13769 #endif
13770     RExC_emit = ptr;
13771     return(ret);
13772 }
13773
13774 /*
13775 - reganode - emit a node with an argument
13776 */
13777 STATIC regnode *                        /* Location. */
13778 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
13779 {
13780     dVAR;
13781     regnode *ptr;
13782     regnode * const ret = RExC_emit;
13783     GET_RE_DEBUG_FLAGS_DECL;
13784
13785     PERL_ARGS_ASSERT_REGANODE;
13786
13787     if (SIZE_ONLY) {
13788         SIZE_ALIGN(RExC_size);
13789         RExC_size += 2;
13790         /*
13791            We can't do this:
13792
13793            assert(2==regarglen[op]+1);
13794
13795            Anything larger than this has to allocate the extra amount.
13796            If we changed this to be:
13797
13798            RExC_size += (1 + regarglen[op]);
13799
13800            then it wouldn't matter. Its not clear what side effect
13801            might come from that so its not done so far.
13802            -- dmq
13803         */
13804         return(ret);
13805     }
13806     if (RExC_emit >= RExC_emit_bound)
13807         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
13808                    op, RExC_emit, RExC_emit_bound);
13809
13810     NODE_ALIGN_FILL(ret);
13811     ptr = ret;
13812     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
13813 #ifdef RE_TRACK_PATTERN_OFFSETS
13814     if (RExC_offsets) {         /* MJD */
13815         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
13816               "reganode",
13817               __LINE__,
13818               PL_reg_name[op],
13819               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
13820               "Overwriting end of array!\n" : "OK",
13821               (UV)(RExC_emit - RExC_emit_start),
13822               (UV)(RExC_parse - RExC_start),
13823               (UV)RExC_offsets[0]));
13824         Set_Cur_Node_Offset;
13825     }
13826 #endif
13827     RExC_emit = ptr;
13828     return(ret);
13829 }
13830
13831 /*
13832 - reguni - emit (if appropriate) a Unicode character
13833 */
13834 STATIC STRLEN
13835 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
13836 {
13837     dVAR;
13838
13839     PERL_ARGS_ASSERT_REGUNI;
13840
13841     return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
13842 }
13843
13844 /*
13845 - reginsert - insert an operator in front of already-emitted operand
13846 *
13847 * Means relocating the operand.
13848 */
13849 STATIC void
13850 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
13851 {
13852     dVAR;
13853     regnode *src;
13854     regnode *dst;
13855     regnode *place;
13856     const int offset = regarglen[(U8)op];
13857     const int size = NODE_STEP_REGNODE + offset;
13858     GET_RE_DEBUG_FLAGS_DECL;
13859
13860     PERL_ARGS_ASSERT_REGINSERT;
13861     PERL_UNUSED_ARG(depth);
13862 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
13863     DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
13864     if (SIZE_ONLY) {
13865         RExC_size += size;
13866         return;
13867     }
13868
13869     src = RExC_emit;
13870     RExC_emit += size;
13871     dst = RExC_emit;
13872     if (RExC_open_parens) {
13873         int paren;
13874         /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
13875         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
13876             if ( RExC_open_parens[paren] >= opnd ) {
13877                 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
13878                 RExC_open_parens[paren] += size;
13879             } else {
13880                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
13881             }
13882             if ( RExC_close_parens[paren] >= opnd ) {
13883                 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
13884                 RExC_close_parens[paren] += size;
13885             } else {
13886                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
13887             }
13888         }
13889     }
13890
13891     while (src > opnd) {
13892         StructCopy(--src, --dst, regnode);
13893 #ifdef RE_TRACK_PATTERN_OFFSETS
13894         if (RExC_offsets) {     /* MJD 20010112 */
13895             MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
13896                   "reg_insert",
13897                   __LINE__,
13898                   PL_reg_name[op],
13899                   (UV)(dst - RExC_emit_start) > RExC_offsets[0]
13900                     ? "Overwriting end of array!\n" : "OK",
13901                   (UV)(src - RExC_emit_start),
13902                   (UV)(dst - RExC_emit_start),
13903                   (UV)RExC_offsets[0]));
13904             Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
13905             Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
13906         }
13907 #endif
13908     }
13909
13910
13911     place = opnd;               /* Op node, where operand used to be. */
13912 #ifdef RE_TRACK_PATTERN_OFFSETS
13913     if (RExC_offsets) {         /* MJD */
13914         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
13915               "reginsert",
13916               __LINE__,
13917               PL_reg_name[op],
13918               (UV)(place - RExC_emit_start) > RExC_offsets[0]
13919               ? "Overwriting end of array!\n" : "OK",
13920               (UV)(place - RExC_emit_start),
13921               (UV)(RExC_parse - RExC_start),
13922               (UV)RExC_offsets[0]));
13923         Set_Node_Offset(place, RExC_parse);
13924         Set_Node_Length(place, 1);
13925     }
13926 #endif
13927     src = NEXTOPER(place);
13928     FILL_ADVANCE_NODE(place, op);
13929     Zero(src, offset, regnode);
13930 }
13931
13932 /*
13933 - regtail - set the next-pointer at the end of a node chain of p to val.
13934 - SEE ALSO: regtail_study
13935 */
13936 /* TODO: All three parms should be const */
13937 STATIC void
13938 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
13939 {
13940     dVAR;
13941     regnode *scan;
13942     GET_RE_DEBUG_FLAGS_DECL;
13943
13944     PERL_ARGS_ASSERT_REGTAIL;
13945 #ifndef DEBUGGING
13946     PERL_UNUSED_ARG(depth);
13947 #endif
13948
13949     if (SIZE_ONLY)
13950         return;
13951
13952     /* Find last node. */
13953     scan = p;
13954     for (;;) {
13955         regnode * const temp = regnext(scan);
13956         DEBUG_PARSE_r({
13957             SV * const mysv=sv_newmortal();
13958             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
13959             regprop(RExC_rx, mysv, scan);
13960             PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
13961                 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
13962                     (temp == NULL ? "->" : ""),
13963                     (temp == NULL ? PL_reg_name[OP(val)] : "")
13964             );
13965         });
13966         if (temp == NULL)
13967             break;
13968         scan = temp;
13969     }
13970
13971     if (reg_off_by_arg[OP(scan)]) {
13972         ARG_SET(scan, val - scan);
13973     }
13974     else {
13975         NEXT_OFF(scan) = val - scan;
13976     }
13977 }
13978
13979 #ifdef DEBUGGING
13980 /*
13981 - regtail_study - set the next-pointer at the end of a node chain of p to val.
13982 - Look for optimizable sequences at the same time.
13983 - currently only looks for EXACT chains.
13984
13985 This is experimental code. The idea is to use this routine to perform
13986 in place optimizations on branches and groups as they are constructed,
13987 with the long term intention of removing optimization from study_chunk so
13988 that it is purely analytical.
13989
13990 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
13991 to control which is which.
13992
13993 */
13994 /* TODO: All four parms should be const */
13995
13996 STATIC U8
13997 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
13998 {
13999     dVAR;
14000     regnode *scan;
14001     U8 exact = PSEUDO;
14002 #ifdef EXPERIMENTAL_INPLACESCAN
14003     I32 min = 0;
14004 #endif
14005     GET_RE_DEBUG_FLAGS_DECL;
14006
14007     PERL_ARGS_ASSERT_REGTAIL_STUDY;
14008
14009
14010     if (SIZE_ONLY)
14011         return exact;
14012
14013     /* Find last node. */
14014
14015     scan = p;
14016     for (;;) {
14017         regnode * const temp = regnext(scan);
14018 #ifdef EXPERIMENTAL_INPLACESCAN
14019         if (PL_regkind[OP(scan)] == EXACT) {
14020             bool has_exactf_sharp_s;    /* Unexamined in this routine */
14021             if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1))
14022                 return EXACT;
14023         }
14024 #endif
14025         if ( exact ) {
14026             switch (OP(scan)) {
14027                 case EXACT:
14028                 case EXACTF:
14029                 case EXACTFA:
14030                 case EXACTFU:
14031                 case EXACTFU_SS:
14032                 case EXACTFU_TRICKYFOLD:
14033                 case EXACTFL:
14034                         if( exact == PSEUDO )
14035                             exact= OP(scan);
14036                         else if ( exact != OP(scan) )
14037                             exact= 0;
14038                 case NOTHING:
14039                     break;
14040                 default:
14041                     exact= 0;
14042             }
14043         }
14044         DEBUG_PARSE_r({
14045             SV * const mysv=sv_newmortal();
14046             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
14047             regprop(RExC_rx, mysv, scan);
14048             PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
14049                 SvPV_nolen_const(mysv),
14050                 REG_NODE_NUM(scan),
14051                 PL_reg_name[exact]);
14052         });
14053         if (temp == NULL)
14054             break;
14055         scan = temp;
14056     }
14057     DEBUG_PARSE_r({
14058         SV * const mysv_val=sv_newmortal();
14059         DEBUG_PARSE_MSG("");
14060         regprop(RExC_rx, mysv_val, val);
14061         PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
14062                       SvPV_nolen_const(mysv_val),
14063                       (IV)REG_NODE_NUM(val),
14064                       (IV)(val - scan)
14065         );
14066     });
14067     if (reg_off_by_arg[OP(scan)]) {
14068         ARG_SET(scan, val - scan);
14069     }
14070     else {
14071         NEXT_OFF(scan) = val - scan;
14072     }
14073
14074     return exact;
14075 }
14076 #endif
14077
14078 /*
14079  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
14080  */
14081 #ifdef DEBUGGING
14082 static void
14083 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
14084 {
14085     int bit;
14086     int set=0;
14087     regex_charset cs;
14088
14089     for (bit=0; bit<32; bit++) {
14090         if (flags & (1<<bit)) {
14091             if ((1<<bit) & RXf_PMf_CHARSET) {   /* Output separately, below */
14092                 continue;
14093             }
14094             if (!set++ && lead)
14095                 PerlIO_printf(Perl_debug_log, "%s",lead);
14096             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
14097         }
14098     }
14099     if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
14100             if (!set++ && lead) {
14101                 PerlIO_printf(Perl_debug_log, "%s",lead);
14102             }
14103             switch (cs) {
14104                 case REGEX_UNICODE_CHARSET:
14105                     PerlIO_printf(Perl_debug_log, "UNICODE");
14106                     break;
14107                 case REGEX_LOCALE_CHARSET:
14108                     PerlIO_printf(Perl_debug_log, "LOCALE");
14109                     break;
14110                 case REGEX_ASCII_RESTRICTED_CHARSET:
14111                     PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
14112                     break;
14113                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
14114                     PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
14115                     break;
14116                 default:
14117                     PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
14118                     break;
14119             }
14120     }
14121     if (lead)  {
14122         if (set)
14123             PerlIO_printf(Perl_debug_log, "\n");
14124         else
14125             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
14126     }
14127 }
14128 #endif
14129
14130 void
14131 Perl_regdump(pTHX_ const regexp *r)
14132 {
14133 #ifdef DEBUGGING
14134     dVAR;
14135     SV * const sv = sv_newmortal();
14136     SV *dsv= sv_newmortal();
14137     RXi_GET_DECL(r,ri);
14138     GET_RE_DEBUG_FLAGS_DECL;
14139
14140     PERL_ARGS_ASSERT_REGDUMP;
14141
14142     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
14143
14144     /* Header fields of interest. */
14145     if (r->anchored_substr) {
14146         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
14147             RE_SV_DUMPLEN(r->anchored_substr), 30);
14148         PerlIO_printf(Perl_debug_log,
14149                       "anchored %s%s at %"IVdf" ",
14150                       s, RE_SV_TAIL(r->anchored_substr),
14151                       (IV)r->anchored_offset);
14152     } else if (r->anchored_utf8) {
14153         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
14154             RE_SV_DUMPLEN(r->anchored_utf8), 30);
14155         PerlIO_printf(Perl_debug_log,
14156                       "anchored utf8 %s%s at %"IVdf" ",
14157                       s, RE_SV_TAIL(r->anchored_utf8),
14158                       (IV)r->anchored_offset);
14159     }
14160     if (r->float_substr) {
14161         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
14162             RE_SV_DUMPLEN(r->float_substr), 30);
14163         PerlIO_printf(Perl_debug_log,
14164                       "floating %s%s at %"IVdf"..%"UVuf" ",
14165                       s, RE_SV_TAIL(r->float_substr),
14166                       (IV)r->float_min_offset, (UV)r->float_max_offset);
14167     } else if (r->float_utf8) {
14168         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
14169             RE_SV_DUMPLEN(r->float_utf8), 30);
14170         PerlIO_printf(Perl_debug_log,
14171                       "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
14172                       s, RE_SV_TAIL(r->float_utf8),
14173                       (IV)r->float_min_offset, (UV)r->float_max_offset);
14174     }
14175     if (r->check_substr || r->check_utf8)
14176         PerlIO_printf(Perl_debug_log,
14177                       (const char *)
14178                       (r->check_substr == r->float_substr
14179                        && r->check_utf8 == r->float_utf8
14180                        ? "(checking floating" : "(checking anchored"));
14181     if (r->extflags & RXf_NOSCAN)
14182         PerlIO_printf(Perl_debug_log, " noscan");
14183     if (r->extflags & RXf_CHECK_ALL)
14184         PerlIO_printf(Perl_debug_log, " isall");
14185     if (r->check_substr || r->check_utf8)
14186         PerlIO_printf(Perl_debug_log, ") ");
14187
14188     if (ri->regstclass) {
14189         regprop(r, sv, ri->regstclass);
14190         PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
14191     }
14192     if (r->extflags & RXf_ANCH) {
14193         PerlIO_printf(Perl_debug_log, "anchored");
14194         if (r->extflags & RXf_ANCH_BOL)
14195             PerlIO_printf(Perl_debug_log, "(BOL)");
14196         if (r->extflags & RXf_ANCH_MBOL)
14197             PerlIO_printf(Perl_debug_log, "(MBOL)");
14198         if (r->extflags & RXf_ANCH_SBOL)
14199             PerlIO_printf(Perl_debug_log, "(SBOL)");
14200         if (r->extflags & RXf_ANCH_GPOS)
14201             PerlIO_printf(Perl_debug_log, "(GPOS)");
14202         PerlIO_putc(Perl_debug_log, ' ');
14203     }
14204     if (r->extflags & RXf_GPOS_SEEN)
14205         PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
14206     if (r->intflags & PREGf_SKIP)
14207         PerlIO_printf(Perl_debug_log, "plus ");
14208     if (r->intflags & PREGf_IMPLICIT)
14209         PerlIO_printf(Perl_debug_log, "implicit ");
14210     PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
14211     if (r->extflags & RXf_EVAL_SEEN)
14212         PerlIO_printf(Perl_debug_log, "with eval ");
14213     PerlIO_printf(Perl_debug_log, "\n");
14214     DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
14215 #else
14216     PERL_ARGS_ASSERT_REGDUMP;
14217     PERL_UNUSED_CONTEXT;
14218     PERL_UNUSED_ARG(r);
14219 #endif  /* DEBUGGING */
14220 }
14221
14222 /*
14223 - regprop - printable representation of opcode
14224 */
14225 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
14226 STMT_START { \
14227         if (do_sep) {                           \
14228             Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
14229             if (flags & ANYOF_INVERT)           \
14230                 /*make sure the invert info is in each */ \
14231                 sv_catpvs(sv, "^");             \
14232             do_sep = 0;                         \
14233         }                                       \
14234 } STMT_END
14235
14236 void
14237 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
14238 {
14239 #ifdef DEBUGGING
14240     dVAR;
14241     int k;
14242
14243     /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
14244     static const char * const anyofs[] = {
14245 #if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
14246     || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6                   \
14247     || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9            \
14248     || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12               \
14249     || _CC_PSXSPC != 13 || _CC_CNTRL != 14 || _CC_ASCII != 15               \
14250     || _CC_VERTSPACE != 16
14251   #error Need to adjust order of anyofs[]
14252 #endif
14253         "[\\w]",
14254         "[\\W]",
14255         "[\\d]",
14256         "[\\D]",
14257         "[:alpha:]",
14258         "[:^alpha:]",
14259         "[:lower:]",
14260         "[:^lower:]",
14261         "[:upper:]",
14262         "[:^upper:]",
14263         "[:punct:]",
14264         "[:^punct:]",
14265         "[:print:]",
14266         "[:^print:]",
14267         "[:alnum:]",
14268         "[:^alnum:]",
14269         "[:graph:]",
14270         "[:^graph:]",
14271         "[:cased:]",
14272         "[:^cased:]",
14273         "[\\s]",
14274         "[\\S]",
14275         "[:blank:]",
14276         "[:^blank:]",
14277         "[:xdigit:]",
14278         "[:^xdigit:]",
14279         "[:space:]",
14280         "[:^space:]",
14281         "[:cntrl:]",
14282         "[:^cntrl:]",
14283         "[:ascii:]",
14284         "[:^ascii:]",
14285         "[\\v]",
14286         "[\\V]"
14287     };
14288     RXi_GET_DECL(prog,progi);
14289     GET_RE_DEBUG_FLAGS_DECL;
14290
14291     PERL_ARGS_ASSERT_REGPROP;
14292
14293     sv_setpvs(sv, "");
14294
14295     if (OP(o) > REGNODE_MAX)            /* regnode.type is unsigned */
14296         /* It would be nice to FAIL() here, but this may be called from
14297            regexec.c, and it would be hard to supply pRExC_state. */
14298         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
14299     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
14300
14301     k = PL_regkind[OP(o)];
14302
14303     if (k == EXACT) {
14304         sv_catpvs(sv, " ");
14305         /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
14306          * is a crude hack but it may be the best for now since
14307          * we have no flag "this EXACTish node was UTF-8"
14308          * --jhi */
14309         pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
14310                   PERL_PV_ESCAPE_UNI_DETECT |
14311                   PERL_PV_ESCAPE_NONASCII   |
14312                   PERL_PV_PRETTY_ELLIPSES   |
14313                   PERL_PV_PRETTY_LTGT       |
14314                   PERL_PV_PRETTY_NOCLEAR
14315                   );
14316     } else if (k == TRIE) {
14317         /* print the details of the trie in dumpuntil instead, as
14318          * progi->data isn't available here */
14319         const char op = OP(o);
14320         const U32 n = ARG(o);
14321         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
14322                (reg_ac_data *)progi->data->data[n] :
14323                NULL;
14324         const reg_trie_data * const trie
14325             = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
14326
14327         Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
14328         DEBUG_TRIE_COMPILE_r(
14329             Perl_sv_catpvf(aTHX_ sv,
14330                 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
14331                 (UV)trie->startstate,
14332                 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
14333                 (UV)trie->wordcount,
14334                 (UV)trie->minlen,
14335                 (UV)trie->maxlen,
14336                 (UV)TRIE_CHARCOUNT(trie),
14337                 (UV)trie->uniquecharcount
14338             )
14339         );
14340         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
14341             int i;
14342             int rangestart = -1;
14343             U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
14344             sv_catpvs(sv, "[");
14345             for (i = 0; i <= 256; i++) {
14346                 if (i < 256 && BITMAP_TEST(bitmap,i)) {
14347                     if (rangestart == -1)
14348                         rangestart = i;
14349                 } else if (rangestart != -1) {
14350                     if (i <= rangestart + 3)
14351                         for (; rangestart < i; rangestart++)
14352                             put_byte(sv, rangestart);
14353                     else {
14354                         put_byte(sv, rangestart);
14355                         sv_catpvs(sv, "-");
14356                         put_byte(sv, i - 1);
14357                     }
14358                     rangestart = -1;
14359                 }
14360             }
14361             sv_catpvs(sv, "]");
14362         }
14363
14364     } else if (k == CURLY) {
14365         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
14366             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
14367         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
14368     }
14369     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
14370         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
14371     else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
14372         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
14373         if ( RXp_PAREN_NAMES(prog) ) {
14374             if ( k != REF || (OP(o) < NREF)) {
14375                 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
14376                 SV **name= av_fetch(list, ARG(o), 0 );
14377                 if (name)
14378                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
14379             }
14380             else {
14381                 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
14382                 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
14383                 I32 *nums=(I32*)SvPVX(sv_dat);
14384                 SV **name= av_fetch(list, nums[0], 0 );
14385                 I32 n;
14386                 if (name) {
14387                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
14388                         Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
14389                                     (n ? "," : ""), (IV)nums[n]);
14390                     }
14391                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
14392                 }
14393             }
14394         }
14395     } else if (k == GOSUB)
14396         Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
14397     else if (k == VERB) {
14398         if (!o->flags)
14399             Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
14400                            SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
14401     } else if (k == LOGICAL)
14402         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);     /* 2: embedded, otherwise 1 */
14403     else if (k == ANYOF) {
14404         int i, rangestart = -1;
14405         const U8 flags = ANYOF_FLAGS(o);
14406         int do_sep = 0;
14407
14408
14409         if (flags & ANYOF_LOCALE)
14410             sv_catpvs(sv, "{loc}");
14411         if (flags & ANYOF_LOC_FOLD)
14412             sv_catpvs(sv, "{i}");
14413         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
14414         if (flags & ANYOF_INVERT)
14415             sv_catpvs(sv, "^");
14416
14417         /* output what the standard cp 0-255 bitmap matches */
14418         for (i = 0; i <= 256; i++) {
14419             if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
14420                 if (rangestart == -1)
14421                     rangestart = i;
14422             } else if (rangestart != -1) {
14423                 if (i <= rangestart + 3)
14424                     for (; rangestart < i; rangestart++)
14425                         put_byte(sv, rangestart);
14426                 else {
14427                     put_byte(sv, rangestart);
14428                     sv_catpvs(sv, "-");
14429                     put_byte(sv, i - 1);
14430                 }
14431                 do_sep = 1;
14432                 rangestart = -1;
14433             }
14434         }
14435
14436         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
14437         /* output any special charclass tests (used entirely under use locale) */
14438         if (ANYOF_CLASS_TEST_ANY_SET(o))
14439             for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
14440                 if (ANYOF_CLASS_TEST(o,i)) {
14441                     sv_catpv(sv, anyofs[i]);
14442                     do_sep = 1;
14443                 }
14444
14445         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
14446
14447         if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
14448             sv_catpvs(sv, "{non-utf8-latin1-all}");
14449         }
14450
14451         /* output information about the unicode matching */
14452         if (flags & ANYOF_UNICODE_ALL)
14453             sv_catpvs(sv, "{unicode_all}");
14454         else if (ANYOF_NONBITMAP(o))
14455             sv_catpvs(sv, "{unicode}");
14456         if (flags & ANYOF_NONBITMAP_NON_UTF8)
14457             sv_catpvs(sv, "{outside bitmap}");
14458
14459         if (ANYOF_NONBITMAP(o)) {
14460             SV *lv; /* Set if there is something outside the bit map */
14461             SV * const sw = regclass_swash(prog, o, FALSE, &lv, NULL);
14462             bool byte_output = FALSE;   /* If something in the bitmap has been
14463                                            output */
14464
14465             if (lv && lv != &PL_sv_undef) {
14466                 if (sw) {
14467                     U8 s[UTF8_MAXBYTES_CASE+1];
14468
14469                     for (i = 0; i <= 256; i++) { /* Look at chars in bitmap */
14470                         uvchr_to_utf8(s, i);
14471
14472                         if (i < 256
14473                             && ! ANYOF_BITMAP_TEST(o, i)    /* Don't duplicate
14474                                                                things already
14475                                                                output as part
14476                                                                of the bitmap */
14477                             && swash_fetch(sw, s, TRUE))
14478                         {
14479                             if (rangestart == -1)
14480                                 rangestart = i;
14481                         } else if (rangestart != -1) {
14482                             byte_output = TRUE;
14483                             if (i <= rangestart + 3)
14484                                 for (; rangestart < i; rangestart++) {
14485                                     put_byte(sv, rangestart);
14486                                 }
14487                             else {
14488                                 put_byte(sv, rangestart);
14489                                 sv_catpvs(sv, "-");
14490                                 put_byte(sv, i-1);
14491                             }
14492                             rangestart = -1;
14493                         }
14494                     }
14495                 }
14496
14497                 {
14498                     char *s = savesvpv(lv);
14499                     char * const origs = s;
14500
14501                     while (*s && *s != '\n')
14502                         s++;
14503
14504                     if (*s == '\n') {
14505                         const char * const t = ++s;
14506
14507                         if (byte_output) {
14508                             sv_catpvs(sv, " ");
14509                         }
14510
14511                         while (*s) {
14512                             if (*s == '\n') {
14513
14514                                 /* Truncate very long output */
14515                                 if (s - origs > 256) {
14516                                     Perl_sv_catpvf(aTHX_ sv,
14517                                                    "%.*s...",
14518                                                    (int) (s - origs - 1),
14519                                                    t);
14520                                     goto out_dump;
14521                                 }
14522                                 *s = ' ';
14523                             }
14524                             else if (*s == '\t') {
14525                                 *s = '-';
14526                             }
14527                             s++;
14528                         }
14529                         if (s[-1] == ' ')
14530                             s[-1] = 0;
14531
14532                         sv_catpv(sv, t);
14533                     }
14534
14535                 out_dump:
14536
14537                     Safefree(origs);
14538                 }
14539                 SvREFCNT_dec_NN(lv);
14540             }
14541         }
14542
14543         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
14544     }
14545     else if (k == POSIXD || k == NPOSIXD) {
14546         U8 index = FLAGS(o) * 2;
14547         if (index > (sizeof(anyofs) / sizeof(anyofs[0]))) {
14548             Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
14549         }
14550         else {
14551             sv_catpv(sv, anyofs[index]);
14552         }
14553     }
14554     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
14555         Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
14556 #else
14557     PERL_UNUSED_CONTEXT;
14558     PERL_UNUSED_ARG(sv);
14559     PERL_UNUSED_ARG(o);
14560     PERL_UNUSED_ARG(prog);
14561 #endif  /* DEBUGGING */
14562 }
14563
14564 SV *
14565 Perl_re_intuit_string(pTHX_ REGEXP * const r)
14566 {                               /* Assume that RE_INTUIT is set */
14567     dVAR;
14568     struct regexp *const prog = ReANY(r);
14569     GET_RE_DEBUG_FLAGS_DECL;
14570
14571     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
14572     PERL_UNUSED_CONTEXT;
14573
14574     DEBUG_COMPILE_r(
14575         {
14576             const char * const s = SvPV_nolen_const(prog->check_substr
14577                       ? prog->check_substr : prog->check_utf8);
14578
14579             if (!PL_colorset) reginitcolors();
14580             PerlIO_printf(Perl_debug_log,
14581                       "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
14582                       PL_colors[4],
14583                       prog->check_substr ? "" : "utf8 ",
14584                       PL_colors[5],PL_colors[0],
14585                       s,
14586                       PL_colors[1],
14587                       (strlen(s) > 60 ? "..." : ""));
14588         } );
14589
14590     return prog->check_substr ? prog->check_substr : prog->check_utf8;
14591 }
14592
14593 /*
14594    pregfree()
14595
14596    handles refcounting and freeing the perl core regexp structure. When
14597    it is necessary to actually free the structure the first thing it
14598    does is call the 'free' method of the regexp_engine associated to
14599    the regexp, allowing the handling of the void *pprivate; member
14600    first. (This routine is not overridable by extensions, which is why
14601    the extensions free is called first.)
14602
14603    See regdupe and regdupe_internal if you change anything here.
14604 */
14605 #ifndef PERL_IN_XSUB_RE
14606 void
14607 Perl_pregfree(pTHX_ REGEXP *r)
14608 {
14609     SvREFCNT_dec(r);
14610 }
14611
14612 void
14613 Perl_pregfree2(pTHX_ REGEXP *rx)
14614 {
14615     dVAR;
14616     struct regexp *const r = ReANY(rx);
14617     GET_RE_DEBUG_FLAGS_DECL;
14618
14619     PERL_ARGS_ASSERT_PREGFREE2;
14620
14621     if (r->mother_re) {
14622         ReREFCNT_dec(r->mother_re);
14623     } else {
14624         CALLREGFREE_PVT(rx); /* free the private data */
14625         SvREFCNT_dec(RXp_PAREN_NAMES(r));
14626         Safefree(r->xpv_len_u.xpvlenu_pv);
14627     }
14628     if (r->substrs) {
14629         SvREFCNT_dec(r->anchored_substr);
14630         SvREFCNT_dec(r->anchored_utf8);
14631         SvREFCNT_dec(r->float_substr);
14632         SvREFCNT_dec(r->float_utf8);
14633         Safefree(r->substrs);
14634     }
14635     RX_MATCH_COPY_FREE(rx);
14636 #ifdef PERL_ANY_COW
14637     SvREFCNT_dec(r->saved_copy);
14638 #endif
14639     Safefree(r->offs);
14640     SvREFCNT_dec(r->qr_anoncv);
14641     rx->sv_u.svu_rx = 0;
14642 }
14643
14644 /*  reg_temp_copy()
14645
14646     This is a hacky workaround to the structural issue of match results
14647     being stored in the regexp structure which is in turn stored in
14648     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
14649     could be PL_curpm in multiple contexts, and could require multiple
14650     result sets being associated with the pattern simultaneously, such
14651     as when doing a recursive match with (??{$qr})
14652
14653     The solution is to make a lightweight copy of the regexp structure
14654     when a qr// is returned from the code executed by (??{$qr}) this
14655     lightweight copy doesn't actually own any of its data except for
14656     the starp/end and the actual regexp structure itself.
14657
14658 */
14659
14660
14661 REGEXP *
14662 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
14663 {
14664     struct regexp *ret;
14665     struct regexp *const r = ReANY(rx);
14666     const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
14667
14668     PERL_ARGS_ASSERT_REG_TEMP_COPY;
14669
14670     if (!ret_x)
14671         ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
14672     else {
14673         SvOK_off((SV *)ret_x);
14674         if (islv) {
14675             /* For PVLVs, SvANY points to the xpvlv body while sv_u points
14676                to the regexp.  (For SVt_REGEXPs, sv_upgrade has already
14677                made both spots point to the same regexp body.) */
14678             REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
14679             assert(!SvPVX(ret_x));
14680             ret_x->sv_u.svu_rx = temp->sv_any;
14681             temp->sv_any = NULL;
14682             SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
14683             SvREFCNT_dec_NN(temp);
14684             /* SvCUR still resides in the xpvlv struct, so the regexp copy-
14685                ing below will not set it. */
14686             SvCUR_set(ret_x, SvCUR(rx));
14687         }
14688     }
14689     /* This ensures that SvTHINKFIRST(sv) is true, and hence that
14690        sv_force_normal(sv) is called.  */
14691     SvFAKE_on(ret_x);
14692     ret = ReANY(ret_x);
14693
14694     SvFLAGS(ret_x) |= SvUTF8(rx);
14695     /* We share the same string buffer as the original regexp, on which we
14696        hold a reference count, incremented when mother_re is set below.
14697        The string pointer is copied here, being part of the regexp struct.
14698      */
14699     memcpy(&(ret->xpv_cur), &(r->xpv_cur),
14700            sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
14701     if (r->offs) {
14702         const I32 npar = r->nparens+1;
14703         Newx(ret->offs, npar, regexp_paren_pair);
14704         Copy(r->offs, ret->offs, npar, regexp_paren_pair);
14705     }
14706     if (r->substrs) {
14707         Newx(ret->substrs, 1, struct reg_substr_data);
14708         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
14709
14710         SvREFCNT_inc_void(ret->anchored_substr);
14711         SvREFCNT_inc_void(ret->anchored_utf8);
14712         SvREFCNT_inc_void(ret->float_substr);
14713         SvREFCNT_inc_void(ret->float_utf8);
14714
14715         /* check_substr and check_utf8, if non-NULL, point to either their
14716            anchored or float namesakes, and don't hold a second reference.  */
14717     }
14718     RX_MATCH_COPIED_off(ret_x);
14719 #ifdef PERL_ANY_COW
14720     ret->saved_copy = NULL;
14721 #endif
14722     ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
14723     SvREFCNT_inc_void(ret->qr_anoncv);
14724
14725     return ret_x;
14726 }
14727 #endif
14728
14729 /* regfree_internal()
14730
14731    Free the private data in a regexp. This is overloadable by
14732    extensions. Perl takes care of the regexp structure in pregfree(),
14733    this covers the *pprivate pointer which technically perl doesn't
14734    know about, however of course we have to handle the
14735    regexp_internal structure when no extension is in use.
14736
14737    Note this is called before freeing anything in the regexp
14738    structure.
14739  */
14740
14741 void
14742 Perl_regfree_internal(pTHX_ REGEXP * const rx)
14743 {
14744     dVAR;
14745     struct regexp *const r = ReANY(rx);
14746     RXi_GET_DECL(r,ri);
14747     GET_RE_DEBUG_FLAGS_DECL;
14748
14749     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
14750
14751     DEBUG_COMPILE_r({
14752         if (!PL_colorset)
14753             reginitcolors();
14754         {
14755             SV *dsv= sv_newmortal();
14756             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
14757                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
14758             PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
14759                 PL_colors[4],PL_colors[5],s);
14760         }
14761     });
14762 #ifdef RE_TRACK_PATTERN_OFFSETS
14763     if (ri->u.offsets)
14764         Safefree(ri->u.offsets);             /* 20010421 MJD */
14765 #endif
14766     if (ri->code_blocks) {
14767         int n;
14768         for (n = 0; n < ri->num_code_blocks; n++)
14769             SvREFCNT_dec(ri->code_blocks[n].src_regex);
14770         Safefree(ri->code_blocks);
14771     }
14772
14773     if (ri->data) {
14774         int n = ri->data->count;
14775
14776         while (--n >= 0) {
14777           /* If you add a ->what type here, update the comment in regcomp.h */
14778             switch (ri->data->what[n]) {
14779             case 'a':
14780             case 'r':
14781             case 's':
14782             case 'S':
14783             case 'u':
14784                 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
14785                 break;
14786             case 'f':
14787                 Safefree(ri->data->data[n]);
14788                 break;
14789             case 'l':
14790             case 'L':
14791                 break;
14792             case 'T':
14793                 { /* Aho Corasick add-on structure for a trie node.
14794                      Used in stclass optimization only */
14795                     U32 refcount;
14796                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
14797                     OP_REFCNT_LOCK;
14798                     refcount = --aho->refcount;
14799                     OP_REFCNT_UNLOCK;
14800                     if ( !refcount ) {
14801                         PerlMemShared_free(aho->states);
14802                         PerlMemShared_free(aho->fail);
14803                          /* do this last!!!! */
14804                         PerlMemShared_free(ri->data->data[n]);
14805                         PerlMemShared_free(ri->regstclass);
14806                     }
14807                 }
14808                 break;
14809             case 't':
14810                 {
14811                     /* trie structure. */
14812                     U32 refcount;
14813                     reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
14814                     OP_REFCNT_LOCK;
14815                     refcount = --trie->refcount;
14816                     OP_REFCNT_UNLOCK;
14817                     if ( !refcount ) {
14818                         PerlMemShared_free(trie->charmap);
14819                         PerlMemShared_free(trie->states);
14820                         PerlMemShared_free(trie->trans);
14821                         if (trie->bitmap)
14822                             PerlMemShared_free(trie->bitmap);
14823                         if (trie->jump)
14824                             PerlMemShared_free(trie->jump);
14825                         PerlMemShared_free(trie->wordinfo);
14826                         /* do this last!!!! */
14827                         PerlMemShared_free(ri->data->data[n]);
14828                     }
14829                 }
14830                 break;
14831             default:
14832                 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
14833             }
14834         }
14835         Safefree(ri->data->what);
14836         Safefree(ri->data);
14837     }
14838
14839     Safefree(ri);
14840 }
14841
14842 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
14843 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
14844 #define SAVEPVN(p,n)    ((p) ? savepvn(p,n) : NULL)
14845
14846 /*
14847    re_dup - duplicate a regexp.
14848
14849    This routine is expected to clone a given regexp structure. It is only
14850    compiled under USE_ITHREADS.
14851
14852    After all of the core data stored in struct regexp is duplicated
14853    the regexp_engine.dupe method is used to copy any private data
14854    stored in the *pprivate pointer. This allows extensions to handle
14855    any duplication it needs to do.
14856
14857    See pregfree() and regfree_internal() if you change anything here.
14858 */
14859 #if defined(USE_ITHREADS)
14860 #ifndef PERL_IN_XSUB_RE
14861 void
14862 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
14863 {
14864     dVAR;
14865     I32 npar;
14866     const struct regexp *r = ReANY(sstr);
14867     struct regexp *ret = ReANY(dstr);
14868
14869     PERL_ARGS_ASSERT_RE_DUP_GUTS;
14870
14871     npar = r->nparens+1;
14872     Newx(ret->offs, npar, regexp_paren_pair);
14873     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
14874     if(ret->swap) {
14875         /* no need to copy these */
14876         Newx(ret->swap, npar, regexp_paren_pair);
14877     }
14878
14879     if (ret->substrs) {
14880         /* Do it this way to avoid reading from *r after the StructCopy().
14881            That way, if any of the sv_dup_inc()s dislodge *r from the L1
14882            cache, it doesn't matter.  */
14883         const bool anchored = r->check_substr
14884             ? r->check_substr == r->anchored_substr
14885             : r->check_utf8 == r->anchored_utf8;
14886         Newx(ret->substrs, 1, struct reg_substr_data);
14887         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
14888
14889         ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
14890         ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
14891         ret->float_substr = sv_dup_inc(ret->float_substr, param);
14892         ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
14893
14894         /* check_substr and check_utf8, if non-NULL, point to either their
14895            anchored or float namesakes, and don't hold a second reference.  */
14896
14897         if (ret->check_substr) {
14898             if (anchored) {
14899                 assert(r->check_utf8 == r->anchored_utf8);
14900                 ret->check_substr = ret->anchored_substr;
14901                 ret->check_utf8 = ret->anchored_utf8;
14902             } else {
14903                 assert(r->check_substr == r->float_substr);
14904                 assert(r->check_utf8 == r->float_utf8);
14905                 ret->check_substr = ret->float_substr;
14906                 ret->check_utf8 = ret->float_utf8;
14907             }
14908         } else if (ret->check_utf8) {
14909             if (anchored) {
14910                 ret->check_utf8 = ret->anchored_utf8;
14911             } else {
14912                 ret->check_utf8 = ret->float_utf8;
14913             }
14914         }
14915     }
14916
14917     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
14918     ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
14919
14920     if (ret->pprivate)
14921         RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
14922
14923     if (RX_MATCH_COPIED(dstr))
14924         ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
14925     else
14926         ret->subbeg = NULL;
14927 #ifdef PERL_ANY_COW
14928     ret->saved_copy = NULL;
14929 #endif
14930
14931     /* Whether mother_re be set or no, we need to copy the string.  We
14932        cannot refrain from copying it when the storage points directly to
14933        our mother regexp, because that's
14934                1: a buffer in a different thread
14935                2: something we no longer hold a reference on
14936                so we need to copy it locally.  */
14937     RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
14938     ret->mother_re   = NULL;
14939     ret->gofs = 0;
14940 }
14941 #endif /* PERL_IN_XSUB_RE */
14942
14943 /*
14944    regdupe_internal()
14945
14946    This is the internal complement to regdupe() which is used to copy
14947    the structure pointed to by the *pprivate pointer in the regexp.
14948    This is the core version of the extension overridable cloning hook.
14949    The regexp structure being duplicated will be copied by perl prior
14950    to this and will be provided as the regexp *r argument, however
14951    with the /old/ structures pprivate pointer value. Thus this routine
14952    may override any copying normally done by perl.
14953
14954    It returns a pointer to the new regexp_internal structure.
14955 */
14956
14957 void *
14958 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
14959 {
14960     dVAR;
14961     struct regexp *const r = ReANY(rx);
14962     regexp_internal *reti;
14963     int len;
14964     RXi_GET_DECL(r,ri);
14965
14966     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
14967
14968     len = ProgLen(ri);
14969
14970     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
14971     Copy(ri->program, reti->program, len+1, regnode);
14972
14973     reti->num_code_blocks = ri->num_code_blocks;
14974     if (ri->code_blocks) {
14975         int n;
14976         Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
14977                 struct reg_code_block);
14978         Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
14979                 struct reg_code_block);
14980         for (n = 0; n < ri->num_code_blocks; n++)
14981              reti->code_blocks[n].src_regex = (REGEXP*)
14982                     sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
14983     }
14984     else
14985         reti->code_blocks = NULL;
14986
14987     reti->regstclass = NULL;
14988
14989     if (ri->data) {
14990         struct reg_data *d;
14991         const int count = ri->data->count;
14992         int i;
14993
14994         Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
14995                 char, struct reg_data);
14996         Newx(d->what, count, U8);
14997
14998         d->count = count;
14999         for (i = 0; i < count; i++) {
15000             d->what[i] = ri->data->what[i];
15001             switch (d->what[i]) {
15002                 /* see also regcomp.h and regfree_internal() */
15003             case 'a': /* actually an AV, but the dup function is identical.  */
15004             case 'r':
15005             case 's':
15006             case 'S':
15007             case 'u': /* actually an HV, but the dup function is identical.  */
15008                 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
15009                 break;
15010             case 'f':
15011                 /* This is cheating. */
15012                 Newx(d->data[i], 1, struct regnode_charclass_class);
15013                 StructCopy(ri->data->data[i], d->data[i],
15014                             struct regnode_charclass_class);
15015                 reti->regstclass = (regnode*)d->data[i];
15016                 break;
15017             case 'T':
15018                 /* Trie stclasses are readonly and can thus be shared
15019                  * without duplication. We free the stclass in pregfree
15020                  * when the corresponding reg_ac_data struct is freed.
15021                  */
15022                 reti->regstclass= ri->regstclass;
15023                 /* Fall through */
15024             case 't':
15025                 OP_REFCNT_LOCK;
15026                 ((reg_trie_data*)ri->data->data[i])->refcount++;
15027                 OP_REFCNT_UNLOCK;
15028                 /* Fall through */
15029             case 'l':
15030             case 'L':
15031                 d->data[i] = ri->data->data[i];
15032                 break;
15033             default:
15034                 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
15035             }
15036         }
15037
15038         reti->data = d;
15039     }
15040     else
15041         reti->data = NULL;
15042
15043     reti->name_list_idx = ri->name_list_idx;
15044
15045 #ifdef RE_TRACK_PATTERN_OFFSETS
15046     if (ri->u.offsets) {
15047         Newx(reti->u.offsets, 2*len+1, U32);
15048         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
15049     }
15050 #else
15051     SetProgLen(reti,len);
15052 #endif
15053
15054     return (void*)reti;
15055 }
15056
15057 #endif    /* USE_ITHREADS */
15058
15059 #ifndef PERL_IN_XSUB_RE
15060
15061 /*
15062  - regnext - dig the "next" pointer out of a node
15063  */
15064 regnode *
15065 Perl_regnext(pTHX_ regnode *p)
15066 {
15067     dVAR;
15068     I32 offset;
15069
15070     if (!p)
15071         return(NULL);
15072
15073     if (OP(p) > REGNODE_MAX) {          /* regnode.type is unsigned */
15074         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
15075     }
15076
15077     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
15078     if (offset == 0)
15079         return(NULL);
15080
15081     return(p+offset);
15082 }
15083 #endif
15084
15085 STATIC void
15086 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
15087 {
15088     va_list args;
15089     STRLEN l1 = strlen(pat1);
15090     STRLEN l2 = strlen(pat2);
15091     char buf[512];
15092     SV *msv;
15093     const char *message;
15094
15095     PERL_ARGS_ASSERT_RE_CROAK2;
15096
15097     if (l1 > 510)
15098         l1 = 510;
15099     if (l1 + l2 > 510)
15100         l2 = 510 - l1;
15101     Copy(pat1, buf, l1 , char);
15102     Copy(pat2, buf + l1, l2 , char);
15103     buf[l1 + l2] = '\n';
15104     buf[l1 + l2 + 1] = '\0';
15105 #ifdef I_STDARG
15106     /* ANSI variant takes additional second argument */
15107     va_start(args, pat2);
15108 #else
15109     va_start(args);
15110 #endif
15111     msv = vmess(buf, &args);
15112     va_end(args);
15113     message = SvPV_const(msv,l1);
15114     if (l1 > 512)
15115         l1 = 512;
15116     Copy(message, buf, l1 , char);
15117     buf[l1-1] = '\0';                   /* Overwrite \n */
15118     Perl_croak(aTHX_ "%s", buf);
15119 }
15120
15121 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
15122
15123 #ifndef PERL_IN_XSUB_RE
15124 void
15125 Perl_save_re_context(pTHX)
15126 {
15127     dVAR;
15128
15129     struct re_save_state *state;
15130
15131     SAVEVPTR(PL_curcop);
15132     SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
15133
15134     state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
15135     PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
15136     SSPUSHUV(SAVEt_RE_STATE);
15137
15138     Copy(&PL_reg_state, state, 1, struct re_save_state);
15139
15140     PL_reg_oldsaved = NULL;
15141     PL_reg_oldsavedlen = 0;
15142     PL_reg_oldsavedoffset = 0;
15143     PL_reg_oldsavedcoffset = 0;
15144     PL_reg_maxiter = 0;
15145     PL_reg_leftiter = 0;
15146     PL_reg_poscache = NULL;
15147     PL_reg_poscache_size = 0;
15148 #ifdef PERL_ANY_COW
15149     PL_nrs = NULL;
15150 #endif
15151
15152     /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
15153     if (PL_curpm) {
15154         const REGEXP * const rx = PM_GETRE(PL_curpm);
15155         if (rx) {
15156             U32 i;
15157             for (i = 1; i <= RX_NPARENS(rx); i++) {
15158                 char digits[TYPE_CHARS(long)];
15159                 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
15160                 GV *const *const gvp
15161                     = (GV**)hv_fetch(PL_defstash, digits, len, 0);
15162
15163                 if (gvp) {
15164                     GV * const gv = *gvp;
15165                     if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
15166                         save_scalar(gv);
15167                 }
15168             }
15169         }
15170     }
15171 }
15172 #endif
15173
15174 #ifdef DEBUGGING
15175
15176 STATIC void
15177 S_put_byte(pTHX_ SV *sv, int c)
15178 {
15179     PERL_ARGS_ASSERT_PUT_BYTE;
15180
15181     /* Our definition of isPRINT() ignores locales, so only bytes that are
15182        not part of UTF-8 are considered printable. I assume that the same
15183        holds for UTF-EBCDIC.
15184        Also, code point 255 is not printable in either (it's E0 in EBCDIC,
15185        which Wikipedia says:
15186
15187        EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
15188        ones (binary 1111 1111, hexadecimal FF). It is similar, but not
15189        identical, to the ASCII delete (DEL) or rubout control character.
15190        ) So the old condition can be simplified to !isPRINT(c)  */
15191     if (!isPRINT(c)) {
15192         if (c < 256) {
15193             Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
15194         }
15195         else {
15196             Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
15197         }
15198     }
15199     else {
15200         const char string = c;
15201         if (c == '-' || c == ']' || c == '\\' || c == '^')
15202             sv_catpvs(sv, "\\");
15203         sv_catpvn(sv, &string, 1);
15204     }
15205 }
15206
15207
15208 #define CLEAR_OPTSTART \
15209     if (optstart) STMT_START { \
15210             DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
15211             optstart=NULL; \
15212     } STMT_END
15213
15214 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
15215
15216 STATIC const regnode *
15217 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
15218             const regnode *last, const regnode *plast,
15219             SV* sv, I32 indent, U32 depth)
15220 {
15221     dVAR;
15222     U8 op = PSEUDO;     /* Arbitrary non-END op. */
15223     const regnode *next;
15224     const regnode *optstart= NULL;
15225
15226     RXi_GET_DECL(r,ri);
15227     GET_RE_DEBUG_FLAGS_DECL;
15228
15229     PERL_ARGS_ASSERT_DUMPUNTIL;
15230
15231 #ifdef DEBUG_DUMPUNTIL
15232     PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
15233         last ? last-start : 0,plast ? plast-start : 0);
15234 #endif
15235
15236     if (plast && plast < last)
15237         last= plast;
15238
15239     while (PL_regkind[op] != END && (!last || node < last)) {
15240         /* While that wasn't END last time... */
15241         NODE_ALIGN(node);
15242         op = OP(node);
15243         if (op == CLOSE || op == WHILEM)
15244             indent--;
15245         next = regnext((regnode *)node);
15246
15247         /* Where, what. */
15248         if (OP(node) == OPTIMIZED) {
15249             if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
15250                 optstart = node;
15251             else
15252                 goto after_print;
15253         } else
15254             CLEAR_OPTSTART;
15255
15256         regprop(r, sv, node);
15257         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
15258                       (int)(2*indent + 1), "", SvPVX_const(sv));
15259
15260         if (OP(node) != OPTIMIZED) {
15261             if (next == NULL)           /* Next ptr. */
15262                 PerlIO_printf(Perl_debug_log, " (0)");
15263             else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
15264                 PerlIO_printf(Perl_debug_log, " (FAIL)");
15265             else
15266                 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
15267             (void)PerlIO_putc(Perl_debug_log, '\n');
15268         }
15269
15270       after_print:
15271         if (PL_regkind[(U8)op] == BRANCHJ) {
15272             assert(next);
15273             {
15274                 const regnode *nnode = (OP(next) == LONGJMP
15275                                        ? regnext((regnode *)next)
15276                                        : next);
15277                 if (last && nnode > last)
15278                     nnode = last;
15279                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
15280             }
15281         }
15282         else if (PL_regkind[(U8)op] == BRANCH) {
15283             assert(next);
15284             DUMPUNTIL(NEXTOPER(node), next);
15285         }
15286         else if ( PL_regkind[(U8)op]  == TRIE ) {
15287             const regnode *this_trie = node;
15288             const char op = OP(node);
15289             const U32 n = ARG(node);
15290             const reg_ac_data * const ac = op>=AHOCORASICK ?
15291                (reg_ac_data *)ri->data->data[n] :
15292                NULL;
15293             const reg_trie_data * const trie =
15294                 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
15295 #ifdef DEBUGGING
15296             AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
15297 #endif
15298             const regnode *nextbranch= NULL;
15299             I32 word_idx;
15300             sv_setpvs(sv, "");
15301             for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
15302                 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
15303
15304                 PerlIO_printf(Perl_debug_log, "%*s%s ",
15305                    (int)(2*(indent+3)), "",
15306                     elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
15307                             PL_colors[0], PL_colors[1],
15308                             (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
15309                             PERL_PV_PRETTY_ELLIPSES    |
15310                             PERL_PV_PRETTY_LTGT
15311                             )
15312                             : "???"
15313                 );
15314                 if (trie->jump) {
15315                     U16 dist= trie->jump[word_idx+1];
15316                     PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
15317                                   (UV)((dist ? this_trie + dist : next) - start));
15318                     if (dist) {
15319                         if (!nextbranch)
15320                             nextbranch= this_trie + trie->jump[0];
15321                         DUMPUNTIL(this_trie + dist, nextbranch);
15322                     }
15323                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
15324                         nextbranch= regnext((regnode *)nextbranch);
15325                 } else {
15326                     PerlIO_printf(Perl_debug_log, "\n");
15327                 }
15328             }
15329             if (last && next > last)
15330                 node= last;
15331             else
15332                 node= next;
15333         }
15334         else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
15335             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
15336                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
15337         }
15338         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
15339             assert(next);
15340             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
15341         }
15342         else if ( op == PLUS || op == STAR) {
15343             DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
15344         }
15345         else if (PL_regkind[(U8)op] == ANYOF) {
15346             /* arglen 1 + class block */
15347             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
15348                     ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
15349             node = NEXTOPER(node);
15350         }
15351         else if (PL_regkind[(U8)op] == EXACT) {
15352             /* Literal string, where present. */
15353             node += NODE_SZ_STR(node) - 1;
15354             node = NEXTOPER(node);
15355         }
15356         else {
15357             node = NEXTOPER(node);
15358             node += regarglen[(U8)op];
15359         }
15360         if (op == CURLYX || op == OPEN)
15361             indent++;
15362     }
15363     CLEAR_OPTSTART;
15364 #ifdef DEBUG_DUMPUNTIL
15365     PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
15366 #endif
15367     return node;
15368 }
15369
15370 #endif  /* DEBUGGING */
15371
15372 /*
15373  * Local variables:
15374  * c-indentation-style: bsd
15375  * c-basic-offset: 4
15376  * indent-tabs-mode: nil
15377  * End:
15378  *
15379  * ex: set ts=8 sts=4 sw=4 et:
15380  */