regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  *      Copyright (c) 1986 by University of Toronto.
  42  *      Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  *      Permission is granted to anyone to use this software for any
  45  *      purpose on any computer system, and to redistribute it freely,
  46  *      subject to the following restrictions:
  47  *
  48  *      1. The author is not responsible for the consequences of use of
  49  *              this software, no matter how awful, even if they arise
  50  *              from defects in it.
  51  *
  52  *      2. The origin of this software must not be misrepresented, either
  53  *              by explicit claim or by omission.
  54  *
  55  *      3. Altered versions must be plainly marked as such, and must not
  56  *              be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #include "perl.h"
  76
  77 #ifndef PERL_IN_XSUB_RE
  78 #include "re_defs.h"
  79 #endif
  80
  81 #define REG_COMP_C
  82 #ifdef PERL_IN_XSUB_RE
  83 #  include "re_comp.h"
  84 #else
  85 #  include "regcomp.h"
  86 #endif
  87
  88 #include "dquote_static.c"
  89 #ifndef PERL_IN_XSUB_RE
  90 #  include "charclass_invlists.h"
  91 #endif
  92
  93 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  94
  95 #ifdef op
  96 #undef op
  97 #endif /* op */
  98
  99 #ifdef MSDOS
 100 #  if defined(BUGGY_MSC6)
 101  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
 102 #    pragma optimize("a",off)
 103  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
 104 #    pragma optimize("w",on )
 105 #  endif /* BUGGY_MSC6 */
 106 #endif /* MSDOS */
 107
 108 #ifndef STATIC
 109 #define STATIC  static
 110 #endif
 111
 112
 113 typedef struct RExC_state_t {
 114     U32         flags;                  /* RXf_* are we folding, multilining? */
 115     U32         pm_flags;               /* PMf_* stuff from the calling PMOP */
 116     char        *precomp;               /* uncompiled string. */
 117     REGEXP      *rx_sv;                 /* The SV that is the regexp. */
 118     regexp      *rx;                    /* perl core regexp structure */
 119     regexp_internal     *rxi;           /* internal data for regexp object pprivate field */
 120     char        *start;                 /* Start of input for compile */
 121     char        *end;                   /* End of input for compile */
 122     char        *parse;                 /* Input-scan pointer. */
 123     I32         whilem_seen;            /* number of WHILEM in this expr */
 124     regnode     *emit_start;            /* Start of emitted-code area */
 125     regnode     *emit_bound;            /* First regnode outside of the allocated space */
 126     regnode     *emit;                  /* Code-emit pointer; &regdummy = don't = compiling */
 127     I32         naughty;                /* How bad is this pattern? */
 128     I32         sawback;                /* Did we see \1, ...? */
 129     U32         seen;
 130     I32         size;                   /* Code size. */
 131     I32         npar;                   /* Capture buffer count, (OPEN). */
 132     I32         cpar;                   /* Capture buffer count, (CLOSE). */
 133     I32         nestroot;               /* root parens we are in - used by accept */
 134     I32         extralen;
 135     I32         seen_zerolen;
 136     regnode     **open_parens;          /* pointers to open parens */
 137     regnode     **close_parens;         /* pointers to close parens */
 138     regnode     *opend;                 /* END node in program */
 139     I32         utf8;           /* whether the pattern is utf8 or not */
 140     I32         orig_utf8;      /* whether the pattern was originally in utf8 */
 141                                 /* XXX use this for future optimisation of case
 142                                  * where pattern must be upgraded to utf8. */
 143     I32         uni_semantics;  /* If a d charset modifier should use unicode
 144                                    rules, even if the pattern is not in
 145                                    utf8 */
 146     HV          *paren_names;           /* Paren names */
 147
 148     regnode     **recurse;              /* Recurse regops */
 149     I32         recurse_count;          /* Number of recurse regops */
 150     I32         in_lookbehind;
 151     I32         contains_locale;
 152     I32         override_recoding;
 153     struct reg_code_block *code_blocks; /* positions of literal (?{})
 154                                             within pattern */
 155     int         num_code_blocks;        /* size of code_blocks[] */
 156     int         code_index;             /* next code_blocks[] slot */
 157 #if ADD_TO_REGEXEC
 158     char        *starttry;              /* -Dr: where regtry was called. */
 159 #define RExC_starttry   (pRExC_state->starttry)
 160 #endif
 161     SV          *runtime_code_qr;       /* qr with the runtime code blocks */
 162 #ifdef DEBUGGING
 163     const char  *lastparse;
 164     I32         lastnum;
 165     AV          *paren_name_list;       /* idx -> name */
 166 #define RExC_lastparse  (pRExC_state->lastparse)
 167 #define RExC_lastnum    (pRExC_state->lastnum)
 168 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 169 #endif
 170 } RExC_state_t;
 171
 172 #define RExC_flags      (pRExC_state->flags)
 173 #define RExC_pm_flags   (pRExC_state->pm_flags)
 174 #define RExC_precomp    (pRExC_state->precomp)
 175 #define RExC_rx_sv      (pRExC_state->rx_sv)
 176 #define RExC_rx         (pRExC_state->rx)
 177 #define RExC_rxi        (pRExC_state->rxi)
 178 #define RExC_start      (pRExC_state->start)
 179 #define RExC_end        (pRExC_state->end)
 180 #define RExC_parse      (pRExC_state->parse)
 181 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
 182 #ifdef RE_TRACK_PATTERN_OFFSETS
 183 #define RExC_offsets    (pRExC_state->rxi->u.offsets) /* I am not like the others */
 184 #endif
 185 #define RExC_emit       (pRExC_state->emit)
 186 #define RExC_emit_start (pRExC_state->emit_start)
 187 #define RExC_emit_bound (pRExC_state->emit_bound)
 188 #define RExC_naughty    (pRExC_state->naughty)
 189 #define RExC_sawback    (pRExC_state->sawback)
 190 #define RExC_seen       (pRExC_state->seen)
 191 #define RExC_size       (pRExC_state->size)
 192 #define RExC_npar       (pRExC_state->npar)
 193 #define RExC_nestroot   (pRExC_state->nestroot)
 194 #define RExC_extralen   (pRExC_state->extralen)
 195 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
 196 #define RExC_utf8       (pRExC_state->utf8)
 197 #define RExC_uni_semantics      (pRExC_state->uni_semantics)
 198 #define RExC_orig_utf8  (pRExC_state->orig_utf8)
 199 #define RExC_open_parens        (pRExC_state->open_parens)
 200 #define RExC_close_parens       (pRExC_state->close_parens)
 201 #define RExC_opend      (pRExC_state->opend)
 202 #define RExC_paren_names        (pRExC_state->paren_names)
 203 #define RExC_recurse    (pRExC_state->recurse)
 204 #define RExC_recurse_count      (pRExC_state->recurse_count)
 205 #define RExC_in_lookbehind      (pRExC_state->in_lookbehind)
 206 #define RExC_contains_locale    (pRExC_state->contains_locale)
 207 #define RExC_override_recoding  (pRExC_state->override_recoding)
 208
 209
 210 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
 211 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 212         ((*s) == '{' && regcurly(s)))
 213
 214 #ifdef SPSTART
 215 #undef SPSTART          /* dratted cpp namespace... */
 216 #endif
 217 /*
 218  * Flags to be passed up and down.
 219  */
 220 #define WORST           0       /* Worst case. */
 221 #define HASWIDTH        0x01    /* Known to match non-null strings. */
 222
 223 /* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
 224  * character, and if utf8, must be invariant.  Note that this is not the same
 225  * thing as REGNODE_SIMPLE */
 226 #define SIMPLE          0x02
 227 #define SPSTART         0x04    /* Starts with * or +. */
 228 #define TRYAGAIN        0x08    /* Weeded out a declaration. */
 229 #define POSTPONED       0x10    /* (?1),(?&name), (??{...}) or similar */
 230
 231 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 232
 233 /* whether trie related optimizations are enabled */
 234 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 235 #define TRIE_STUDY_OPT
 236 #define FULL_TRIE_STUDY
 237 #define TRIE_STCLASS
 238 #endif
 239
 240
 241
 242 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 243 #define PBITVAL(paren) (1 << ((paren) & 7))
 244 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 245 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 246 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 247
 248 /* If not already in utf8, do a longjmp back to the beginning */
 249 #define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
 250 #define REQUIRE_UTF8    STMT_START {                                       \
 251                                      if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
 252                         } STMT_END
 253
 254 /* About scan_data_t.
 255
 256   During optimisation we recurse through the regexp program performing
 257   various inplace (keyhole style) optimisations. In addition study_chunk
 258   and scan_commit populate this data structure with information about
 259   what strings MUST appear in the pattern. We look for the longest
 260   string that must appear at a fixed location, and we look for the
 261   longest string that may appear at a floating location. So for instance
 262   in the pattern:
 263
 264     /FOO[xX]A.*B[xX]BAR/
 265
 266   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 267   strings (because they follow a .* construct). study_chunk will identify
 268   both FOO and BAR as being the longest fixed and floating strings respectively.
 269
 270   The strings can be composites, for instance
 271
 272      /(f)(o)(o)/
 273
 274   will result in a composite fixed substring 'foo'.
 275
 276   For each string some basic information is maintained:
 277
 278   - offset or min_offset
 279     This is the position the string must appear at, or not before.
 280     It also implicitly (when combined with minlenp) tells us how many
 281     characters must match before the string we are searching for.
 282     Likewise when combined with minlenp and the length of the string it
 283     tells us how many characters must appear after the string we have
 284     found.
 285
 286   - max_offset
 287     Only used for floating strings. This is the rightmost point that
 288     the string can appear at. If set to I32 max it indicates that the
 289     string can occur infinitely far to the right.
 290
 291   - minlenp
 292     A pointer to the minimum length of the pattern that the string
 293     was found inside. This is important as in the case of positive
 294     lookahead or positive lookbehind we can have multiple patterns
 295     involved. Consider
 296
 297     /(?=FOO).*F/
 298
 299     The minimum length of the pattern overall is 3, the minimum length
 300     of the lookahead part is 3, but the minimum length of the part that
 301     will actually match is 1. So 'FOO's minimum length is 3, but the
 302     minimum length for the F is 1. This is important as the minimum length
 303     is used to determine offsets in front of and behind the string being
 304     looked for.  Since strings can be composites this is the length of the
 305     pattern at the time it was committed with a scan_commit. Note that
 306     the length is calculated by study_chunk, so that the minimum lengths
 307     are not known until the full pattern has been compiled, thus the
 308     pointer to the value.
 309
 310   - lookbehind
 311
 312     In the case of lookbehind the string being searched for can be
 313     offset past the start point of the final matching string.
 314     If this value was just blithely removed from the min_offset it would
 315     invalidate some of the calculations for how many chars must match
 316     before or after (as they are derived from min_offset and minlen and
 317     the length of the string being searched for).
 318     When the final pattern is compiled and the data is moved from the
 319     scan_data_t structure into the regexp structure the information
 320     about lookbehind is factored in, with the information that would
 321     have been lost precalculated in the end_shift field for the
 322     associated string.
 323
 324   The fields pos_min and pos_delta are used to store the minimum offset
 325   and the delta to the maximum offset at the current point in the pattern.
 326
 327 */
 328
 329 typedef struct scan_data_t {
 330     /*I32 len_min;      unused */
 331     /*I32 len_delta;    unused */
 332     I32 pos_min;
 333     I32 pos_delta;
 334     SV *last_found;
 335     I32 last_end;           /* min value, <0 unless valid. */
 336     I32 last_start_min;
 337     I32 last_start_max;
 338     SV **longest;           /* Either &l_fixed, or &l_float. */
 339     SV *longest_fixed;      /* longest fixed string found in pattern */
 340     I32 offset_fixed;       /* offset where it starts */
 341     I32 *minlen_fixed;      /* pointer to the minlen relevant to the string */
 342     I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 343     SV *longest_float;      /* longest floating string found in pattern */
 344     I32 offset_float_min;   /* earliest point in string it can appear */
 345     I32 offset_float_max;   /* latest point in string it can appear */
 346     I32 *minlen_float;      /* pointer to the minlen relevant to the string */
 347     I32 lookbehind_float;   /* is the position of the string modified by LB */
 348     I32 flags;
 349     I32 whilem_c;
 350     I32 *last_closep;
 351     struct regnode_charclass_class *start_class;
 352 } scan_data_t;
 353
 354 /*
 355  * Forward declarations for pregcomp()'s friends.
 356  */
 357
 358 static const scan_data_t zero_scan_data =
 359   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 360
 361 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 362 #define SF_BEFORE_SEOL          0x0001
 363 #define SF_BEFORE_MEOL          0x0002
 364 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 365 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 366
 367 #ifdef NO_UNARY_PLUS
 368 #  define SF_FIX_SHIFT_EOL      (0+2)
 369 #  define SF_FL_SHIFT_EOL               (0+4)
 370 #else
 371 #  define SF_FIX_SHIFT_EOL      (+2)
 372 #  define SF_FL_SHIFT_EOL               (+4)
 373 #endif
 374
 375 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 376 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 377
 378 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 379 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 380 #define SF_IS_INF               0x0040
 381 #define SF_HAS_PAR              0x0080
 382 #define SF_IN_PAR               0x0100
 383 #define SF_HAS_EVAL             0x0200
 384 #define SCF_DO_SUBSTR           0x0400
 385 #define SCF_DO_STCLASS_AND      0x0800
 386 #define SCF_DO_STCLASS_OR       0x1000
 387 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 388 #define SCF_WHILEM_VISITED_POS  0x2000
 389
 390 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 391 #define SCF_SEEN_ACCEPT         0x8000
 392
 393 #define UTF cBOOL(RExC_utf8)
 394
 395 /* The enums for all these are ordered so things work out correctly */
 396 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 397 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
 398 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
 399 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
 400 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
 401 #define MORE_ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
 402 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
 403
 404 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 405
 406 #define OOB_UNICODE             12345678
 407 #define OOB_NAMEDCLASS          -1
 408
 409 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 410 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 411
 412
 413 /* length of regex to show in messages that don't mark a position within */
 414 #define RegexLengthToShowInErrorMessages 127
 415
 416 /*
 417  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 418  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 419  * op/pragma/warn/regcomp.
 420  */
 421 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 422 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 423
 424 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
 425
 426 /*
 427  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 428  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 429  * "...".
 430  */
 431 #define _FAIL(code) STMT_START {                                        \
 432     const char *ellipses = "";                                          \
 433     IV len = RExC_end - RExC_precomp;                                   \
 434                                                                         \
 435     if (!SIZE_ONLY)                                                     \
 436         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);                   \
 437     if (len > RegexLengthToShowInErrorMessages) {                       \
 438         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
 439         len = RegexLengthToShowInErrorMessages - 10;                    \
 440         ellipses = "...";                                               \
 441     }                                                                   \
 442     code;                                                               \
 443 } STMT_END
 444
 445 #define FAIL(msg) _FAIL(                            \
 446     Perl_croak(aTHX_ "%s in regex m/%.*s%s/",       \
 447             msg, (int)len, RExC_precomp, ellipses))
 448
 449 #define FAIL2(msg,arg) _FAIL(                       \
 450     Perl_croak(aTHX_ msg " in regex m/%.*s%s/",     \
 451             arg, (int)len, RExC_precomp, ellipses))
 452
 453 /*
 454  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 455  */
 456 #define Simple_vFAIL(m) STMT_START {                                    \
 457     const IV offset = RExC_parse - RExC_precomp;                        \
 458     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
 459             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
 460 } STMT_END
 461
 462 /*
 463  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 464  */
 465 #define vFAIL(m) STMT_START {                           \
 466     if (!SIZE_ONLY)                                     \
 467         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 468     Simple_vFAIL(m);                                    \
 469 } STMT_END
 470
 471 /*
 472  * Like Simple_vFAIL(), but accepts two arguments.
 473  */
 474 #define Simple_vFAIL2(m,a1) STMT_START {                        \
 475     const IV offset = RExC_parse - RExC_precomp;                        \
 476     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,                   \
 477             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 478 } STMT_END
 479
 480 /*
 481  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 482  */
 483 #define vFAIL2(m,a1) STMT_START {                       \
 484     if (!SIZE_ONLY)                                     \
 485         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 486     Simple_vFAIL2(m, a1);                               \
 487 } STMT_END
 488
 489
 490 /*
 491  * Like Simple_vFAIL(), but accepts three arguments.
 492  */
 493 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
 494     const IV offset = RExC_parse - RExC_precomp;                \
 495     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,               \
 496             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 497 } STMT_END
 498
 499 /*
 500  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 501  */
 502 #define vFAIL3(m,a1,a2) STMT_START {                    \
 503     if (!SIZE_ONLY)                                     \
 504         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 505     Simple_vFAIL3(m, a1, a2);                           \
 506 } STMT_END
 507
 508 /*
 509  * Like Simple_vFAIL(), but accepts four arguments.
 510  */
 511 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
 512     const IV offset = RExC_parse - RExC_precomp;                \
 513     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,           \
 514             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 515 } STMT_END
 516
 517 #define ckWARNreg(loc,m) STMT_START {                                   \
 518     const IV offset = loc - RExC_precomp;                               \
 519     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 520             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 521 } STMT_END
 522
 523 #define ckWARNregdep(loc,m) STMT_START {                                \
 524     const IV offset = loc - RExC_precomp;                               \
 525     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 526             m REPORT_LOCATION,                                          \
 527             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 528 } STMT_END
 529
 530 #define ckWARN2regdep(loc,m, a1) STMT_START {                           \
 531     const IV offset = loc - RExC_precomp;                               \
 532     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 533             m REPORT_LOCATION,                                          \
 534             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
 535 } STMT_END
 536
 537 #define ckWARN2reg(loc, m, a1) STMT_START {                             \
 538     const IV offset = loc - RExC_precomp;                               \
 539     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 540             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
 541 } STMT_END
 542
 543 #define vWARN3(loc, m, a1, a2) STMT_START {                             \
 544     const IV offset = loc - RExC_precomp;                               \
 545     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 546             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 547 } STMT_END
 548
 549 #define ckWARN3reg(loc, m, a1, a2) STMT_START {                         \
 550     const IV offset = loc - RExC_precomp;                               \
 551     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 552             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 553 } STMT_END
 554
 555 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
 556     const IV offset = loc - RExC_precomp;                               \
 557     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 558             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 559 } STMT_END
 560
 561 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START {                     \
 562     const IV offset = loc - RExC_precomp;                               \
 563     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 564             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 565 } STMT_END
 566
 567 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
 568     const IV offset = loc - RExC_precomp;                               \
 569     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 570             a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
 571 } STMT_END
 572
 573
 574 /* Allow for side effects in s */
 575 #define REGC(c,s) STMT_START {                  \
 576     if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
 577 } STMT_END
 578
 579 /* Macros for recording node offsets.   20001227 mjd@plover.com
 580  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 581  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 582  * Element 0 holds the number n.
 583  * Position is 1 indexed.
 584  */
 585 #ifndef RE_TRACK_PATTERN_OFFSETS
 586 #define Set_Node_Offset_To_R(node,byte)
 587 #define Set_Node_Offset(node,byte)
 588 #define Set_Cur_Node_Offset
 589 #define Set_Node_Length_To_R(node,len)
 590 #define Set_Node_Length(node,len)
 591 #define Set_Node_Cur_Length(node)
 592 #define Node_Offset(n)
 593 #define Node_Length(n)
 594 #define Set_Node_Offset_Length(node,offset,len)
 595 #define ProgLen(ri) ri->u.proglen
 596 #define SetProgLen(ri,x) ri->u.proglen = x
 597 #else
 598 #define ProgLen(ri) ri->u.offsets[0]
 599 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 600 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
 601     if (! SIZE_ONLY) {                                                  \
 602         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
 603                     __LINE__, (int)(node), (int)(byte)));               \
 604         if((node) < 0) {                                                \
 605             Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
 606         } else {                                                        \
 607             RExC_offsets[2*(node)-1] = (byte);                          \
 608         }                                                               \
 609     }                                                                   \
 610 } STMT_END
 611
 612 #define Set_Node_Offset(node,byte) \
 613     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 614 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 615
 616 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
 617     if (! SIZE_ONLY) {                                                  \
 618         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
 619                 __LINE__, (int)(node), (int)(len)));                    \
 620         if((node) < 0) {                                                \
 621             Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
 622         } else {                                                        \
 623             RExC_offsets[2*(node)] = (len);                             \
 624         }                                                               \
 625     }                                                                   \
 626 } STMT_END
 627
 628 #define Set_Node_Length(node,len) \
 629     Set_Node_Length_To_R((node)-RExC_emit_start, len)
 630 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
 631 #define Set_Node_Cur_Length(node) \
 632     Set_Node_Length(node, RExC_parse - parse_start)
 633
 634 /* Get offsets and lengths */
 635 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 636 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 637
 638 #define Set_Node_Offset_Length(node,offset,len) STMT_START {    \
 639     Set_Node_Offset_To_R((node)-RExC_emit_start, (offset));     \
 640     Set_Node_Length_To_R((node)-RExC_emit_start, (len));        \
 641 } STMT_END
 642 #endif
 643
 644 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 645 #define EXPERIMENTAL_INPLACESCAN
 646 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
 647
 648 #define DEBUG_STUDYDATA(str,data,depth)                              \
 649 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
 650     PerlIO_printf(Perl_debug_log,                                    \
 651         "%*s" str "Pos:%"IVdf"/%"IVdf                                \
 652         " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s",       \
 653         (int)(depth)*2, "",                                          \
 654         (IV)((data)->pos_min),                                       \
 655         (IV)((data)->pos_delta),                                     \
 656         (UV)((data)->flags),                                         \
 657         (IV)((data)->whilem_c),                                      \
 658         (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
 659         is_inf ? "INF " : ""                                         \
 660     );                                                               \
 661     if ((data)->last_found)                                          \
 662         PerlIO_printf(Perl_debug_log,                                \
 663             "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 664             " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 665             SvPVX_const((data)->last_found),                         \
 666             (IV)((data)->last_end),                                  \
 667             (IV)((data)->last_start_min),                            \
 668             (IV)((data)->last_start_max),                            \
 669             ((data)->longest &&                                      \
 670              (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 671             SvPVX_const((data)->longest_fixed),                      \
 672             (IV)((data)->offset_fixed),                              \
 673             ((data)->longest &&                                      \
 674              (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 675             SvPVX_const((data)->longest_float),                      \
 676             (IV)((data)->offset_float_min),                          \
 677             (IV)((data)->offset_float_max)                           \
 678         );                                                           \
 679     PerlIO_printf(Perl_debug_log,"\n");                              \
 680 });
 681
 682 static void clear_re(pTHX_ void *r);
 683
 684 /* Mark that we cannot extend a found fixed substring at this point.
 685    Update the longest found anchored substring and the longest found
 686    floating substrings if needed. */
 687
 688 STATIC void
 689 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
 690 {
 691     const STRLEN l = CHR_SVLEN(data->last_found);
 692     const STRLEN old_l = CHR_SVLEN(*data->longest);
 693     GET_RE_DEBUG_FLAGS_DECL;
 694
 695     PERL_ARGS_ASSERT_SCAN_COMMIT;
 696
 697     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 698         SvSetMagicSV(*data->longest, data->last_found);
 699         if (*data->longest == data->longest_fixed) {
 700             data->offset_fixed = l ? data->last_start_min : data->pos_min;
 701             if (data->flags & SF_BEFORE_EOL)
 702                 data->flags
 703                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
 704             else
 705                 data->flags &= ~SF_FIX_BEFORE_EOL;
 706             data->minlen_fixed=minlenp;
 707             data->lookbehind_fixed=0;
 708         }
 709         else { /* *data->longest == data->longest_float */
 710             data->offset_float_min = l ? data->last_start_min : data->pos_min;
 711             data->offset_float_max = (l
 712                                       ? data->last_start_max
 713                                       : data->pos_min + data->pos_delta);
 714             if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
 715                 data->offset_float_max = I32_MAX;
 716             if (data->flags & SF_BEFORE_EOL)
 717                 data->flags
 718                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
 719             else
 720                 data->flags &= ~SF_FL_BEFORE_EOL;
 721             data->minlen_float=minlenp;
 722             data->lookbehind_float=0;
 723         }
 724     }
 725     SvCUR_set(data->last_found, 0);
 726     {
 727         SV * const sv = data->last_found;
 728         if (SvUTF8(sv) && SvMAGICAL(sv)) {
 729             MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 730             if (mg)
 731                 mg->mg_len = 0;
 732         }
 733     }
 734     data->last_end = -1;
 735     data->flags &= ~SF_BEFORE_EOL;
 736     DEBUG_STUDYDATA("commit: ",data,0);
 737 }
 738
 739 /* Can match anything (initialization) */
 740 STATIC void
 741 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 742 {
 743     PERL_ARGS_ASSERT_CL_ANYTHING;
 744
 745     ANYOF_BITMAP_SETALL(cl);
 746     cl->flags = ANYOF_CLASS|ANYOF_EOS|ANYOF_UNICODE_ALL
 747                 |ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL;
 748
 749     /* If any portion of the regex is to operate under locale rules,
 750      * initialization includes it.  The reason this isn't done for all regexes
 751      * is that the optimizer was written under the assumption that locale was
 752      * all-or-nothing.  Given the complexity and lack of documentation in the
 753      * optimizer, and that there are inadequate test cases for locale, so many
 754      * parts of it may not work properly, it is safest to avoid locale unless
 755      * necessary. */
 756     if (RExC_contains_locale) {
 757         ANYOF_CLASS_SETALL(cl);     /* /l uses class */
 758         cl->flags |= ANYOF_LOCALE;
 759     }
 760     else {
 761         ANYOF_CLASS_ZERO(cl);       /* Only /l uses class now */
 762     }
 763 }
 764
 765 /* Can match anything (initialization) */
 766 STATIC int
 767 S_cl_is_anything(const struct regnode_charclass_class *cl)
 768 {
 769     int value;
 770
 771     PERL_ARGS_ASSERT_CL_IS_ANYTHING;
 772
 773     for (value = 0; value <= ANYOF_MAX; value += 2)
 774         if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
 775             return 1;
 776     if (!(cl->flags & ANYOF_UNICODE_ALL))
 777         return 0;
 778     if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
 779         return 0;
 780     return 1;
 781 }
 782
 783 /* Can match anything (initialization) */
 784 STATIC void
 785 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 786 {
 787     PERL_ARGS_ASSERT_CL_INIT;
 788
 789     Zero(cl, 1, struct regnode_charclass_class);
 790     cl->type = ANYOF;
 791     cl_anything(pRExC_state, cl);
 792     ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
 793 }
 794
 795 /* These two functions currently do the exact same thing */
 796 #define cl_init_zero            S_cl_init
 797
 798 /* 'AND' a given class with another one.  Can create false positives.  'cl'
 799  * should not be inverted.  'and_with->flags & ANYOF_CLASS' should be 0 if
 800  * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
 801 STATIC void
 802 S_cl_and(struct regnode_charclass_class *cl,
 803         const struct regnode_charclass_class *and_with)
 804 {
 805     PERL_ARGS_ASSERT_CL_AND;
 806
 807     assert(and_with->type == ANYOF);
 808
 809     /* I (khw) am not sure all these restrictions are necessary XXX */
 810     if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
 811         && !(ANYOF_CLASS_TEST_ANY_SET(cl))
 812         && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 813         && !(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
 814         && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) {
 815         int i;
 816
 817         if (and_with->flags & ANYOF_INVERT)
 818             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 819                 cl->bitmap[i] &= ~and_with->bitmap[i];
 820         else
 821             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 822                 cl->bitmap[i] &= and_with->bitmap[i];
 823     } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
 824
 825     if (and_with->flags & ANYOF_INVERT) {
 826
 827         /* Here, the and'ed node is inverted.  Get the AND of the flags that
 828          * aren't affected by the inversion.  Those that are affected are
 829          * handled individually below */
 830         U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
 831         cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
 832         cl->flags |= affected_flags;
 833
 834         /* We currently don't know how to deal with things that aren't in the
 835          * bitmap, but we know that the intersection is no greater than what
 836          * is already in cl, so let there be false positives that get sorted
 837          * out after the synthetic start class succeeds, and the node is
 838          * matched for real. */
 839
 840         /* The inversion of these two flags indicate that the resulting
 841          * intersection doesn't have them */
 842         if (and_with->flags & ANYOF_UNICODE_ALL) {
 843             cl->flags &= ~ANYOF_UNICODE_ALL;
 844         }
 845         if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
 846             cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
 847         }
 848     }
 849     else {   /* and'd node is not inverted */
 850         U8 outside_bitmap_but_not_utf8; /* Temp variable */
 851
 852         if (! ANYOF_NONBITMAP(and_with)) {
 853
 854             /* Here 'and_with' doesn't match anything outside the bitmap
 855              * (except possibly ANYOF_UNICODE_ALL), which means the
 856              * intersection can't either, except for ANYOF_UNICODE_ALL, in
 857              * which case we don't know what the intersection is, but it's no
 858              * greater than what cl already has, so can just leave it alone,
 859              * with possible false positives */
 860             if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
 861                 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
 862                 cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
 863             }
 864         }
 865         else if (! ANYOF_NONBITMAP(cl)) {
 866
 867             /* Here, 'and_with' does match something outside the bitmap, and cl
 868              * doesn't have a list of things to match outside the bitmap.  If
 869              * cl can match all code points above 255, the intersection will
 870              * be those above-255 code points that 'and_with' matches.  If cl
 871              * can't match all Unicode code points, it means that it can't
 872              * match anything outside the bitmap (since the 'if' that got us
 873              * into this block tested for that), so we leave the bitmap empty.
 874              */
 875             if (cl->flags & ANYOF_UNICODE_ALL) {
 876                 ARG_SET(cl, ARG(and_with));
 877
 878                 /* and_with's ARG may match things that don't require UTF8.
 879                  * And now cl's will too, in spite of this being an 'and'.  See
 880                  * the comments below about the kludge */
 881                 cl->flags |= and_with->flags & ANYOF_NONBITMAP_NON_UTF8;
 882             }
 883         }
 884         else {
 885             /* Here, both 'and_with' and cl match something outside the
 886              * bitmap.  Currently we do not do the intersection, so just match
 887              * whatever cl had at the beginning.  */
 888         }
 889
 890
 891         /* Take the intersection of the two sets of flags.  However, the
 892          * ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'.  This is a
 893          * kludge around the fact that this flag is not treated like the others
 894          * which are initialized in cl_anything().  The way the optimizer works
 895          * is that the synthetic start class (SSC) is initialized to match
 896          * anything, and then the first time a real node is encountered, its
 897          * values are AND'd with the SSC's with the result being the values of
 898          * the real node.  However, there are paths through the optimizer where
 899          * the AND never gets called, so those initialized bits are set
 900          * inappropriately, which is not usually a big deal, as they just cause
 901          * false positives in the SSC, which will just mean a probably
 902          * imperceptible slow down in execution.  However this bit has a
 903          * higher false positive consequence in that it can cause utf8.pm,
 904          * utf8_heavy.pl ... to be loaded when not necessary, which is a much
 905          * bigger slowdown and also causes significant extra memory to be used.
 906          * In order to prevent this, the code now takes a different tack.  The
 907          * bit isn't set unless some part of the regular expression needs it,
 908          * but once set it won't get cleared.  This means that these extra
 909          * modules won't get loaded unless there was some path through the
 910          * pattern that would have required them anyway, and  so any false
 911          * positives that occur by not ANDing them out when they could be
 912          * aren't as severe as they would be if we treated this bit like all
 913          * the others */
 914         outside_bitmap_but_not_utf8 = (cl->flags | and_with->flags)
 915                                       & ANYOF_NONBITMAP_NON_UTF8;
 916         cl->flags &= and_with->flags;
 917         cl->flags |= outside_bitmap_but_not_utf8;
 918     }
 919 }
 920
 921 /* 'OR' a given class with another one.  Can create false positives.  'cl'
 922  * should not be inverted.  'or_with->flags & ANYOF_CLASS' should be 0 if
 923  * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
 924 STATIC void
 925 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
 926 {
 927     PERL_ARGS_ASSERT_CL_OR;
 928
 929     if (or_with->flags & ANYOF_INVERT) {
 930
 931         /* Here, the or'd node is to be inverted.  This means we take the
 932          * complement of everything not in the bitmap, but currently we don't
 933          * know what that is, so give up and match anything */
 934         if (ANYOF_NONBITMAP(or_with)) {
 935             cl_anything(pRExC_state, cl);
 936         }
 937         /* We do not use
 938          * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
 939          *   <= (B1 | !B2) | (CL1 | !CL2)
 940          * which is wasteful if CL2 is small, but we ignore CL2:
 941          *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
 942          * XXXX Can we handle case-fold?  Unclear:
 943          *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
 944          *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
 945          */
 946         else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 947              && !(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
 948              && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD) ) {
 949             int i;
 950
 951             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 952                 cl->bitmap[i] |= ~or_with->bitmap[i];
 953         } /* XXXX: logic is complicated otherwise */
 954         else {
 955             cl_anything(pRExC_state, cl);
 956         }
 957
 958         /* And, we can just take the union of the flags that aren't affected
 959          * by the inversion */
 960         cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
 961
 962         /* For the remaining flags:
 963             ANYOF_UNICODE_ALL and inverted means to not match anything above
 964                     255, which means that the union with cl should just be
 965                     what cl has in it, so can ignore this flag
 966             ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
 967                     is 127-255 to match them, but then invert that, so the
 968                     union with cl should just be what cl has in it, so can
 969                     ignore this flag
 970          */
 971     } else {    /* 'or_with' is not inverted */
 972         /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
 973         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 974              && (!(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
 975                  || (cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) ) {
 976             int i;
 977
 978             /* OR char bitmap and class bitmap separately */
 979             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 980                 cl->bitmap[i] |= or_with->bitmap[i];
 981             if (ANYOF_CLASS_TEST_ANY_SET(or_with)) {
 982                 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
 983                     cl->classflags[i] |= or_with->classflags[i];
 984                 cl->flags |= ANYOF_CLASS;
 985             }
 986         }
 987         else { /* XXXX: logic is complicated, leave it along for a moment. */
 988             cl_anything(pRExC_state, cl);
 989         }
 990
 991         if (ANYOF_NONBITMAP(or_with)) {
 992
 993             /* Use the added node's outside-the-bit-map match if there isn't a
 994              * conflict.  If there is a conflict (both nodes match something
 995              * outside the bitmap, but what they match outside is not the same
 996              * pointer, and hence not easily compared until XXX we extend
 997              * inversion lists this far), give up and allow the start class to
 998              * match everything outside the bitmap.  If that stuff is all above
 999              * 255, can just set UNICODE_ALL, otherwise caould be anything. */
1000             if (! ANYOF_NONBITMAP(cl)) {
1001                 ARG_SET(cl, ARG(or_with));
1002             }
1003             else if (ARG(cl) != ARG(or_with)) {
1004
1005                 if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
1006                     cl_anything(pRExC_state, cl);
1007                 }
1008                 else {
1009                     cl->flags |= ANYOF_UNICODE_ALL;
1010                 }
1011             }
1012         }
1013
1014         /* Take the union */
1015         cl->flags |= or_with->flags;
1016     }
1017 }
1018
1019 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1020 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
1021 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1022 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
1023
1024
1025 #ifdef DEBUGGING
1026 /*
1027    dump_trie(trie,widecharmap,revcharmap)
1028    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1029    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1030
1031    These routines dump out a trie in a somewhat readable format.
1032    The _interim_ variants are used for debugging the interim
1033    tables that are used to generate the final compressed
1034    representation which is what dump_trie expects.
1035
1036    Part of the reason for their existence is to provide a form
1037    of documentation as to how the different representations function.
1038
1039 */
1040
1041 /*
1042   Dumps the final compressed table form of the trie to Perl_debug_log.
1043   Used for debugging make_trie().
1044 */
1045
1046 STATIC void
1047 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1048             AV *revcharmap, U32 depth)
1049 {
1050     U32 state;
1051     SV *sv=sv_newmortal();
1052     int colwidth= widecharmap ? 6 : 4;
1053     U16 word;
1054     GET_RE_DEBUG_FLAGS_DECL;
1055
1056     PERL_ARGS_ASSERT_DUMP_TRIE;
1057
1058     PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1059         (int)depth * 2 + 2,"",
1060         "Match","Base","Ofs" );
1061
1062     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1063         SV ** const tmp = av_fetch( revcharmap, state, 0);
1064         if ( tmp ) {
1065             PerlIO_printf( Perl_debug_log, "%*s",
1066                 colwidth,
1067                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1068                             PL_colors[0], PL_colors[1],
1069                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1070                             PERL_PV_ESCAPE_FIRSTCHAR
1071                 )
1072             );
1073         }
1074     }
1075     PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1076         (int)depth * 2 + 2,"");
1077
1078     for( state = 0 ; state < trie->uniquecharcount ; state++ )
1079         PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1080     PerlIO_printf( Perl_debug_log, "\n");
1081
1082     for( state = 1 ; state < trie->statecount ; state++ ) {
1083         const U32 base = trie->states[ state ].trans.base;
1084
1085         PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1086
1087         if ( trie->states[ state ].wordnum ) {
1088             PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1089         } else {
1090             PerlIO_printf( Perl_debug_log, "%6s", "" );
1091         }
1092
1093         PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1094
1095         if ( base ) {
1096             U32 ofs = 0;
1097
1098             while( ( base + ofs  < trie->uniquecharcount ) ||
1099                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
1100                      && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1101                     ofs++;
1102
1103             PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1104
1105             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1106                 if ( ( base + ofs >= trie->uniquecharcount ) &&
1107                      ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1108                      trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1109                 {
1110                    PerlIO_printf( Perl_debug_log, "%*"UVXf,
1111                     colwidth,
1112                     (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1113                 } else {
1114                     PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
1115                 }
1116             }
1117
1118             PerlIO_printf( Perl_debug_log, "]");
1119
1120         }
1121         PerlIO_printf( Perl_debug_log, "\n" );
1122     }
1123     PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1124     for (word=1; word <= trie->wordcount; word++) {
1125         PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1126             (int)word, (int)(trie->wordinfo[word].prev),
1127             (int)(trie->wordinfo[word].len));
1128     }
1129     PerlIO_printf(Perl_debug_log, "\n" );
1130 }
1131 /*
1132   Dumps a fully constructed but uncompressed trie in list form.
1133   List tries normally only are used for construction when the number of
1134   possible chars (trie->uniquecharcount) is very high.
1135   Used for debugging make_trie().
1136 */
1137 STATIC void
1138 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1139                          HV *widecharmap, AV *revcharmap, U32 next_alloc,
1140                          U32 depth)
1141 {
1142     U32 state;
1143     SV *sv=sv_newmortal();
1144     int colwidth= widecharmap ? 6 : 4;
1145     GET_RE_DEBUG_FLAGS_DECL;
1146
1147     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1148
1149     /* print out the table precompression.  */
1150     PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1151         (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1152         "------:-----+-----------------\n" );
1153
1154     for( state=1 ; state < next_alloc ; state ++ ) {
1155         U16 charid;
1156
1157         PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1158             (int)depth * 2 + 2,"", (UV)state  );
1159         if ( ! trie->states[ state ].wordnum ) {
1160             PerlIO_printf( Perl_debug_log, "%5s| ","");
1161         } else {
1162             PerlIO_printf( Perl_debug_log, "W%4x| ",
1163                 trie->states[ state ].wordnum
1164             );
1165         }
1166         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1167             SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1168             if ( tmp ) {
1169                 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1170                     colwidth,
1171                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1172                             PL_colors[0], PL_colors[1],
1173                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1174                             PERL_PV_ESCAPE_FIRSTCHAR
1175                     ) ,
1176                     TRIE_LIST_ITEM(state,charid).forid,
1177                     (UV)TRIE_LIST_ITEM(state,charid).newstate
1178                 );
1179                 if (!(charid % 10))
1180                     PerlIO_printf(Perl_debug_log, "\n%*s| ",
1181                         (int)((depth * 2) + 14), "");
1182             }
1183         }
1184         PerlIO_printf( Perl_debug_log, "\n");
1185     }
1186 }
1187
1188 /*
1189   Dumps a fully constructed but uncompressed trie in table form.
1190   This is the normal DFA style state transition table, with a few
1191   twists to facilitate compression later.
1192   Used for debugging make_trie().
1193 */
1194 STATIC void
1195 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1196                           HV *widecharmap, AV *revcharmap, U32 next_alloc,
1197                           U32 depth)
1198 {
1199     U32 state;
1200     U16 charid;
1201     SV *sv=sv_newmortal();
1202     int colwidth= widecharmap ? 6 : 4;
1203     GET_RE_DEBUG_FLAGS_DECL;
1204
1205     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1206
1207     /*
1208        print out the table precompression so that we can do a visual check
1209        that they are identical.
1210      */
1211
1212     PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1213
1214     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1215         SV ** const tmp = av_fetch( revcharmap, charid, 0);
1216         if ( tmp ) {
1217             PerlIO_printf( Perl_debug_log, "%*s",
1218                 colwidth,
1219                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1220                             PL_colors[0], PL_colors[1],
1221                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1222                             PERL_PV_ESCAPE_FIRSTCHAR
1223                 )
1224             );
1225         }
1226     }
1227
1228     PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1229
1230     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1231         PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1232     }
1233
1234     PerlIO_printf( Perl_debug_log, "\n" );
1235
1236     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1237
1238         PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1239             (int)depth * 2 + 2,"",
1240             (UV)TRIE_NODENUM( state ) );
1241
1242         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1243             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1244             if (v)
1245                 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1246             else
1247                 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1248         }
1249         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1250             PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1251         } else {
1252             PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1253             trie->states[ TRIE_NODENUM( state ) ].wordnum );
1254         }
1255     }
1256 }
1257
1258 #endif
1259
1260
1261 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1262   startbranch: the first branch in the whole branch sequence
1263   first      : start branch of sequence of branch-exact nodes.
1264                May be the same as startbranch
1265   last       : Thing following the last branch.
1266                May be the same as tail.
1267   tail       : item following the branch sequence
1268   count      : words in the sequence
1269   flags      : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1270   depth      : indent depth
1271
1272 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1273
1274 A trie is an N'ary tree where the branches are determined by digital
1275 decomposition of the key. IE, at the root node you look up the 1st character and
1276 follow that branch repeat until you find the end of the branches. Nodes can be
1277 marked as "accepting" meaning they represent a complete word. Eg:
1278
1279   /he|she|his|hers/
1280
1281 would convert into the following structure. Numbers represent states, letters
1282 following numbers represent valid transitions on the letter from that state, if
1283 the number is in square brackets it represents an accepting state, otherwise it
1284 will be in parenthesis.
1285
1286       +-h->+-e->[3]-+-r->(8)-+-s->[9]
1287       |    |
1288       |   (2)
1289       |    |
1290      (1)   +-i->(6)-+-s->[7]
1291       |
1292       +-s->(3)-+-h->(4)-+-e->[5]
1293
1294       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1295
1296 This shows that when matching against the string 'hers' we will begin at state 1
1297 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1298 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1299 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1300 single traverse. We store a mapping from accepting to state to which word was
1301 matched, and then when we have multiple possibilities we try to complete the
1302 rest of the regex in the order in which they occured in the alternation.
1303
1304 The only prior NFA like behaviour that would be changed by the TRIE support is
1305 the silent ignoring of duplicate alternations which are of the form:
1306
1307  / (DUPE|DUPE) X? (?{ ... }) Y /x
1308
1309 Thus EVAL blocks following a trie may be called a different number of times with
1310 and without the optimisation. With the optimisations dupes will be silently
1311 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1312 the following demonstrates:
1313
1314  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1315
1316 which prints out 'word' three times, but
1317
1318  'words'=~/(word|word|word)(?{ print $1 })S/
1319
1320 which doesnt print it out at all. This is due to other optimisations kicking in.
1321
1322 Example of what happens on a structural level:
1323
1324 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1325
1326    1: CURLYM[1] {1,32767}(18)
1327    5:   BRANCH(8)
1328    6:     EXACT <ac>(16)
1329    8:   BRANCH(11)
1330    9:     EXACT <ad>(16)
1331   11:   BRANCH(14)
1332   12:     EXACT <ab>(16)
1333   16:   SUCCEED(0)
1334   17:   NOTHING(18)
1335   18: END(0)
1336
1337 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1338 and should turn into:
1339
1340    1: CURLYM[1] {1,32767}(18)
1341    5:   TRIE(16)
1342         [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1343           <ac>
1344           <ad>
1345           <ab>
1346   16:   SUCCEED(0)
1347   17:   NOTHING(18)
1348   18: END(0)
1349
1350 Cases where tail != last would be like /(?foo|bar)baz/:
1351
1352    1: BRANCH(4)
1353    2:   EXACT <foo>(8)
1354    4: BRANCH(7)
1355    5:   EXACT <bar>(8)
1356    7: TAIL(8)
1357    8: EXACT <baz>(10)
1358   10: END(0)
1359
1360 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1361 and would end up looking like:
1362
1363     1: TRIE(8)
1364       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1365         <foo>
1366         <bar>
1367    7: TAIL(8)
1368    8: EXACT <baz>(10)
1369   10: END(0)
1370
1371     d = uvuni_to_utf8_flags(d, uv, 0);
1372
1373 is the recommended Unicode-aware way of saying
1374
1375     *(d++) = uv;
1376 */
1377
1378 #define TRIE_STORE_REVCHAR(val)                                            \
1379     STMT_START {                                                           \
1380         if (UTF) {                                                         \
1381             SV *zlopp = newSV(7); /* XXX: optimize me */                   \
1382             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
1383             unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, val); \
1384             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
1385             SvPOK_on(zlopp);                                               \
1386             SvUTF8_on(zlopp);                                              \
1387             av_push(revcharmap, zlopp);                                    \
1388         } else {                                                           \
1389             char ooooff = (char)val;                                           \
1390             av_push(revcharmap, newSVpvn(&ooooff, 1));                     \
1391         }                                                                  \
1392         } STMT_END
1393
1394 #define TRIE_READ_CHAR STMT_START {                                                     \
1395     wordlen++;                                                                          \
1396     if ( UTF ) {                                                                        \
1397         /* if it is UTF then it is either already folded, or does not need folding */   \
1398         uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags);             \
1399     }                                                                                   \
1400     else if (folder == PL_fold_latin1) {                                                \
1401         /* if we use this folder we have to obey unicode rules on latin-1 data */       \
1402         if ( foldlen > 0 ) {                                                            \
1403            uvc = utf8n_to_uvuni( (const U8*) scan, UTF8_MAXLEN, &len, uniflags );       \
1404            foldlen -= len;                                                              \
1405            scan += len;                                                                 \
1406            len = 0;                                                                     \
1407         } else {                                                                        \
1408             len = 1;                                                                    \
1409             uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1);                     \
1410             skiplen = UNISKIP(uvc);                                                     \
1411             foldlen -= skiplen;                                                         \
1412             scan = foldbuf + skiplen;                                                   \
1413         }                                                                               \
1414     } else {                                                                            \
1415         /* raw data, will be folded later if needed */                                  \
1416         uvc = (U32)*uc;                                                                 \
1417         len = 1;                                                                        \
1418     }                                                                                   \
1419 } STMT_END
1420
1421
1422
1423 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
1424     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
1425         U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
1426         Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1427     }                                                           \
1428     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
1429     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
1430     TRIE_LIST_CUR( state )++;                                   \
1431 } STMT_END
1432
1433 #define TRIE_LIST_NEW(state) STMT_START {                       \
1434     Newxz( trie->states[ state ].trans.list,               \
1435         4, reg_trie_trans_le );                                 \
1436      TRIE_LIST_CUR( state ) = 1;                                \
1437      TRIE_LIST_LEN( state ) = 4;                                \
1438 } STMT_END
1439
1440 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
1441     U16 dupe= trie->states[ state ].wordnum;                    \
1442     regnode * const noper_next = regnext( noper );              \
1443                                                                 \
1444     DEBUG_r({                                                   \
1445         /* store the word for dumping */                        \
1446         SV* tmp;                                                \
1447         if (OP(noper) != NOTHING)                               \
1448             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);    \
1449         else                                                    \
1450             tmp = newSVpvn_utf8( "", 0, UTF );                  \
1451         av_push( trie_words, tmp );                             \
1452     });                                                         \
1453                                                                 \
1454     curword++;                                                  \
1455     trie->wordinfo[curword].prev   = 0;                         \
1456     trie->wordinfo[curword].len    = wordlen;                   \
1457     trie->wordinfo[curword].accept = state;                     \
1458                                                                 \
1459     if ( noper_next < tail ) {                                  \
1460         if (!trie->jump)                                        \
1461             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1462         trie->jump[curword] = (U16)(noper_next - convert);      \
1463         if (!jumper)                                            \
1464             jumper = noper_next;                                \
1465         if (!nextbranch)                                        \
1466             nextbranch= regnext(cur);                           \
1467     }                                                           \
1468                                                                 \
1469     if ( dupe ) {                                               \
1470         /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
1471         /* chain, so that when the bits of chain are later    */\
1472         /* linked together, the dups appear in the chain      */\
1473         trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1474         trie->wordinfo[dupe].prev = curword;                    \
1475     } else {                                                    \
1476         /* we haven't inserted this word yet.                */ \
1477         trie->states[ state ].wordnum = curword;                \
1478     }                                                           \
1479 } STMT_END
1480
1481
1482 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)          \
1483      ( ( base + charid >=  ucharcount                                   \
1484          && base + charid < ubound                                      \
1485          && state == trie->trans[ base - ucharcount + charid ].check    \
1486          && trie->trans[ base - ucharcount + charid ].next )            \
1487            ? trie->trans[ base - ucharcount + charid ].next             \
1488            : ( state==1 ? special : 0 )                                 \
1489       )
1490
1491 #define MADE_TRIE       1
1492 #define MADE_JUMP_TRIE  2
1493 #define MADE_EXACT_TRIE 4
1494
1495 STATIC I32
1496 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1497 {
1498     dVAR;
1499     /* first pass, loop through and scan words */
1500     reg_trie_data *trie;
1501     HV *widecharmap = NULL;
1502     AV *revcharmap = newAV();
1503     regnode *cur;
1504     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1505     STRLEN len = 0;
1506     UV uvc = 0;
1507     U16 curword = 0;
1508     U32 next_alloc = 0;
1509     regnode *jumper = NULL;
1510     regnode *nextbranch = NULL;
1511     regnode *convert = NULL;
1512     U32 *prev_states; /* temp array mapping each state to previous one */
1513     /* we just use folder as a flag in utf8 */
1514     const U8 * folder = NULL;
1515
1516 #ifdef DEBUGGING
1517     const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1518     AV *trie_words = NULL;
1519     /* along with revcharmap, this only used during construction but both are
1520      * useful during debugging so we store them in the struct when debugging.
1521      */
1522 #else
1523     const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1524     STRLEN trie_charcount=0;
1525 #endif
1526     SV *re_trie_maxbuff;
1527     GET_RE_DEBUG_FLAGS_DECL;
1528
1529     PERL_ARGS_ASSERT_MAKE_TRIE;
1530 #ifndef DEBUGGING
1531     PERL_UNUSED_ARG(depth);
1532 #endif
1533
1534     switch (flags) {
1535         case EXACT: break;
1536         case EXACTFA:
1537         case EXACTFU_SS:
1538         case EXACTFU_TRICKYFOLD:
1539         case EXACTFU: folder = PL_fold_latin1; break;
1540         case EXACTF:  folder = PL_fold; break;
1541         case EXACTFL: folder = PL_fold_locale; break;
1542         default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1543     }
1544
1545     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1546     trie->refcount = 1;
1547     trie->startstate = 1;
1548     trie->wordcount = word_count;
1549     RExC_rxi->data->data[ data_slot ] = (void*)trie;
1550     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1551     if (flags == EXACT)
1552         trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1553     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1554                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
1555
1556     DEBUG_r({
1557         trie_words = newAV();
1558     });
1559
1560     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1561     if (!SvIOK(re_trie_maxbuff)) {
1562         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1563     }
1564     DEBUG_TRIE_COMPILE_r({
1565                 PerlIO_printf( Perl_debug_log,
1566                   "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1567                   (int)depth * 2 + 2, "",
1568                   REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1569                   REG_NODE_NUM(last), REG_NODE_NUM(tail),
1570                   (int)depth);
1571     });
1572
1573    /* Find the node we are going to overwrite */
1574     if ( first == startbranch && OP( last ) != BRANCH ) {
1575         /* whole branch chain */
1576         convert = first;
1577     } else {
1578         /* branch sub-chain */
1579         convert = NEXTOPER( first );
1580     }
1581
1582     /*  -- First loop and Setup --
1583
1584        We first traverse the branches and scan each word to determine if it
1585        contains widechars, and how many unique chars there are, this is
1586        important as we have to build a table with at least as many columns as we
1587        have unique chars.
1588
1589        We use an array of integers to represent the character codes 0..255
1590        (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1591        native representation of the character value as the key and IV's for the
1592        coded index.
1593
1594        *TODO* If we keep track of how many times each character is used we can
1595        remap the columns so that the table compression later on is more
1596        efficient in terms of memory by ensuring the most common value is in the
1597        middle and the least common are on the outside.  IMO this would be better
1598        than a most to least common mapping as theres a decent chance the most
1599        common letter will share a node with the least common, meaning the node
1600        will not be compressible. With a middle is most common approach the worst
1601        case is when we have the least common nodes twice.
1602
1603      */
1604
1605     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1606         regnode *noper = NEXTOPER( cur );
1607         const U8 *uc = (U8*)STRING( noper );
1608         const U8 *e  = uc + STR_LEN( noper );
1609         STRLEN foldlen = 0;
1610         U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1611         STRLEN skiplen = 0;
1612         const U8 *scan = (U8*)NULL;
1613         U32 wordlen      = 0;         /* required init */
1614         STRLEN chars = 0;
1615         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1616
1617         if (OP(noper) == NOTHING) {
1618             regnode *noper_next= regnext(noper);
1619             if (noper_next != tail && OP(noper_next) == flags) {
1620                 noper = noper_next;
1621                 uc= (U8*)STRING(noper);
1622                 e= uc + STR_LEN(noper);
1623                 trie->minlen= STR_LEN(noper);
1624             } else {
1625                 trie->minlen= 0;
1626                 continue;
1627             }
1628         }
1629
1630         if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
1631             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1632                                           regardless of encoding */
1633             if (OP( noper ) == EXACTFU_SS) {
1634                 /* false positives are ok, so just set this */
1635                 TRIE_BITMAP_SET(trie,0xDF);
1636             }
1637         }
1638         for ( ; uc < e ; uc += len ) {
1639             TRIE_CHARCOUNT(trie)++;
1640             TRIE_READ_CHAR;
1641             chars++;
1642             if ( uvc < 256 ) {
1643                 if ( folder ) {
1644                     U8 folded= folder[ (U8) uvc ];
1645                     if ( !trie->charmap[ folded ] ) {
1646                         trie->charmap[ folded ]=( ++trie->uniquecharcount );
1647                         TRIE_STORE_REVCHAR( folded );
1648                     }
1649                 }
1650                 if ( !trie->charmap[ uvc ] ) {
1651                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1652                     TRIE_STORE_REVCHAR( uvc );
1653                 }
1654                 if ( set_bit ) {
1655                     /* store the codepoint in the bitmap, and its folded
1656                      * equivalent. */
1657                     TRIE_BITMAP_SET(trie, uvc);
1658
1659                     /* store the folded codepoint */
1660                     if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
1661
1662                     if ( !UTF ) {
1663                         /* store first byte of utf8 representation of
1664                            variant codepoints */
1665                         if (! UNI_IS_INVARIANT(uvc)) {
1666                             TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
1667                         }
1668                     }
1669                     set_bit = 0; /* We've done our bit :-) */
1670                 }
1671             } else {
1672                 SV** svpp;
1673                 if ( !widecharmap )
1674                     widecharmap = newHV();
1675
1676                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1677
1678                 if ( !svpp )
1679                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1680
1681                 if ( !SvTRUE( *svpp ) ) {
1682                     sv_setiv( *svpp, ++trie->uniquecharcount );
1683                     TRIE_STORE_REVCHAR(uvc);
1684                 }
1685             }
1686         }
1687         if( cur == first ) {
1688             trie->minlen = chars;
1689             trie->maxlen = chars;
1690         } else if (chars < trie->minlen) {
1691             trie->minlen = chars;
1692         } else if (chars > trie->maxlen) {
1693             trie->maxlen = chars;
1694         }
1695         if (OP( noper ) == EXACTFU_SS) {
1696             /* XXX: workaround - 'ss' could match "\x{DF}" so minlen could be 1 and not 2*/
1697             if (trie->minlen > 1)
1698                 trie->minlen= 1;
1699         }
1700         if (OP( noper ) == EXACTFU_TRICKYFOLD) {
1701             /* XXX: workround - things like "\x{1FBE}\x{0308}\x{0301}" can match "\x{0390}"
1702              *                - We assume that any such sequence might match a 2 byte string */
1703             if (trie->minlen > 2 )
1704                 trie->minlen= 2;
1705         }
1706
1707     } /* end first pass */
1708     DEBUG_TRIE_COMPILE_r(
1709         PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1710                 (int)depth * 2 + 2,"",
1711                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1712                 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1713                 (int)trie->minlen, (int)trie->maxlen )
1714     );
1715
1716     /*
1717         We now know what we are dealing with in terms of unique chars and
1718         string sizes so we can calculate how much memory a naive
1719         representation using a flat table  will take. If it's over a reasonable
1720         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1721         conservative but potentially much slower representation using an array
1722         of lists.
1723
1724         At the end we convert both representations into the same compressed
1725         form that will be used in regexec.c for matching with. The latter
1726         is a form that cannot be used to construct with but has memory
1727         properties similar to the list form and access properties similar
1728         to the table form making it both suitable for fast searches and
1729         small enough that its feasable to store for the duration of a program.
1730
1731         See the comment in the code where the compressed table is produced
1732         inplace from the flat tabe representation for an explanation of how
1733         the compression works.
1734
1735     */
1736
1737
1738     Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1739     prev_states[1] = 0;
1740
1741     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1742         /*
1743             Second Pass -- Array Of Lists Representation
1744
1745             Each state will be represented by a list of charid:state records
1746             (reg_trie_trans_le) the first such element holds the CUR and LEN
1747             points of the allocated array. (See defines above).
1748
1749             We build the initial structure using the lists, and then convert
1750             it into the compressed table form which allows faster lookups
1751             (but cant be modified once converted).
1752         */
1753
1754         STRLEN transcount = 1;
1755
1756         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1757             "%*sCompiling trie using list compiler\n",
1758             (int)depth * 2 + 2, ""));
1759
1760         trie->states = (reg_trie_state *)
1761             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1762                                   sizeof(reg_trie_state) );
1763         TRIE_LIST_NEW(1);
1764         next_alloc = 2;
1765
1766         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1767
1768             regnode *noper   = NEXTOPER( cur );
1769             U8 *uc           = (U8*)STRING( noper );
1770             const U8 *e      = uc + STR_LEN( noper );
1771             U32 state        = 1;         /* required init */
1772             U16 charid       = 0;         /* sanity init */
1773             U8 *scan         = (U8*)NULL; /* sanity init */
1774             STRLEN foldlen   = 0;         /* required init */
1775             U32 wordlen      = 0;         /* required init */
1776             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1777             STRLEN skiplen   = 0;
1778
1779             if (OP(noper) == NOTHING) {
1780                 regnode *noper_next= regnext(noper);
1781                 if (noper_next != tail && OP(noper_next) == flags) {
1782                     noper = noper_next;
1783                     uc= (U8*)STRING(noper);
1784                     e= uc + STR_LEN(noper);
1785                 }
1786             }
1787
1788             if (OP(noper) != NOTHING) {
1789                 for ( ; uc < e ; uc += len ) {
1790
1791                     TRIE_READ_CHAR;
1792
1793                     if ( uvc < 256 ) {
1794                         charid = trie->charmap[ uvc ];
1795                     } else {
1796                         SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1797                         if ( !svpp ) {
1798                             charid = 0;
1799                         } else {
1800                             charid=(U16)SvIV( *svpp );
1801                         }
1802                     }
1803                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
1804                     if ( charid ) {
1805
1806                         U16 check;
1807                         U32 newstate = 0;
1808
1809                         charid--;
1810                         if ( !trie->states[ state ].trans.list ) {
1811                             TRIE_LIST_NEW( state );
1812                         }
1813                         for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1814                             if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1815                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1816                                 break;
1817                             }
1818                         }
1819                         if ( ! newstate ) {
1820                             newstate = next_alloc++;
1821                             prev_states[newstate] = state;
1822                             TRIE_LIST_PUSH( state, charid, newstate );
1823                             transcount++;
1824                         }
1825                         state = newstate;
1826                     } else {
1827                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1828                     }
1829                 }
1830             }
1831             TRIE_HANDLE_WORD(state);
1832
1833         } /* end second pass */
1834
1835         /* next alloc is the NEXT state to be allocated */
1836         trie->statecount = next_alloc;
1837         trie->states = (reg_trie_state *)
1838             PerlMemShared_realloc( trie->states,
1839                                    next_alloc
1840                                    * sizeof(reg_trie_state) );
1841
1842         /* and now dump it out before we compress it */
1843         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1844                                                          revcharmap, next_alloc,
1845                                                          depth+1)
1846         );
1847
1848         trie->trans = (reg_trie_trans *)
1849             PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1850         {
1851             U32 state;
1852             U32 tp = 0;
1853             U32 zp = 0;
1854
1855
1856             for( state=1 ; state < next_alloc ; state ++ ) {
1857                 U32 base=0;
1858
1859                 /*
1860                 DEBUG_TRIE_COMPILE_MORE_r(
1861                     PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1862                 );
1863                 */
1864
1865                 if (trie->states[state].trans.list) {
1866                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1867                     U16 maxid=minid;
1868                     U16 idx;
1869
1870                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1871                         const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1872                         if ( forid < minid ) {
1873                             minid=forid;
1874                         } else if ( forid > maxid ) {
1875                             maxid=forid;
1876                         }
1877                     }
1878                     if ( transcount < tp + maxid - minid + 1) {
1879                         transcount *= 2;
1880                         trie->trans = (reg_trie_trans *)
1881                             PerlMemShared_realloc( trie->trans,
1882                                                      transcount
1883                                                      * sizeof(reg_trie_trans) );
1884                         Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1885                     }
1886                     base = trie->uniquecharcount + tp - minid;
1887                     if ( maxid == minid ) {
1888                         U32 set = 0;
1889                         for ( ; zp < tp ; zp++ ) {
1890                             if ( ! trie->trans[ zp ].next ) {
1891                                 base = trie->uniquecharcount + zp - minid;
1892                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1893                                 trie->trans[ zp ].check = state;
1894                                 set = 1;
1895                                 break;
1896                             }
1897                         }
1898                         if ( !set ) {
1899                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1900                             trie->trans[ tp ].check = state;
1901                             tp++;
1902                             zp = tp;
1903                         }
1904                     } else {
1905                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1906                             const U32 tid = base -  trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1907                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1908                             trie->trans[ tid ].check = state;
1909                         }
1910                         tp += ( maxid - minid + 1 );
1911                     }
1912                     Safefree(trie->states[ state ].trans.list);
1913                 }
1914                 /*
1915                 DEBUG_TRIE_COMPILE_MORE_r(
1916                     PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1917                 );
1918                 */
1919                 trie->states[ state ].trans.base=base;
1920             }
1921             trie->lasttrans = tp + 1;
1922         }
1923     } else {
1924         /*
1925            Second Pass -- Flat Table Representation.
1926
1927            we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1928            We know that we will need Charcount+1 trans at most to store the data
1929            (one row per char at worst case) So we preallocate both structures
1930            assuming worst case.
1931
1932            We then construct the trie using only the .next slots of the entry
1933            structs.
1934
1935            We use the .check field of the first entry of the node temporarily to
1936            make compression both faster and easier by keeping track of how many non
1937            zero fields are in the node.
1938
1939            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1940            transition.
1941
1942            There are two terms at use here: state as a TRIE_NODEIDX() which is a
1943            number representing the first entry of the node, and state as a
1944            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1945            TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1946            are 2 entrys per node. eg:
1947
1948              A B       A B
1949           1. 2 4    1. 3 7
1950           2. 0 3    3. 0 5
1951           3. 0 0    5. 0 0
1952           4. 0 0    7. 0 0
1953
1954            The table is internally in the right hand, idx form. However as we also
1955            have to deal with the states array which is indexed by nodenum we have to
1956            use TRIE_NODENUM() to convert.
1957
1958         */
1959         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1960             "%*sCompiling trie using table compiler\n",
1961             (int)depth * 2 + 2, ""));
1962
1963         trie->trans = (reg_trie_trans *)
1964             PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1965                                   * trie->uniquecharcount + 1,
1966                                   sizeof(reg_trie_trans) );
1967         trie->states = (reg_trie_state *)
1968             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1969                                   sizeof(reg_trie_state) );
1970         next_alloc = trie->uniquecharcount + 1;
1971
1972
1973         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1974
1975             regnode *noper   = NEXTOPER( cur );
1976             const U8 *uc     = (U8*)STRING( noper );
1977             const U8 *e      = uc + STR_LEN( noper );
1978
1979             U32 state        = 1;         /* required init */
1980
1981             U16 charid       = 0;         /* sanity init */
1982             U32 accept_state = 0;         /* sanity init */
1983             U8 *scan         = (U8*)NULL; /* sanity init */
1984
1985             STRLEN foldlen   = 0;         /* required init */
1986             U32 wordlen      = 0;         /* required init */
1987             STRLEN skiplen   = 0;
1988             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1989
1990             if (OP(noper) == NOTHING) {
1991                 regnode *noper_next= regnext(noper);
1992                 if (noper_next != tail && OP(noper_next) == flags) {
1993                     noper = noper_next;
1994                     uc= (U8*)STRING(noper);
1995                     e= uc + STR_LEN(noper);
1996                 }
1997             }
1998
1999             if ( OP(noper) != NOTHING ) {
2000                 for ( ; uc < e ; uc += len ) {
2001
2002                     TRIE_READ_CHAR;
2003
2004                     if ( uvc < 256 ) {
2005                         charid = trie->charmap[ uvc ];
2006                     } else {
2007                         SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
2008                         charid = svpp ? (U16)SvIV(*svpp) : 0;
2009                     }
2010                     if ( charid ) {
2011                         charid--;
2012                         if ( !trie->trans[ state + charid ].next ) {
2013                             trie->trans[ state + charid ].next = next_alloc;
2014                             trie->trans[ state ].check++;
2015                             prev_states[TRIE_NODENUM(next_alloc)]
2016                                     = TRIE_NODENUM(state);
2017                             next_alloc += trie->uniquecharcount;
2018                         }
2019                         state = trie->trans[ state + charid ].next;
2020                     } else {
2021                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2022                     }
2023                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
2024                 }
2025             }
2026             accept_state = TRIE_NODENUM( state );
2027             TRIE_HANDLE_WORD(accept_state);
2028
2029         } /* end second pass */
2030
2031         /* and now dump it out before we compress it */
2032         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2033                                                           revcharmap,
2034                                                           next_alloc, depth+1));
2035
2036         {
2037         /*
2038            * Inplace compress the table.*
2039
2040            For sparse data sets the table constructed by the trie algorithm will
2041            be mostly 0/FAIL transitions or to put it another way mostly empty.
2042            (Note that leaf nodes will not contain any transitions.)
2043
2044            This algorithm compresses the tables by eliminating most such
2045            transitions, at the cost of a modest bit of extra work during lookup:
2046
2047            - Each states[] entry contains a .base field which indicates the
2048            index in the state[] array wheres its transition data is stored.
2049
2050            - If .base is 0 there are no valid transitions from that node.
2051
2052            - If .base is nonzero then charid is added to it to find an entry in
2053            the trans array.
2054
2055            -If trans[states[state].base+charid].check!=state then the
2056            transition is taken to be a 0/Fail transition. Thus if there are fail
2057            transitions at the front of the node then the .base offset will point
2058            somewhere inside the previous nodes data (or maybe even into a node
2059            even earlier), but the .check field determines if the transition is
2060            valid.
2061
2062            XXX - wrong maybe?
2063            The following process inplace converts the table to the compressed
2064            table: We first do not compress the root node 1,and mark all its
2065            .check pointers as 1 and set its .base pointer as 1 as well. This
2066            allows us to do a DFA construction from the compressed table later,
2067            and ensures that any .base pointers we calculate later are greater
2068            than 0.
2069
2070            - We set 'pos' to indicate the first entry of the second node.
2071
2072            - We then iterate over the columns of the node, finding the first and
2073            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2074            and set the .check pointers accordingly, and advance pos
2075            appropriately and repreat for the next node. Note that when we copy
2076            the next pointers we have to convert them from the original
2077            NODEIDX form to NODENUM form as the former is not valid post
2078            compression.
2079
2080            - If a node has no transitions used we mark its base as 0 and do not
2081            advance the pos pointer.
2082
2083            - If a node only has one transition we use a second pointer into the
2084            structure to fill in allocated fail transitions from other states.
2085            This pointer is independent of the main pointer and scans forward
2086            looking for null transitions that are allocated to a state. When it
2087            finds one it writes the single transition into the "hole".  If the
2088            pointer doesnt find one the single transition is appended as normal.
2089
2090            - Once compressed we can Renew/realloc the structures to release the
2091            excess space.
2092
2093            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2094            specifically Fig 3.47 and the associated pseudocode.
2095
2096            demq
2097         */
2098         const U32 laststate = TRIE_NODENUM( next_alloc );
2099         U32 state, charid;
2100         U32 pos = 0, zp=0;
2101         trie->statecount = laststate;
2102
2103         for ( state = 1 ; state < laststate ; state++ ) {
2104             U8 flag = 0;
2105             const U32 stateidx = TRIE_NODEIDX( state );
2106             const U32 o_used = trie->trans[ stateidx ].check;
2107             U32 used = trie->trans[ stateidx ].check;
2108             trie->trans[ stateidx ].check = 0;
2109
2110             for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2111                 if ( flag || trie->trans[ stateidx + charid ].next ) {
2112                     if ( trie->trans[ stateidx + charid ].next ) {
2113                         if (o_used == 1) {
2114                             for ( ; zp < pos ; zp++ ) {
2115                                 if ( ! trie->trans[ zp ].next ) {
2116                                     break;
2117                                 }
2118                             }
2119                             trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2120                             trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2121                             trie->trans[ zp ].check = state;
2122                             if ( ++zp > pos ) pos = zp;
2123                             break;
2124                         }
2125                         used--;
2126                     }
2127                     if ( !flag ) {
2128                         flag = 1;
2129                         trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2130                     }
2131                     trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2132                     trie->trans[ pos ].check = state;
2133                     pos++;
2134                 }
2135             }
2136         }
2137         trie->lasttrans = pos + 1;
2138         trie->states = (reg_trie_state *)
2139             PerlMemShared_realloc( trie->states, laststate
2140                                    * sizeof(reg_trie_state) );
2141         DEBUG_TRIE_COMPILE_MORE_r(
2142                 PerlIO_printf( Perl_debug_log,
2143                     "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2144                     (int)depth * 2 + 2,"",
2145                     (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
2146                     (IV)next_alloc,
2147                     (IV)pos,
2148                     ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2149             );
2150
2151         } /* end table compress */
2152     }
2153     DEBUG_TRIE_COMPILE_MORE_r(
2154             PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2155                 (int)depth * 2 + 2, "",
2156                 (UV)trie->statecount,
2157                 (UV)trie->lasttrans)
2158     );
2159     /* resize the trans array to remove unused space */
2160     trie->trans = (reg_trie_trans *)
2161         PerlMemShared_realloc( trie->trans, trie->lasttrans
2162                                * sizeof(reg_trie_trans) );
2163
2164     {   /* Modify the program and insert the new TRIE node */
2165         U8 nodetype =(U8)(flags & 0xFF);
2166         char *str=NULL;
2167
2168 #ifdef DEBUGGING
2169         regnode *optimize = NULL;
2170 #ifdef RE_TRACK_PATTERN_OFFSETS
2171
2172         U32 mjd_offset = 0;
2173         U32 mjd_nodelen = 0;
2174 #endif /* RE_TRACK_PATTERN_OFFSETS */
2175 #endif /* DEBUGGING */
2176         /*
2177            This means we convert either the first branch or the first Exact,
2178            depending on whether the thing following (in 'last') is a branch
2179            or not and whther first is the startbranch (ie is it a sub part of
2180            the alternation or is it the whole thing.)
2181            Assuming its a sub part we convert the EXACT otherwise we convert
2182            the whole branch sequence, including the first.
2183          */
2184         /* Find the node we are going to overwrite */
2185         if ( first != startbranch || OP( last ) == BRANCH ) {
2186             /* branch sub-chain */
2187             NEXT_OFF( first ) = (U16)(last - first);
2188 #ifdef RE_TRACK_PATTERN_OFFSETS
2189             DEBUG_r({
2190                 mjd_offset= Node_Offset((convert));
2191                 mjd_nodelen= Node_Length((convert));
2192             });
2193 #endif
2194             /* whole branch chain */
2195         }
2196 #ifdef RE_TRACK_PATTERN_OFFSETS
2197         else {
2198             DEBUG_r({
2199                 const  regnode *nop = NEXTOPER( convert );
2200                 mjd_offset= Node_Offset((nop));
2201                 mjd_nodelen= Node_Length((nop));
2202             });
2203         }
2204         DEBUG_OPTIMISE_r(
2205             PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2206                 (int)depth * 2 + 2, "",
2207                 (UV)mjd_offset, (UV)mjd_nodelen)
2208         );
2209 #endif
2210         /* But first we check to see if there is a common prefix we can
2211            split out as an EXACT and put in front of the TRIE node.  */
2212         trie->startstate= 1;
2213         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
2214             U32 state;
2215             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2216                 U32 ofs = 0;
2217                 I32 idx = -1;
2218                 U32 count = 0;
2219                 const U32 base = trie->states[ state ].trans.base;
2220
2221                 if ( trie->states[state].wordnum )
2222                         count = 1;
2223
2224                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2225                     if ( ( base + ofs >= trie->uniquecharcount ) &&
2226                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2227                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2228                     {
2229                         if ( ++count > 1 ) {
2230                             SV **tmp = av_fetch( revcharmap, ofs, 0);
2231                             const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2232                             if ( state == 1 ) break;
2233                             if ( count == 2 ) {
2234                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2235                                 DEBUG_OPTIMISE_r(
2236                                     PerlIO_printf(Perl_debug_log,
2237                                         "%*sNew Start State=%"UVuf" Class: [",
2238                                         (int)depth * 2 + 2, "",
2239                                         (UV)state));
2240                                 if (idx >= 0) {
2241                                     SV ** const tmp = av_fetch( revcharmap, idx, 0);
2242                                     const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2243
2244                                     TRIE_BITMAP_SET(trie,*ch);
2245                                     if ( folder )
2246                                         TRIE_BITMAP_SET(trie, folder[ *ch ]);
2247                                     DEBUG_OPTIMISE_r(
2248                                         PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2249                                     );
2250                                 }
2251                             }
2252                             TRIE_BITMAP_SET(trie,*ch);
2253                             if ( folder )
2254                                 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2255                             DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2256                         }
2257                         idx = ofs;
2258                     }
2259                 }
2260                 if ( count == 1 ) {
2261                     SV **tmp = av_fetch( revcharmap, idx, 0);
2262                     STRLEN len;
2263                     char *ch = SvPV( *tmp, len );
2264                     DEBUG_OPTIMISE_r({
2265                         SV *sv=sv_newmortal();
2266                         PerlIO_printf( Perl_debug_log,
2267                             "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2268                             (int)depth * 2 + 2, "",
2269                             (UV)state, (UV)idx,
2270                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2271                                 PL_colors[0], PL_colors[1],
2272                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2273                                 PERL_PV_ESCAPE_FIRSTCHAR
2274                             )
2275                         );
2276                     });
2277                     if ( state==1 ) {
2278                         OP( convert ) = nodetype;
2279                         str=STRING(convert);
2280                         STR_LEN(convert)=0;
2281                     }
2282                     STR_LEN(convert) += len;
2283                     while (len--)
2284                         *str++ = *ch++;
2285                 } else {
2286 #ifdef DEBUGGING
2287                     if (state>1)
2288                         DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2289 #endif
2290                     break;
2291                 }
2292             }
2293             trie->prefixlen = (state-1);
2294             if (str) {
2295                 regnode *n = convert+NODE_SZ_STR(convert);
2296                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2297                 trie->startstate = state;
2298                 trie->minlen -= (state - 1);
2299                 trie->maxlen -= (state - 1);
2300 #ifdef DEBUGGING
2301                /* At least the UNICOS C compiler choked on this
2302                 * being argument to DEBUG_r(), so let's just have
2303                 * it right here. */
2304                if (
2305 #ifdef PERL_EXT_RE_BUILD
2306                    1
2307 #else
2308                    DEBUG_r_TEST
2309 #endif
2310                    ) {
2311                    regnode *fix = convert;
2312                    U32 word = trie->wordcount;
2313                    mjd_nodelen++;
2314                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2315                    while( ++fix < n ) {
2316                        Set_Node_Offset_Length(fix, 0, 0);
2317                    }
2318                    while (word--) {
2319                        SV ** const tmp = av_fetch( trie_words, word, 0 );
2320                        if (tmp) {
2321                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
2322                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2323                            else
2324                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2325                        }
2326                    }
2327                }
2328 #endif
2329                 if (trie->maxlen) {
2330                     convert = n;
2331                 } else {
2332                     NEXT_OFF(convert) = (U16)(tail - convert);
2333                     DEBUG_r(optimize= n);
2334                 }
2335             }
2336         }
2337         if (!jumper)
2338             jumper = last;
2339         if ( trie->maxlen ) {
2340             NEXT_OFF( convert ) = (U16)(tail - convert);
2341             ARG_SET( convert, data_slot );
2342             /* Store the offset to the first unabsorbed branch in
2343                jump[0], which is otherwise unused by the jump logic.
2344                We use this when dumping a trie and during optimisation. */
2345             if (trie->jump)
2346                 trie->jump[0] = (U16)(nextbranch - convert);
2347
2348             /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2349              *   and there is a bitmap
2350              *   and the first "jump target" node we found leaves enough room
2351              * then convert the TRIE node into a TRIEC node, with the bitmap
2352              * embedded inline in the opcode - this is hypothetically faster.
2353              */
2354             if ( !trie->states[trie->startstate].wordnum
2355                  && trie->bitmap
2356                  && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2357             {
2358                 OP( convert ) = TRIEC;
2359                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2360                 PerlMemShared_free(trie->bitmap);
2361                 trie->bitmap= NULL;
2362             } else
2363                 OP( convert ) = TRIE;
2364
2365             /* store the type in the flags */
2366             convert->flags = nodetype;
2367             DEBUG_r({
2368             optimize = convert
2369                       + NODE_STEP_REGNODE
2370                       + regarglen[ OP( convert ) ];
2371             });
2372             /* XXX We really should free up the resource in trie now,
2373                    as we won't use them - (which resources?) dmq */
2374         }
2375         /* needed for dumping*/
2376         DEBUG_r(if (optimize) {
2377             regnode *opt = convert;
2378
2379             while ( ++opt < optimize) {
2380                 Set_Node_Offset_Length(opt,0,0);
2381             }
2382             /*
2383                 Try to clean up some of the debris left after the
2384                 optimisation.
2385              */
2386             while( optimize < jumper ) {
2387                 mjd_nodelen += Node_Length((optimize));
2388                 OP( optimize ) = OPTIMIZED;
2389                 Set_Node_Offset_Length(optimize,0,0);
2390                 optimize++;
2391             }
2392             Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2393         });
2394     } /* end node insert */
2395     REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, convert);
2396
2397     /*  Finish populating the prev field of the wordinfo array.  Walk back
2398      *  from each accept state until we find another accept state, and if
2399      *  so, point the first word's .prev field at the second word. If the
2400      *  second already has a .prev field set, stop now. This will be the
2401      *  case either if we've already processed that word's accept state,
2402      *  or that state had multiple words, and the overspill words were
2403      *  already linked up earlier.
2404      */
2405     {
2406         U16 word;
2407         U32 state;
2408         U16 prev;
2409
2410         for (word=1; word <= trie->wordcount; word++) {
2411             prev = 0;
2412             if (trie->wordinfo[word].prev)
2413                 continue;
2414             state = trie->wordinfo[word].accept;
2415             while (state) {
2416                 state = prev_states[state];
2417                 if (!state)
2418                     break;
2419                 prev = trie->states[state].wordnum;
2420                 if (prev)
2421                     break;
2422             }
2423             trie->wordinfo[word].prev = prev;
2424         }
2425         Safefree(prev_states);
2426     }
2427
2428
2429     /* and now dump out the compressed format */
2430     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2431
2432     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2433 #ifdef DEBUGGING
2434     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2435     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2436 #else
2437     SvREFCNT_dec(revcharmap);
2438 #endif
2439     return trie->jump
2440            ? MADE_JUMP_TRIE
2441            : trie->startstate>1
2442              ? MADE_EXACT_TRIE
2443              : MADE_TRIE;
2444 }
2445
2446 STATIC void
2447 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode *stclass, U32 depth)
2448 {
2449 /* The Trie is constructed and compressed now so we can build a fail array if it's needed
2450
2451    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2452    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2453    ISBN 0-201-10088-6
2454
2455    We find the fail state for each state in the trie, this state is the longest proper
2456    suffix of the current state's 'word' that is also a proper prefix of another word in our
2457    trie. State 1 represents the word '' and is thus the default fail state. This allows
2458    the DFA not to have to restart after its tried and failed a word at a given point, it
2459    simply continues as though it had been matching the other word in the first place.
2460    Consider
2461       'abcdgu'=~/abcdefg|cdgu/
2462    When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2463    fail, which would bring us to the state representing 'd' in the second word where we would
2464    try 'g' and succeed, proceeding to match 'cdgu'.
2465  */
2466  /* add a fail transition */
2467     const U32 trie_offset = ARG(source);
2468     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2469     U32 *q;
2470     const U32 ucharcount = trie->uniquecharcount;
2471     const U32 numstates = trie->statecount;
2472     const U32 ubound = trie->lasttrans + ucharcount;
2473     U32 q_read = 0;
2474     U32 q_write = 0;
2475     U32 charid;
2476     U32 base = trie->states[ 1 ].trans.base;
2477     U32 *fail;
2478     reg_ac_data *aho;
2479     const U32 data_slot = add_data( pRExC_state, 1, "T" );
2480     GET_RE_DEBUG_FLAGS_DECL;
2481
2482     PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2483 #ifndef DEBUGGING
2484     PERL_UNUSED_ARG(depth);
2485 #endif
2486
2487
2488     ARG_SET( stclass, data_slot );
2489     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2490     RExC_rxi->data->data[ data_slot ] = (void*)aho;
2491     aho->trie=trie_offset;
2492     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2493     Copy( trie->states, aho->states, numstates, reg_trie_state );
2494     Newxz( q, numstates, U32);
2495     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2496     aho->refcount = 1;
2497     fail = aho->fail;
2498     /* initialize fail[0..1] to be 1 so that we always have
2499        a valid final fail state */
2500     fail[ 0 ] = fail[ 1 ] = 1;
2501
2502     for ( charid = 0; charid < ucharcount ; charid++ ) {
2503         const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2504         if ( newstate ) {
2505             q[ q_write ] = newstate;
2506             /* set to point at the root */
2507             fail[ q[ q_write++ ] ]=1;
2508         }
2509     }
2510     while ( q_read < q_write) {
2511         const U32 cur = q[ q_read++ % numstates ];
2512         base = trie->states[ cur ].trans.base;
2513
2514         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2515             const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2516             if (ch_state) {
2517                 U32 fail_state = cur;
2518                 U32 fail_base;
2519                 do {
2520                     fail_state = fail[ fail_state ];
2521                     fail_base = aho->states[ fail_state ].trans.base;
2522                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2523
2524                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2525                 fail[ ch_state ] = fail_state;
2526                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2527                 {
2528                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
2529                 }
2530                 q[ q_write++ % numstates] = ch_state;
2531             }
2532         }
2533     }
2534     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2535        when we fail in state 1, this allows us to use the
2536        charclass scan to find a valid start char. This is based on the principle
2537        that theres a good chance the string being searched contains lots of stuff
2538        that cant be a start char.
2539      */
2540     fail[ 0 ] = fail[ 1 ] = 0;
2541     DEBUG_TRIE_COMPILE_r({
2542         PerlIO_printf(Perl_debug_log,
2543                       "%*sStclass Failtable (%"UVuf" states): 0",
2544                       (int)(depth * 2), "", (UV)numstates
2545         );
2546         for( q_read=1; q_read<numstates; q_read++ ) {
2547             PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2548         }
2549         PerlIO_printf(Perl_debug_log, "\n");
2550     });
2551     Safefree(q);
2552     /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2553 }
2554
2555
2556 /*
2557  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2558  * These need to be revisited when a newer toolchain becomes available.
2559  */
2560 #if defined(__sparc64__) && defined(__GNUC__)
2561 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2562 #       undef  SPARC64_GCC_WORKAROUND
2563 #       define SPARC64_GCC_WORKAROUND 1
2564 #   endif
2565 #endif
2566
2567 #define DEBUG_PEEP(str,scan,depth) \
2568     DEBUG_OPTIMISE_r({if (scan){ \
2569        SV * const mysv=sv_newmortal(); \
2570        regnode *Next = regnext(scan); \
2571        regprop(RExC_rx, mysv, scan); \
2572        PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2573        (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2574        Next ? (REG_NODE_NUM(Next)) : 0 ); \
2575    }});
2576
2577
2578 /* The below joins as many adjacent EXACTish nodes as possible into a single
2579  * one, and looks for problematic sequences of characters whose folds vs.
2580  * non-folds have sufficiently different lengths, that the optimizer would be
2581  * fooled into rejecting legitimate matches of them, and the trie construction
2582  * code can't cope with them.  The joining is only done if:
2583  * 1) there is room in the current conglomerated node to entirely contain the
2584  *    next one.
2585  * 2) they are the exact same node type
2586  *
2587  * The adjacent nodes actually may be separated by NOTHING kind nodes, and
2588  * these get optimized out
2589  *
2590  * If there are problematic code sequences, *min_subtract is set to the delta
2591  * that the minimum size of the node can be less than its actual size.  And,
2592  * the node type of the result is changed to reflect that it contains these
2593  * sequences.
2594  *
2595  * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
2596  * and contains LATIN SMALL LETTER SHARP S
2597  *
2598  * This is as good a place as any to discuss the design of handling these
2599  * problematic sequences.  It's been wrong in Perl for a very long time.  There
2600  * are three code points in Unicode whose folded lengths differ so much from
2601  * the un-folded lengths that it causes problems for the optimizer and trie
2602  * construction.  Why only these are problematic, and not others where lengths
2603  * also differ is something I (khw) do not understand.  New versions of Unicode
2604  * might add more such code points.  Hopefully the logic in fold_grind.t that
2605  * figures out what to test (in part by verifying that each size-combination
2606  * gets tested) will catch any that do come along, so they can be added to the
2607  * special handling below.  The chances of new ones are actually rather small,
2608  * as most, if not all, of the world's scripts that have casefolding have
2609  * already been encoded by Unicode.  Also, a number of Unicode's decisions were
2610  * made to allow compatibility with pre-existing standards, and almost all of
2611  * those have already been dealt with.  These would otherwise be the most
2612  * likely candidates for generating further tricky sequences.  In other words,
2613  * Unicode by itself is unlikely to add new ones unless it is for compatibility
2614  * with pre-existing standards, and there aren't many of those left.
2615  *
2616  * The previous designs for dealing with these involved assigning a special
2617  * node for them.  This approach doesn't work, as evidenced by this example:
2618  *      "\xDFs" =~ /s\xDF/ui    # Used to fail before these patches
2619  * Both these fold to "sss", but if the pattern is parsed to create a node of
2620  * that would match just the \xDF, it won't be able to handle the case where a
2621  * successful match would have to cross the node's boundary.  The new approach
2622  * that hopefully generally solves the problem generates an EXACTFU_SS node
2623  * that is "sss".
2624  *
2625  * There are a number of components to the approach (a lot of work for just
2626  * three code points!):
2627  * 1)   This routine examines each EXACTFish node that could contain the
2628  *      problematic sequences.  It returns in *min_subtract how much to
2629  *      subtract from the the actual length of the string to get a real minimum
2630  *      for one that could match it.  This number is usually 0 except for the
2631  *      problematic sequences.  This delta is used by the caller to adjust the
2632  *      min length of the match, and the delta between min and max, so that the
2633  *      optimizer doesn't reject these possibilities based on size constraints.
2634  * 2)   These sequences are not currently correctly handled by the trie code
2635  *      either, so it changes the joined node type to ops that are not handled
2636  *      by trie's, those new ops being EXACTFU_SS and EXACTFU_TRICKYFOLD.
2637  * 3)   This is sufficient for the two Greek sequences (described below), but
2638  *      the one involving the Sharp s (\xDF) needs more.  The node type
2639  *      EXACTFU_SS is used for an EXACTFU node that contains at least one "ss"
2640  *      sequence in it.  For non-UTF-8 patterns and strings, this is the only
2641  *      case where there is a possible fold length change.  That means that a
2642  *      regular EXACTFU node without UTF-8 involvement doesn't have to concern
2643  *      itself with length changes, and so can be processed faster.  regexec.c
2644  *      takes advantage of this.  Generally, an EXACTFish node that is in UTF-8
2645  *      is pre-folded by regcomp.c.  This saves effort in regex matching.
2646  *      However, probably mostly for historical reasons, the pre-folding isn't
2647  *      done for non-UTF8 patterns (and it can't be for EXACTF and EXACTFL
2648  *      nodes, as what they fold to isn't known until runtime.)  The fold
2649  *      possibilities for the non-UTF8 patterns are quite simple, except for
2650  *      the sharp s.  All the ones that don't involve a UTF-8 target string
2651  *      are members of a fold-pair, and arrays are set up for all of them
2652  *      that quickly find the other member of the pair.  It might actually
2653  *      be faster to pre-fold these, but it isn't currently done, except for
2654  *      the sharp s.  Code elsewhere in this file makes sure that it gets
2655  *      folded to 'ss', even if the pattern isn't UTF-8.  This avoids the
2656  *      issues described in the next item.
2657  * 4)   A problem remains for the sharp s in EXACTF nodes.  Whether it matches
2658  *      'ss' or not is not knowable at compile time.  It will match iff the
2659  *      target string is in UTF-8, unlike the EXACTFU nodes, where it always
2660  *      matches; and the EXACTFL and EXACTFA nodes where it never does.  Thus
2661  *      it can't be folded to "ss" at compile time, unlike EXACTFU does as
2662  *      described in item 3).  An assumption that the optimizer part of
2663  *      regexec.c (probably unwittingly) makes is that a character in the
2664  *      pattern corresponds to at most a single character in the target string.
2665  *      (And I do mean character, and not byte here, unlike other parts of the
2666  *      documentation that have never been updated to account for multibyte
2667  *      Unicode.)  This assumption is wrong only in this case, as all other
2668  *      cases are either 1-1 folds when no UTF-8 is involved; or is true by
2669  *      virtue of having this file pre-fold UTF-8 patterns.   I'm
2670  *      reluctant to try to change this assumption, so instead the code punts.
2671  *      This routine examines EXACTF nodes for the sharp s, and returns a
2672  *      boolean indicating whether or not the node is an EXACTF node that
2673  *      contains a sharp s.  When it is true, the caller sets a flag that later
2674  *      causes the optimizer in this file to not set values for the floating
2675  *      and fixed string lengths, and thus avoids the optimizer code in
2676  *      regexec.c that makes the invalid assumption.  Thus, there is no
2677  *      optimization based on string lengths for EXACTF nodes that contain the
2678  *      sharp s.  This only happens for /id rules (which means the pattern
2679  *      isn't in UTF-8).
2680  */
2681
2682 #define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
2683     if (PL_regkind[OP(scan)] == EXACT) \
2684         join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
2685
2686 STATIC U32
2687 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
2688     /* Merge several consecutive EXACTish nodes into one. */
2689     regnode *n = regnext(scan);
2690     U32 stringok = 1;
2691     regnode *next = scan + NODE_SZ_STR(scan);
2692     U32 merged = 0;
2693     U32 stopnow = 0;
2694 #ifdef DEBUGGING
2695     regnode *stop = scan;
2696     GET_RE_DEBUG_FLAGS_DECL;
2697 #else
2698     PERL_UNUSED_ARG(depth);
2699 #endif
2700
2701     PERL_ARGS_ASSERT_JOIN_EXACT;
2702 #ifndef EXPERIMENTAL_INPLACESCAN
2703     PERL_UNUSED_ARG(flags);
2704     PERL_UNUSED_ARG(val);
2705 #endif
2706     DEBUG_PEEP("join",scan,depth);
2707
2708     /* Look through the subsequent nodes in the chain.  Skip NOTHING, merge
2709      * EXACT ones that are mergeable to the current one. */
2710     while (n
2711            && (PL_regkind[OP(n)] == NOTHING
2712                || (stringok && OP(n) == OP(scan)))
2713            && NEXT_OFF(n)
2714            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
2715     {
2716
2717         if (OP(n) == TAIL || n > next)
2718             stringok = 0;
2719         if (PL_regkind[OP(n)] == NOTHING) {
2720             DEBUG_PEEP("skip:",n,depth);
2721             NEXT_OFF(scan) += NEXT_OFF(n);
2722             next = n + NODE_STEP_REGNODE;
2723 #ifdef DEBUGGING
2724             if (stringok)
2725                 stop = n;
2726 #endif
2727             n = regnext(n);
2728         }
2729         else if (stringok) {
2730             const unsigned int oldl = STR_LEN(scan);
2731             regnode * const nnext = regnext(n);
2732
2733             if (oldl + STR_LEN(n) > U8_MAX)
2734                 break;
2735
2736             DEBUG_PEEP("merg",n,depth);
2737             merged++;
2738
2739             NEXT_OFF(scan) += NEXT_OFF(n);
2740             STR_LEN(scan) += STR_LEN(n);
2741             next = n + NODE_SZ_STR(n);
2742             /* Now we can overwrite *n : */
2743             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2744 #ifdef DEBUGGING
2745             stop = next - 1;
2746 #endif
2747             n = nnext;
2748             if (stopnow) break;
2749         }
2750
2751 #ifdef EXPERIMENTAL_INPLACESCAN
2752         if (flags && !NEXT_OFF(n)) {
2753             DEBUG_PEEP("atch", val, depth);
2754             if (reg_off_by_arg[OP(n)]) {
2755                 ARG_SET(n, val - n);
2756             }
2757             else {
2758                 NEXT_OFF(n) = val - n;
2759             }
2760             stopnow = 1;
2761         }
2762 #endif
2763     }
2764
2765     *min_subtract = 0;
2766     *has_exactf_sharp_s = FALSE;
2767
2768     /* Here, all the adjacent mergeable EXACTish nodes have been merged.  We
2769      * can now analyze for sequences of problematic code points.  (Prior to
2770      * this final joining, sequences could have been split over boundaries, and
2771      * hence missed).  The sequences only happen in folding, hence for any
2772      * non-EXACT EXACTish node */
2773     if (OP(scan) != EXACT) {
2774         U8 *s;
2775         U8 * s0 = (U8*) STRING(scan);
2776         U8 * const s_end = s0 + STR_LEN(scan);
2777
2778         /* The below is perhaps overboard, but this allows us to save a test
2779          * each time through the loop at the expense of a mask.  This is
2780          * because on both EBCDIC and ASCII machines, 'S' and 's' differ by a
2781          * single bit.  On ASCII they are 32 apart; on EBCDIC, they are 64.
2782          * This uses an exclusive 'or' to find that bit and then inverts it to
2783          * form a mask, with just a single 0, in the bit position where 'S' and
2784          * 's' differ. */
2785         const U8 S_or_s_mask = (U8) ~ ('S' ^ 's');
2786         const U8 s_masked = 's' & S_or_s_mask;
2787
2788         /* One pass is made over the node's string looking for all the
2789          * possibilities.  to avoid some tests in the loop, there are two main
2790          * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
2791          * non-UTF-8 */
2792         if (UTF) {
2793
2794             /* There are two problematic Greek code points in Unicode
2795              * casefolding
2796              *
2797              * U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2798              * U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2799              *
2800              * which casefold to
2801              *
2802              * Unicode                      UTF-8
2803              *
2804              * U+03B9 U+0308 U+0301         0xCE 0xB9 0xCC 0x88 0xCC 0x81
2805              * U+03C5 U+0308 U+0301         0xCF 0x85 0xCC 0x88 0xCC 0x81
2806              *
2807              * This means that in case-insensitive matching (or "loose
2808              * matching", as Unicode calls it), an EXACTF of length six (the
2809              * UTF-8 encoded byte length of the above casefolded versions) can
2810              * match a target string of length two (the byte length of UTF-8
2811              * encoded U+0390 or U+03B0).  This would rather mess up the
2812              * minimum length computation.  (there are other code points that
2813              * also fold to these two sequences, but the delta is smaller)
2814              *
2815              * If these sequences are found, the minimum length is decreased by
2816              * four (six minus two).
2817              *
2818              * Similarly, 'ss' may match the single char and byte LATIN SMALL
2819              * LETTER SHARP S.  We decrease the min length by 1 for each
2820              * occurrence of 'ss' found */
2821
2822 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2823 #           define U390_first_byte 0xb4
2824             const U8 U390_tail[] = "\x68\xaf\x49\xaf\x42";
2825 #           define U3B0_first_byte 0xb5
2826             const U8 U3B0_tail[] = "\x46\xaf\x49\xaf\x42";
2827 #else
2828 #           define U390_first_byte 0xce
2829             const U8 U390_tail[] = "\xb9\xcc\x88\xcc\x81";
2830 #           define U3B0_first_byte 0xcf
2831             const U8 U3B0_tail[] = "\x85\xcc\x88\xcc\x81";
2832 #endif
2833             const U8 len = sizeof(U390_tail); /* (-1 for NUL; +1 for 1st byte;
2834                                                  yields a net of 0 */
2835             /* Examine the string for one of the problematic sequences */
2836             for (s = s0;
2837                  s < s_end - 1; /* Can stop 1 before the end, as minimum length
2838                                  * sequence we are looking for is 2 */
2839                  s += UTF8SKIP(s))
2840             {
2841
2842                 /* Look for the first byte in each problematic sequence */
2843                 switch (*s) {
2844                     /* We don't have to worry about other things that fold to
2845                      * 's' (such as the long s, U+017F), as all above-latin1
2846                      * code points have been pre-folded */
2847                     case 's':
2848                     case 'S':
2849
2850                         /* Current character is an 's' or 'S'.  If next one is
2851                          * as well, we have the dreaded sequence */
2852                         if (((*(s+1) & S_or_s_mask) == s_masked)
2853                             /* These two node types don't have special handling
2854                              * for 'ss' */
2855                             && OP(scan) != EXACTFL && OP(scan) != EXACTFA)
2856                         {
2857                             *min_subtract += 1;
2858                             OP(scan) = EXACTFU_SS;
2859                             s++;    /* No need to look at this character again */
2860                         }
2861                         break;
2862
2863                     case U390_first_byte:
2864                         if (s_end - s >= len
2865
2866                             /* The 1's are because are skipping comparing the
2867                              * first byte */
2868                             && memEQ(s + 1, U390_tail, len - 1))
2869                         {
2870                             goto greek_sequence;
2871                         }
2872                         break;
2873
2874                     case U3B0_first_byte:
2875                         if (! (s_end - s >= len
2876                                && memEQ(s + 1, U3B0_tail, len - 1)))
2877                         {
2878                             break;
2879                         }
2880                       greek_sequence:
2881                         *min_subtract += 4;
2882
2883                         /* This can't currently be handled by trie's, so change
2884                          * the node type to indicate this.  If EXACTFA and
2885                          * EXACTFL were ever to be handled by trie's, this
2886                          * would have to be changed.  If this node has already
2887                          * been changed to EXACTFU_SS in this loop, leave it as
2888                          * is.  (I (khw) think it doesn't matter in regexec.c
2889                          * for UTF patterns, but no need to change it */
2890                         if (OP(scan) == EXACTFU) {
2891                             OP(scan) = EXACTFU_TRICKYFOLD;
2892                         }
2893                         s += 6; /* We already know what this sequence is.  Skip
2894                                    the rest of it */
2895                         break;
2896                 }
2897             }
2898         }
2899         else if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
2900
2901             /* Here, the pattern is not UTF-8.  We need to look only for the
2902              * 'ss' sequence, and in the EXACTF case, the sharp s, which can be
2903              * in the final position.  Otherwise we can stop looking 1 byte
2904              * earlier because have to find both the first and second 's' */
2905             const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
2906
2907             for (s = s0; s < upper; s++) {
2908                 switch (*s) {
2909                     case 'S':
2910                     case 's':
2911                         if (s_end - s > 1
2912                             && ((*(s+1) & S_or_s_mask) == s_masked))
2913                         {
2914                             *min_subtract += 1;
2915
2916                             /* EXACTF nodes need to know that the minimum
2917                              * length changed so that a sharp s in the string
2918                              * can match this ss in the pattern, but they
2919                              * remain EXACTF nodes, as they are not trie'able,
2920                              * so don't have to invent a new node type to
2921                              * exclude them from the trie code */
2922                             if (OP(scan) != EXACTF) {
2923                                 OP(scan) = EXACTFU_SS;
2924                             }
2925                             s++;
2926                         }
2927                         break;
2928                     case LATIN_SMALL_LETTER_SHARP_S:
2929                         if (OP(scan) == EXACTF) {
2930                             *has_exactf_sharp_s = TRUE;
2931                         }
2932                         break;
2933                 }
2934             }
2935         }
2936     }
2937
2938 #ifdef DEBUGGING
2939     /* Allow dumping but overwriting the collection of skipped
2940      * ops and/or strings with fake optimized ops */
2941     n = scan + NODE_SZ_STR(scan);
2942     while (n <= stop) {
2943         OP(n) = OPTIMIZED;
2944         FLAGS(n) = 0;
2945         NEXT_OFF(n) = 0;
2946         n++;
2947     }
2948 #endif
2949     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2950     return stopnow;
2951 }
2952
2953 /* REx optimizer.  Converts nodes into quicker variants "in place".
2954    Finds fixed substrings.  */
2955
2956 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2957    to the position after last scanned or to NULL. */
2958
2959 #define INIT_AND_WITHP \
2960     assert(!and_withp); \
2961     Newx(and_withp,1,struct regnode_charclass_class); \
2962     SAVEFREEPV(and_withp)
2963
2964 /* this is a chain of data about sub patterns we are processing that
2965    need to be handled separately/specially in study_chunk. Its so
2966    we can simulate recursion without losing state.  */
2967 struct scan_frame;
2968 typedef struct scan_frame {
2969     regnode *last;  /* last node to process in this frame */
2970     regnode *next;  /* next node to process when last is reached */
2971     struct scan_frame *prev; /*previous frame*/
2972     I32 stop; /* what stopparen do we use */
2973 } scan_frame;
2974
2975
2976 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2977
2978 #define CASE_SYNST_FNC(nAmE)                                       \
2979 case nAmE:                                                         \
2980     if (flags & SCF_DO_STCLASS_AND) {                              \
2981             for (value = 0; value < 256; value++)                  \
2982                 if (!is_ ## nAmE ## _cp(value))                       \
2983                     ANYOF_BITMAP_CLEAR(data->start_class, value);  \
2984     }                                                              \
2985     else {                                                         \
2986             for (value = 0; value < 256; value++)                  \
2987                 if (is_ ## nAmE ## _cp(value))                        \
2988                     ANYOF_BITMAP_SET(data->start_class, value);    \
2989     }                                                              \
2990     break;                                                         \
2991 case N ## nAmE:                                                    \
2992     if (flags & SCF_DO_STCLASS_AND) {                              \
2993             for (value = 0; value < 256; value++)                   \
2994                 if (is_ ## nAmE ## _cp(value))                         \
2995                     ANYOF_BITMAP_CLEAR(data->start_class, value);   \
2996     }                                                               \
2997     else {                                                          \
2998             for (value = 0; value < 256; value++)                   \
2999                 if (!is_ ## nAmE ## _cp(value))                        \
3000                     ANYOF_BITMAP_SET(data->start_class, value);     \
3001     }                                                               \
3002     break
3003
3004
3005
3006 STATIC I32
3007 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
3008                         I32 *minlenp, I32 *deltap,
3009                         regnode *last,
3010                         scan_data_t *data,
3011                         I32 stopparen,
3012                         U8* recursed,
3013                         struct regnode_charclass_class *and_withp,
3014                         U32 flags, U32 depth)
3015                         /* scanp: Start here (read-write). */
3016                         /* deltap: Write maxlen-minlen here. */
3017                         /* last: Stop before this one. */
3018                         /* data: string data about the pattern */
3019                         /* stopparen: treat close N as END */
3020                         /* recursed: which subroutines have we recursed into */
3021                         /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3022 {
3023     dVAR;
3024     I32 min = 0, pars = 0, code;
3025     regnode *scan = *scanp, *next;
3026     I32 delta = 0;
3027     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3028     int is_inf_internal = 0;            /* The studied chunk is infinite */
3029     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3030     scan_data_t data_fake;
3031     SV *re_trie_maxbuff = NULL;
3032     regnode *first_non_open = scan;
3033     I32 stopmin = I32_MAX;
3034     scan_frame *frame = NULL;
3035     GET_RE_DEBUG_FLAGS_DECL;
3036
3037     PERL_ARGS_ASSERT_STUDY_CHUNK;
3038
3039 #ifdef DEBUGGING
3040     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3041 #endif
3042
3043     if ( depth == 0 ) {
3044         while (first_non_open && OP(first_non_open) == OPEN)
3045             first_non_open=regnext(first_non_open);
3046     }
3047
3048
3049   fake_study_recurse:
3050     while ( scan && OP(scan) != END && scan < last ){
3051         UV min_subtract = 0;    /* How much to subtract from the minimum node
3052                                    length to get a real minimum (because the
3053                                    folded version may be shorter) */
3054         bool has_exactf_sharp_s = FALSE;
3055         /* Peephole optimizer: */
3056         DEBUG_STUDYDATA("Peep:", data,depth);
3057         DEBUG_PEEP("Peep",scan,depth);
3058
3059         /* Its not clear to khw or hv why this is done here, and not in the
3060          * clauses that deal with EXACT nodes.  khw's guess is that it's
3061          * because of a previous design */
3062         JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
3063
3064         /* Follow the next-chain of the current node and optimize
3065            away all the NOTHINGs from it.  */
3066         if (OP(scan) != CURLYX) {
3067             const int max = (reg_off_by_arg[OP(scan)]
3068                        ? I32_MAX
3069                        /* I32 may be smaller than U16 on CRAYs! */
3070                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3071             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3072             int noff;
3073             regnode *n = scan;
3074
3075             /* Skip NOTHING and LONGJMP. */
3076             while ((n = regnext(n))
3077                    && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3078                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3079                    && off + noff < max)
3080                 off += noff;
3081             if (reg_off_by_arg[OP(scan)])
3082                 ARG(scan) = off;
3083             else
3084                 NEXT_OFF(scan) = off;
3085         }
3086
3087
3088
3089         /* The principal pseudo-switch.  Cannot be a switch, since we
3090            look into several different things.  */
3091         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3092                    || OP(scan) == IFTHEN) {
3093             next = regnext(scan);
3094             code = OP(scan);
3095             /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
3096
3097             if (OP(next) == code || code == IFTHEN) {
3098                 /* NOTE - There is similar code to this block below for handling
3099                    TRIE nodes on a re-study.  If you change stuff here check there
3100                    too. */
3101                 I32 max1 = 0, min1 = I32_MAX, num = 0;
3102                 struct regnode_charclass_class accum;
3103                 regnode * const startbranch=scan;
3104
3105                 if (flags & SCF_DO_SUBSTR)
3106                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
3107                 if (flags & SCF_DO_STCLASS)
3108                     cl_init_zero(pRExC_state, &accum);
3109
3110                 while (OP(scan) == code) {
3111                     I32 deltanext, minnext, f = 0, fake;
3112                     struct regnode_charclass_class this_class;
3113
3114                     num++;
3115                     data_fake.flags = 0;
3116                     if (data) {
3117                         data_fake.whilem_c = data->whilem_c;
3118                         data_fake.last_closep = data->last_closep;
3119                     }
3120                     else
3121                         data_fake.last_closep = &fake;
3122
3123                     data_fake.pos_delta = delta;
3124                     next = regnext(scan);
3125                     scan = NEXTOPER(scan);
3126                     if (code != BRANCH)
3127                         scan = NEXTOPER(scan);
3128                     if (flags & SCF_DO_STCLASS) {
3129                         cl_init(pRExC_state, &this_class);
3130                         data_fake.start_class = &this_class;
3131                         f = SCF_DO_STCLASS_AND;
3132                     }
3133                     if (flags & SCF_WHILEM_VISITED_POS)
3134                         f |= SCF_WHILEM_VISITED_POS;
3135
3136                     /* we suppose the run is continuous, last=next...*/
3137                     minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3138                                           next, &data_fake,
3139                                           stopparen, recursed, NULL, f,depth+1);
3140                     if (min1 > minnext)
3141                         min1 = minnext;
3142                     if (max1 < minnext + deltanext)
3143                         max1 = minnext + deltanext;
3144                     if (deltanext == I32_MAX)
3145                         is_inf = is_inf_internal = 1;
3146                     scan = next;
3147                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3148                         pars++;
3149                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
3150                         if ( stopmin > minnext)
3151                             stopmin = min + min1;
3152                         flags &= ~SCF_DO_SUBSTR;
3153                         if (data)
3154                             data->flags |= SCF_SEEN_ACCEPT;
3155                     }
3156                     if (data) {
3157                         if (data_fake.flags & SF_HAS_EVAL)
3158                             data->flags |= SF_HAS_EVAL;
3159                         data->whilem_c = data_fake.whilem_c;
3160                     }
3161                     if (flags & SCF_DO_STCLASS)
3162                         cl_or(pRExC_state, &accum, &this_class);
3163                 }
3164                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3165                     min1 = 0;
3166                 if (flags & SCF_DO_SUBSTR) {
3167                     data->pos_min += min1;
3168                     data->pos_delta += max1 - min1;
3169                     if (max1 != min1 || is_inf)
3170                         data->longest = &(data->longest_float);
3171                 }
3172                 min += min1;
3173                 delta += max1 - min1;
3174                 if (flags & SCF_DO_STCLASS_OR) {
3175                     cl_or(pRExC_state, data->start_class, &accum);
3176                     if (min1) {
3177                         cl_and(data->start_class, and_withp);
3178                         flags &= ~SCF_DO_STCLASS;
3179                     }
3180                 }
3181                 else if (flags & SCF_DO_STCLASS_AND) {
3182                     if (min1) {
3183                         cl_and(data->start_class, &accum);
3184                         flags &= ~SCF_DO_STCLASS;
3185                     }
3186                     else {
3187                         /* Switch to OR mode: cache the old value of
3188                          * data->start_class */
3189                         INIT_AND_WITHP;
3190                         StructCopy(data->start_class, and_withp,
3191                                    struct regnode_charclass_class);
3192                         flags &= ~SCF_DO_STCLASS_AND;
3193                         StructCopy(&accum, data->start_class,
3194                                    struct regnode_charclass_class);
3195                         flags |= SCF_DO_STCLASS_OR;
3196                         data->start_class->flags |= ANYOF_EOS;
3197                     }
3198                 }
3199
3200                 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
3201                 /* demq.
3202
3203                    Assuming this was/is a branch we are dealing with: 'scan' now
3204                    points at the item that follows the branch sequence, whatever
3205                    it is. We now start at the beginning of the sequence and look
3206                    for subsequences of
3207
3208                    BRANCH->EXACT=>x1
3209                    BRANCH->EXACT=>x2
3210                    tail
3211
3212                    which would be constructed from a pattern like /A|LIST|OF|WORDS/
3213
3214                    If we can find such a subsequence we need to turn the first
3215                    element into a trie and then add the subsequent branch exact
3216                    strings to the trie.
3217
3218                    We have two cases
3219
3220                      1. patterns where the whole set of branches can be converted.
3221
3222                      2. patterns where only a subset can be converted.
3223
3224                    In case 1 we can replace the whole set with a single regop
3225                    for the trie. In case 2 we need to keep the start and end
3226                    branches so
3227
3228                      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3229                      becomes BRANCH TRIE; BRANCH X;
3230
3231                   There is an additional case, that being where there is a
3232                   common prefix, which gets split out into an EXACT like node
3233                   preceding the TRIE node.
3234
3235                   If x(1..n)==tail then we can do a simple trie, if not we make
3236                   a "jump" trie, such that when we match the appropriate word
3237                   we "jump" to the appropriate tail node. Essentially we turn
3238                   a nested if into a case structure of sorts.
3239
3240                 */
3241
3242                     int made=0;
3243                     if (!re_trie_maxbuff) {
3244                         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3245                         if (!SvIOK(re_trie_maxbuff))
3246                             sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3247                     }
3248                     if ( SvIV(re_trie_maxbuff)>=0  ) {
3249                         regnode *cur;
3250                         regnode *first = (regnode *)NULL;
3251                         regnode *last = (regnode *)NULL;
3252                         regnode *tail = scan;
3253                         U8 trietype = 0;
3254                         U32 count=0;
3255
3256 #ifdef DEBUGGING
3257                         SV * const mysv = sv_newmortal();       /* for dumping */
3258 #endif
3259                         /* var tail is used because there may be a TAIL
3260                            regop in the way. Ie, the exacts will point to the
3261                            thing following the TAIL, but the last branch will
3262                            point at the TAIL. So we advance tail. If we
3263                            have nested (?:) we may have to move through several
3264                            tails.
3265                          */
3266
3267                         while ( OP( tail ) == TAIL ) {
3268                             /* this is the TAIL generated by (?:) */
3269                             tail = regnext( tail );
3270                         }
3271
3272
3273                         DEBUG_TRIE_COMPILE_r({
3274                             regprop(RExC_rx, mysv, tail );
3275                             PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3276                                 (int)depth * 2 + 2, "",
3277                                 "Looking for TRIE'able sequences. Tail node is: ",
3278                                 SvPV_nolen_const( mysv )
3279                             );
3280                         });
3281
3282                         /*
3283
3284                             Step through the branches
3285                                 cur represents each branch,
3286                                 noper is the first thing to be matched as part of that branch
3287                                 noper_next is the regnext() of that node.
3288
3289                             We normally handle a case like this /FOO[xyz]|BAR[pqr]/
3290                             via a "jump trie" but we also support building with NOJUMPTRIE,
3291                             which restricts the trie logic to structures like /FOO|BAR/.
3292
3293                             If noper is a trieable nodetype then the branch is a possible optimization
3294                             target. If we are building under NOJUMPTRIE then we require that noper_next
3295                             is the same as scan (our current position in the regex program).
3296
3297                             Once we have two or more consecutive such branches we can create a
3298                             trie of the EXACT's contents and stitch it in place into the program.
3299
3300                             If the sequence represents all of the branches in the alternation we
3301                             replace the entire thing with a single TRIE node.
3302
3303                             Otherwise when it is a subsequence we need to stitch it in place and
3304                             replace only the relevant branches. This means the first branch has
3305                             to remain as it is used by the alternation logic, and its next pointer,
3306                             and needs to be repointed at the item on the branch chain following
3307                             the last branch we have optimized away.
3308
3309                             This could be either a BRANCH, in which case the subsequence is internal,
3310                             or it could be the item following the branch sequence in which case the
3311                             subsequence is at the end (which does not necessarily mean the first node
3312                             is the start of the alternation).
3313
3314                             TRIE_TYPE(X) is a define which maps the optype to a trietype.
3315
3316                                 optype          |  trietype
3317                                 ----------------+-----------
3318                                 NOTHING         | NOTHING
3319                                 EXACT           | EXACT
3320                                 EXACTFU         | EXACTFU
3321                                 EXACTFU_SS      | EXACTFU
3322                                 EXACTFU_TRICKYFOLD | EXACTFU
3323                                 EXACTFA         | 0
3324
3325
3326                         */
3327 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING :   \
3328                        ( EXACT == (X) )   ? EXACT :        \
3329                        ( EXACTFU == (X) || EXACTFU_SS == (X) || EXACTFU_TRICKYFOLD == (X) ) ? EXACTFU :        \
3330                        0 )
3331
3332                         /* dont use tail as the end marker for this traverse */
3333                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3334                             regnode * const noper = NEXTOPER( cur );
3335                             U8 noper_type = OP( noper );
3336                             U8 noper_trietype = TRIE_TYPE( noper_type );
3337 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3338                             regnode * const noper_next = regnext( noper );
3339                             U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
3340                             U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
3341 #endif
3342
3343                             DEBUG_TRIE_COMPILE_r({
3344                                 regprop(RExC_rx, mysv, cur);
3345                                 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3346                                    (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3347
3348                                 regprop(RExC_rx, mysv, noper);
3349                                 PerlIO_printf( Perl_debug_log, " -> %s",
3350                                     SvPV_nolen_const(mysv));
3351
3352                                 if ( noper_next ) {
3353                                   regprop(RExC_rx, mysv, noper_next );
3354                                   PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3355                                     SvPV_nolen_const(mysv));
3356                                 }
3357                                 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
3358                                    REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
3359                                    PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
3360                                 );
3361                             });
3362
3363                             /* Is noper a trieable nodetype that can be merged with the
3364                              * current trie (if there is one)? */
3365                             if ( noper_trietype
3366                                   &&
3367                                   (
3368                                         ( noper_trietype == NOTHING)
3369                                         || ( trietype == NOTHING )
3370                                         || ( trietype == noper_trietype )
3371                                   )
3372 #ifdef NOJUMPTRIE
3373                                   && noper_next == tail
3374 #endif
3375                                   && count < U16_MAX)
3376                             {
3377                                 /* Handle mergable triable node
3378                                  * Either we are the first node in a new trieable sequence,
3379                                  * in which case we do some bookkeeping, otherwise we update
3380                                  * the end pointer. */
3381                                 if ( !first ) {
3382                                     first = cur;
3383                                     trietype = noper_trietype;
3384                                     if ( noper_trietype == NOTHING ) {
3385 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
3386                                         regnode * const noper_next = regnext( noper );
3387                                         U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
3388                                         U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
3389 #endif
3390
3391                                         if ( noper_next_trietype )
3392                                             trietype = noper_next_trietype;
3393                                     }
3394                                 } else {
3395                                     if ( trietype == NOTHING )
3396                                         trietype = noper_trietype;
3397                                     last = cur;
3398                                 }
3399                                 if (first)
3400                                     count++;
3401                             } /* end handle mergable triable node */
3402                             else {
3403                                 /* handle unmergable node -
3404                                  * noper may either be a triable node which can not be tried
3405                                  * together with the current trie, or a non triable node */
3406                                 if ( last ) {
3407                                     /* If last is set and trietype is not NOTHING then we have found
3408                                      * at least two triable branch sequences in a row of a similar
3409                                      * trietype so we can turn them into a trie. If/when we
3410                                      * allow NOTHING to start a trie sequence this condition will be
3411                                      * required, and it isn't expensive so we leave it in for now. */
3412                                     if ( trietype != NOTHING )
3413                                         make_trie( pRExC_state,
3414                                                 startbranch, first, cur, tail, count,
3415                                                 trietype, depth+1 );
3416                                     last = NULL; /* note: we clear/update first, trietype etc below, so we dont do it here */
3417                                 }
3418                                 if ( noper_trietype
3419 #ifdef NOJUMPTRIE
3420                                      && noper_next == tail
3421 #endif
3422                                 ){
3423                                     /* noper is triable, so we can start a new trie sequence */
3424                                     count = 1;
3425                                     first = cur;
3426                                     trietype = noper_trietype;
3427                                 } else if (first) {
3428                                     /* if we already saw a first but the current node is not triable then we have
3429                                      * to reset the first information. */
3430                                     count = 0;
3431                                     first = NULL;
3432                                     trietype = 0;
3433                                 }
3434                             } /* end handle unmergable node */
3435                         } /* loop over branches */
3436                         DEBUG_TRIE_COMPILE_r({
3437                             regprop(RExC_rx, mysv, cur);
3438                             PerlIO_printf( Perl_debug_log,
3439                               "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3440                               "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3441
3442                         });
3443                         if ( last ) {
3444                             if ( trietype != NOTHING ) {
3445                                 /* the last branch of the sequence was part of a trie,
3446                                  * so we have to construct it here outside of the loop
3447                                  */
3448                                 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 );
3449 #ifdef TRIE_STUDY_OPT
3450                                 if ( ((made == MADE_EXACT_TRIE &&
3451                                      startbranch == first)
3452                                      || ( first_non_open == first )) &&
3453                                      depth==0 ) {
3454                                     flags |= SCF_TRIE_RESTUDY;
3455                                     if ( startbranch == first
3456                                          && scan == tail )
3457                                     {
3458                                         RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3459                                     }
3460                                 }
3461 #endif
3462                             } else {
3463                                 /* at this point we know whatever we have is a NOTHING sequence/branch
3464                                  * AND if 'startbranch' is 'first' then we can turn the whole thing into a NOTHING
3465                                  */
3466                                 if ( startbranch == first ) {
3467                                     regnode *opt;
3468                                     /* the entire thing is a NOTHING sequence, something like this:
3469                                      * (?:|) So we can turn it into a plain NOTHING op. */
3470                                     DEBUG_TRIE_COMPILE_r({
3471                                         regprop(RExC_rx, mysv, cur);
3472                                         PerlIO_printf( Perl_debug_log,
3473                                           "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
3474                                           "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3475
3476                                     });
3477                                     OP(startbranch)= NOTHING;
3478                                     NEXT_OFF(startbranch)= tail - startbranch;
3479                                     for ( opt= startbranch + 1; opt < tail ; opt++ )
3480                                         OP(opt)= OPTIMIZED;
3481                                 }
3482                             }
3483                         } /* end if ( last) */
3484                     } /* TRIE_MAXBUF is non zero */
3485
3486                 } /* do trie */
3487
3488             }
3489             else if ( code == BRANCHJ ) {  /* single branch is optimized. */
3490                 scan = NEXTOPER(NEXTOPER(scan));
3491             } else                      /* single branch is optimized. */
3492                 scan = NEXTOPER(scan);
3493             continue;
3494         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3495             scan_frame *newframe = NULL;
3496             I32 paren;
3497             regnode *start;
3498             regnode *end;
3499
3500             if (OP(scan) != SUSPEND) {
3501             /* set the pointer */
3502                 if (OP(scan) == GOSUB) {
3503                     paren = ARG(scan);
3504                     RExC_recurse[ARG2L(scan)] = scan;
3505                     start = RExC_open_parens[paren-1];
3506                     end   = RExC_close_parens[paren-1];
3507                 } else {
3508                     paren = 0;
3509                     start = RExC_rxi->program + 1;
3510                     end   = RExC_opend;
3511                 }
3512                 if (!recursed) {
3513                     Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3514                     SAVEFREEPV(recursed);
3515                 }
3516                 if (!PAREN_TEST(recursed,paren+1)) {
3517                     PAREN_SET(recursed,paren+1);
3518                     Newx(newframe,1,scan_frame);
3519                 } else {
3520                     if (flags & SCF_DO_SUBSTR) {
3521                         SCAN_COMMIT(pRExC_state,data,minlenp);
3522                         data->longest = &(data->longest_float);
3523                     }
3524                     is_inf = is_inf_internal = 1;
3525                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3526                         cl_anything(pRExC_state, data->start_class);
3527                     flags &= ~SCF_DO_STCLASS;
3528                 }
3529             } else {
3530                 Newx(newframe,1,scan_frame);
3531                 paren = stopparen;
3532                 start = scan+2;
3533                 end = regnext(scan);
3534             }
3535             if (newframe) {
3536                 assert(start);
3537                 assert(end);
3538                 SAVEFREEPV(newframe);
3539                 newframe->next = regnext(scan);
3540                 newframe->last = last;
3541                 newframe->stop = stopparen;
3542                 newframe->prev = frame;
3543
3544                 frame = newframe;
3545                 scan =  start;
3546                 stopparen = paren;
3547                 last = end;
3548
3549                 continue;
3550             }
3551         }
3552         else if (OP(scan) == EXACT) {
3553             I32 l = STR_LEN(scan);
3554             UV uc;
3555             if (UTF) {
3556                 const U8 * const s = (U8*)STRING(scan);
3557                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3558                 l = utf8_length(s, s + l);
3559             } else {
3560                 uc = *((U8*)STRING(scan));
3561             }
3562             min += l;
3563             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3564                 /* The code below prefers earlier match for fixed
3565                    offset, later match for variable offset.  */
3566                 if (data->last_end == -1) { /* Update the start info. */
3567                     data->last_start_min = data->pos_min;
3568                     data->last_start_max = is_inf
3569                         ? I32_MAX : data->pos_min + data->pos_delta;
3570                 }
3571                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3572                 if (UTF)
3573                     SvUTF8_on(data->last_found);
3574                 {
3575                     SV * const sv = data->last_found;
3576                     MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3577                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
3578                     if (mg && mg->mg_len >= 0)
3579                         mg->mg_len += utf8_length((U8*)STRING(scan),
3580                                                   (U8*)STRING(scan)+STR_LEN(scan));
3581                 }
3582                 data->last_end = data->pos_min + l;
3583                 data->pos_min += l; /* As in the first entry. */
3584                 data->flags &= ~SF_BEFORE_EOL;
3585             }
3586             if (flags & SCF_DO_STCLASS_AND) {
3587                 /* Check whether it is compatible with what we know already! */
3588                 int compat = 1;
3589
3590
3591                 /* If compatible, we or it in below.  It is compatible if is
3592                  * in the bitmp and either 1) its bit or its fold is set, or 2)
3593                  * it's for a locale.  Even if there isn't unicode semantics
3594                  * here, at runtime there may be because of matching against a
3595                  * utf8 string, so accept a possible false positive for
3596                  * latin1-range folds */
3597                 if (uc >= 0x100 ||
3598                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3599                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
3600                     && (!(data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD)
3601                         || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3602                     )
3603                 {
3604                     compat = 0;
3605                 }
3606                 ANYOF_CLASS_ZERO(data->start_class);
3607                 ANYOF_BITMAP_ZERO(data->start_class);
3608                 if (compat)
3609                     ANYOF_BITMAP_SET(data->start_class, uc);
3610                 else if (uc >= 0x100) {
3611                     int i;
3612
3613                     /* Some Unicode code points fold to the Latin1 range; as
3614                      * XXX temporary code, instead of figuring out if this is
3615                      * one, just assume it is and set all the start class bits
3616                      * that could be some such above 255 code point's fold
3617                      * which will generate fals positives.  As the code
3618                      * elsewhere that does compute the fold settles down, it
3619                      * can be extracted out and re-used here */
3620                     for (i = 0; i < 256; i++){
3621                         if (HAS_NONLATIN1_FOLD_CLOSURE(i)) {
3622                             ANYOF_BITMAP_SET(data->start_class, i);
3623                         }
3624                     }
3625                 }
3626                 data->start_class->flags &= ~ANYOF_EOS;
3627                 if (uc < 0x100)
3628                   data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3629             }
3630             else if (flags & SCF_DO_STCLASS_OR) {
3631                 /* false positive possible if the class is case-folded */
3632                 if (uc < 0x100)
3633                     ANYOF_BITMAP_SET(data->start_class, uc);
3634                 else
3635                     data->start_class->flags |= ANYOF_UNICODE_ALL;
3636                 data->start_class->flags &= ~ANYOF_EOS;
3637                 cl_and(data->start_class, and_withp);
3638             }
3639             flags &= ~SCF_DO_STCLASS;
3640         }
3641         else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3642             I32 l = STR_LEN(scan);
3643             UV uc = *((U8*)STRING(scan));
3644
3645             /* Search for fixed substrings supports EXACT only. */
3646             if (flags & SCF_DO_SUBSTR) {
3647                 assert(data);
3648                 SCAN_COMMIT(pRExC_state, data, minlenp);
3649             }
3650             if (UTF) {
3651                 const U8 * const s = (U8 *)STRING(scan);
3652                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3653                 l = utf8_length(s, s + l);
3654             }
3655             else if (has_exactf_sharp_s) {
3656                 RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
3657             }
3658             min += l - min_subtract;
3659             if (min < 0) {
3660                 min = 0;
3661             }
3662             delta += min_subtract;
3663             if (flags & SCF_DO_SUBSTR) {
3664                 data->pos_min += l - min_subtract;
3665                 if (data->pos_min < 0) {
3666                     data->pos_min = 0;
3667                 }
3668                 data->pos_delta += min_subtract;
3669                 if (min_subtract) {
3670                     data->longest = &(data->longest_float);
3671                 }
3672             }
3673             if (flags & SCF_DO_STCLASS_AND) {
3674                 /* Check whether it is compatible with what we know already! */
3675                 int compat = 1;
3676                 if (uc >= 0x100 ||
3677                  (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3678                   && !ANYOF_BITMAP_TEST(data->start_class, uc)
3679                   && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3680                 {
3681                     compat = 0;
3682                 }
3683                 ANYOF_CLASS_ZERO(data->start_class);
3684                 ANYOF_BITMAP_ZERO(data->start_class);
3685                 if (compat) {
3686                     ANYOF_BITMAP_SET(data->start_class, uc);
3687                     data->start_class->flags &= ~ANYOF_EOS;
3688                     data->start_class->flags |= ANYOF_LOC_NONBITMAP_FOLD;
3689                     if (OP(scan) == EXACTFL) {
3690                         /* XXX This set is probably no longer necessary, and
3691                          * probably wrong as LOCALE now is on in the initial
3692                          * state */
3693                         data->start_class->flags |= ANYOF_LOCALE;
3694                     }
3695                     else {
3696
3697                         /* Also set the other member of the fold pair.  In case
3698                          * that unicode semantics is called for at runtime, use
3699                          * the full latin1 fold.  (Can't do this for locale,
3700                          * because not known until runtime) */
3701                         ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3702
3703                         /* All other (EXACTFL handled above) folds except under
3704                          * /iaa that include s, S, and sharp_s also may include
3705                          * the others */
3706                         if (OP(scan) != EXACTFA) {
3707                             if (uc == 's' || uc == 'S') {
3708                                 ANYOF_BITMAP_SET(data->start_class,
3709                                                  LATIN_SMALL_LETTER_SHARP_S);
3710                             }
3711                             else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3712                                 ANYOF_BITMAP_SET(data->start_class, 's');
3713                                 ANYOF_BITMAP_SET(data->start_class, 'S');
3714                             }
3715                         }
3716                     }
3717                 }
3718                 else if (uc >= 0x100) {
3719                     int i;
3720                     for (i = 0; i < 256; i++){
3721                         if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3722                             ANYOF_BITMAP_SET(data->start_class, i);
3723                         }
3724                     }
3725                 }
3726             }
3727             else if (flags & SCF_DO_STCLASS_OR) {
3728                 if (data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) {
3729                     /* false positive possible if the class is case-folded.
3730                        Assume that the locale settings are the same... */
3731                     if (uc < 0x100) {
3732                         ANYOF_BITMAP_SET(data->start_class, uc);
3733                         if (OP(scan) != EXACTFL) {
3734
3735                             /* And set the other member of the fold pair, but
3736                              * can't do that in locale because not known until
3737                              * run-time */
3738                             ANYOF_BITMAP_SET(data->start_class,
3739                                              PL_fold_latin1[uc]);
3740
3741                             /* All folds except under /iaa that include s, S,
3742                              * and sharp_s also may include the others */
3743                             if (OP(scan) != EXACTFA) {
3744                                 if (uc == 's' || uc == 'S') {
3745                                     ANYOF_BITMAP_SET(data->start_class,
3746                                                    LATIN_SMALL_LETTER_SHARP_S);
3747                                 }
3748                                 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3749                                     ANYOF_BITMAP_SET(data->start_class, 's');
3750                                     ANYOF_BITMAP_SET(data->start_class, 'S');
3751                                 }
3752                             }
3753                         }
3754                     }
3755                     data->start_class->flags &= ~ANYOF_EOS;
3756                 }
3757                 cl_and(data->start_class, and_withp);
3758             }
3759             flags &= ~SCF_DO_STCLASS;
3760         }
3761         else if (REGNODE_VARIES(OP(scan))) {
3762             I32 mincount, maxcount, minnext, deltanext, fl = 0;
3763             I32 f = flags, pos_before = 0;
3764             regnode * const oscan = scan;
3765             struct regnode_charclass_class this_class;
3766             struct regnode_charclass_class *oclass = NULL;
3767             I32 next_is_eval = 0;
3768
3769             switch (PL_regkind[OP(scan)]) {
3770             case WHILEM:                /* End of (?:...)* . */
3771                 scan = NEXTOPER(scan);
3772                 goto finish;
3773             case PLUS:
3774                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3775                     next = NEXTOPER(scan);
3776                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3777                         mincount = 1;
3778                         maxcount = REG_INFTY;
3779                         next = regnext(scan);
3780                         scan = NEXTOPER(scan);
3781                         goto do_curly;
3782                     }
3783                 }
3784                 if (flags & SCF_DO_SUBSTR)
3785                     data->pos_min++;
3786                 min++;
3787                 /* Fall through. */
3788             case STAR:
3789                 if (flags & SCF_DO_STCLASS) {
3790                     mincount = 0;
3791                     maxcount = REG_INFTY;
3792                     next = regnext(scan);
3793                     scan = NEXTOPER(scan);
3794                     goto do_curly;
3795                 }
3796                 is_inf = is_inf_internal = 1;
3797                 scan = regnext(scan);
3798                 if (flags & SCF_DO_SUBSTR) {
3799                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3800                     data->longest = &(data->longest_float);
3801                 }
3802                 goto optimize_curly_tail;
3803             case CURLY:
3804                 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3805                     && (scan->flags == stopparen))
3806                 {
3807                     mincount = 1;
3808                     maxcount = 1;
3809                 } else {
3810                     mincount = ARG1(scan);
3811                     maxcount = ARG2(scan);
3812                 }
3813                 next = regnext(scan);
3814                 if (OP(scan) == CURLYX) {
3815                     I32 lp = (data ? *(data->last_closep) : 0);
3816                     scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3817                 }
3818                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3819                 next_is_eval = (OP(scan) == EVAL);
3820               do_curly:
3821                 if (flags & SCF_DO_SUBSTR) {
3822                     if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3823                     pos_before = data->pos_min;
3824                 }
3825                 if (data) {
3826                     fl = data->flags;
3827                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3828                     if (is_inf)
3829                         data->flags |= SF_IS_INF;
3830                 }
3831                 if (flags & SCF_DO_STCLASS) {
3832                     cl_init(pRExC_state, &this_class);
3833                     oclass = data->start_class;
3834                     data->start_class = &this_class;
3835                     f |= SCF_DO_STCLASS_AND;
3836                     f &= ~SCF_DO_STCLASS_OR;
3837                 }
3838                 /* Exclude from super-linear cache processing any {n,m}
3839                    regops for which the combination of input pos and regex
3840                    pos is not enough information to determine if a match
3841                    will be possible.
3842
3843                    For example, in the regex /foo(bar\s*){4,8}baz/ with the
3844                    regex pos at the \s*, the prospects for a match depend not
3845                    only on the input position but also on how many (bar\s*)
3846                    repeats into the {4,8} we are. */
3847                if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3848                     f &= ~SCF_WHILEM_VISITED_POS;
3849
3850                 /* This will finish on WHILEM, setting scan, or on NULL: */
3851                 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3852                                       last, data, stopparen, recursed, NULL,
3853                                       (mincount == 0
3854                                         ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3855
3856                 if (flags & SCF_DO_STCLASS)
3857                     data->start_class = oclass;
3858                 if (mincount == 0 || minnext == 0) {
3859                     if (flags & SCF_DO_STCLASS_OR) {
3860                         cl_or(pRExC_state, data->start_class, &this_class);
3861                     }
3862                     else if (flags & SCF_DO_STCLASS_AND) {
3863                         /* Switch to OR mode: cache the old value of
3864                          * data->start_class */
3865                         INIT_AND_WITHP;
3866                         StructCopy(data->start_class, and_withp,
3867                                    struct regnode_charclass_class);
3868                         flags &= ~SCF_DO_STCLASS_AND;
3869                         StructCopy(&this_class, data->start_class,
3870                                    struct regnode_charclass_class);
3871                         flags |= SCF_DO_STCLASS_OR;
3872                         data->start_class->flags |= ANYOF_EOS;
3873                     }
3874                 } else {                /* Non-zero len */
3875                     if (flags & SCF_DO_STCLASS_OR) {
3876                         cl_or(pRExC_state, data->start_class, &this_class);
3877                         cl_and(data->start_class, and_withp);
3878                     }
3879                     else if (flags & SCF_DO_STCLASS_AND)
3880                         cl_and(data->start_class, &this_class);
3881                     flags &= ~SCF_DO_STCLASS;
3882                 }
3883                 if (!scan)              /* It was not CURLYX, but CURLY. */
3884                     scan = next;
3885                 if ( /* ? quantifier ok, except for (?{ ... }) */
3886                     (next_is_eval || !(mincount == 0 && maxcount == 1))
3887                     && (minnext == 0) && (deltanext == 0)
3888                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3889                     && maxcount <= REG_INFTY/3) /* Complement check for big count */
3890                 {
3891                     ckWARNreg(RExC_parse,
3892                               "Quantifier unexpected on zero-length expression");
3893                 }
3894
3895                 min += minnext * mincount;
3896                 is_inf_internal |= ((maxcount == REG_INFTY
3897                                      && (minnext + deltanext) > 0)
3898                                     || deltanext == I32_MAX);
3899                 is_inf |= is_inf_internal;
3900                 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3901
3902                 /* Try powerful optimization CURLYX => CURLYN. */
3903                 if (  OP(oscan) == CURLYX && data
3904                       && data->flags & SF_IN_PAR
3905                       && !(data->flags & SF_HAS_EVAL)
3906                       && !deltanext && minnext == 1 ) {
3907                     /* Try to optimize to CURLYN.  */
3908                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3909                     regnode * const nxt1 = nxt;
3910 #ifdef DEBUGGING
3911                     regnode *nxt2;
3912 #endif
3913
3914                     /* Skip open. */
3915                     nxt = regnext(nxt);
3916                     if (!REGNODE_SIMPLE(OP(nxt))
3917                         && !(PL_regkind[OP(nxt)] == EXACT
3918                              && STR_LEN(nxt) == 1))
3919                         goto nogo;
3920 #ifdef DEBUGGING
3921                     nxt2 = nxt;
3922 #endif
3923                     nxt = regnext(nxt);
3924                     if (OP(nxt) != CLOSE)
3925                         goto nogo;
3926                     if (RExC_open_parens) {
3927                         RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3928                         RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3929                     }
3930                     /* Now we know that nxt2 is the only contents: */
3931                     oscan->flags = (U8)ARG(nxt);
3932                     OP(oscan) = CURLYN;
3933                     OP(nxt1) = NOTHING; /* was OPEN. */
3934
3935 #ifdef DEBUGGING
3936                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3937                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3938                     NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3939                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
3940                     OP(nxt + 1) = OPTIMIZED; /* was count. */
3941                     NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3942 #endif
3943                 }
3944               nogo:
3945
3946                 /* Try optimization CURLYX => CURLYM. */
3947                 if (  OP(oscan) == CURLYX && data
3948                       && !(data->flags & SF_HAS_PAR)
3949                       && !(data->flags & SF_HAS_EVAL)
3950                       && !deltanext     /* atom is fixed width */
3951                       && minnext != 0   /* CURLYM can't handle zero width */
3952                 ) {
3953                     /* XXXX How to optimize if data == 0? */
3954                     /* Optimize to a simpler form.  */
3955                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3956                     regnode *nxt2;
3957
3958                     OP(oscan) = CURLYM;
3959                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3960                             && (OP(nxt2) != WHILEM))
3961                         nxt = nxt2;
3962                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
3963                     /* Need to optimize away parenths. */
3964                     if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3965                         /* Set the parenth number.  */
3966                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3967
3968                         oscan->flags = (U8)ARG(nxt);
3969                         if (RExC_open_parens) {
3970                             RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3971                             RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3972                         }
3973                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
3974                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
3975
3976 #ifdef DEBUGGING
3977                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3978                         OP(nxt + 1) = OPTIMIZED; /* was count. */
3979                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3980                         NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
3981 #endif
3982 #if 0
3983                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
3984                             regnode *nnxt = regnext(nxt1);
3985                             if (nnxt == nxt) {
3986                                 if (reg_off_by_arg[OP(nxt1)])
3987                                     ARG_SET(nxt1, nxt2 - nxt1);
3988                                 else if (nxt2 - nxt1 < U16_MAX)
3989                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
3990                                 else
3991                                     OP(nxt) = NOTHING;  /* Cannot beautify */
3992                             }
3993                             nxt1 = nnxt;
3994                         }
3995 #endif
3996                         /* Optimize again: */
3997                         study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3998                                     NULL, stopparen, recursed, NULL, 0,depth+1);
3999                     }
4000                     else
4001                         oscan->flags = 0;
4002                 }
4003                 else if ((OP(oscan) == CURLYX)
4004                          && (flags & SCF_WHILEM_VISITED_POS)
4005                          /* See the comment on a similar expression above.
4006                             However, this time it's not a subexpression
4007                             we care about, but the expression itself. */
4008                          && (maxcount == REG_INFTY)
4009                          && data && ++data->whilem_c < 16) {
4010                     /* This stays as CURLYX, we can put the count/of pair. */
4011                     /* Find WHILEM (as in regexec.c) */
4012                     regnode *nxt = oscan + NEXT_OFF(oscan);
4013
4014                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4015                         nxt += ARG(nxt);
4016                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
4017                         | (RExC_whilem_seen << 4)); /* On WHILEM */
4018                 }
4019                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4020                     pars++;
4021                 if (flags & SCF_DO_SUBSTR) {
4022                     SV *last_str = NULL;
4023                     int counted = mincount != 0;
4024
4025                     if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
4026 #if defined(SPARC64_GCC_WORKAROUND)
4027                         I32 b = 0;
4028                         STRLEN l = 0;
4029                         const char *s = NULL;
4030                         I32 old = 0;
4031
4032                         if (pos_before >= data->last_start_min)
4033                             b = pos_before;
4034                         else
4035                             b = data->last_start_min;
4036
4037                         l = 0;
4038                         s = SvPV_const(data->last_found, l);
4039                         old = b - data->last_start_min;
4040
4041 #else
4042                         I32 b = pos_before >= data->last_start_min
4043                             ? pos_before : data->last_start_min;
4044                         STRLEN l;
4045                         const char * const s = SvPV_const(data->last_found, l);
4046                         I32 old = b - data->last_start_min;
4047 #endif
4048
4049                         if (UTF)
4050                             old = utf8_hop((U8*)s, old) - (U8*)s;
4051                         l -= old;
4052                         /* Get the added string: */
4053                         last_str = newSVpvn_utf8(s  + old, l, UTF);
4054                         if (deltanext == 0 && pos_before == b) {
4055                             /* What was added is a constant string */
4056                             if (mincount > 1) {
4057                                 SvGROW(last_str, (mincount * l) + 1);
4058                                 repeatcpy(SvPVX(last_str) + l,
4059                                           SvPVX_const(last_str), l, mincount - 1);
4060                                 SvCUR_set(last_str, SvCUR(last_str) * mincount);
4061                                 /* Add additional parts. */
4062                                 SvCUR_set(data->last_found,
4063                                           SvCUR(data->last_found) - l);
4064                                 sv_catsv(data->last_found, last_str);
4065                                 {
4066                                     SV * sv = data->last_found;
4067                                     MAGIC *mg =
4068                                         SvUTF8(sv) && SvMAGICAL(sv) ?
4069                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
4070                                     if (mg && mg->mg_len >= 0)
4071                                         mg->mg_len += CHR_SVLEN(last_str) - l;
4072                                 }
4073                                 data->last_end += l * (mincount - 1);
4074                             }
4075                         } else {
4076                             /* start offset must point into the last copy */
4077                             data->last_start_min += minnext * (mincount - 1);
4078                             data->last_start_max += is_inf ? I32_MAX
4079                                 : (maxcount - 1) * (minnext + data->pos_delta);
4080                         }
4081                     }
4082                     /* It is counted once already... */
4083                     data->pos_min += minnext * (mincount - counted);
4084                     data->pos_delta += - counted * deltanext +
4085                         (minnext + deltanext) * maxcount - minnext * mincount;
4086                     if (mincount != maxcount) {
4087                          /* Cannot extend fixed substrings found inside
4088                             the group.  */
4089                         SCAN_COMMIT(pRExC_state,data,minlenp);
4090                         if (mincount && last_str) {
4091                             SV * const sv = data->last_found;
4092                             MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4093                                 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4094
4095                             if (mg)
4096                                 mg->mg_len = -1;
4097                             sv_setsv(sv, last_str);
4098                             data->last_end = data->pos_min;
4099                             data->last_start_min =
4100                                 data->pos_min - CHR_SVLEN(last_str);
4101                             data->last_start_max = is_inf
4102                                 ? I32_MAX
4103                                 : data->pos_min + data->pos_delta
4104                                 - CHR_SVLEN(last_str);
4105                         }
4106                         data->longest = &(data->longest_float);
4107                     }
4108                     SvREFCNT_dec(last_str);
4109                 }
4110                 if (data && (fl & SF_HAS_EVAL))
4111                     data->flags |= SF_HAS_EVAL;
4112               optimize_curly_tail:
4113                 if (OP(oscan) != CURLYX) {
4114                     while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4115                            && NEXT_OFF(next))
4116                         NEXT_OFF(oscan) += NEXT_OFF(next);
4117                 }
4118                 continue;
4119             default:                    /* REF, ANYOFV, and CLUMP only? */
4120                 if (flags & SCF_DO_SUBSTR) {
4121                     SCAN_COMMIT(pRExC_state,data,minlenp);      /* Cannot expect anything... */
4122                     data->longest = &(data->longest_float);
4123                 }
4124                 is_inf = is_inf_internal = 1;
4125                 if (flags & SCF_DO_STCLASS_OR)
4126                     cl_anything(pRExC_state, data->start_class);
4127                 flags &= ~SCF_DO_STCLASS;
4128                 break;
4129             }
4130         }
4131         else if (OP(scan) == LNBREAK) {
4132             if (flags & SCF_DO_STCLASS) {
4133                 int value = 0;
4134                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
4135                 if (flags & SCF_DO_STCLASS_AND) {
4136                     for (value = 0; value < 256; value++)
4137                         if (!is_VERTWS_cp(value))
4138                             ANYOF_BITMAP_CLEAR(data->start_class, value);
4139                 }
4140                 else {
4141                     for (value = 0; value < 256; value++)
4142                         if (is_VERTWS_cp(value))
4143                             ANYOF_BITMAP_SET(data->start_class, value);
4144                 }
4145                 if (flags & SCF_DO_STCLASS_OR)
4146                     cl_and(data->start_class, and_withp);
4147                 flags &= ~SCF_DO_STCLASS;
4148             }
4149             min += 1;
4150             delta += 1;
4151             if (flags & SCF_DO_SUBSTR) {
4152                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
4153                 data->pos_min += 1;
4154                 data->pos_delta += 1;
4155                 data->longest = &(data->longest_float);
4156             }
4157         }
4158         else if (REGNODE_SIMPLE(OP(scan))) {
4159             int value = 0;
4160
4161             if (flags & SCF_DO_SUBSTR) {
4162                 SCAN_COMMIT(pRExC_state,data,minlenp);
4163                 data->pos_min++;
4164             }
4165             min++;
4166             if (flags & SCF_DO_STCLASS) {
4167                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
4168
4169                 /* Some of the logic below assumes that switching
4170                    locale on will only add false positives. */
4171                 switch (PL_regkind[OP(scan)]) {
4172                 case SANY:
4173                 default:
4174                   do_default:
4175                     /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
4176                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4177                         cl_anything(pRExC_state, data->start_class);
4178                     break;
4179                 case REG_ANY:
4180                     if (OP(scan) == SANY)
4181                         goto do_default;
4182                     if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
4183                         value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
4184                                  || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
4185                         cl_anything(pRExC_state, data->start_class);
4186                     }
4187                     if (flags & SCF_DO_STCLASS_AND || !value)
4188                         ANYOF_BITMAP_CLEAR(data->start_class,'\n');
4189                     break;
4190                 case ANYOF:
4191                     if (flags & SCF_DO_STCLASS_AND)
4192                         cl_and(data->start_class,
4193                                (struct regnode_charclass_class*)scan);
4194                     else
4195                         cl_or(pRExC_state, data->start_class,
4196                               (struct regnode_charclass_class*)scan);
4197                     break;
4198                 case ALNUM:
4199                     if (flags & SCF_DO_STCLASS_AND) {
4200                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4201                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
4202                             if (OP(scan) == ALNUMU) {
4203                                 for (value = 0; value < 256; value++) {
4204                                     if (!isWORDCHAR_L1(value)) {
4205                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4206                                     }
4207                                 }
4208                             } else {
4209                                 for (value = 0; value < 256; value++) {
4210                                     if (!isALNUM(value)) {
4211                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4212                                     }
4213                                 }
4214                             }
4215                         }
4216                     }
4217                     else {
4218                         if (data->start_class->flags & ANYOF_LOCALE)
4219                             ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
4220
4221                         /* Even if under locale, set the bits for non-locale
4222                          * in case it isn't a true locale-node.  This will
4223                          * create false positives if it truly is locale */
4224                         if (OP(scan) == ALNUMU) {
4225                             for (value = 0; value < 256; value++) {
4226                                 if (isWORDCHAR_L1(value)) {
4227                                     ANYOF_BITMAP_SET(data->start_class, value);
4228                                 }
4229                             }
4230                         } else {
4231                             for (value = 0; value < 256; value++) {
4232                                 if (isALNUM(value)) {
4233                                     ANYOF_BITMAP_SET(data->start_class, value);
4234                                 }
4235                             }
4236                         }
4237                     }
4238                     break;
4239                 case NALNUM:
4240                     if (flags & SCF_DO_STCLASS_AND) {
4241                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4242                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
4243                             if (OP(scan) == NALNUMU) {
4244                                 for (value = 0; value < 256; value++) {
4245                                     if (isWORDCHAR_L1(value)) {
4246                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4247                                     }
4248                                 }
4249                             } else {
4250                                 for (value = 0; value < 256; value++) {
4251                                     if (isALNUM(value)) {
4252                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4253                                     }
4254                                 }
4255                             }
4256                         }
4257                     }
4258                     else {
4259                         if (data->start_class->flags & ANYOF_LOCALE)
4260                             ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
4261
4262                         /* Even if under locale, set the bits for non-locale in
4263                          * case it isn't a true locale-node.  This will create
4264                          * false positives if it truly is locale */
4265                         if (OP(scan) == NALNUMU) {
4266                             for (value = 0; value < 256; value++) {
4267                                 if (! isWORDCHAR_L1(value)) {
4268                                     ANYOF_BITMAP_SET(data->start_class, value);
4269                                 }
4270                             }
4271                         } else {
4272                             for (value = 0; value < 256; value++) {
4273                                 if (! isALNUM(value)) {
4274                                     ANYOF_BITMAP_SET(data->start_class, value);
4275                                 }
4276                             }
4277                         }
4278                     }
4279                     break;
4280                 case SPACE:
4281                     if (flags & SCF_DO_STCLASS_AND) {
4282                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4283                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
4284                             if (OP(scan) == SPACEU) {
4285                                 for (value = 0; value < 256; value++) {
4286                                     if (!isSPACE_L1(value)) {
4287                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4288                                     }
4289                                 }
4290                             } else {
4291                                 for (value = 0; value < 256; value++) {
4292                                     if (!isSPACE(value)) {
4293                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4294                                     }
4295                                 }
4296                             }
4297                         }
4298                     }
4299                     else {
4300                         if (data->start_class->flags & ANYOF_LOCALE) {
4301                             ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
4302                         }
4303                         if (OP(scan) == SPACEU) {
4304                             for (value = 0; value < 256; value++) {
4305                                 if (isSPACE_L1(value)) {
4306                                     ANYOF_BITMAP_SET(data->start_class, value);
4307                                 }
4308                             }
4309                         } else {
4310                             for (value = 0; value < 256; value++) {
4311                                 if (isSPACE(value)) {
4312                                     ANYOF_BITMAP_SET(data->start_class, value);
4313                                 }
4314                             }
4315                         }
4316                     }
4317                     break;
4318                 case NSPACE:
4319                     if (flags & SCF_DO_STCLASS_AND) {
4320                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4321                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
4322                             if (OP(scan) == NSPACEU) {
4323                                 for (value = 0; value < 256; value++) {
4324                                     if (isSPACE_L1(value)) {
4325                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4326                                     }
4327                                 }
4328                             } else {
4329                                 for (value = 0; value < 256; value++) {
4330                                     if (isSPACE(value)) {
4331                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4332                                     }
4333                                 }
4334                             }
4335                         }
4336                     }
4337                     else {
4338                         if (data->start_class->flags & ANYOF_LOCALE)
4339                             ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
4340                         if (OP(scan) == NSPACEU) {
4341                             for (value = 0; value < 256; value++) {
4342                                 if (!isSPACE_L1(value)) {
4343                                     ANYOF_BITMAP_SET(data->start_class, value);
4344                                 }
4345                             }
4346                         }
4347                         else {
4348                             for (value = 0; value < 256; value++) {
4349                                 if (!isSPACE(value)) {
4350                                     ANYOF_BITMAP_SET(data->start_class, value);
4351                                 }
4352                             }
4353                         }
4354                     }
4355                     break;
4356                 case DIGIT:
4357                     if (flags & SCF_DO_STCLASS_AND) {
4358                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4359                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
4360                             for (value = 0; value < 256; value++)
4361                                 if (!isDIGIT(value))
4362                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
4363                         }
4364                     }
4365                     else {
4366                         if (data->start_class->flags & ANYOF_LOCALE)
4367                             ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
4368                         for (value = 0; value < 256; value++)
4369                             if (isDIGIT(value))
4370                                 ANYOF_BITMAP_SET(data->start_class, value);
4371                     }
4372                     break;
4373                 case NDIGIT:
4374                     if (flags & SCF_DO_STCLASS_AND) {
4375                         if (!(data->start_class->flags & ANYOF_LOCALE))
4376                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
4377                         for (value = 0; value < 256; value++)
4378                             if (isDIGIT(value))
4379                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
4380                     }
4381                     else {
4382                         if (data->start_class->flags & ANYOF_LOCALE)
4383                             ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
4384                         for (value = 0; value < 256; value++)
4385                             if (!isDIGIT(value))
4386                                 ANYOF_BITMAP_SET(data->start_class, value);
4387                     }
4388                     break;
4389                 CASE_SYNST_FNC(VERTWS);
4390                 CASE_SYNST_FNC(HORIZWS);
4391
4392                 }
4393                 if (flags & SCF_DO_STCLASS_OR)
4394                     cl_and(data->start_class, and_withp);
4395                 flags &= ~SCF_DO_STCLASS;
4396             }
4397         }
4398         else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
4399             data->flags |= (OP(scan) == MEOL
4400                             ? SF_BEFORE_MEOL
4401                             : SF_BEFORE_SEOL);
4402         }
4403         else if (  PL_regkind[OP(scan)] == BRANCHJ
4404                  /* Lookbehind, or need to calculate parens/evals/stclass: */
4405                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
4406                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4407             if ( OP(scan) == UNLESSM &&
4408                  scan->flags == 0 &&
4409                  OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
4410                  OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
4411             ) {
4412                 regnode *opt;
4413                 regnode *upto= regnext(scan);
4414                 DEBUG_PARSE_r({
4415                     SV * const mysv_val=sv_newmortal();
4416                     DEBUG_STUDYDATA("OPFAIL",data,depth);
4417
4418                     /*DEBUG_PARSE_MSG("opfail");*/
4419                     regprop(RExC_rx, mysv_val, upto);
4420                     PerlIO_printf(Perl_debug_log, "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
4421                                   SvPV_nolen_const(mysv_val),
4422                                   (IV)REG_NODE_NUM(upto),
4423                                   (IV)(upto - scan)
4424                     );
4425                 });
4426                 OP(scan) = OPFAIL;
4427                 NEXT_OFF(scan) = upto - scan;
4428                 for (opt= scan + 1; opt < upto ; opt++)
4429                     OP(opt) = OPTIMIZED;
4430                 scan= upto;
4431                 continue;
4432             }
4433             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4434                 || OP(scan) == UNLESSM )
4435             {
4436                 /* Negative Lookahead/lookbehind
4437                    In this case we can't do fixed string optimisation.
4438                 */
4439
4440                 I32 deltanext, minnext, fake = 0;
4441                 regnode *nscan;
4442                 struct regnode_charclass_class intrnl;
4443                 int f = 0;
4444
4445                 data_fake.flags = 0;
4446                 if (data) {
4447                     data_fake.whilem_c = data->whilem_c;
4448                     data_fake.last_closep = data->last_closep;
4449                 }
4450                 else
4451                     data_fake.last_closep = &fake;
4452                 data_fake.pos_delta = delta;
4453                 if ( flags & SCF_DO_STCLASS && !scan->flags
4454                      && OP(scan) == IFMATCH ) { /* Lookahead */
4455                     cl_init(pRExC_state, &intrnl);
4456                     data_fake.start_class = &intrnl;
4457                     f |= SCF_DO_STCLASS_AND;
4458                 }
4459                 if (flags & SCF_WHILEM_VISITED_POS)
4460                     f |= SCF_WHILEM_VISITED_POS;
4461                 next = regnext(scan);
4462                 nscan = NEXTOPER(NEXTOPER(scan));
4463                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
4464                     last, &data_fake, stopparen, recursed, NULL, f, depth+1);
4465                 if (scan->flags) {
4466                     if (deltanext) {
4467                         FAIL("Variable length lookbehind not implemented");
4468                     }
4469                     else if (minnext > (I32)U8_MAX) {
4470                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4471                     }
4472                     scan->flags = (U8)minnext;
4473                 }
4474                 if (data) {
4475                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4476                         pars++;
4477                     if (data_fake.flags & SF_HAS_EVAL)
4478                         data->flags |= SF_HAS_EVAL;
4479                     data->whilem_c = data_fake.whilem_c;
4480                 }
4481                 if (f & SCF_DO_STCLASS_AND) {
4482                     if (flags & SCF_DO_STCLASS_OR) {
4483                         /* OR before, AND after: ideally we would recurse with
4484                          * data_fake to get the AND applied by study of the
4485                          * remainder of the pattern, and then derecurse;
4486                          * *** HACK *** for now just treat as "no information".
4487                          * See [perl #56690].
4488                          */
4489                         cl_init(pRExC_state, data->start_class);
4490                     }  else {
4491                         /* AND before and after: combine and continue */
4492                         const int was = (data->start_class->flags & ANYOF_EOS);
4493
4494                         cl_and(data->start_class, &intrnl);
4495                         if (was)
4496                             data->start_class->flags |= ANYOF_EOS;
4497                     }
4498                 }
4499             }
4500 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4501             else {
4502                 /* Positive Lookahead/lookbehind
4503                    In this case we can do fixed string optimisation,
4504                    but we must be careful about it. Note in the case of
4505                    lookbehind the positions will be offset by the minimum
4506                    length of the pattern, something we won't know about
4507                    until after the recurse.
4508                 */
4509                 I32 deltanext, fake = 0;
4510                 regnode *nscan;
4511                 struct regnode_charclass_class intrnl;
4512                 int f = 0;
4513                 /* We use SAVEFREEPV so that when the full compile
4514                     is finished perl will clean up the allocated
4515                     minlens when it's all done. This way we don't
4516                     have to worry about freeing them when we know
4517                     they wont be used, which would be a pain.
4518                  */
4519                 I32 *minnextp;
4520                 Newx( minnextp, 1, I32 );
4521                 SAVEFREEPV(minnextp);
4522
4523                 if (data) {
4524                     StructCopy(data, &data_fake, scan_data_t);
4525                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
4526                         f |= SCF_DO_SUBSTR;
4527                         if (scan->flags)
4528                             SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
4529                         data_fake.last_found=newSVsv(data->last_found);
4530                     }
4531                 }
4532                 else
4533                     data_fake.last_closep = &fake;
4534                 data_fake.flags = 0;
4535                 data_fake.pos_delta = delta;
4536                 if (is_inf)
4537                     data_fake.flags |= SF_IS_INF;
4538                 if ( flags & SCF_DO_STCLASS && !scan->flags
4539                      && OP(scan) == IFMATCH ) { /* Lookahead */
4540                     cl_init(pRExC_state, &intrnl);
4541                     data_fake.start_class = &intrnl;
4542                     f |= SCF_DO_STCLASS_AND;
4543                 }
4544                 if (flags & SCF_WHILEM_VISITED_POS)
4545                     f |= SCF_WHILEM_VISITED_POS;
4546                 next = regnext(scan);
4547                 nscan = NEXTOPER(NEXTOPER(scan));
4548
4549                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
4550                     last, &data_fake, stopparen, recursed, NULL, f,depth+1);
4551                 if (scan->flags) {
4552                     if (deltanext) {
4553                         FAIL("Variable length lookbehind not implemented");
4554                     }
4555                     else if (*minnextp > (I32)U8_MAX) {
4556                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4557                     }
4558                     scan->flags = (U8)*minnextp;
4559                 }
4560
4561                 *minnextp += min;
4562
4563                 if (f & SCF_DO_STCLASS_AND) {
4564                     const int was = (data->start_class->flags & ANYOF_EOS);
4565
4566                     cl_and(data->start_class, &intrnl);
4567                     if (was)
4568                         data->start_class->flags |= ANYOF_EOS;
4569                 }
4570                 if (data) {
4571                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4572                         pars++;
4573                     if (data_fake.flags & SF_HAS_EVAL)
4574                         data->flags |= SF_HAS_EVAL;
4575                     data->whilem_c = data_fake.whilem_c;
4576                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4577                         if (RExC_rx->minlen<*minnextp)
4578                             RExC_rx->minlen=*minnextp;
4579                         SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4580                         SvREFCNT_dec(data_fake.last_found);
4581
4582                         if ( data_fake.minlen_fixed != minlenp )
4583                         {
4584                             data->offset_fixed= data_fake.offset_fixed;
4585                             data->minlen_fixed= data_fake.minlen_fixed;
4586                             data->lookbehind_fixed+= scan->flags;
4587                         }
4588                         if ( data_fake.minlen_float != minlenp )
4589                         {
4590                             data->minlen_float= data_fake.minlen_float;
4591                             data->offset_float_min=data_fake.offset_float_min;
4592                             data->offset_float_max=data_fake.offset_float_max;
4593                             data->lookbehind_float+= scan->flags;
4594                         }
4595                     }
4596                 }
4597             }
4598 #endif
4599         }
4600         else if (OP(scan) == OPEN) {
4601             if (stopparen != (I32)ARG(scan))
4602                 pars++;
4603         }
4604         else if (OP(scan) == CLOSE) {
4605             if (stopparen == (I32)ARG(scan)) {
4606                 break;
4607             }
4608             if ((I32)ARG(scan) == is_par) {
4609                 next = regnext(scan);
4610
4611                 if ( next && (OP(next) != WHILEM) && next < last)
4612                     is_par = 0;         /* Disable optimization */
4613             }
4614             if (data)
4615                 *(data->last_closep) = ARG(scan);
4616         }
4617         else if (OP(scan) == EVAL) {
4618                 if (data)
4619                     data->flags |= SF_HAS_EVAL;
4620         }
4621         else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4622             if (flags & SCF_DO_SUBSTR) {
4623                 SCAN_COMMIT(pRExC_state,data,minlenp);
4624                 flags &= ~SCF_DO_SUBSTR;
4625             }
4626             if (data && OP(scan)==ACCEPT) {
4627                 data->flags |= SCF_SEEN_ACCEPT;
4628                 if (stopmin > min)
4629                     stopmin = min;
4630             }
4631         }
4632         else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4633         {
4634                 if (flags & SCF_DO_SUBSTR) {
4635                     SCAN_COMMIT(pRExC_state,data,minlenp);
4636                     data->longest = &(data->longest_float);
4637                 }
4638                 is_inf = is_inf_internal = 1;
4639                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4640                     cl_anything(pRExC_state, data->start_class);
4641                 flags &= ~SCF_DO_STCLASS;
4642         }
4643         else if (OP(scan) == GPOS) {
4644             if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4645                 !(delta || is_inf || (data && data->pos_delta)))
4646             {
4647                 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4648                     RExC_rx->extflags |= RXf_ANCH_GPOS;
4649                 if (RExC_rx->gofs < (U32)min)
4650                     RExC_rx->gofs = min;
4651             } else {
4652                 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4653                 RExC_rx->gofs = 0;
4654             }
4655         }
4656 #ifdef TRIE_STUDY_OPT
4657 #ifdef FULL_TRIE_STUDY
4658         else if (PL_regkind[OP(scan)] == TRIE) {
4659             /* NOTE - There is similar code to this block above for handling
4660                BRANCH nodes on the initial study.  If you change stuff here
4661                check there too. */
4662             regnode *trie_node= scan;
4663             regnode *tail= regnext(scan);
4664             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4665             I32 max1 = 0, min1 = I32_MAX;
4666             struct regnode_charclass_class accum;
4667
4668             if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4669                 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4670             if (flags & SCF_DO_STCLASS)
4671                 cl_init_zero(pRExC_state, &accum);
4672
4673             if (!trie->jump) {
4674                 min1= trie->minlen;
4675                 max1= trie->maxlen;
4676             } else {
4677                 const regnode *nextbranch= NULL;
4678                 U32 word;
4679
4680                 for ( word=1 ; word <= trie->wordcount ; word++)
4681                 {
4682                     I32 deltanext=0, minnext=0, f = 0, fake;
4683                     struct regnode_charclass_class this_class;
4684
4685                     data_fake.flags = 0;
4686                     if (data) {
4687                         data_fake.whilem_c = data->whilem_c;
4688                         data_fake.last_closep = data->last_closep;
4689                     }
4690                     else
4691                         data_fake.last_closep = &fake;
4692                     data_fake.pos_delta = delta;
4693                     if (flags & SCF_DO_STCLASS) {
4694                         cl_init(pRExC_state, &this_class);
4695                         data_fake.start_class = &this_class;
4696                         f = SCF_DO_STCLASS_AND;
4697                     }
4698                     if (flags & SCF_WHILEM_VISITED_POS)
4699                         f |= SCF_WHILEM_VISITED_POS;
4700
4701                     if (trie->jump[word]) {
4702                         if (!nextbranch)
4703                             nextbranch = trie_node + trie->jump[0];
4704                         scan= trie_node + trie->jump[word];
4705                         /* We go from the jump point to the branch that follows
4706                            it. Note this means we need the vestigal unused branches
4707                            even though they arent otherwise used.
4708                          */
4709                         minnext = study_chunk(pRExC_state, &scan, minlenp,
4710                             &deltanext, (regnode *)nextbranch, &data_fake,
4711                             stopparen, recursed, NULL, f,depth+1);
4712                     }
4713                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4714                         nextbranch= regnext((regnode*)nextbranch);
4715
4716                     if (min1 > (I32)(minnext + trie->minlen))
4717                         min1 = minnext + trie->minlen;
4718                     if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4719                         max1 = minnext + deltanext + trie->maxlen;
4720                     if (deltanext == I32_MAX)
4721                         is_inf = is_inf_internal = 1;
4722
4723                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4724                         pars++;
4725                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
4726                         if ( stopmin > min + min1)
4727                             stopmin = min + min1;
4728                         flags &= ~SCF_DO_SUBSTR;
4729                         if (data)
4730                             data->flags |= SCF_SEEN_ACCEPT;
4731                     }
4732                     if (data) {
4733                         if (data_fake.flags & SF_HAS_EVAL)
4734                             data->flags |= SF_HAS_EVAL;
4735                         data->whilem_c = data_fake.whilem_c;
4736                     }
4737                     if (flags & SCF_DO_STCLASS)
4738                         cl_or(pRExC_state, &accum, &this_class);
4739                 }
4740             }
4741             if (flags & SCF_DO_SUBSTR) {
4742                 data->pos_min += min1;
4743                 data->pos_delta += max1 - min1;
4744                 if (max1 != min1 || is_inf)
4745                     data->longest = &(data->longest_float);
4746             }
4747             min += min1;
4748             delta += max1 - min1;
4749             if (flags & SCF_DO_STCLASS_OR) {
4750                 cl_or(pRExC_state, data->start_class, &accum);
4751                 if (min1) {
4752                     cl_and(data->start_class, and_withp);
4753                     flags &= ~SCF_DO_STCLASS;
4754                 }
4755             }
4756             else if (flags & SCF_DO_STCLASS_AND) {
4757                 if (min1) {
4758                     cl_and(data->start_class, &accum);
4759                     flags &= ~SCF_DO_STCLASS;
4760                 }
4761                 else {
4762                     /* Switch to OR mode: cache the old value of
4763                      * data->start_class */
4764                     INIT_AND_WITHP;
4765                     StructCopy(data->start_class, and_withp,
4766                                struct regnode_charclass_class);
4767                     flags &= ~SCF_DO_STCLASS_AND;
4768                     StructCopy(&accum, data->start_class,
4769                                struct regnode_charclass_class);
4770                     flags |= SCF_DO_STCLASS_OR;
4771                     data->start_class->flags |= ANYOF_EOS;
4772                 }
4773             }
4774             scan= tail;
4775             continue;
4776         }
4777 #else
4778         else if (PL_regkind[OP(scan)] == TRIE) {
4779             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4780             U8*bang=NULL;
4781
4782             min += trie->minlen;
4783             delta += (trie->maxlen - trie->minlen);
4784             flags &= ~SCF_DO_STCLASS; /* xxx */
4785             if (flags & SCF_DO_SUBSTR) {
4786                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
4787                 data->pos_min += trie->minlen;
4788                 data->pos_delta += (trie->maxlen - trie->minlen);
4789                 if (trie->maxlen != trie->minlen)
4790                     data->longest = &(data->longest_float);
4791             }
4792             if (trie->jump) /* no more substrings -- for now /grr*/
4793                 flags &= ~SCF_DO_SUBSTR;
4794         }
4795 #endif /* old or new */
4796 #endif /* TRIE_STUDY_OPT */
4797
4798         /* Else: zero-length, ignore. */
4799         scan = regnext(scan);
4800     }
4801     if (frame) {
4802         last = frame->last;
4803         scan = frame->next;
4804         stopparen = frame->stop;
4805         frame = frame->prev;
4806         goto fake_study_recurse;
4807     }
4808
4809   finish:
4810     assert(!frame);
4811     DEBUG_STUDYDATA("pre-fin:",data,depth);
4812
4813     *scanp = scan;
4814     *deltap = is_inf_internal ? I32_MAX : delta;
4815     if (flags & SCF_DO_SUBSTR && is_inf)
4816         data->pos_delta = I32_MAX - data->pos_min;
4817     if (is_par > (I32)U8_MAX)
4818         is_par = 0;
4819     if (is_par && pars==1 && data) {
4820         data->flags |= SF_IN_PAR;
4821         data->flags &= ~SF_HAS_PAR;
4822     }
4823     else if (pars && data) {
4824         data->flags |= SF_HAS_PAR;
4825         data->flags &= ~SF_IN_PAR;
4826     }
4827     if (flags & SCF_DO_STCLASS_OR)
4828         cl_and(data->start_class, and_withp);
4829     if (flags & SCF_TRIE_RESTUDY)
4830         data->flags |=  SCF_TRIE_RESTUDY;
4831
4832     DEBUG_STUDYDATA("post-fin:",data,depth);
4833
4834     return min < stopmin ? min : stopmin;
4835 }
4836
4837 STATIC U32
4838 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4839 {
4840     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4841
4842     PERL_ARGS_ASSERT_ADD_DATA;
4843
4844     Renewc(RExC_rxi->data,
4845            sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4846            char, struct reg_data);
4847     if(count)
4848         Renew(RExC_rxi->data->what, count + n, U8);
4849     else
4850         Newx(RExC_rxi->data->what, n, U8);
4851     RExC_rxi->data->count = count + n;
4852     Copy(s, RExC_rxi->data->what + count, n, U8);
4853     return count;
4854 }
4855
4856 /*XXX: todo make this not included in a non debugging perl */
4857 #ifndef PERL_IN_XSUB_RE
4858 void
4859 Perl_reginitcolors(pTHX)
4860 {
4861     dVAR;
4862     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4863     if (s) {
4864         char *t = savepv(s);
4865         int i = 0;
4866         PL_colors[0] = t;
4867         while (++i < 6) {
4868             t = strchr(t, '\t');
4869             if (t) {
4870                 *t = '\0';
4871                 PL_colors[i] = ++t;
4872             }
4873             else
4874                 PL_colors[i] = t = (char *)"";
4875         }
4876     } else {
4877         int i = 0;
4878         while (i < 6)
4879             PL_colors[i++] = (char *)"";
4880     }
4881     PL_colorset = 1;
4882 }
4883 #endif
4884
4885
4886 #ifdef TRIE_STUDY_OPT
4887 #define CHECK_RESTUDY_GOTO                                  \
4888         if (                                                \
4889               (data.flags & SCF_TRIE_RESTUDY)               \
4890               && ! restudied++                              \
4891         )     goto reStudy
4892 #else
4893 #define CHECK_RESTUDY_GOTO
4894 #endif
4895
4896 /*
4897  * pregcomp - compile a regular expression into internal code
4898  *
4899  * Decides which engine's compiler to call based on the hint currently in
4900  * scope
4901  */
4902
4903 #ifndef PERL_IN_XSUB_RE
4904
4905 /* return the currently in-scope regex engine (or the default if none)  */
4906
4907 regexp_engine const *
4908 Perl_current_re_engine(pTHX)
4909 {
4910     dVAR;
4911
4912     if (IN_PERL_COMPILETIME) {
4913         HV * const table = GvHV(PL_hintgv);
4914         SV **ptr;
4915
4916         if (!table)
4917             return &PL_core_reg_engine;
4918         ptr = hv_fetchs(table, "regcomp", FALSE);
4919         if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
4920             return &PL_core_reg_engine;
4921         return INT2PTR(regexp_engine*,SvIV(*ptr));
4922     }
4923     else {
4924         SV *ptr;
4925         if (!PL_curcop->cop_hints_hash)
4926             return &PL_core_reg_engine;
4927         ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
4928         if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
4929             return &PL_core_reg_engine;
4930         return INT2PTR(regexp_engine*,SvIV(ptr));
4931     }
4932 }
4933
4934
4935 REGEXP *
4936 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4937 {
4938     dVAR;
4939     regexp_engine const *eng = current_re_engine();
4940     GET_RE_DEBUG_FLAGS_DECL;
4941
4942     PERL_ARGS_ASSERT_PREGCOMP;
4943
4944     /* Dispatch a request to compile a regexp to correct regexp engine. */
4945     DEBUG_COMPILE_r({
4946         PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4947                         PTR2UV(eng));
4948     });
4949     return CALLREGCOMP_ENG(eng, pattern, flags);
4950 }
4951 #endif
4952
4953 /* public(ish) wrapper for Perl_re_op_compile that only takes an SV
4954  * pattern rather than a list of OPs */
4955
4956 REGEXP *
4957 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
4958 {
4959     SV *pat = pattern; /* defeat constness! */
4960     PERL_ARGS_ASSERT_RE_COMPILE;
4961     return Perl_re_op_compile(aTHX_ &pat, 1, NULL, current_re_engine(),
4962                                 NULL, NULL, rx_flags, 0);
4963 }
4964
4965 /* see if there are any run-time code blocks in the pattern.
4966  * False positives are allowed */
4967
4968 static bool
4969 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state, OP *expr,
4970                     U32 pm_flags, char *pat, STRLEN plen)
4971 {
4972     int n = 0;
4973     STRLEN s;
4974
4975     /* avoid infinitely recursing when we recompile the pattern parcelled up
4976      * as qr'...'. A single constant qr// string can't have have any
4977      * run-time component in it, and thus, no runtime code. (A non-qr
4978      * string, however, can, e.g. $x =~ '(?{})') */
4979     if  ((pm_flags & PMf_IS_QR) && expr && expr->op_type == OP_CONST)
4980         return 0;
4981
4982     for (s = 0; s < plen; s++) {
4983         if (n < pRExC_state->num_code_blocks
4984             && s == pRExC_state->code_blocks[n].start)
4985         {
4986             s = pRExC_state->code_blocks[n].end;
4987             n++;
4988             continue;
4989         }
4990         /* TODO ideally should handle [..], (#..), /#.../x to reduce false
4991          * positives here */
4992         if (pat[s] == '(' && pat[s+1] == '?' &&
4993             (pat[s+2] == '{' || (pat[s+2] == '?' && pat[s+3] == '{'))
4994         )
4995             return 1;
4996     }
4997     return 0;
4998 }
4999
5000 /* Handle run-time code blocks. We will already have compiled any direct
5001  * or indirect literal code blocks. Now, take the pattern 'pat' and make a
5002  * copy of it, but with any literal code blocks blanked out and
5003  * appropriate chars escaped; then feed it into
5004  *
5005  *    eval "qr'modified_pattern'"
5006  *
5007  * For example,
5008  *
5009  *       a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
5010  *
5011  * becomes
5012  *
5013  *    qr'a\\bc                       def\'ghi\\\\jkl(?{"this is runtime"})mno'
5014  *
5015  * After eval_sv()-ing that, grab any new code blocks from the returned qr
5016  * and merge them with any code blocks of the original regexp.
5017  *
5018  * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
5019  * instead, just save the qr and return FALSE; this tells our caller that
5020  * the original pattern needs upgrading to utf8.
5021  */
5022
5023 bool
5024 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5025     char *pat, STRLEN plen)
5026 {
5027     SV *qr;
5028
5029     GET_RE_DEBUG_FLAGS_DECL;
5030
5031     if (pRExC_state->runtime_code_qr) {
5032         /* this is the second time we've been called; this should
5033          * only happen if the main pattern got upgraded to utf8
5034          * during compilation; re-use the qr we compiled first time
5035          * round (which should be utf8 too)
5036          */
5037         qr = pRExC_state->runtime_code_qr;
5038         pRExC_state->runtime_code_qr = NULL;
5039         assert(RExC_utf8 && SvUTF8(qr));
5040     }
5041     else {
5042         int n = 0;
5043         STRLEN s;
5044         char *p, *newpat;
5045         int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
5046         SV *sv, *qr_ref;
5047         dSP;
5048
5049         /* determine how many extra chars we need for ' and \ escaping */
5050         for (s = 0; s < plen; s++) {
5051             if (pat[s] == '\'' || pat[s] == '\\')
5052                 newlen++;
5053         }
5054
5055         Newx(newpat, newlen, char);
5056         p = newpat;
5057         *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
5058
5059         for (s = 0; s < plen; s++) {
5060             if (n < pRExC_state->num_code_blocks
5061                 && s == pRExC_state->code_blocks[n].start)
5062             {
5063                 /* blank out literal code block */
5064                 assert(pat[s] == '(');
5065                 while (s <= pRExC_state->code_blocks[n].end) {
5066                     *p++ = ' ';
5067                     s++;
5068                 }
5069                 s--;
5070                 n++;
5071                 continue;
5072             }
5073             if (pat[s] == '\'' || pat[s] == '\\')
5074                 *p++ = '\\';
5075             *p++ = pat[s];
5076         }
5077         *p++ = '\'';
5078         if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
5079             *p++ = 'x';
5080         *p++ = '\0';
5081         DEBUG_COMPILE_r({
5082             PerlIO_printf(Perl_debug_log,
5083                 "%sre-parsing pattern for runtime code:%s %s\n",
5084                 PL_colors[4],PL_colors[5],newpat);
5085         });
5086
5087         sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
5088         Safefree(newpat);
5089
5090         ENTER;
5091         SAVETMPS;
5092         save_re_context();
5093         PUSHSTACKi(PERLSI_REQUIRE);
5094         /* this causes the toker to collapse \\ into \ when parsing
5095          * qr''; normally only q'' does this. It also alters hints
5096          * handling */
5097         PL_reg_state.re_reparsing = TRUE;
5098         eval_sv(sv, G_SCALAR);
5099         SvREFCNT_dec(sv);
5100         SPAGAIN;
5101         qr_ref = POPs;
5102         PUTBACK;
5103         if (SvTRUE(ERRSV))
5104             Perl_croak(aTHX_ "%s", SvPVx_nolen_const(ERRSV));
5105         assert(SvROK(qr_ref));
5106         qr = SvRV(qr_ref);
5107         assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
5108         /* the leaving below frees the tmp qr_ref.
5109          * Give qr a life of its own */
5110         SvREFCNT_inc(qr);
5111         POPSTACK;
5112         FREETMPS;
5113         LEAVE;
5114
5115     }
5116
5117     if (!RExC_utf8 && SvUTF8(qr)) {
5118         /* first time through; the pattern got upgraded; save the
5119          * qr for the next time through */
5120         assert(!pRExC_state->runtime_code_qr);
5121         pRExC_state->runtime_code_qr = qr;
5122         return 0;
5123     }
5124
5125
5126     /* extract any code blocks within the returned qr//  */
5127
5128
5129     /* merge the main (r1) and run-time (r2) code blocks into one */
5130     {
5131         RXi_GET_DECL(((struct regexp*)SvANY(qr)), r2);
5132         struct reg_code_block *new_block, *dst;
5133         RExC_state_t * const r1 = pRExC_state; /* convenient alias */
5134         int i1 = 0, i2 = 0;
5135
5136         if (!r2->num_code_blocks) /* we guessed wrong */
5137             return 1;
5138
5139         Newx(new_block,
5140             r1->num_code_blocks + r2->num_code_blocks,
5141             struct reg_code_block);
5142         dst = new_block;
5143
5144         while (    i1 < r1->num_code_blocks
5145                 || i2 < r2->num_code_blocks)
5146         {
5147             struct reg_code_block *src;
5148             bool is_qr = 0;
5149
5150             if (i1 == r1->num_code_blocks) {
5151                 src = &r2->code_blocks[i2++];
5152                 is_qr = 1;
5153             }
5154             else if (i2 == r2->num_code_blocks)
5155                 src = &r1->code_blocks[i1++];
5156             else if (  r1->code_blocks[i1].start
5157                      < r2->code_blocks[i2].start)
5158             {
5159                 src = &r1->code_blocks[i1++];
5160                 assert(src->end < r2->code_blocks[i2].start);
5161             }
5162             else {
5163                 assert(  r1->code_blocks[i1].start
5164                        > r2->code_blocks[i2].start);
5165                 src = &r2->code_blocks[i2++];
5166                 is_qr = 1;
5167                 assert(src->end < r1->code_blocks[i1].start);
5168             }
5169
5170             assert(pat[src->start] == '(');
5171             assert(pat[src->end]   == ')');
5172             dst->start      = src->start;
5173             dst->end        = src->end;
5174             dst->block      = src->block;
5175             dst->src_regex  = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
5176                                     : src->src_regex;
5177             dst++;
5178         }
5179         r1->num_code_blocks += r2->num_code_blocks;
5180         Safefree(r1->code_blocks);
5181         r1->code_blocks = new_block;
5182     }
5183
5184     SvREFCNT_dec(qr);
5185     return 1;
5186 }
5187
5188
5189 /*
5190  * Perl_re_op_compile - the perl internal RE engine's function to compile a
5191  * regular expression into internal code.
5192  * The pattern may be passed either as:
5193  *    a list of SVs (patternp plus pat_count)
5194  *    a list of OPs (expr)
5195  * If both are passed, the SV list is used, but the OP list indicates
5196  * which SVs are actually pre-compiled code blocks
5197  *
5198  * The SVs in the list have magic and qr overloading applied to them (and
5199  * the list may be modified in-place with replacement SVs in the latter
5200  * case).
5201  *
5202  * If the pattern hasn't changed from old_re, then old_re will be
5203  * returned.
5204  *
5205  * eng is the current engine. If that engine has an op_comp method, then
5206  * handle directly (i.e. we assume that op_comp was us); otherwise, just
5207  * do the initial concatenation of arguments and pass on to the external
5208  * engine.
5209  *
5210  * If is_bare_re is not null, set it to a boolean indicating whether the
5211  * arg list reduced (after overloading) to a single bare regex which has
5212  * been returned (i.e. /$qr/).
5213  *
5214  * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
5215  *
5216  * pm_flags contains the PMf_* flags, typically based on those from the
5217  * pm_flags field of the related PMOP. Currently we're only interested in
5218  * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
5219  *
5220  * We can't allocate space until we know how big the compiled form will be,
5221  * but we can't compile it (and thus know how big it is) until we've got a
5222  * place to put the code.  So we cheat:  we compile it twice, once with code
5223  * generation turned off and size counting turned on, and once "for real".
5224  * This also means that we don't allocate space until we are sure that the
5225  * thing really will compile successfully, and we never have to move the
5226  * code and thus invalidate pointers into it.  (Note that it has to be in
5227  * one piece because free() must be able to free it all.) [NB: not true in perl]
5228  *
5229  * Beware that the optimization-preparation code in here knows about some
5230  * of the structure of the compiled regexp.  [I'll say.]
5231  */
5232
5233 REGEXP *
5234 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
5235                     OP *expr, const regexp_engine* eng, REGEXP *VOL old_re,
5236                      bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
5237 {
5238     dVAR;
5239     REGEXP *rx;
5240     struct regexp *r;
5241     register regexp_internal *ri;
5242     STRLEN plen;
5243     char  * VOL exp;
5244     char* xend;
5245     regnode *scan;
5246     I32 flags;
5247     I32 minlen = 0;
5248     U32 rx_flags;
5249     SV * VOL pat;
5250
5251     /* these are all flags - maybe they should be turned
5252      * into a single int with different bit masks */
5253     I32 sawlookahead = 0;
5254     I32 sawplus = 0;
5255     I32 sawopen = 0;
5256     bool used_setjump = FALSE;
5257     regex_charset initial_charset = get_regex_charset(orig_rx_flags);
5258     bool code_is_utf8 = 0;
5259     bool VOL recompile = 0;
5260     bool runtime_code = 0;
5261     U8 jump_ret = 0;
5262     dJMPENV;
5263     scan_data_t data;
5264     RExC_state_t RExC_state;
5265     RExC_state_t * const pRExC_state = &RExC_state;
5266 #ifdef TRIE_STUDY_OPT
5267     int restudied;
5268     RExC_state_t copyRExC_state;
5269 #endif
5270     GET_RE_DEBUG_FLAGS_DECL;
5271
5272     PERL_ARGS_ASSERT_RE_OP_COMPILE;
5273
5274     DEBUG_r(if (!PL_colorset) reginitcolors());
5275
5276 #ifndef PERL_IN_XSUB_RE
5277     /* Initialize these here instead of as-needed, as is quick and avoids
5278      * having to test them each time otherwise */
5279     if (! PL_AboveLatin1) {
5280         PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
5281         PL_ASCII = _new_invlist_C_array(ASCII_invlist);
5282         PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
5283
5284         PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist);
5285         PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist);
5286
5287         PL_L1PosixAlpha = _new_invlist_C_array(L1PosixAlpha_invlist);
5288         PL_PosixAlpha = _new_invlist_C_array(PosixAlpha_invlist);
5289
5290         PL_PosixBlank = _new_invlist_C_array(PosixBlank_invlist);
5291         PL_XPosixBlank = _new_invlist_C_array(XPosixBlank_invlist);
5292
5293         PL_L1Cased = _new_invlist_C_array(L1Cased_invlist);
5294
5295         PL_PosixCntrl = _new_invlist_C_array(PosixCntrl_invlist);
5296         PL_XPosixCntrl = _new_invlist_C_array(XPosixCntrl_invlist);
5297
5298         PL_PosixDigit = _new_invlist_C_array(PosixDigit_invlist);
5299
5300         PL_L1PosixGraph = _new_invlist_C_array(L1PosixGraph_invlist);
5301         PL_PosixGraph = _new_invlist_C_array(PosixGraph_invlist);
5302
5303         PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist);
5304         PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist);
5305
5306         PL_L1PosixLower = _new_invlist_C_array(L1PosixLower_invlist);
5307         PL_PosixLower = _new_invlist_C_array(PosixLower_invlist);
5308
5309         PL_L1PosixPrint = _new_invlist_C_array(L1PosixPrint_invlist);
5310         PL_PosixPrint = _new_invlist_C_array(PosixPrint_invlist);
5311
5312         PL_L1PosixPunct = _new_invlist_C_array(L1PosixPunct_invlist);
5313         PL_PosixPunct = _new_invlist_C_array(PosixPunct_invlist);
5314
5315         PL_PerlSpace = _new_invlist_C_array(PerlSpace_invlist);
5316         PL_XPerlSpace = _new_invlist_C_array(XPerlSpace_invlist);
5317
5318         PL_PosixSpace = _new_invlist_C_array(PosixSpace_invlist);
5319         PL_XPosixSpace = _new_invlist_C_array(XPosixSpace_invlist);
5320
5321         PL_L1PosixUpper = _new_invlist_C_array(L1PosixUpper_invlist);
5322         PL_PosixUpper = _new_invlist_C_array(PosixUpper_invlist);
5323
5324         PL_VertSpace = _new_invlist_C_array(VertSpace_invlist);
5325
5326         PL_PosixWord = _new_invlist_C_array(PosixWord_invlist);
5327         PL_L1PosixWord = _new_invlist_C_array(L1PosixWord_invlist);
5328
5329         PL_PosixXDigit = _new_invlist_C_array(PosixXDigit_invlist);
5330         PL_XPosixXDigit = _new_invlist_C_array(XPosixXDigit_invlist);
5331     }
5332 #endif
5333
5334     pRExC_state->code_blocks = NULL;
5335     pRExC_state->num_code_blocks = 0;
5336
5337     if (is_bare_re)
5338         *is_bare_re = FALSE;
5339
5340     if (expr && (expr->op_type == OP_LIST ||
5341                 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
5342
5343         /* is the source UTF8, and how many code blocks are there? */
5344         OP *o;
5345         int ncode = 0;
5346
5347         for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5348             if (o->op_type == OP_CONST && SvUTF8(cSVOPo_sv))
5349                 code_is_utf8 = 1;
5350             else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
5351                 /* count of DO blocks */
5352                 ncode++;
5353         }
5354         if (ncode) {
5355             pRExC_state->num_code_blocks = ncode;
5356             Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
5357         }
5358     }
5359
5360     if (pat_count) {
5361         /* handle a list of SVs */
5362
5363         SV **svp;
5364
5365         /* apply magic and RE overloading to each arg */
5366         for (svp = patternp; svp < patternp + pat_count; svp++) {
5367             SV *rx = *svp;
5368             SvGETMAGIC(rx);
5369             if (SvROK(rx) && SvAMAGIC(rx)) {
5370                 SV *sv = AMG_CALLunary(rx, regexp_amg);
5371                 if (sv) {
5372                     if (SvROK(sv))
5373                         sv = SvRV(sv);
5374                     if (SvTYPE(sv) != SVt_REGEXP)
5375                         Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5376                     *svp = sv;
5377                 }
5378             }
5379         }
5380
5381         if (pat_count > 1) {
5382             /* concat multiple args and find any code block indexes */
5383
5384             OP *o = NULL;
5385             int n = 0;
5386             bool utf8 = 0;
5387             STRLEN orig_patlen = 0;
5388
5389             if (pRExC_state->num_code_blocks) {
5390                 o = cLISTOPx(expr)->op_first;
5391                 assert(o->op_type == OP_PUSHMARK);
5392                 o = o->op_sibling;
5393             }
5394
5395             pat = newSVpvn("", 0);
5396             SAVEFREESV(pat);
5397
5398             /* determine if the pattern is going to be utf8 (needed
5399              * in advance to align code block indices correctly).
5400              * XXX This could fail to be detected for an arg with
5401              * overloading but not concat overloading; but the main effect
5402              * in this obscure case is to need a 'use re eval' for a
5403              * literal code block */
5404             for (svp = patternp; svp < patternp + pat_count; svp++) {
5405                 if (SvUTF8(*svp))
5406                     utf8 = 1;
5407             }
5408             if (utf8)
5409                 SvUTF8_on(pat);
5410
5411             for (svp = patternp; svp < patternp + pat_count; svp++) {
5412                 SV *sv, *msv = *svp;
5413                 SV *rx;
5414                 bool code = 0;
5415                 if (o) {
5416                     if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
5417                         assert(n < pRExC_state->num_code_blocks);
5418                         pRExC_state->code_blocks[n].start = SvCUR(pat);
5419                         pRExC_state->code_blocks[n].block = o;
5420                         pRExC_state->code_blocks[n].src_regex = NULL;
5421                         n++;
5422                         code = 1;
5423                         o = o->op_sibling; /* skip CONST */
5424                         assert(o);
5425                     }
5426                     o = o->op_sibling;;
5427                 }
5428
5429                 if ((SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5430                         (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5431                 {
5432                     sv_setsv(pat, sv);
5433                     /* overloading involved: all bets are off over literal
5434                      * code. Pretend we haven't seen it */
5435                     pRExC_state->num_code_blocks -= n;
5436                     n = 0;
5437                     rx = NULL;
5438
5439                 }
5440                 else  {
5441                     while (SvAMAGIC(msv)
5442                             && (sv = AMG_CALLunary(msv, string_amg))
5443                             && sv != msv)
5444                     {
5445                         msv = sv;
5446                         SvGETMAGIC(msv);
5447                     }
5448                     if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5449                         msv = SvRV(msv);
5450                     orig_patlen = SvCUR(pat);
5451                     sv_catsv_nomg(pat, msv);
5452                     rx = msv;
5453                     if (code)
5454                         pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
5455                 }
5456
5457                 /* extract any code blocks within any embedded qr//'s */
5458                 if (rx && SvTYPE(rx) == SVt_REGEXP
5459                     && RX_ENGINE((REGEXP*)rx)->op_comp)
5460                 {
5461
5462                     RXi_GET_DECL(((struct regexp*)SvANY(rx)), ri);
5463                     if (ri->num_code_blocks) {
5464                         int i;
5465                         /* the presence of an embedded qr// with code means
5466                          * we should always recompile: the text of the
5467                          * qr// may not have changed, but it may be a
5468                          * different closure than last time */
5469                         recompile = 1;
5470                         Renew(pRExC_state->code_blocks,
5471                             pRExC_state->num_code_blocks + ri->num_code_blocks,
5472                             struct reg_code_block);
5473                         pRExC_state->num_code_blocks += ri->num_code_blocks;
5474                         for (i=0; i < ri->num_code_blocks; i++) {
5475                             struct reg_code_block *src, *dst;
5476                             STRLEN offset =  orig_patlen
5477                                 + ((struct regexp *)SvANY(rx))->pre_prefix;
5478                             assert(n < pRExC_state->num_code_blocks);
5479                             src = &ri->code_blocks[i];
5480                             dst = &pRExC_state->code_blocks[n];
5481                             dst->start      = src->start + offset;
5482                             dst->end        = src->end   + offset;
5483                             dst->block      = src->block;
5484                             dst->src_regex  = (REGEXP*) SvREFCNT_inc( (SV*)
5485                                                     src->src_regex
5486                                                         ? src->src_regex
5487                                                         : (REGEXP*)rx);
5488                             n++;
5489                         }
5490                     }
5491                 }
5492             }
5493             SvSETMAGIC(pat);
5494         }
5495         else {
5496             SV *sv;
5497             pat = *patternp;
5498             while (SvAMAGIC(pat)
5499                     && (sv = AMG_CALLunary(pat, string_amg))
5500                     && sv != pat)
5501             {
5502                 pat = sv;
5503                 SvGETMAGIC(pat);
5504             }
5505         }
5506
5507         /* handle bare regex: foo =~ $re */
5508         {
5509             SV *re = pat;
5510             if (SvROK(re))
5511                 re = SvRV(re);
5512             if (SvTYPE(re) == SVt_REGEXP) {
5513                 if (is_bare_re)
5514                     *is_bare_re = TRUE;
5515                 SvREFCNT_inc(re);
5516                 Safefree(pRExC_state->code_blocks);
5517                 return (REGEXP*)re;
5518             }
5519         }
5520     }
5521     else {
5522         /* not a list of SVs, so must be a list of OPs */
5523         assert(expr);
5524         if (expr->op_type == OP_LIST) {
5525             int i = -1;
5526             bool is_code = 0;
5527             OP *o;
5528
5529             pat = newSVpvn("", 0);
5530             SAVEFREESV(pat);
5531             if (code_is_utf8)
5532                 SvUTF8_on(pat);
5533
5534             /* given a list of CONSTs and DO blocks in expr, append all
5535              * the CONSTs to pat, and record the start and end of each
5536              * code block in code_blocks[] (each DO{} op is followed by an
5537              * OP_CONST containing the corresponding literal '(?{...})
5538              * text)
5539              */
5540             for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5541                 if (o->op_type == OP_CONST) {
5542                     sv_catsv(pat, cSVOPo_sv);
5543                     if (is_code) {
5544                         pRExC_state->code_blocks[i].end = SvCUR(pat)-1;
5545                         is_code = 0;
5546                     }
5547                 }
5548                 else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
5549                     assert(i+1 < pRExC_state->num_code_blocks);
5550                     pRExC_state->code_blocks[++i].start = SvCUR(pat);
5551                     pRExC_state->code_blocks[i].block = o;
5552                     pRExC_state->code_blocks[i].src_regex = NULL;
5553                     is_code = 1;
5554                 }
5555             }
5556         }
5557         else {
5558             assert(expr->op_type == OP_CONST);
5559             pat = cSVOPx_sv(expr);
5560         }
5561     }
5562
5563     exp = SvPV_nomg(pat, plen);
5564
5565     if (!eng->op_comp) {
5566         if ((SvUTF8(pat) && IN_BYTES)
5567                 || SvGMAGICAL(pat) || SvAMAGIC(pat))
5568         {
5569             /* make a temporary copy; either to convert to bytes,
5570              * or to avoid repeating get-magic / overloaded stringify */
5571             pat = newSVpvn_flags(exp, plen, SVs_TEMP |
5572                                         (IN_BYTES ? 0 : SvUTF8(pat)));
5573         }
5574         Safefree(pRExC_state->code_blocks);
5575         return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
5576     }
5577
5578     /* ignore the utf8ness if the pattern is 0 length */
5579     RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
5580     RExC_uni_semantics = 0;
5581     RExC_contains_locale = 0;
5582     pRExC_state->runtime_code_qr = NULL;
5583
5584     /****************** LONG JUMP TARGET HERE***********************/
5585     /* Longjmp back to here if have to switch in midstream to utf8 */
5586     if (! RExC_orig_utf8) {
5587         JMPENV_PUSH(jump_ret);
5588         used_setjump = TRUE;
5589     }
5590
5591     if (jump_ret == 0) {    /* First time through */
5592         xend = exp + plen;
5593
5594         DEBUG_COMPILE_r({
5595             SV *dsv= sv_newmortal();
5596             RE_PV_QUOTED_DECL(s, RExC_utf8,
5597                 dsv, exp, plen, 60);
5598             PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
5599                            PL_colors[4],PL_colors[5],s);
5600         });
5601     }
5602     else {  /* longjumped back */
5603         U8 *src, *dst;
5604         int n=0;
5605         STRLEN s = 0, d = 0;
5606         bool do_end = 0;
5607
5608         /* If the cause for the longjmp was other than changing to utf8, pop
5609          * our own setjmp, and longjmp to the correct handler */
5610         if (jump_ret != UTF8_LONGJMP) {
5611             JMPENV_POP;
5612             JMPENV_JUMP(jump_ret);
5613         }
5614
5615         GET_RE_DEBUG_FLAGS;
5616
5617         /* It's possible to write a regexp in ascii that represents Unicode
5618         codepoints outside of the byte range, such as via \x{100}. If we
5619         detect such a sequence we have to convert the entire pattern to utf8
5620         and then recompile, as our sizing calculation will have been based
5621         on 1 byte == 1 character, but we will need to use utf8 to encode
5622         at least some part of the pattern, and therefore must convert the whole
5623         thing.
5624         -- dmq */
5625         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5626             "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5627
5628         /* upgrade pattern to UTF8, and if there are code blocks,
5629          * recalculate the indices.
5630          * This is essentially an unrolled Perl_bytes_to_utf8() */
5631
5632         src = (U8*)SvPV_nomg(pat, plen);
5633         Newx(dst, plen * 2 + 1, U8);
5634
5635         while (s < plen) {
5636             const UV uv = NATIVE_TO_ASCII(src[s]);
5637             if (UNI_IS_INVARIANT(uv))
5638                 dst[d]   = (U8)UTF_TO_NATIVE(uv);
5639             else {
5640                 dst[d++] = (U8)UTF8_EIGHT_BIT_HI(uv);
5641                 dst[d]   = (U8)UTF8_EIGHT_BIT_LO(uv);
5642             }
5643             if (n < pRExC_state->num_code_blocks) {
5644                 if (!do_end && pRExC_state->code_blocks[n].start == s) {
5645                     pRExC_state->code_blocks[n].start = d;
5646                     assert(dst[d] == '(');
5647                     do_end = 1;
5648                 }
5649                 else if (do_end && pRExC_state->code_blocks[n].end == s) {
5650                     pRExC_state->code_blocks[n].end = d;
5651                     assert(dst[d] == ')');
5652                     do_end = 0;
5653                     n++;
5654                 }
5655             }
5656             s++;
5657             d++;
5658         }
5659         dst[d] = '\0';
5660         plen = d;
5661         exp = (char*) dst;
5662         xend = exp + plen;
5663         SAVEFREEPV(exp);
5664         RExC_orig_utf8 = RExC_utf8 = 1;
5665     }
5666
5667     /* return old regex if pattern hasn't changed */
5668
5669     if (   old_re
5670         && !recompile
5671         && !!RX_UTF8(old_re) == !!RExC_utf8
5672         && RX_PRECOMP(old_re)
5673         && RX_PRELEN(old_re) == plen
5674         && memEQ(RX_PRECOMP(old_re), exp, plen))
5675     {
5676         /* with runtime code, always recompile */
5677         runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags,
5678                                             exp, plen);
5679         if (!runtime_code) {
5680             ReREFCNT_inc(old_re);
5681             if (used_setjump) {
5682                 JMPENV_POP;
5683             }
5684             Safefree(pRExC_state->code_blocks);
5685             return old_re;
5686         }
5687     }
5688     else if ((pm_flags & PMf_USE_RE_EVAL)
5689                 /* this second condition covers the non-regex literal case,
5690                  * i.e.  $foo =~ '(?{})'. */
5691                 || ( !PL_reg_state.re_reparsing && IN_PERL_COMPILETIME
5692                     && (PL_hints & HINT_RE_EVAL))
5693     )
5694         runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags,
5695                             exp, plen);
5696
5697 #ifdef TRIE_STUDY_OPT
5698     restudied = 0;
5699 #endif
5700
5701     rx_flags = orig_rx_flags;
5702
5703     if (initial_charset == REGEX_LOCALE_CHARSET) {
5704         RExC_contains_locale = 1;
5705     }
5706     else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
5707
5708         /* Set to use unicode semantics if the pattern is in utf8 and has the
5709          * 'depends' charset specified, as it means unicode when utf8  */
5710         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5711     }
5712
5713     RExC_precomp = exp;
5714     RExC_flags = rx_flags;
5715     RExC_pm_flags = pm_flags;
5716
5717     if (runtime_code) {
5718         if (PL_tainting && PL_tainted)
5719             Perl_croak(aTHX_ "Eval-group in insecure regular expression");
5720
5721         if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
5722             /* whoops, we have a non-utf8 pattern, whilst run-time code
5723              * got compiled as utf8. Try again with a utf8 pattern */
5724              JMPENV_JUMP(UTF8_LONGJMP);
5725         }
5726     }
5727     assert(!pRExC_state->runtime_code_qr);
5728
5729     RExC_sawback = 0;
5730
5731     RExC_seen = 0;
5732     RExC_in_lookbehind = 0;
5733     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
5734     RExC_extralen = 0;
5735     RExC_override_recoding = 0;
5736
5737     /* First pass: determine size, legality. */
5738     RExC_parse = exp;
5739     RExC_start = exp;
5740     RExC_end = xend;
5741     RExC_naughty = 0;
5742     RExC_npar = 1;
5743     RExC_nestroot = 0;
5744     RExC_size = 0L;
5745     RExC_emit = &PL_regdummy;
5746     RExC_whilem_seen = 0;
5747     RExC_open_parens = NULL;
5748     RExC_close_parens = NULL;
5749     RExC_opend = NULL;
5750     RExC_paren_names = NULL;
5751 #ifdef DEBUGGING
5752     RExC_paren_name_list = NULL;
5753 #endif
5754     RExC_recurse = NULL;
5755     RExC_recurse_count = 0;
5756     pRExC_state->code_index = 0;
5757
5758 #if 0 /* REGC() is (currently) a NOP at the first pass.
5759        * Clever compilers notice this and complain. --jhi */
5760     REGC((U8)REG_MAGIC, (char*)RExC_emit);
5761 #endif
5762     DEBUG_PARSE_r(
5763         PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
5764         RExC_lastnum=0;
5765         RExC_lastparse=NULL;
5766     );
5767     if (reg(pRExC_state, 0, &flags,1) == NULL) {
5768         RExC_precomp = NULL;
5769         Safefree(pRExC_state->code_blocks);
5770         return(NULL);
5771     }
5772
5773     /* Here, finished first pass.  Get rid of any added setjmp */
5774     if (used_setjump) {
5775         JMPENV_POP;
5776     }
5777
5778     DEBUG_PARSE_r({
5779         PerlIO_printf(Perl_debug_log,
5780             "Required size %"IVdf" nodes\n"
5781             "Starting second pass (creation)\n",
5782             (IV)RExC_size);
5783         RExC_lastnum=0;
5784         RExC_lastparse=NULL;
5785     });
5786
5787     /* The first pass could have found things that force Unicode semantics */
5788     if ((RExC_utf8 || RExC_uni_semantics)
5789          && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
5790     {
5791         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5792     }
5793
5794     /* Small enough for pointer-storage convention?
5795        If extralen==0, this means that we will not need long jumps. */
5796     if (RExC_size >= 0x10000L && RExC_extralen)
5797         RExC_size += RExC_extralen;
5798     else
5799         RExC_extralen = 0;
5800     if (RExC_whilem_seen > 15)
5801         RExC_whilem_seen = 15;
5802
5803     /* Allocate space and zero-initialize. Note, the two step process
5804        of zeroing when in debug mode, thus anything assigned has to
5805        happen after that */
5806     rx = (REGEXP*) newSV_type(SVt_REGEXP);
5807     r = (struct regexp*)SvANY(rx);
5808     Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
5809          char, regexp_internal);
5810     if ( r == NULL || ri == NULL )
5811         FAIL("Regexp out of space");
5812 #ifdef DEBUGGING
5813     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
5814     Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
5815 #else
5816     /* bulk initialize base fields with 0. */
5817     Zero(ri, sizeof(regexp_internal), char);
5818 #endif
5819
5820     /* non-zero initialization begins here */
5821     RXi_SET( r, ri );
5822     r->engine= eng;
5823     r->extflags = rx_flags;
5824     if (pm_flags & PMf_IS_QR) {
5825         ri->code_blocks = pRExC_state->code_blocks;
5826         ri->num_code_blocks = pRExC_state->num_code_blocks;
5827     }
5828     else
5829         SAVEFREEPV(pRExC_state->code_blocks);
5830
5831     {
5832         bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
5833         bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
5834
5835         /* The caret is output if there are any defaults: if not all the STD
5836          * flags are set, or if no character set specifier is needed */
5837         bool has_default =
5838                     (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
5839                     || ! has_charset);
5840         bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
5841         U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
5842                             >> RXf_PMf_STD_PMMOD_SHIFT);
5843         const char *fptr = STD_PAT_MODS;        /*"msix"*/
5844         char *p;
5845         /* Allocate for the worst case, which is all the std flags are turned
5846          * on.  If more precision is desired, we could do a population count of
5847          * the flags set.  This could be done with a small lookup table, or by
5848          * shifting, masking and adding, or even, when available, assembly
5849          * language for a machine-language population count.
5850          * We never output a minus, as all those are defaults, so are
5851          * covered by the caret */
5852         const STRLEN wraplen = plen + has_p + has_runon
5853             + has_default       /* If needs a caret */
5854
5855                 /* If needs a character set specifier */
5856             + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
5857             + (sizeof(STD_PAT_MODS) - 1)
5858             + (sizeof("(?:)") - 1);
5859
5860         p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
5861         SvPOK_on(rx);
5862         if (RExC_utf8)
5863             SvFLAGS(rx) |= SVf_UTF8;
5864         *p++='('; *p++='?';
5865
5866         /* If a default, cover it using the caret */
5867         if (has_default) {
5868             *p++= DEFAULT_PAT_MOD;
5869         }
5870         if (has_charset) {
5871             STRLEN len;
5872             const char* const name = get_regex_charset_name(r->extflags, &len);
5873             Copy(name, p, len, char);
5874             p += len;
5875         }
5876         if (has_p)
5877             *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
5878         {
5879             char ch;
5880             while((ch = *fptr++)) {
5881                 if(reganch & 1)
5882                     *p++ = ch;
5883                 reganch >>= 1;
5884             }
5885         }
5886
5887         *p++ = ':';
5888         Copy(RExC_precomp, p, plen, char);
5889         assert ((RX_WRAPPED(rx) - p) < 16);
5890         r->pre_prefix = p - RX_WRAPPED(rx);
5891         p += plen;
5892         if (has_runon)
5893             *p++ = '\n';
5894         *p++ = ')';
5895         *p = 0;
5896         SvCUR_set(rx, p - SvPVX_const(rx));
5897     }
5898
5899     r->intflags = 0;
5900     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
5901
5902     if (RExC_seen & REG_SEEN_RECURSE) {
5903         Newxz(RExC_open_parens, RExC_npar,regnode *);
5904         SAVEFREEPV(RExC_open_parens);
5905         Newxz(RExC_close_parens,RExC_npar,regnode *);
5906         SAVEFREEPV(RExC_close_parens);
5907     }
5908
5909     /* Useful during FAIL. */
5910 #ifdef RE_TRACK_PATTERN_OFFSETS
5911     Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
5912     DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
5913                           "%s %"UVuf" bytes for offset annotations.\n",
5914                           ri->u.offsets ? "Got" : "Couldn't get",
5915                           (UV)((2*RExC_size+1) * sizeof(U32))));
5916 #endif
5917     SetProgLen(ri,RExC_size);
5918     RExC_rx_sv = rx;
5919     RExC_rx = r;
5920     RExC_rxi = ri;
5921     REH_CALL_COMP_BEGIN_HOOK(pRExC_state->rx);
5922
5923     /* Second pass: emit code. */
5924     RExC_flags = rx_flags;      /* don't let top level (?i) bleed */
5925     RExC_pm_flags = pm_flags;
5926     RExC_parse = exp;
5927     RExC_end = xend;
5928     RExC_naughty = 0;
5929     RExC_npar = 1;
5930     RExC_emit_start = ri->program;
5931     RExC_emit = ri->program;
5932     RExC_emit_bound = ri->program + RExC_size + 1;
5933     pRExC_state->code_index = 0;
5934
5935     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
5936     if (reg(pRExC_state, 0, &flags,1) == NULL) {
5937         ReREFCNT_dec(rx);
5938         return(NULL);
5939     }
5940     /* XXXX To minimize changes to RE engine we always allocate
5941        3-units-long substrs field. */
5942     Newx(r->substrs, 1, struct reg_substr_data);
5943     if (RExC_recurse_count) {
5944         Newxz(RExC_recurse,RExC_recurse_count,regnode *);
5945         SAVEFREEPV(RExC_recurse);
5946     }
5947
5948 reStudy:
5949     r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
5950     Zero(r->substrs, 1, struct reg_substr_data);
5951
5952 #ifdef TRIE_STUDY_OPT
5953     if (!restudied) {
5954         StructCopy(&zero_scan_data, &data, scan_data_t);
5955         copyRExC_state = RExC_state;
5956     } else {
5957         U32 seen=RExC_seen;
5958         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
5959
5960         RExC_state = copyRExC_state;
5961         if (seen & REG_TOP_LEVEL_BRANCHES)
5962             RExC_seen |= REG_TOP_LEVEL_BRANCHES;
5963         else
5964             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
5965         if (data.last_found) {
5966             SvREFCNT_dec(data.longest_fixed);
5967             SvREFCNT_dec(data.longest_float);
5968             SvREFCNT_dec(data.last_found);
5969         }
5970         StructCopy(&zero_scan_data, &data, scan_data_t);
5971     }
5972 #else
5973     StructCopy(&zero_scan_data, &data, scan_data_t);
5974 #endif
5975
5976     /* Dig out information for optimizations. */
5977     r->extflags = RExC_flags; /* was pm_op */
5978     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
5979
5980     if (UTF)
5981         SvUTF8_on(rx);  /* Unicode in it? */
5982     ri->regstclass = NULL;
5983     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
5984         r->intflags |= PREGf_NAUGHTY;
5985     scan = ri->program + 1;             /* First BRANCH. */
5986
5987     /* testing for BRANCH here tells us whether there is "must appear"
5988        data in the pattern. If there is then we can use it for optimisations */
5989     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /*  Only one top-level choice. */
5990         I32 fake;
5991         STRLEN longest_float_length, longest_fixed_length;
5992         struct regnode_charclass_class ch_class; /* pointed to by data */
5993         int stclass_flag;
5994         I32 last_close = 0; /* pointed to by data */
5995         regnode *first= scan;
5996         regnode *first_next= regnext(first);
5997         /*
5998          * Skip introductions and multiplicators >= 1
5999          * so that we can extract the 'meat' of the pattern that must
6000          * match in the large if() sequence following.
6001          * NOTE that EXACT is NOT covered here, as it is normally
6002          * picked up by the optimiser separately.
6003          *
6004          * This is unfortunate as the optimiser isnt handling lookahead
6005          * properly currently.
6006          *
6007          */
6008         while ((OP(first) == OPEN && (sawopen = 1)) ||
6009                /* An OR of *one* alternative - should not happen now. */
6010             (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
6011             /* for now we can't handle lookbehind IFMATCH*/
6012             (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
6013             (OP(first) == PLUS) ||
6014             (OP(first) == MINMOD) ||
6015                /* An {n,m} with n>0 */
6016             (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
6017             (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
6018         {
6019                 /*
6020                  * the only op that could be a regnode is PLUS, all the rest
6021                  * will be regnode_1 or regnode_2.
6022                  *
6023                  */
6024                 if (OP(first) == PLUS)
6025                     sawplus = 1;
6026                 else
6027                     first += regarglen[OP(first)];
6028
6029                 first = NEXTOPER(first);
6030                 first_next= regnext(first);
6031         }
6032
6033         /* Starting-point info. */
6034       again:
6035         DEBUG_PEEP("first:",first,0);
6036         /* Ignore EXACT as we deal with it later. */
6037         if (PL_regkind[OP(first)] == EXACT) {
6038             if (OP(first) == EXACT)
6039                 NOOP;   /* Empty, get anchored substr later. */
6040             else
6041                 ri->regstclass = first;
6042         }
6043 #ifdef TRIE_STCLASS
6044         else if (PL_regkind[OP(first)] == TRIE &&
6045                 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
6046         {
6047             regnode *trie_op;
6048             /* this can happen only on restudy */
6049             if ( OP(first) == TRIE ) {
6050                 struct regnode_1 *trieop = (struct regnode_1 *)
6051                     PerlMemShared_calloc(1, sizeof(struct regnode_1));
6052                 StructCopy(first,trieop,struct regnode_1);
6053                 trie_op=(regnode *)trieop;
6054             } else {
6055                 struct regnode_charclass *trieop = (struct regnode_charclass *)
6056                     PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
6057                 StructCopy(first,trieop,struct regnode_charclass);
6058                 trie_op=(regnode *)trieop;
6059             }
6060             OP(trie_op)+=2;
6061             make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
6062             ri->regstclass = trie_op;
6063         }
6064 #endif
6065         else if (REGNODE_SIMPLE(OP(first)))
6066             ri->regstclass = first;
6067         else if (PL_regkind[OP(first)] == BOUND ||
6068                  PL_regkind[OP(first)] == NBOUND)
6069             ri->regstclass = first;
6070         else if (PL_regkind[OP(first)] == BOL) {
6071             r->extflags |= (OP(first) == MBOL
6072                            ? RXf_ANCH_MBOL
6073                            : (OP(first) == SBOL
6074                               ? RXf_ANCH_SBOL
6075                               : RXf_ANCH_BOL));
6076             first = NEXTOPER(first);
6077             goto again;
6078         }
6079         else if (OP(first) == GPOS) {
6080             r->extflags |= RXf_ANCH_GPOS;
6081             first = NEXTOPER(first);
6082             goto again;
6083         }
6084         else if ((!sawopen || !RExC_sawback) &&
6085             (OP(first) == STAR &&
6086             PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
6087             !(r->extflags & RXf_ANCH) && !pRExC_state->num_code_blocks)
6088         {
6089             /* turn .* into ^.* with an implied $*=1 */
6090             const int type =
6091                 (OP(NEXTOPER(first)) == REG_ANY)
6092                     ? RXf_ANCH_MBOL
6093                     : RXf_ANCH_SBOL;
6094             r->extflags |= type;
6095             r->intflags |= PREGf_IMPLICIT;
6096             first = NEXTOPER(first);
6097             goto again;
6098         }
6099         if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
6100             && !pRExC_state->num_code_blocks) /* May examine pos and $& */
6101             /* x+ must match at the 1st pos of run of x's */
6102             r->intflags |= PREGf_SKIP;
6103
6104         /* Scan is after the zeroth branch, first is atomic matcher. */
6105 #ifdef TRIE_STUDY_OPT
6106         DEBUG_PARSE_r(
6107             if (!restudied)
6108                 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6109                               (IV)(first - scan + 1))
6110         );
6111 #else
6112         DEBUG_PARSE_r(
6113             PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6114                 (IV)(first - scan + 1))
6115         );
6116 #endif
6117
6118
6119         /*
6120         * If there's something expensive in the r.e., find the
6121         * longest literal string that must appear and make it the
6122         * regmust.  Resolve ties in favor of later strings, since
6123         * the regstart check works with the beginning of the r.e.
6124         * and avoiding duplication strengthens checking.  Not a
6125         * strong reason, but sufficient in the absence of others.
6126         * [Now we resolve ties in favor of the earlier string if
6127         * it happens that c_offset_min has been invalidated, since the
6128         * earlier string may buy us something the later one won't.]
6129         */
6130
6131         data.longest_fixed = newSVpvs("");
6132         data.longest_float = newSVpvs("");
6133         data.last_found = newSVpvs("");
6134         data.longest = &(data.longest_fixed);
6135         first = scan;
6136         if (!ri->regstclass) {
6137             cl_init(pRExC_state, &ch_class);
6138             data.start_class = &ch_class;
6139             stclass_flag = SCF_DO_STCLASS_AND;
6140         } else                          /* XXXX Check for BOUND? */
6141             stclass_flag = 0;
6142         data.last_closep = &last_close;
6143
6144         minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
6145             &data, -1, NULL, NULL,
6146             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
6147
6148
6149         CHECK_RESTUDY_GOTO;
6150
6151
6152         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
6153              && data.last_start_min == 0 && data.last_end > 0
6154              && !RExC_seen_zerolen
6155              && !(RExC_seen & REG_SEEN_VERBARG)
6156              && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
6157             r->extflags |= RXf_CHECK_ALL;
6158         scan_commit(pRExC_state, &data,&minlen,0);
6159         SvREFCNT_dec(data.last_found);
6160
6161         /* Note that code very similar to this but for anchored string
6162            follows immediately below, changes may need to be made to both.
6163            Be careful.
6164          */
6165         longest_float_length = CHR_SVLEN(data.longest_float);
6166         if (longest_float_length
6167             || (data.flags & SF_FL_BEFORE_EOL
6168                 && (!(data.flags & SF_FL_BEFORE_MEOL)
6169                     || (RExC_flags & RXf_PMf_MULTILINE))))
6170         {
6171             I32 t,ml;
6172
6173             /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
6174             if ((RExC_seen & REG_SEEN_EXACTF_SHARP_S)
6175                 || (SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
6176                     && data.offset_fixed == data.offset_float_min
6177                     && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
6178                     goto remove_float;          /* As in (a)+. */
6179
6180             /* copy the information about the longest float from the reg_scan_data
6181                over to the program. */
6182             if (SvUTF8(data.longest_float)) {
6183                 r->float_utf8 = data.longest_float;
6184                 r->float_substr = NULL;
6185             } else {
6186                 r->float_substr = data.longest_float;
6187                 r->float_utf8 = NULL;
6188             }
6189             /* float_end_shift is how many chars that must be matched that
6190                follow this item. We calculate it ahead of time as once the
6191                lookbehind offset is added in we lose the ability to correctly
6192                calculate it.*/
6193             ml = data.minlen_float ? *(data.minlen_float)
6194                                    : (I32)longest_float_length;
6195             r->float_end_shift = ml - data.offset_float_min
6196                 - longest_float_length + (SvTAIL(data.longest_float) != 0)
6197                 + data.lookbehind_float;
6198             r->float_min_offset = data.offset_float_min - data.lookbehind_float;
6199             r->float_max_offset = data.offset_float_max;
6200             if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
6201                 r->float_max_offset -= data.lookbehind_float;
6202
6203             t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
6204                        && (!(data.flags & SF_FL_BEFORE_MEOL)
6205                            || (RExC_flags & RXf_PMf_MULTILINE)));
6206             fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
6207         }
6208         else {
6209           remove_float:
6210             r->float_substr = r->float_utf8 = NULL;
6211             SvREFCNT_dec(data.longest_float);
6212             longest_float_length = 0;
6213         }
6214
6215         /* Note that code very similar to this but for floating string
6216            is immediately above, changes may need to be made to both.
6217            Be careful.
6218          */
6219         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
6220
6221         /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
6222         if (! (RExC_seen & REG_SEEN_EXACTF_SHARP_S)
6223             && (longest_fixed_length
6224                 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
6225                     && (!(data.flags & SF_FIX_BEFORE_MEOL)
6226                         || (RExC_flags & RXf_PMf_MULTILINE)))) )
6227         {
6228             I32 t,ml;
6229
6230             /* copy the information about the longest fixed
6231                from the reg_scan_data over to the program. */
6232             if (SvUTF8(data.longest_fixed)) {
6233                 r->anchored_utf8 = data.longest_fixed;
6234                 r->anchored_substr = NULL;
6235             } else {
6236                 r->anchored_substr = data.longest_fixed;
6237                 r->anchored_utf8 = NULL;
6238             }
6239             /* fixed_end_shift is how many chars that must be matched that
6240                follow this item. We calculate it ahead of time as once the
6241                lookbehind offset is added in we lose the ability to correctly
6242                calculate it.*/
6243             ml = data.minlen_fixed ? *(data.minlen_fixed)
6244                                    : (I32)longest_fixed_length;
6245             r->anchored_end_shift = ml - data.offset_fixed
6246                 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
6247                 + data.lookbehind_fixed;
6248             r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
6249
6250             t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
6251                  && (!(data.flags & SF_FIX_BEFORE_MEOL)
6252                      || (RExC_flags & RXf_PMf_MULTILINE)));
6253             fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
6254         }
6255         else {
6256             r->anchored_substr = r->anchored_utf8 = NULL;
6257             SvREFCNT_dec(data.longest_fixed);
6258             longest_fixed_length = 0;
6259         }
6260         if (ri->regstclass
6261             && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
6262             ri->regstclass = NULL;
6263
6264         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
6265             && stclass_flag
6266             && !(data.start_class->flags & ANYOF_EOS)
6267             && !cl_is_anything(data.start_class))
6268         {
6269             const U32 n = add_data(pRExC_state, 1, "f");
6270             data.start_class->flags |= ANYOF_IS_SYNTHETIC;
6271
6272             Newx(RExC_rxi->data->data[n], 1,
6273                 struct regnode_charclass_class);
6274             StructCopy(data.start_class,
6275                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6276                        struct regnode_charclass_class);
6277             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6278             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6279             DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
6280                       regprop(r, sv, (regnode*)data.start_class);
6281                       PerlIO_printf(Perl_debug_log,
6282                                     "synthetic stclass \"%s\".\n",
6283                                     SvPVX_const(sv));});
6284         }
6285
6286         /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
6287         if (longest_fixed_length > longest_float_length) {
6288             r->check_end_shift = r->anchored_end_shift;
6289             r->check_substr = r->anchored_substr;
6290             r->check_utf8 = r->anchored_utf8;
6291             r->check_offset_min = r->check_offset_max = r->anchored_offset;
6292             if (r->extflags & RXf_ANCH_SINGLE)
6293                 r->extflags |= RXf_NOSCAN;
6294         }
6295         else {
6296             r->check_end_shift = r->float_end_shift;
6297             r->check_substr = r->float_substr;
6298             r->check_utf8 = r->float_utf8;
6299             r->check_offset_min = r->float_min_offset;
6300             r->check_offset_max = r->float_max_offset;
6301         }
6302         /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
6303            This should be changed ASAP!  */
6304         if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
6305             r->extflags |= RXf_USE_INTUIT;
6306             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
6307                 r->extflags |= RXf_INTUIT_TAIL;
6308         }
6309         /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
6310         if ( (STRLEN)minlen < longest_float_length )
6311             minlen= longest_float_length;
6312         if ( (STRLEN)minlen < longest_fixed_length )
6313             minlen= longest_fixed_length;
6314         */
6315     }
6316     else {
6317         /* Several toplevels. Best we can is to set minlen. */
6318         I32 fake;
6319         struct regnode_charclass_class ch_class;
6320         I32 last_close = 0;
6321
6322         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
6323
6324         scan = ri->program + 1;
6325         cl_init(pRExC_state, &ch_class);
6326         data.start_class = &ch_class;
6327         data.last_closep = &last_close;
6328
6329
6330         minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
6331             &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
6332
6333         CHECK_RESTUDY_GOTO;
6334
6335         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
6336                 = r->float_substr = r->float_utf8 = NULL;
6337
6338         if (!(data.start_class->flags & ANYOF_EOS)
6339             && !cl_is_anything(data.start_class))
6340         {
6341             const U32 n = add_data(pRExC_state, 1, "f");
6342             data.start_class->flags |= ANYOF_IS_SYNTHETIC;
6343
6344             Newx(RExC_rxi->data->data[n], 1,
6345                 struct regnode_charclass_class);
6346             StructCopy(data.start_class,
6347                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6348                        struct regnode_charclass_class);
6349             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6350             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6351             DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
6352                       regprop(r, sv, (regnode*)data.start_class);
6353                       PerlIO_printf(Perl_debug_log,
6354                                     "synthetic stclass \"%s\".\n",
6355                                     SvPVX_const(sv));});
6356         }
6357     }
6358
6359     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
6360        the "real" pattern. */
6361     DEBUG_OPTIMISE_r({
6362         PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
6363                       (IV)minlen, (IV)r->minlen);
6364     });
6365     r->minlenret = minlen;
6366     if (r->minlen < minlen)
6367         r->minlen = minlen;
6368
6369     if (RExC_seen & REG_SEEN_GPOS)
6370         r->extflags |= RXf_GPOS_SEEN;
6371     if (RExC_seen & REG_SEEN_LOOKBEHIND)
6372         r->extflags |= RXf_LOOKBEHIND_SEEN;
6373     if (pRExC_state->num_code_blocks)
6374         r->extflags |= RXf_EVAL_SEEN;
6375     if (RExC_seen & REG_SEEN_CANY)
6376         r->extflags |= RXf_CANY_SEEN;
6377     if (RExC_seen & REG_SEEN_VERBARG)
6378         r->intflags |= PREGf_VERBARG_SEEN;
6379     if (RExC_seen & REG_SEEN_CUTGROUP)
6380         r->intflags |= PREGf_CUTGROUP_SEEN;
6381     if (pm_flags & PMf_USE_RE_EVAL)
6382         r->intflags |= PREGf_USE_RE_EVAL;
6383     if (RExC_paren_names)
6384         RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
6385     else
6386         RXp_PAREN_NAMES(r) = NULL;
6387
6388 #ifdef STUPID_PATTERN_CHECKS
6389     if (RX_PRELEN(rx) == 0)
6390         r->extflags |= RXf_NULL;
6391     if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
6392         /* XXX: this should happen BEFORE we compile */
6393         r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
6394     else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
6395         r->extflags |= RXf_WHITE;
6396     else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
6397         r->extflags |= RXf_START_ONLY;
6398 #else
6399     if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
6400             /* XXX: this should happen BEFORE we compile */
6401             r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
6402     else {
6403         regnode *first = ri->program + 1;
6404         U8 fop = OP(first);
6405
6406         if (PL_regkind[fop] == NOTHING && OP(NEXTOPER(first)) == END)
6407             r->extflags |= RXf_NULL;
6408         else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END)
6409             r->extflags |= RXf_START_ONLY;
6410         else if (fop == PLUS && OP(NEXTOPER(first)) == SPACE
6411                              && OP(regnext(first)) == END)
6412             r->extflags |= RXf_WHITE;
6413     }
6414 #endif
6415 #ifdef DEBUGGING
6416     if (RExC_paren_names) {
6417         ri->name_list_idx = add_data( pRExC_state, 1, "a" );
6418         ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
6419     } else
6420 #endif
6421         ri->name_list_idx = 0;
6422
6423     if (RExC_recurse_count) {
6424         for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
6425             const regnode *scan = RExC_recurse[RExC_recurse_count-1];
6426             ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
6427         }
6428     }
6429     Newxz(r->offs, RExC_npar, regexp_paren_pair);
6430     /* assume we don't need to swap parens around before we match */
6431
6432     DEBUG_DUMP_r({
6433         PerlIO_printf(Perl_debug_log,"Final program:\n");
6434         regdump(r);
6435     });
6436 #ifdef RE_TRACK_PATTERN_OFFSETS
6437     DEBUG_OFFSETS_r(if (ri->u.offsets) {
6438         const U32 len = ri->u.offsets[0];
6439         U32 i;
6440         GET_RE_DEBUG_FLAGS_DECL;
6441         PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
6442         for (i = 1; i <= len; i++) {
6443             if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
6444                 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
6445                 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
6446             }
6447         PerlIO_printf(Perl_debug_log, "\n");
6448     });
6449 #endif
6450     return rx;
6451 }
6452
6453
6454 SV*
6455 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
6456                     const U32 flags)
6457 {
6458     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
6459
6460     PERL_UNUSED_ARG(value);
6461
6462     if (flags & RXapif_FETCH) {
6463         return reg_named_buff_fetch(rx, key, flags);
6464     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
6465         Perl_croak_no_modify(aTHX);
6466         return NULL;
6467     } else if (flags & RXapif_EXISTS) {
6468         return reg_named_buff_exists(rx, key, flags)
6469             ? &PL_sv_yes
6470             : &PL_sv_no;
6471     } else if (flags & RXapif_REGNAMES) {
6472         return reg_named_buff_all(rx, flags);
6473     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
6474         return reg_named_buff_scalar(rx, flags);
6475     } else {
6476         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
6477         return NULL;
6478     }
6479 }
6480
6481 SV*
6482 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
6483                          const U32 flags)
6484 {
6485     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
6486     PERL_UNUSED_ARG(lastkey);
6487
6488     if (flags & RXapif_FIRSTKEY)
6489         return reg_named_buff_firstkey(rx, flags);
6490     else if (flags & RXapif_NEXTKEY)
6491         return reg_named_buff_nextkey(rx, flags);
6492     else {
6493         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
6494         return NULL;
6495     }
6496 }
6497
6498 SV*
6499 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
6500                           const U32 flags)
6501 {
6502     AV *retarray = NULL;
6503     SV *ret;
6504     struct regexp *const rx = (struct regexp *)SvANY(r);
6505
6506     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
6507
6508     if (flags & RXapif_ALL)
6509         retarray=newAV();
6510
6511     if (rx && RXp_PAREN_NAMES(rx)) {
6512         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
6513         if (he_str) {
6514             IV i;
6515             SV* sv_dat=HeVAL(he_str);
6516             I32 *nums=(I32*)SvPVX(sv_dat);
6517             for ( i=0; i<SvIVX(sv_dat); i++ ) {
6518                 if ((I32)(rx->nparens) >= nums[i]
6519                     && rx->offs[nums[i]].start != -1
6520                     && rx->offs[nums[i]].end != -1)
6521                 {
6522                     ret = newSVpvs("");
6523                     CALLREG_NUMBUF_FETCH(r,nums[i],ret);
6524                     if (!retarray)
6525                         return ret;
6526                 } else {
6527                     if (retarray)
6528                         ret = newSVsv(&PL_sv_undef);
6529                 }
6530                 if (retarray)
6531                     av_push(retarray, ret);
6532             }
6533             if (retarray)
6534                 return newRV_noinc(MUTABLE_SV(retarray));
6535         }
6536     }
6537     return NULL;
6538 }
6539
6540 bool
6541 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
6542                            const U32 flags)
6543 {
6544     struct regexp *const rx = (struct regexp *)SvANY(r);
6545
6546     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
6547
6548     if (rx && RXp_PAREN_NAMES(rx)) {
6549         if (flags & RXapif_ALL) {
6550             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
6551         } else {
6552             SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
6553             if (sv) {
6554                 SvREFCNT_dec(sv);
6555                 return TRUE;
6556             } else {
6557                 return FALSE;
6558             }
6559         }
6560     } else {
6561         return FALSE;
6562     }
6563 }
6564
6565 SV*
6566 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
6567 {
6568     struct regexp *const rx = (struct regexp *)SvANY(r);
6569
6570     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
6571
6572     if ( rx && RXp_PAREN_NAMES(rx) ) {
6573         (void)hv_iterinit(RXp_PAREN_NAMES(rx));
6574
6575         return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
6576     } else {
6577         return FALSE;
6578     }
6579 }
6580
6581 SV*
6582 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
6583 {
6584     struct regexp *const rx = (struct regexp *)SvANY(r);
6585     GET_RE_DEBUG_FLAGS_DECL;
6586
6587     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
6588
6589     if (rx && RXp_PAREN_NAMES(rx)) {
6590         HV *hv = RXp_PAREN_NAMES(rx);
6591         HE *temphe;
6592         while ( (temphe = hv_iternext_flags(hv,0)) ) {
6593             IV i;
6594             IV parno = 0;
6595             SV* sv_dat = HeVAL(temphe);
6596             I32 *nums = (I32*)SvPVX(sv_dat);
6597             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6598                 if ((I32)(rx->lastparen) >= nums[i] &&
6599                     rx->offs[nums[i]].start != -1 &&
6600                     rx->offs[nums[i]].end != -1)
6601                 {
6602                     parno = nums[i];
6603                     break;
6604                 }
6605             }
6606             if (parno || flags & RXapif_ALL) {
6607                 return newSVhek(HeKEY_hek(temphe));
6608             }
6609         }
6610     }
6611     return NULL;
6612 }
6613
6614 SV*
6615 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
6616 {
6617     SV *ret;
6618     AV *av;
6619     I32 length;
6620     struct regexp *const rx = (struct regexp *)SvANY(r);
6621
6622     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
6623
6624     if (rx && RXp_PAREN_NAMES(rx)) {
6625         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
6626             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
6627         } else if (flags & RXapif_ONE) {
6628             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
6629             av = MUTABLE_AV(SvRV(ret));
6630             length = av_len(av);
6631             SvREFCNT_dec(ret);
6632             return newSViv(length + 1);
6633         } else {
6634             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
6635             return NULL;
6636         }
6637     }
6638     return &PL_sv_undef;
6639 }
6640
6641 SV*
6642 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
6643 {
6644     struct regexp *const rx = (struct regexp *)SvANY(r);
6645     AV *av = newAV();
6646
6647     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
6648
6649     if (rx && RXp_PAREN_NAMES(rx)) {
6650         HV *hv= RXp_PAREN_NAMES(rx);
6651         HE *temphe;
6652         (void)hv_iterinit(hv);
6653         while ( (temphe = hv_iternext_flags(hv,0)) ) {
6654             IV i;
6655             IV parno = 0;
6656             SV* sv_dat = HeVAL(temphe);
6657             I32 *nums = (I32*)SvPVX(sv_dat);
6658             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6659                 if ((I32)(rx->lastparen) >= nums[i] &&
6660                     rx->offs[nums[i]].start != -1 &&
6661                     rx->offs[nums[i]].end != -1)
6662                 {
6663                     parno = nums[i];
6664                     break;
6665                 }
6666             }
6667             if (parno || flags & RXapif_ALL) {
6668                 av_push(av, newSVhek(HeKEY_hek(temphe)));
6669             }
6670         }
6671     }
6672
6673     return newRV_noinc(MUTABLE_SV(av));
6674 }
6675
6676 void
6677 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
6678                              SV * const sv)
6679 {
6680     struct regexp *const rx = (struct regexp *)SvANY(r);
6681     char *s = NULL;
6682     I32 i = 0;
6683     I32 s1, t1;
6684
6685     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
6686
6687     if (!rx->subbeg) {
6688         sv_setsv(sv,&PL_sv_undef);
6689         return;
6690     }
6691     else
6692     if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
6693         /* $` */
6694         i = rx->offs[0].start;
6695         s = rx->subbeg;
6696     }
6697     else
6698     if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
6699         /* $' */
6700         s = rx->subbeg + rx->offs[0].end;
6701         i = rx->sublen - rx->offs[0].end;
6702     }
6703     else
6704     if ( 0 <= paren && paren <= (I32)rx->nparens &&
6705         (s1 = rx->offs[paren].start) != -1 &&
6706         (t1 = rx->offs[paren].end) != -1)
6707     {
6708         /* $& $1 ... */
6709         i = t1 - s1;
6710         s = rx->subbeg + s1;
6711     } else {
6712         sv_setsv(sv,&PL_sv_undef);
6713         return;
6714     }
6715     assert(rx->sublen >= (s - rx->subbeg) + i );
6716     if (i >= 0) {
6717         const int oldtainted = PL_tainted;
6718         TAINT_NOT;
6719         sv_setpvn(sv, s, i);
6720         PL_tainted = oldtainted;
6721         if ( (rx->extflags & RXf_CANY_SEEN)
6722             ? (RXp_MATCH_UTF8(rx)
6723                         && (!i || is_utf8_string((U8*)s, i)))
6724             : (RXp_MATCH_UTF8(rx)) )
6725         {
6726             SvUTF8_on(sv);
6727         }
6728         else
6729             SvUTF8_off(sv);
6730         if (PL_tainting) {
6731             if (RXp_MATCH_TAINTED(rx)) {
6732                 if (SvTYPE(sv) >= SVt_PVMG) {
6733                     MAGIC* const mg = SvMAGIC(sv);
6734                     MAGIC* mgt;
6735                     PL_tainted = 1;
6736                     SvMAGIC_set(sv, mg->mg_moremagic);
6737                     SvTAINT(sv);
6738                     if ((mgt = SvMAGIC(sv))) {
6739                         mg->mg_moremagic = mgt;
6740                         SvMAGIC_set(sv, mg);
6741                     }
6742                 } else {
6743                     PL_tainted = 1;
6744                     SvTAINT(sv);
6745                 }
6746             } else
6747                 SvTAINTED_off(sv);
6748         }
6749     } else {
6750         sv_setsv(sv,&PL_sv_undef);
6751         return;
6752     }
6753 }
6754
6755 void
6756 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
6757                                                          SV const * const value)
6758 {
6759     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
6760
6761     PERL_UNUSED_ARG(rx);
6762     PERL_UNUSED_ARG(paren);
6763     PERL_UNUSED_ARG(value);
6764
6765     if (!PL_localizing)
6766         Perl_croak_no_modify(aTHX);
6767 }
6768
6769 I32
6770 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
6771                               const I32 paren)
6772 {
6773     struct regexp *const rx = (struct regexp *)SvANY(r);
6774     I32 i;
6775     I32 s1, t1;
6776
6777     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
6778
6779     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
6780         switch (paren) {
6781       /* $` / ${^PREMATCH} */
6782       case RX_BUFF_IDX_PREMATCH:
6783         if (rx->offs[0].start != -1) {
6784                         i = rx->offs[0].start;
6785                         if (i > 0) {
6786                                 s1 = 0;
6787                                 t1 = i;
6788                                 goto getlen;
6789                         }
6790             }
6791         return 0;
6792       /* $' / ${^POSTMATCH} */
6793       case RX_BUFF_IDX_POSTMATCH:
6794             if (rx->offs[0].end != -1) {
6795                         i = rx->sublen - rx->offs[0].end;
6796                         if (i > 0) {
6797                                 s1 = rx->offs[0].end;
6798                                 t1 = rx->sublen;
6799                                 goto getlen;
6800                         }
6801             }
6802         return 0;
6803       /* $& / ${^MATCH}, $1, $2, ... */
6804       default:
6805             if (paren <= (I32)rx->nparens &&
6806             (s1 = rx->offs[paren].start) != -1 &&
6807             (t1 = rx->offs[paren].end) != -1)
6808             {
6809             i = t1 - s1;
6810             goto getlen;
6811         } else {
6812             if (ckWARN(WARN_UNINITIALIZED))
6813                 report_uninit((const SV *)sv);
6814             return 0;
6815         }
6816     }
6817   getlen:
6818     if (i > 0 && RXp_MATCH_UTF8(rx)) {
6819         const char * const s = rx->subbeg + s1;
6820         const U8 *ep;
6821         STRLEN el;
6822
6823         i = t1 - s1;
6824         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
6825                         i = el;
6826     }
6827     return i;
6828 }
6829
6830 SV*
6831 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
6832 {
6833     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
6834         PERL_UNUSED_ARG(rx);
6835         if (0)
6836             return NULL;
6837         else
6838             return newSVpvs("Regexp");
6839 }
6840
6841 /* Scans the name of a named buffer from the pattern.
6842  * If flags is REG_RSN_RETURN_NULL returns null.
6843  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
6844  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
6845  * to the parsed name as looked up in the RExC_paren_names hash.
6846  * If there is an error throws a vFAIL().. type exception.
6847  */
6848
6849 #define REG_RSN_RETURN_NULL    0
6850 #define REG_RSN_RETURN_NAME    1
6851 #define REG_RSN_RETURN_DATA    2
6852
6853 STATIC SV*
6854 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
6855 {
6856     char *name_start = RExC_parse;
6857
6858     PERL_ARGS_ASSERT_REG_SCAN_NAME;
6859
6860     if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
6861          /* skip IDFIRST by using do...while */
6862         if (UTF)
6863             do {
6864                 RExC_parse += UTF8SKIP(RExC_parse);
6865             } while (isALNUM_utf8((U8*)RExC_parse));
6866         else
6867             do {
6868                 RExC_parse++;
6869             } while (isALNUM(*RExC_parse));
6870     }
6871
6872     if ( flags ) {
6873         SV* sv_name
6874             = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
6875                              SVs_TEMP | (UTF ? SVf_UTF8 : 0));
6876         if ( flags == REG_RSN_RETURN_NAME)
6877             return sv_name;
6878         else if (flags==REG_RSN_RETURN_DATA) {
6879             HE *he_str = NULL;
6880             SV *sv_dat = NULL;
6881             if ( ! sv_name )      /* should not happen*/
6882                 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
6883             if (RExC_paren_names)
6884                 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
6885             if ( he_str )
6886                 sv_dat = HeVAL(he_str);
6887             if ( ! sv_dat )
6888                 vFAIL("Reference to nonexistent named group");
6889             return sv_dat;
6890         }
6891         else {
6892             Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
6893                        (unsigned long) flags);
6894         }
6895         assert(0); /* NOT REACHED */
6896     }
6897     return NULL;
6898 }
6899
6900 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
6901     int rem=(int)(RExC_end - RExC_parse);                       \
6902     int cut;                                                    \
6903     int num;                                                    \
6904     int iscut=0;                                                \
6905     if (rem>10) {                                               \
6906         rem=10;                                                 \
6907         iscut=1;                                                \
6908     }                                                           \
6909     cut=10-rem;                                                 \
6910     if (RExC_lastparse!=RExC_parse)                             \
6911         PerlIO_printf(Perl_debug_log," >%.*s%-*s",              \
6912             rem, RExC_parse,                                    \
6913             cut + 4,                                            \
6914             iscut ? "..." : "<"                                 \
6915         );                                                      \
6916     else                                                        \
6917         PerlIO_printf(Perl_debug_log,"%16s","");                \
6918                                                                 \
6919     if (SIZE_ONLY)                                              \
6920        num = RExC_size + 1;                                     \
6921     else                                                        \
6922        num=REG_NODE_NUM(RExC_emit);                             \
6923     if (RExC_lastnum!=num)                                      \
6924        PerlIO_printf(Perl_debug_log,"|%4d",num);                \
6925     else                                                        \
6926        PerlIO_printf(Perl_debug_log,"|%4s","");                 \
6927     PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
6928         (int)((depth*2)), "",                                   \
6929         (funcname)                                              \
6930     );                                                          \
6931     RExC_lastnum=num;                                           \
6932     RExC_lastparse=RExC_parse;                                  \
6933 })
6934
6935
6936
6937 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
6938     DEBUG_PARSE_MSG((funcname));                            \
6939     PerlIO_printf(Perl_debug_log,"%4s","\n");               \
6940 })
6941 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
6942     DEBUG_PARSE_MSG((funcname));                            \
6943     PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
6944 })
6945
6946 /* This section of code defines the inversion list object and its methods.  The
6947  * interfaces are highly subject to change, so as much as possible is static to
6948  * this file.  An inversion list is here implemented as a malloc'd C UV array
6949  * with some added info that is placed as UVs at the beginning in a header
6950  * portion.  An inversion list for Unicode is an array of code points, sorted
6951  * by ordinal number.  The zeroth element is the first code point in the list.
6952  * The 1th element is the first element beyond that not in the list.  In other
6953  * words, the first range is
6954  *  invlist[0]..(invlist[1]-1)
6955  * The other ranges follow.  Thus every element whose index is divisible by two
6956  * marks the beginning of a range that is in the list, and every element not
6957  * divisible by two marks the beginning of a range not in the list.  A single
6958  * element inversion list that contains the single code point N generally
6959  * consists of two elements
6960  *  invlist[0] == N
6961  *  invlist[1] == N+1
6962  * (The exception is when N is the highest representable value on the
6963  * machine, in which case the list containing just it would be a single
6964  * element, itself.  By extension, if the last range in the list extends to
6965  * infinity, then the first element of that range will be in the inversion list
6966  * at a position that is divisible by two, and is the final element in the
6967  * list.)
6968  * Taking the complement (inverting) an inversion list is quite simple, if the
6969  * first element is 0, remove it; otherwise add a 0 element at the beginning.
6970  * This implementation reserves an element at the beginning of each inversion list
6971  * to contain 0 when the list contains 0, and contains 1 otherwise.  The actual
6972  * beginning of the list is either that element if 0, or the next one if 1.
6973  *
6974  * More about inversion lists can be found in "Unicode Demystified"
6975  * Chapter 13 by Richard Gillam, published by Addison-Wesley.
6976  * More will be coming when functionality is added later.
6977  *
6978  * The inversion list data structure is currently implemented as an SV pointing
6979  * to an array of UVs that the SV thinks are bytes.  This allows us to have an
6980  * array of UV whose memory management is automatically handled by the existing
6981  * facilities for SV's.
6982  *
6983  * Some of the methods should always be private to the implementation, and some
6984  * should eventually be made public */
6985
6986 #define INVLIST_LEN_OFFSET 0    /* Number of elements in the inversion list */
6987 #define INVLIST_ITER_OFFSET 1   /* Current iteration position */
6988
6989 /* This is a combination of a version and data structure type, so that one
6990  * being passed in can be validated to be an inversion list of the correct
6991  * vintage.  When the structure of the header is changed, a new random number
6992  * in the range 2**31-1 should be generated and the new() method changed to
6993  * insert that at this location.  Then, if an auxiliary program doesn't change
6994  * correspondingly, it will be discovered immediately */
6995 #define INVLIST_VERSION_ID_OFFSET 2
6996 #define INVLIST_VERSION_ID 1064334010
6997
6998 /* For safety, when adding new elements, remember to #undef them at the end of
6999  * the inversion list code section */
7000
7001 #define INVLIST_ZERO_OFFSET 3   /* 0 or 1; must be last element in header */
7002 /* The UV at position ZERO contains either 0 or 1.  If 0, the inversion list
7003  * contains the code point U+00000, and begins here.  If 1, the inversion list
7004  * doesn't contain U+0000, and it begins at the next UV in the array.
7005  * Inverting an inversion list consists of adding or removing the 0 at the
7006  * beginning of it.  By reserving a space for that 0, inversion can be made
7007  * very fast */
7008
7009 #define HEADER_LENGTH (INVLIST_ZERO_OFFSET + 1)
7010
7011 /* Internally things are UVs */
7012 #define TO_INTERNAL_SIZE(x) ((x + HEADER_LENGTH) * sizeof(UV))
7013 #define FROM_INTERNAL_SIZE(x) ((x / sizeof(UV)) - HEADER_LENGTH)
7014
7015 #define INVLIST_INITIAL_LEN 10
7016
7017 PERL_STATIC_INLINE UV*
7018 S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
7019 {
7020     /* Returns a pointer to the first element in the inversion list's array.
7021      * This is called upon initialization of an inversion list.  Where the
7022      * array begins depends on whether the list has the code point U+0000
7023      * in it or not.  The other parameter tells it whether the code that
7024      * follows this call is about to put a 0 in the inversion list or not.
7025      * The first element is either the element with 0, if 0, or the next one,
7026      * if 1 */
7027
7028     UV* zero = get_invlist_zero_addr(invlist);
7029
7030     PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
7031
7032     /* Must be empty */
7033     assert(! *get_invlist_len_addr(invlist));
7034
7035     /* 1^1 = 0; 1^0 = 1 */
7036     *zero = 1 ^ will_have_0;
7037     return zero + *zero;
7038 }
7039
7040 PERL_STATIC_INLINE UV*
7041 S_invlist_array(pTHX_ SV* const invlist)
7042 {
7043     /* Returns the pointer to the inversion list's array.  Every time the
7044      * length changes, this needs to be called in case malloc or realloc moved
7045      * it */
7046
7047     PERL_ARGS_ASSERT_INVLIST_ARRAY;
7048
7049     /* Must not be empty.  If these fail, you probably didn't check for <len>
7050      * being non-zero before trying to get the array */
7051     assert(*get_invlist_len_addr(invlist));
7052     assert(*get_invlist_zero_addr(invlist) == 0
7053            || *get_invlist_zero_addr(invlist) == 1);
7054
7055     /* The array begins either at the element reserved for zero if the
7056      * list contains 0 (that element will be set to 0), or otherwise the next
7057      * element (in which case the reserved element will be set to 1). */
7058     return (UV *) (get_invlist_zero_addr(invlist)
7059                    + *get_invlist_zero_addr(invlist));
7060 }
7061
7062 PERL_STATIC_INLINE UV*
7063 S_get_invlist_len_addr(pTHX_ SV* invlist)
7064 {
7065     /* Return the address of the UV that contains the current number
7066      * of used elements in the inversion list */
7067
7068     PERL_ARGS_ASSERT_GET_INVLIST_LEN_ADDR;
7069
7070     return (UV *) (SvPVX(invlist) + (INVLIST_LEN_OFFSET * sizeof (UV)));
7071 }
7072
7073 PERL_STATIC_INLINE UV
7074 S_invlist_len(pTHX_ SV* const invlist)
7075 {
7076     /* Returns the current number of elements stored in the inversion list's
7077      * array */
7078
7079     PERL_ARGS_ASSERT_INVLIST_LEN;
7080
7081     return *get_invlist_len_addr(invlist);
7082 }
7083
7084 PERL_STATIC_INLINE void
7085 S_invlist_set_len(pTHX_ SV* const invlist, const UV len)
7086 {
7087     /* Sets the current number of elements stored in the inversion list */
7088
7089     PERL_ARGS_ASSERT_INVLIST_SET_LEN;
7090
7091     *get_invlist_len_addr(invlist) = len;
7092
7093     assert(len <= SvLEN(invlist));
7094
7095     SvCUR_set(invlist, TO_INTERNAL_SIZE(len));
7096     /* If the list contains U+0000, that element is part of the header,
7097      * and should not be counted as part of the array.  It will contain
7098      * 0 in that case, and 1 otherwise.  So we could flop 0=>1, 1=>0 and
7099      * subtract:
7100      *  SvCUR_set(invlist,
7101      *            TO_INTERNAL_SIZE(len
7102      *                             - (*get_invlist_zero_addr(inv_list) ^ 1)));
7103      * But, this is only valid if len is not 0.  The consequences of not doing
7104      * this is that the memory allocation code may think that 1 more UV is
7105      * being used than actually is, and so might do an unnecessary grow.  That
7106      * seems worth not bothering to make this the precise amount.
7107      *
7108      * Note that when inverting, SvCUR shouldn't change */
7109 }
7110
7111 PERL_STATIC_INLINE UV
7112 S_invlist_max(pTHX_ SV* const invlist)
7113 {
7114     /* Returns the maximum number of elements storable in the inversion list's
7115      * array, without having to realloc() */
7116
7117     PERL_ARGS_ASSERT_INVLIST_MAX;
7118
7119     return FROM_INTERNAL_SIZE(SvLEN(invlist));
7120 }
7121
7122 PERL_STATIC_INLINE UV*
7123 S_get_invlist_zero_addr(pTHX_ SV* invlist)
7124 {
7125     /* Return the address of the UV that is reserved to hold 0 if the inversion
7126      * list contains 0.  This has to be the last element of the heading, as the
7127      * list proper starts with either it if 0, or the next element if not.
7128      * (But we force it to contain either 0 or 1) */
7129
7130     PERL_ARGS_ASSERT_GET_INVLIST_ZERO_ADDR;
7131
7132     return (UV *) (SvPVX(invlist) + (INVLIST_ZERO_OFFSET * sizeof (UV)));
7133 }
7134
7135 #ifndef PERL_IN_XSUB_RE
7136 SV*
7137 Perl__new_invlist(pTHX_ IV initial_size)
7138 {
7139
7140     /* Return a pointer to a newly constructed inversion list, with enough
7141      * space to store 'initial_size' elements.  If that number is negative, a
7142      * system default is used instead */
7143
7144     SV* new_list;
7145
7146     if (initial_size < 0) {
7147         initial_size = INVLIST_INITIAL_LEN;
7148     }
7149
7150     /* Allocate the initial space */
7151     new_list = newSV(TO_INTERNAL_SIZE(initial_size));
7152     invlist_set_len(new_list, 0);
7153
7154     /* Force iterinit() to be used to get iteration to work */
7155     *get_invlist_iter_addr(new_list) = UV_MAX;
7156
7157     /* This should force a segfault if a method doesn't initialize this
7158      * properly */
7159     *get_invlist_zero_addr(new_list) = UV_MAX;
7160
7161     *get_invlist_version_id_addr(new_list) = INVLIST_VERSION_ID;
7162 #if HEADER_LENGTH != 4
7163 #   error Need to regenerate VERSION_ID by running perl -E 'say int(rand 2**31-1)', and then changing the #if to the new length
7164 #endif
7165
7166     return new_list;
7167 }
7168 #endif
7169
7170 STATIC SV*
7171 S__new_invlist_C_array(pTHX_ UV* list)
7172 {
7173     /* Return a pointer to a newly constructed inversion list, initialized to
7174      * point to <list>, which has to be in the exact correct inversion list
7175      * form, including internal fields.  Thus this is a dangerous routine that
7176      * should not be used in the wrong hands */
7177
7178     SV* invlist = newSV_type(SVt_PV);
7179
7180     PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
7181
7182     SvPV_set(invlist, (char *) list);
7183     SvLEN_set(invlist, 0);  /* Means we own the contents, and the system
7184                                shouldn't touch it */
7185     SvCUR_set(invlist, TO_INTERNAL_SIZE(invlist_len(invlist)));
7186
7187     if (*get_invlist_version_id_addr(invlist) != INVLIST_VERSION_ID) {
7188         Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
7189     }
7190
7191     return invlist;
7192 }
7193
7194 STATIC void
7195 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
7196 {
7197     /* Grow the maximum size of an inversion list */
7198
7199     PERL_ARGS_ASSERT_INVLIST_EXTEND;
7200
7201     SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max));
7202 }
7203
7204 PERL_STATIC_INLINE void
7205 S_invlist_trim(pTHX_ SV* const invlist)
7206 {
7207     PERL_ARGS_ASSERT_INVLIST_TRIM;
7208
7209     /* Change the length of the inversion list to how many entries it currently
7210      * has */
7211
7212     SvPV_shrink_to_cur((SV *) invlist);
7213 }
7214
7215 /* An element is in an inversion list iff its index is even numbered: 0, 2, 4,
7216  * etc */
7217 #define ELEMENT_RANGE_MATCHES_INVLIST(i) (! ((i) & 1))
7218 #define PREV_RANGE_MATCHES_INVLIST(i) (! ELEMENT_RANGE_MATCHES_INVLIST(i))
7219
7220 #define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
7221
7222 STATIC void
7223 S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
7224 {
7225    /* Subject to change or removal.  Append the range from 'start' to 'end' at
7226     * the end of the inversion list.  The range must be above any existing
7227     * ones. */
7228
7229     UV* array;
7230     UV max = invlist_max(invlist);
7231     UV len = invlist_len(invlist);
7232
7233     PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
7234
7235     if (len == 0) { /* Empty lists must be initialized */
7236         array = _invlist_array_init(invlist, start == 0);
7237     }
7238     else {
7239         /* Here, the existing list is non-empty. The current max entry in the
7240          * list is generally the first value not in the set, except when the
7241          * set extends to the end of permissible values, in which case it is
7242          * the first entry in that final set, and so this call is an attempt to
7243          * append out-of-order */
7244
7245         UV final_element = len - 1;
7246         array = invlist_array(invlist);
7247         if (array[final_element] > start
7248             || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
7249         {
7250             Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
7251                        array[final_element], start,
7252                        ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
7253         }
7254
7255         /* Here, it is a legal append.  If the new range begins with the first
7256          * value not in the set, it is extending the set, so the new first
7257          * value not in the set is one greater than the newly extended range.
7258          * */
7259         if (array[final_element] == start) {
7260             if (end != UV_MAX) {
7261                 array[final_element] = end + 1;
7262             }
7263             else {
7264                 /* But if the end is the maximum representable on the machine,
7265                  * just let the range that this would extend to have no end */
7266                 invlist_set_len(invlist, len - 1);
7267             }
7268             return;
7269         }
7270     }
7271
7272     /* Here the new range doesn't extend any existing set.  Add it */
7273
7274     len += 2;   /* Includes an element each for the start and end of range */
7275
7276     /* If overflows the existing space, extend, which may cause the array to be
7277      * moved */
7278     if (max < len) {
7279         invlist_extend(invlist, len);
7280         invlist_set_len(invlist, len);  /* Have to set len here to avoid assert
7281                                            failure in invlist_array() */
7282         array = invlist_array(invlist);
7283     }
7284     else {
7285         invlist_set_len(invlist, len);
7286     }
7287
7288     /* The next item on the list starts the range, the one after that is
7289      * one past the new range.  */
7290     array[len - 2] = start;
7291     if (end != UV_MAX) {
7292         array[len - 1] = end + 1;
7293     }
7294     else {
7295         /* But if the end is the maximum representable on the machine, just let
7296          * the range have no end */
7297         invlist_set_len(invlist, len - 1);
7298     }
7299 }
7300
7301 #ifndef PERL_IN_XSUB_RE
7302
7303 STATIC IV
7304 S_invlist_search(pTHX_ SV* const invlist, const UV cp)
7305 {
7306     /* Searches the inversion list for the entry that contains the input code
7307      * point <cp>.  If <cp> is not in the list, -1 is returned.  Otherwise, the
7308      * return value is the index into the list's array of the range that
7309      * contains <cp> */
7310
7311     IV low = 0;
7312     IV high = invlist_len(invlist);
7313     const UV * const array = invlist_array(invlist);
7314
7315     PERL_ARGS_ASSERT_INVLIST_SEARCH;
7316
7317     /* If list is empty or the code point is before the first element, return
7318      * failure. */
7319     if (high == 0 || cp < array[0]) {
7320         return -1;
7321     }
7322
7323     /* Binary search.  What we are looking for is <i> such that
7324      *  array[i] <= cp < array[i+1]
7325      * The loop below converges on the i+1. */
7326     while (low < high) {
7327         IV mid = (low + high) / 2;
7328         if (array[mid] <= cp) {
7329             low = mid + 1;
7330
7331             /* We could do this extra test to exit the loop early.
7332             if (cp < array[low]) {
7333                 return mid;
7334             }
7335             */
7336         }
7337         else { /* cp < array[mid] */
7338             high = mid;
7339         }
7340     }
7341
7342     return high - 1;
7343 }
7344
7345 void
7346 Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch)
7347 {
7348     /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
7349      * but is used when the swash has an inversion list.  This makes this much
7350      * faster, as it uses a binary search instead of a linear one.  This is
7351      * intimately tied to that function, and perhaps should be in utf8.c,
7352      * except it is intimately tied to inversion lists as well.  It assumes
7353      * that <swatch> is all 0's on input */
7354
7355     UV current = start;
7356     const IV len = invlist_len(invlist);
7357     IV i;
7358     const UV * array;
7359
7360     PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
7361
7362     if (len == 0) { /* Empty inversion list */
7363         return;
7364     }
7365
7366     array = invlist_array(invlist);
7367
7368     /* Find which element it is */
7369     i = invlist_search(invlist, start);
7370
7371     /* We populate from <start> to <end> */
7372     while (current < end) {
7373         UV upper;
7374
7375         /* The inversion list gives the results for every possible code point
7376          * after the first one in the list.  Only those ranges whose index is
7377          * even are ones that the inversion list matches.  For the odd ones,
7378          * and if the initial code point is not in the list, we have to skip
7379          * forward to the next element */
7380         if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
7381             i++;
7382             if (i >= len) { /* Finished if beyond the end of the array */
7383                 return;
7384             }
7385             current = array[i];
7386             if (current >= end) {   /* Finished if beyond the end of what we
7387                                        are populating */
7388                 return;
7389             }
7390         }
7391         assert(current >= start);
7392
7393         /* The current range ends one below the next one, except don't go past
7394          * <end> */
7395         i++;
7396         upper = (i < len && array[i] < end) ? array[i] : end;
7397
7398         /* Here we are in a range that matches.  Populate a bit in the 3-bit U8
7399          * for each code point in it */
7400         for (; current < upper; current++) {
7401             const STRLEN offset = (STRLEN)(current - start);
7402             swatch[offset >> 3] |= 1 << (offset & 7);
7403         }
7404
7405         /* Quit if at the end of the list */
7406         if (i >= len) {
7407
7408             /* But first, have to deal with the highest possible code point on
7409              * the platform.  The previous code assumes that <end> is one
7410              * beyond where we want to populate, but that is impossible at the
7411              * platform's infinity, so have to handle it specially */
7412             if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
7413             {
7414                 const STRLEN offset = (STRLEN)(end - start);
7415                 swatch[offset >> 3] |= 1 << (offset & 7);
7416             }
7417             return;
7418         }
7419
7420         /* Advance to the next range, which will be for code points not in the
7421          * inversion list */
7422         current = array[i];
7423     }
7424
7425     return;
7426 }
7427
7428
7429 void
7430 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** output)
7431 {
7432     /* Take the union of two inversion lists and point <output> to it.  *output
7433      * should be defined upon input, and if it points to one of the two lists,
7434      * the reference count to that list will be decremented.  The first list,
7435      * <a>, may be NULL, in which case a copy of the second list is returned.
7436      * If <complement_b> is TRUE, the union is taken of the complement
7437      * (inversion) of <b> instead of b itself.
7438      *
7439      * The basis for this comes from "Unicode Demystified" Chapter 13 by
7440      * Richard Gillam, published by Addison-Wesley, and explained at some
7441      * length there.  The preface says to incorporate its examples into your
7442      * code at your own risk.
7443      *
7444      * The algorithm is like a merge sort.
7445      *
7446      * XXX A potential performance improvement is to keep track as we go along
7447      * if only one of the inputs contributes to the result, meaning the other
7448      * is a subset of that one.  In that case, we can skip the final copy and
7449      * return the larger of the input lists, but then outside code might need
7450      * to keep track of whether to free the input list or not */
7451
7452     UV* array_a;    /* a's array */
7453     UV* array_b;
7454     UV len_a;       /* length of a's array */
7455     UV len_b;
7456
7457     SV* u;                      /* the resulting union */
7458     UV* array_u;
7459     UV len_u;
7460
7461     UV i_a = 0;             /* current index into a's array */
7462     UV i_b = 0;
7463     UV i_u = 0;
7464
7465     /* running count, as explained in the algorithm source book; items are
7466      * stopped accumulating and are output when the count changes to/from 0.
7467      * The count is incremented when we start a range that's in the set, and
7468      * decremented when we start a range that's not in the set.  So its range
7469      * is 0 to 2.  Only when the count is zero is something not in the set.
7470      */
7471     UV count = 0;
7472
7473     PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
7474     assert(a != b);
7475
7476     /* If either one is empty, the union is the other one */
7477     if (a == NULL || ((len_a = invlist_len(a)) == 0)) {
7478         if (*output == a) {
7479             if (a != NULL) {
7480                 SvREFCNT_dec(a);
7481             }
7482         }
7483         if (*output != b) {
7484             *output = invlist_clone(b);
7485             if (complement_b) {
7486                 _invlist_invert(*output);
7487             }
7488         } /* else *output already = b; */
7489         return;
7490     }
7491     else if ((len_b = invlist_len(b)) == 0) {
7492         if (*output == b) {
7493             SvREFCNT_dec(b);
7494         }
7495
7496         /* The complement of an empty list is a list that has everything in it,
7497          * so the union with <a> includes everything too */
7498         if (complement_b) {
7499             if (a == *output) {
7500                 SvREFCNT_dec(a);
7501             }
7502             *output = _new_invlist(1);
7503             _append_range_to_invlist(*output, 0, UV_MAX);
7504         }
7505         else if (*output != a) {
7506             *output = invlist_clone(a);
7507         }
7508         /* else *output already = a; */
7509         return;
7510     }
7511
7512     /* Here both lists exist and are non-empty */
7513     array_a = invlist_array(a);
7514     array_b = invlist_array(b);
7515
7516     /* If are to take the union of 'a' with the complement of b, set it
7517      * up so are looking at b's complement. */
7518     if (complement_b) {
7519
7520         /* To complement, we invert: if the first element is 0, remove it.  To
7521          * do this, we just pretend the array starts one later, and clear the
7522          * flag as we don't have to do anything else later */
7523         if (array_b[0] == 0) {
7524             array_b++;
7525             len_b--;
7526             complement_b = FALSE;
7527         }
7528         else {
7529
7530             /* But if the first element is not zero, we unshift a 0 before the
7531              * array.  The data structure reserves a space for that 0 (which
7532              * should be a '1' right now), so physical shifting is unneeded,
7533              * but temporarily change that element to 0.  Before exiting the
7534              * routine, we must restore the element to '1' */
7535             array_b--;
7536             len_b++;
7537             array_b[0] = 0;
7538         }
7539     }
7540
7541     /* Size the union for the worst case: that the sets are completely
7542      * disjoint */
7543     u = _new_invlist(len_a + len_b);
7544
7545     /* Will contain U+0000 if either component does */
7546     array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
7547                                       || (len_b > 0 && array_b[0] == 0));
7548
7549     /* Go through each list item by item, stopping when exhausted one of
7550      * them */
7551     while (i_a < len_a && i_b < len_b) {
7552         UV cp;      /* The element to potentially add to the union's array */
7553         bool cp_in_set;   /* is it in the the input list's set or not */
7554
7555         /* We need to take one or the other of the two inputs for the union.
7556          * Since we are merging two sorted lists, we take the smaller of the
7557          * next items.  In case of a tie, we take the one that is in its set
7558          * first.  If we took one not in the set first, it would decrement the
7559          * count, possibly to 0 which would cause it to be output as ending the
7560          * range, and the next time through we would take the same number, and
7561          * output it again as beginning the next range.  By doing it the
7562          * opposite way, there is no possibility that the count will be
7563          * momentarily decremented to 0, and thus the two adjoining ranges will
7564          * be seamlessly merged.  (In a tie and both are in the set or both not
7565          * in the set, it doesn't matter which we take first.) */
7566         if (array_a[i_a] < array_b[i_b]
7567             || (array_a[i_a] == array_b[i_b]
7568                 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7569         {
7570             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7571             cp= array_a[i_a++];
7572         }
7573         else {
7574             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7575             cp= array_b[i_b++];
7576         }
7577
7578         /* Here, have chosen which of the two inputs to look at.  Only output
7579          * if the running count changes to/from 0, which marks the
7580          * beginning/end of a range in that's in the set */
7581         if (cp_in_set) {
7582             if (count == 0) {
7583                 array_u[i_u++] = cp;
7584             }
7585             count++;
7586         }
7587         else {
7588             count--;
7589             if (count == 0) {
7590                 array_u[i_u++] = cp;
7591             }
7592         }
7593     }
7594
7595     /* Here, we are finished going through at least one of the lists, which
7596      * means there is something remaining in at most one.  We check if the list
7597      * that hasn't been exhausted is positioned such that we are in the middle
7598      * of a range in its set or not.  (i_a and i_b point to the element beyond
7599      * the one we care about.) If in the set, we decrement 'count'; if 0, there
7600      * is potentially more to output.
7601      * There are four cases:
7602      *  1) Both weren't in their sets, count is 0, and remains 0.  What's left
7603      *     in the union is entirely from the non-exhausted set.
7604      *  2) Both were in their sets, count is 2.  Nothing further should
7605      *     be output, as everything that remains will be in the exhausted
7606      *     list's set, hence in the union; decrementing to 1 but not 0 insures
7607      *     that
7608      *  3) the exhausted was in its set, non-exhausted isn't, count is 1.
7609      *     Nothing further should be output because the union includes
7610      *     everything from the exhausted set.  Not decrementing ensures that.
7611      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1;
7612      *     decrementing to 0 insures that we look at the remainder of the
7613      *     non-exhausted set */
7614     if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7615         || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7616     {
7617         count--;
7618     }
7619
7620     /* The final length is what we've output so far, plus what else is about to
7621      * be output.  (If 'count' is non-zero, then the input list we exhausted
7622      * has everything remaining up to the machine's limit in its set, and hence
7623      * in the union, so there will be no further output. */
7624     len_u = i_u;
7625     if (count == 0) {
7626         /* At most one of the subexpressions will be non-zero */
7627         len_u += (len_a - i_a) + (len_b - i_b);
7628     }
7629
7630     /* Set result to final length, which can change the pointer to array_u, so
7631      * re-find it */
7632     if (len_u != invlist_len(u)) {
7633         invlist_set_len(u, len_u);
7634         invlist_trim(u);
7635         array_u = invlist_array(u);
7636     }
7637
7638     /* When 'count' is 0, the list that was exhausted (if one was shorter than
7639      * the other) ended with everything above it not in its set.  That means
7640      * that the remaining part of the union is precisely the same as the
7641      * non-exhausted list, so can just copy it unchanged.  (If both list were
7642      * exhausted at the same time, then the operations below will be both 0.)
7643      */
7644     if (count == 0) {
7645         IV copy_count; /* At most one will have a non-zero copy count */
7646         if ((copy_count = len_a - i_a) > 0) {
7647             Copy(array_a + i_a, array_u + i_u, copy_count, UV);
7648         }
7649         else if ((copy_count = len_b - i_b) > 0) {
7650             Copy(array_b + i_b, array_u + i_u, copy_count, UV);
7651         }
7652     }
7653
7654     /*  We may be removing a reference to one of the inputs */
7655     if (a == *output || b == *output) {
7656         SvREFCNT_dec(*output);
7657     }
7658
7659     /* If we've changed b, restore it */
7660     if (complement_b) {
7661         array_b[0] = 1;
7662     }
7663
7664     *output = u;
7665     return;
7666 }
7667
7668 void
7669 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** i)
7670 {
7671     /* Take the intersection of two inversion lists and point <i> to it.  *i
7672      * should be defined upon input, and if it points to one of the two lists,
7673      * the reference count to that list will be decremented.
7674      * If <complement_b> is TRUE, the result will be the intersection of <a>
7675      * and the complement (or inversion) of <b> instead of <b> directly.
7676      *
7677      * The basis for this comes from "Unicode Demystified" Chapter 13 by
7678      * Richard Gillam, published by Addison-Wesley, and explained at some
7679      * length there.  The preface says to incorporate its examples into your
7680      * code at your own risk.  In fact, it had bugs
7681      *
7682      * The algorithm is like a merge sort, and is essentially the same as the
7683      * union above
7684      */
7685
7686     UV* array_a;                /* a's array */
7687     UV* array_b;
7688     UV len_a;   /* length of a's array */
7689     UV len_b;
7690
7691     SV* r;                   /* the resulting intersection */
7692     UV* array_r;
7693     UV len_r;
7694
7695     UV i_a = 0;             /* current index into a's array */
7696     UV i_b = 0;
7697     UV i_r = 0;
7698
7699     /* running count, as explained in the algorithm source book; items are
7700      * stopped accumulating and are output when the count changes to/from 2.
7701      * The count is incremented when we start a range that's in the set, and
7702      * decremented when we start a range that's not in the set.  So its range
7703      * is 0 to 2.  Only when the count is 2 is something in the intersection.
7704      */
7705     UV count = 0;
7706
7707     PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
7708     assert(a != b);
7709
7710     /* Special case if either one is empty */
7711     len_a = invlist_len(a);
7712     if ((len_a == 0) || ((len_b = invlist_len(b)) == 0)) {
7713
7714         if (len_a != 0 && complement_b) {
7715
7716             /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
7717              * be empty.  Here, also we are using 'b's complement, which hence
7718              * must be every possible code point.  Thus the intersection is
7719              * simply 'a'. */
7720             if (*i != a) {
7721                 *i = invlist_clone(a);
7722
7723                 if (*i == b) {
7724                     SvREFCNT_dec(b);
7725                 }
7726             }
7727             /* else *i is already 'a' */
7728             return;
7729         }
7730
7731         /* Here, 'a' or 'b' is empty and not using the complement of 'b'.  The
7732          * intersection must be empty */
7733         if (*i == a) {
7734             SvREFCNT_dec(a);
7735         }
7736         else if (*i == b) {
7737             SvREFCNT_dec(b);
7738         }
7739         *i = _new_invlist(0);
7740         return;
7741     }
7742
7743     /* Here both lists exist and are non-empty */
7744     array_a = invlist_array(a);
7745     array_b = invlist_array(b);
7746
7747     /* If are to take the intersection of 'a' with the complement of b, set it
7748      * up so are looking at b's complement. */
7749     if (complement_b) {
7750
7751         /* To complement, we invert: if the first element is 0, remove it.  To
7752          * do this, we just pretend the array starts one later, and clear the
7753          * flag as we don't have to do anything else later */
7754         if (array_b[0] == 0) {
7755             array_b++;
7756             len_b--;
7757             complement_b = FALSE;
7758         }
7759         else {
7760
7761             /* But if the first element is not zero, we unshift a 0 before the
7762              * array.  The data structure reserves a space for that 0 (which
7763              * should be a '1' right now), so physical shifting is unneeded,
7764              * but temporarily change that element to 0.  Before exiting the
7765              * routine, we must restore the element to '1' */
7766             array_b--;
7767             len_b++;
7768             array_b[0] = 0;
7769         }
7770     }
7771
7772     /* Size the intersection for the worst case: that the intersection ends up
7773      * fragmenting everything to be completely disjoint */
7774     r= _new_invlist(len_a + len_b);
7775
7776     /* Will contain U+0000 iff both components do */
7777     array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
7778                                      && len_b > 0 && array_b[0] == 0);
7779
7780     /* Go through each list item by item, stopping when exhausted one of
7781      * them */
7782     while (i_a < len_a && i_b < len_b) {
7783         UV cp;      /* The element to potentially add to the intersection's
7784                        array */
7785         bool cp_in_set; /* Is it in the input list's set or not */
7786
7787         /* We need to take one or the other of the two inputs for the
7788          * intersection.  Since we are merging two sorted lists, we take the
7789          * smaller of the next items.  In case of a tie, we take the one that
7790          * is not in its set first (a difference from the union algorithm).  If
7791          * we took one in the set first, it would increment the count, possibly
7792          * to 2 which would cause it to be output as starting a range in the
7793          * intersection, and the next time through we would take that same
7794          * number, and output it again as ending the set.  By doing it the
7795          * opposite of this, there is no possibility that the count will be
7796          * momentarily incremented to 2.  (In a tie and both are in the set or
7797          * both not in the set, it doesn't matter which we take first.) */
7798         if (array_a[i_a] < array_b[i_b]
7799             || (array_a[i_a] == array_b[i_b]
7800                 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7801         {
7802             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7803             cp= array_a[i_a++];
7804         }
7805         else {
7806             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7807             cp= array_b[i_b++];
7808         }
7809
7810         /* Here, have chosen which of the two inputs to look at.  Only output
7811          * if the running count changes to/from 2, which marks the
7812          * beginning/end of a range that's in the intersection */
7813         if (cp_in_set) {
7814             count++;
7815             if (count == 2) {
7816                 array_r[i_r++] = cp;
7817             }
7818         }
7819         else {
7820             if (count == 2) {
7821                 array_r[i_r++] = cp;
7822             }
7823             count--;
7824         }
7825     }
7826
7827     /* Here, we are finished going through at least one of the lists, which
7828      * means there is something remaining in at most one.  We check if the list
7829      * that has been exhausted is positioned such that we are in the middle
7830      * of a range in its set or not.  (i_a and i_b point to elements 1 beyond
7831      * the ones we care about.)  There are four cases:
7832      *  1) Both weren't in their sets, count is 0, and remains 0.  There's
7833      *     nothing left in the intersection.
7834      *  2) Both were in their sets, count is 2 and perhaps is incremented to
7835      *     above 2.  What should be output is exactly that which is in the
7836      *     non-exhausted set, as everything it has is also in the intersection
7837      *     set, and everything it doesn't have can't be in the intersection
7838      *  3) The exhausted was in its set, non-exhausted isn't, count is 1, and
7839      *     gets incremented to 2.  Like the previous case, the intersection is
7840      *     everything that remains in the non-exhausted set.
7841      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
7842      *     remains 1.  And the intersection has nothing more. */
7843     if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7844         || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7845     {
7846         count++;
7847     }
7848
7849     /* The final length is what we've output so far plus what else is in the
7850      * intersection.  At most one of the subexpressions below will be non-zero */
7851     len_r = i_r;
7852     if (count >= 2) {
7853         len_r += (len_a - i_a) + (len_b - i_b);
7854     }
7855
7856     /* Set result to final length, which can change the pointer to array_r, so
7857      * re-find it */
7858     if (len_r != invlist_len(r)) {
7859         invlist_set_len(r, len_r);
7860         invlist_trim(r);
7861         array_r = invlist_array(r);
7862     }
7863
7864     /* Finish outputting any remaining */
7865     if (count >= 2) { /* At most one will have a non-zero copy count */
7866         IV copy_count;
7867         if ((copy_count = len_a - i_a) > 0) {
7868             Copy(array_a + i_a, array_r + i_r, copy_count, UV);
7869         }
7870         else if ((copy_count = len_b - i_b) > 0) {
7871             Copy(array_b + i_b, array_r + i_r, copy_count, UV);
7872         }
7873     }
7874
7875     /*  We may be removing a reference to one of the inputs */
7876     if (a == *i || b == *i) {
7877         SvREFCNT_dec(*i);
7878     }
7879
7880     /* If we've changed b, restore it */
7881     if (complement_b) {
7882         array_b[0] = 1;
7883     }
7884
7885     *i = r;
7886     return;
7887 }
7888
7889 SV*
7890 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
7891 {
7892     /* Add the range from 'start' to 'end' inclusive to the inversion list's
7893      * set.  A pointer to the inversion list is returned.  This may actually be
7894      * a new list, in which case the passed in one has been destroyed.  The
7895      * passed in inversion list can be NULL, in which case a new one is created
7896      * with just the one range in it */
7897
7898     SV* range_invlist;
7899     UV len;
7900
7901     if (invlist == NULL) {
7902         invlist = _new_invlist(2);
7903         len = 0;
7904     }
7905     else {
7906         len = invlist_len(invlist);
7907     }
7908
7909     /* If comes after the final entry, can just append it to the end */
7910     if (len == 0
7911         || start >= invlist_array(invlist)
7912                                     [invlist_len(invlist) - 1])
7913     {
7914         _append_range_to_invlist(invlist, start, end);
7915         return invlist;
7916     }
7917
7918     /* Here, can't just append things, create and return a new inversion list
7919      * which is the union of this range and the existing inversion list */
7920     range_invlist = _new_invlist(2);
7921     _append_range_to_invlist(range_invlist, start, end);
7922
7923     _invlist_union(invlist, range_invlist, &invlist);
7924
7925     /* The temporary can be freed */
7926     SvREFCNT_dec(range_invlist);
7927
7928     return invlist;
7929 }
7930
7931 #endif
7932
7933 PERL_STATIC_INLINE SV*
7934 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
7935     return _add_range_to_invlist(invlist, cp, cp);
7936 }
7937
7938 #ifndef PERL_IN_XSUB_RE
7939 void
7940 Perl__invlist_invert(pTHX_ SV* const invlist)
7941 {
7942     /* Complement the input inversion list.  This adds a 0 if the list didn't
7943      * have a zero; removes it otherwise.  As described above, the data
7944      * structure is set up so that this is very efficient */
7945
7946     UV* len_pos = get_invlist_len_addr(invlist);
7947
7948     PERL_ARGS_ASSERT__INVLIST_INVERT;
7949
7950     /* The inverse of matching nothing is matching everything */
7951     if (*len_pos == 0) {
7952         _append_range_to_invlist(invlist, 0, UV_MAX);
7953         return;
7954     }
7955
7956     /* The exclusive or complents 0 to 1; and 1 to 0.  If the result is 1, the
7957      * zero element was a 0, so it is being removed, so the length decrements
7958      * by 1; and vice-versa.  SvCUR is unaffected */
7959     if (*get_invlist_zero_addr(invlist) ^= 1) {
7960         (*len_pos)--;
7961     }
7962     else {
7963         (*len_pos)++;
7964     }
7965 }
7966
7967 void
7968 Perl__invlist_invert_prop(pTHX_ SV* const invlist)
7969 {
7970     /* Complement the input inversion list (which must be a Unicode property,
7971      * all of which don't match above the Unicode maximum code point.)  And
7972      * Perl has chosen to not have the inversion match above that either.  This
7973      * adds a 0x110000 if the list didn't end with it, and removes it if it did
7974      */
7975
7976     UV len;
7977     UV* array;
7978
7979     PERL_ARGS_ASSERT__INVLIST_INVERT_PROP;
7980
7981     _invlist_invert(invlist);
7982
7983     len = invlist_len(invlist);
7984
7985     if (len != 0) { /* If empty do nothing */
7986         array = invlist_array(invlist);
7987         if (array[len - 1] != PERL_UNICODE_MAX + 1) {
7988             /* Add 0x110000.  First, grow if necessary */
7989             len++;
7990             if (invlist_max(invlist) < len) {
7991                 invlist_extend(invlist, len);
7992                 array = invlist_array(invlist);
7993             }
7994             invlist_set_len(invlist, len);
7995             array[len - 1] = PERL_UNICODE_MAX + 1;
7996         }
7997         else {  /* Remove the 0x110000 */
7998             invlist_set_len(invlist, len - 1);
7999         }
8000     }
8001
8002     return;
8003 }
8004 #endif
8005
8006 PERL_STATIC_INLINE SV*
8007 S_invlist_clone(pTHX_ SV* const invlist)
8008 {
8009
8010     /* Return a new inversion list that is a copy of the input one, which is
8011      * unchanged */
8012
8013     /* Need to allocate extra space to accommodate Perl's addition of a
8014      * trailing NUL to SvPV's, since it thinks they are always strings */
8015     SV* new_invlist = _new_invlist(invlist_len(invlist) + 1);
8016     STRLEN length = SvCUR(invlist);
8017
8018     PERL_ARGS_ASSERT_INVLIST_CLONE;
8019
8020     SvCUR_set(new_invlist, length); /* This isn't done automatically */
8021     Copy(SvPVX(invlist), SvPVX(new_invlist), length, char);
8022
8023     return new_invlist;
8024 }
8025
8026 PERL_STATIC_INLINE UV*
8027 S_get_invlist_iter_addr(pTHX_ SV* invlist)
8028 {
8029     /* Return the address of the UV that contains the current iteration
8030      * position */
8031
8032     PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
8033
8034     return (UV *) (SvPVX(invlist) + (INVLIST_ITER_OFFSET * sizeof (UV)));
8035 }
8036
8037 PERL_STATIC_INLINE UV*
8038 S_get_invlist_version_id_addr(pTHX_ SV* invlist)
8039 {
8040     /* Return the address of the UV that contains the version id. */
8041
8042     PERL_ARGS_ASSERT_GET_INVLIST_VERSION_ID_ADDR;
8043
8044     return (UV *) (SvPVX(invlist) + (INVLIST_VERSION_ID_OFFSET * sizeof (UV)));
8045 }
8046
8047 PERL_STATIC_INLINE void
8048 S_invlist_iterinit(pTHX_ SV* invlist)   /* Initialize iterator for invlist */
8049 {
8050     PERL_ARGS_ASSERT_INVLIST_ITERINIT;
8051
8052     *get_invlist_iter_addr(invlist) = 0;
8053 }
8054
8055 STATIC bool
8056 S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
8057 {
8058     /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
8059      * This call sets in <*start> and <*end>, the next range in <invlist>.
8060      * Returns <TRUE> if successful and the next call will return the next
8061      * range; <FALSE> if was already at the end of the list.  If the latter,
8062      * <*start> and <*end> are unchanged, and the next call to this function
8063      * will start over at the beginning of the list */
8064
8065     UV* pos = get_invlist_iter_addr(invlist);
8066     UV len = invlist_len(invlist);
8067     UV *array;
8068
8069     PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
8070
8071     if (*pos >= len) {
8072         *pos = UV_MAX;  /* Force iternit() to be required next time */
8073         return FALSE;
8074     }
8075
8076     array = invlist_array(invlist);
8077
8078     *start = array[(*pos)++];
8079
8080     if (*pos >= len) {
8081         *end = UV_MAX;
8082     }
8083     else {
8084         *end = array[(*pos)++] - 1;
8085     }
8086
8087     return TRUE;
8088 }
8089
8090 #ifndef PERL_IN_XSUB_RE
8091 SV *
8092 Perl__invlist_contents(pTHX_ SV* const invlist)
8093 {
8094     /* Get the contents of an inversion list into a string SV so that they can
8095      * be printed out.  It uses the format traditionally done for debug tracing
8096      */
8097
8098     UV start, end;
8099     SV* output = newSVpvs("\n");
8100
8101     PERL_ARGS_ASSERT__INVLIST_CONTENTS;
8102
8103     invlist_iterinit(invlist);
8104     while (invlist_iternext(invlist, &start, &end)) {
8105         if (end == UV_MAX) {
8106             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
8107         }
8108         else if (end != start) {
8109             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
8110                     start,       end);
8111         }
8112         else {
8113             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
8114         }
8115     }
8116
8117     return output;
8118 }
8119 #endif
8120
8121 #if 0
8122 void
8123 S_invlist_dump(pTHX_ SV* const invlist, const char * const header)
8124 {
8125     /* Dumps out the ranges in an inversion list.  The string 'header'
8126      * if present is output on a line before the first range */
8127
8128     UV start, end;
8129
8130     if (header && strlen(header)) {
8131         PerlIO_printf(Perl_debug_log, "%s\n", header);
8132     }
8133     invlist_iterinit(invlist);
8134     while (invlist_iternext(invlist, &start, &end)) {
8135         if (end == UV_MAX) {
8136             PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. INFINITY\n", start);
8137         }
8138         else {
8139             PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. 0x%04"UVXf"\n", start, end);
8140         }
8141     }
8142 }
8143 #endif
8144
8145 #undef HEADER_LENGTH
8146 #undef INVLIST_INITIAL_LENGTH
8147 #undef TO_INTERNAL_SIZE
8148 #undef FROM_INTERNAL_SIZE
8149 #undef INVLIST_LEN_OFFSET
8150 #undef INVLIST_ZERO_OFFSET
8151 #undef INVLIST_ITER_OFFSET
8152 #undef INVLIST_VERSION_ID
8153
8154 /* End of inversion list object */
8155
8156 /*
8157  - reg - regular expression, i.e. main body or parenthesized thing
8158  *
8159  * Caller must absorb opening parenthesis.
8160  *
8161  * Combining parenthesis handling with the base level of regular expression
8162  * is a trifle forced, but the need to tie the tails of the branches to what
8163  * follows makes it hard to avoid.
8164  */
8165 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
8166 #ifdef DEBUGGING
8167 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
8168 #else
8169 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
8170 #endif
8171
8172 STATIC regnode *
8173 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
8174     /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
8175 {
8176     dVAR;
8177     register regnode *ret;              /* Will be the head of the group. */
8178     register regnode *br;
8179     register regnode *lastbr;
8180     register regnode *ender = NULL;
8181     register I32 parno = 0;
8182     I32 flags;
8183     U32 oregflags = RExC_flags;
8184     bool have_branch = 0;
8185     bool is_open = 0;
8186     I32 freeze_paren = 0;
8187     I32 after_freeze = 0;
8188
8189     /* for (?g), (?gc), and (?o) warnings; warning
8190        about (?c) will warn about (?g) -- japhy    */
8191
8192 #define WASTED_O  0x01
8193 #define WASTED_G  0x02
8194 #define WASTED_C  0x04
8195 #define WASTED_GC (0x02|0x04)
8196     I32 wastedflags = 0x00;
8197
8198     char * parse_start = RExC_parse; /* MJD */
8199     char * const oregcomp_parse = RExC_parse;
8200
8201     GET_RE_DEBUG_FLAGS_DECL;
8202
8203     PERL_ARGS_ASSERT_REG;
8204     DEBUG_PARSE("reg ");
8205
8206     *flagp = 0;                         /* Tentatively. */
8207
8208
8209     /* Make an OPEN node, if parenthesized. */
8210     if (paren) {
8211         if ( *RExC_parse == '*') { /* (*VERB:ARG) */
8212             char *start_verb = RExC_parse;
8213             STRLEN verb_len = 0;
8214             char *start_arg = NULL;
8215             unsigned char op = 0;
8216             int argok = 1;
8217             int internal_argval = 0; /* internal_argval is only useful if !argok */
8218             while ( *RExC_parse && *RExC_parse != ')' ) {
8219                 if ( *RExC_parse == ':' ) {
8220                     start_arg = RExC_parse + 1;
8221                     break;
8222                 }
8223                 RExC_parse++;
8224             }
8225             ++start_verb;
8226             verb_len = RExC_parse - start_verb;
8227             if ( start_arg ) {
8228                 RExC_parse++;
8229                 while ( *RExC_parse && *RExC_parse != ')' )
8230                     RExC_parse++;
8231                 if ( *RExC_parse != ')' )
8232                     vFAIL("Unterminated verb pattern argument");
8233                 if ( RExC_parse == start_arg )
8234                     start_arg = NULL;
8235             } else {
8236                 if ( *RExC_parse != ')' )
8237                     vFAIL("Unterminated verb pattern");
8238             }
8239
8240             switch ( *start_verb ) {
8241             case 'A':  /* (*ACCEPT) */
8242                 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
8243                     op = ACCEPT;
8244                     internal_argval = RExC_nestroot;
8245                 }
8246                 break;
8247             case 'C':  /* (*COMMIT) */
8248                 if ( memEQs(start_verb,verb_len,"COMMIT") )
8249                     op = COMMIT;
8250                 break;
8251             case 'F':  /* (*FAIL) */
8252                 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
8253                     op = OPFAIL;
8254                     argok = 0;
8255                 }
8256                 break;
8257             case ':':  /* (*:NAME) */
8258             case 'M':  /* (*MARK:NAME) */
8259                 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
8260                     op = MARKPOINT;
8261                     argok = -1;
8262                 }
8263                 break;
8264             case 'P':  /* (*PRUNE) */
8265                 if ( memEQs(start_verb,verb_len,"PRUNE") )
8266                     op = PRUNE;
8267                 break;
8268             case 'S':   /* (*SKIP) */
8269                 if ( memEQs(start_verb,verb_len,"SKIP") )
8270                     op = SKIP;
8271                 break;
8272             case 'T':  /* (*THEN) */
8273                 /* [19:06] <TimToady> :: is then */
8274                 if ( memEQs(start_verb,verb_len,"THEN") ) {
8275                     op = CUTGROUP;
8276                     RExC_seen |= REG_SEEN_CUTGROUP;
8277                 }
8278                 break;
8279             }
8280             if ( ! op ) {
8281                 RExC_parse++;
8282                 vFAIL3("Unknown verb pattern '%.*s'",
8283                     verb_len, start_verb);
8284             }
8285             if ( argok ) {
8286                 if ( start_arg && internal_argval ) {
8287                     vFAIL3("Verb pattern '%.*s' may not have an argument",
8288                         verb_len, start_verb);
8289                 } else if ( argok < 0 && !start_arg ) {
8290                     vFAIL3("Verb pattern '%.*s' has a mandatory argument",
8291                         verb_len, start_verb);
8292                 } else {
8293                     ret = reganode(pRExC_state, op, internal_argval);
8294                     if ( ! internal_argval && ! SIZE_ONLY ) {
8295                         if (start_arg) {
8296                             SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
8297                             ARG(ret) = add_data( pRExC_state, 1, "S" );
8298                             RExC_rxi->data->data[ARG(ret)]=(void*)sv;
8299                             ret->flags = 0;
8300                         } else {
8301                             ret->flags = 1;
8302                         }
8303                     }
8304                 }
8305                 if (!internal_argval)
8306                     RExC_seen |= REG_SEEN_VERBARG;
8307             } else if ( start_arg ) {
8308                 vFAIL3("Verb pattern '%.*s' may not have an argument",
8309                         verb_len, start_verb);
8310             } else {
8311                 ret = reg_node(pRExC_state, op);
8312             }
8313             nextchar(pRExC_state);
8314             return ret;
8315         } else
8316         if (*RExC_parse == '?') { /* (?...) */
8317             bool is_logical = 0;
8318             const char * const seqstart = RExC_parse;
8319             bool has_use_defaults = FALSE;
8320
8321             RExC_parse++;
8322             paren = *RExC_parse++;
8323             ret = NULL;                 /* For look-ahead/behind. */
8324             switch (paren) {
8325
8326             case 'P':   /* (?P...) variants for those used to PCRE/Python */
8327                 paren = *RExC_parse++;
8328                 if ( paren == '<')         /* (?P<...>) named capture */
8329                     goto named_capture;
8330                 else if (paren == '>') {   /* (?P>name) named recursion */
8331                     goto named_recursion;
8332                 }
8333                 else if (paren == '=') {   /* (?P=...)  named backref */
8334                     /* this pretty much dupes the code for \k<NAME> in regatom(), if
8335                        you change this make sure you change that */
8336                     char* name_start = RExC_parse;
8337                     U32 num = 0;
8338                     SV *sv_dat = reg_scan_name(pRExC_state,
8339                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8340                     if (RExC_parse == name_start || *RExC_parse != ')')
8341                         vFAIL2("Sequence %.3s... not terminated",parse_start);
8342
8343                     if (!SIZE_ONLY) {
8344                         num = add_data( pRExC_state, 1, "S" );
8345                         RExC_rxi->data->data[num]=(void*)sv_dat;
8346                         SvREFCNT_inc_simple_void(sv_dat);
8347                     }
8348                     RExC_sawback = 1;
8349                     ret = reganode(pRExC_state,
8350                                    ((! FOLD)
8351                                      ? NREF
8352                                      : (MORE_ASCII_RESTRICTED)
8353                                        ? NREFFA
8354                                        : (AT_LEAST_UNI_SEMANTICS)
8355                                          ? NREFFU
8356                                          : (LOC)
8357                                            ? NREFFL
8358                                            : NREFF),
8359                                     num);
8360                     *flagp |= HASWIDTH;
8361
8362                     Set_Node_Offset(ret, parse_start+1);
8363                     Set_Node_Cur_Length(ret); /* MJD */
8364
8365                     nextchar(pRExC_state);
8366                     return ret;
8367                 }
8368                 RExC_parse++;
8369                 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8370                 /*NOTREACHED*/
8371             case '<':           /* (?<...) */
8372                 if (*RExC_parse == '!')
8373                     paren = ',';
8374                 else if (*RExC_parse != '=')
8375               named_capture:
8376                 {               /* (?<...>) */
8377                     char *name_start;
8378                     SV *svname;
8379                     paren= '>';
8380             case '\'':          /* (?'...') */
8381                     name_start= RExC_parse;
8382                     svname = reg_scan_name(pRExC_state,
8383                         SIZE_ONLY ?  /* reverse test from the others */
8384                         REG_RSN_RETURN_NAME :
8385                         REG_RSN_RETURN_NULL);
8386                     if (RExC_parse == name_start) {
8387                         RExC_parse++;
8388                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8389                         /*NOTREACHED*/
8390                     }
8391                     if (*RExC_parse != paren)
8392                         vFAIL2("Sequence (?%c... not terminated",
8393                             paren=='>' ? '<' : paren);
8394                     if (SIZE_ONLY) {
8395                         HE *he_str;
8396                         SV *sv_dat = NULL;
8397                         if (!svname) /* shouldn't happen */
8398                             Perl_croak(aTHX_
8399                                 "panic: reg_scan_name returned NULL");
8400                         if (!RExC_paren_names) {
8401                             RExC_paren_names= newHV();
8402                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
8403 #ifdef DEBUGGING
8404                             RExC_paren_name_list= newAV();
8405                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
8406 #endif
8407                         }
8408                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
8409                         if ( he_str )
8410                             sv_dat = HeVAL(he_str);
8411                         if ( ! sv_dat ) {
8412                             /* croak baby croak */
8413                             Perl_croak(aTHX_
8414                                 "panic: paren_name hash element allocation failed");
8415                         } else if ( SvPOK(sv_dat) ) {
8416                             /* (?|...) can mean we have dupes so scan to check
8417                                its already been stored. Maybe a flag indicating
8418                                we are inside such a construct would be useful,
8419                                but the arrays are likely to be quite small, so
8420                                for now we punt -- dmq */
8421                             IV count = SvIV(sv_dat);
8422                             I32 *pv = (I32*)SvPVX(sv_dat);
8423                             IV i;
8424                             for ( i = 0 ; i < count ; i++ ) {
8425                                 if ( pv[i] == RExC_npar ) {
8426                                     count = 0;
8427                                     break;
8428                                 }
8429                             }
8430                             if ( count ) {
8431                                 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
8432                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
8433                                 pv[count] = RExC_npar;
8434                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
8435                             }
8436                         } else {
8437                             (void)SvUPGRADE(sv_dat,SVt_PVNV);
8438                             sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
8439                             SvIOK_on(sv_dat);
8440                             SvIV_set(sv_dat, 1);
8441                         }
8442 #ifdef DEBUGGING
8443                         /* Yes this does cause a memory leak in debugging Perls */
8444                         if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
8445                             SvREFCNT_dec(svname);
8446 #endif
8447
8448                         /*sv_dump(sv_dat);*/
8449                     }
8450                     nextchar(pRExC_state);
8451                     paren = 1;
8452                     goto capturing_parens;
8453                 }
8454                 RExC_seen |= REG_SEEN_LOOKBEHIND;
8455                 RExC_in_lookbehind++;
8456                 RExC_parse++;
8457             case '=':           /* (?=...) */
8458                 RExC_seen_zerolen++;
8459                 break;
8460             case '!':           /* (?!...) */
8461                 RExC_seen_zerolen++;
8462                 if (*RExC_parse == ')') {
8463                     ret=reg_node(pRExC_state, OPFAIL);
8464                     nextchar(pRExC_state);
8465                     return ret;
8466                 }
8467                 break;
8468             case '|':           /* (?|...) */
8469                 /* branch reset, behave like a (?:...) except that
8470                    buffers in alternations share the same numbers */
8471                 paren = ':';
8472                 after_freeze = freeze_paren = RExC_npar;
8473                 break;
8474             case ':':           /* (?:...) */
8475             case '>':           /* (?>...) */
8476                 break;
8477             case '$':           /* (?$...) */
8478             case '@':           /* (?@...) */
8479                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
8480                 break;
8481             case '#':           /* (?#...) */
8482                 while (*RExC_parse && *RExC_parse != ')')
8483                     RExC_parse++;
8484                 if (*RExC_parse != ')')
8485                     FAIL("Sequence (?#... not terminated");
8486                 nextchar(pRExC_state);
8487                 *flagp = TRYAGAIN;
8488                 return NULL;
8489             case '0' :           /* (?0) */
8490             case 'R' :           /* (?R) */
8491                 if (*RExC_parse != ')')
8492                     FAIL("Sequence (?R) not terminated");
8493                 ret = reg_node(pRExC_state, GOSTART);
8494                 *flagp |= POSTPONED;
8495                 nextchar(pRExC_state);
8496                 return ret;
8497                 /*notreached*/
8498             { /* named and numeric backreferences */
8499                 I32 num;
8500             case '&':            /* (?&NAME) */
8501                 parse_start = RExC_parse - 1;
8502               named_recursion:
8503                 {
8504                     SV *sv_dat = reg_scan_name(pRExC_state,
8505                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8506                      num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
8507                 }
8508                 goto gen_recurse_regop;
8509                 assert(0); /* NOT REACHED */
8510             case '+':
8511                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8512                     RExC_parse++;
8513                     vFAIL("Illegal pattern");
8514                 }
8515                 goto parse_recursion;
8516                 /* NOT REACHED*/
8517             case '-': /* (?-1) */
8518                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8519                     RExC_parse--; /* rewind to let it be handled later */
8520                     goto parse_flags;
8521                 }
8522                 /*FALLTHROUGH */
8523             case '1': case '2': case '3': case '4': /* (?1) */
8524             case '5': case '6': case '7': case '8': case '9':
8525                 RExC_parse--;
8526               parse_recursion:
8527                 num = atoi(RExC_parse);
8528                 parse_start = RExC_parse - 1; /* MJD */
8529                 if (*RExC_parse == '-')
8530                     RExC_parse++;
8531                 while (isDIGIT(*RExC_parse))
8532                         RExC_parse++;
8533                 if (*RExC_parse!=')')
8534                     vFAIL("Expecting close bracket");
8535
8536               gen_recurse_regop:
8537                 if ( paren == '-' ) {
8538                     /*
8539                     Diagram of capture buffer numbering.
8540                     Top line is the normal capture buffer numbers
8541                     Bottom line is the negative indexing as from
8542                     the X (the (?-2))
8543
8544                     +   1 2    3 4 5 X          6 7
8545                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
8546                     -   5 4    3 2 1 X          x x
8547
8548                     */
8549                     num = RExC_npar + num;
8550                     if (num < 1)  {
8551                         RExC_parse++;
8552                         vFAIL("Reference to nonexistent group");
8553                     }
8554                 } else if ( paren == '+' ) {
8555                     num = RExC_npar + num - 1;
8556                 }
8557
8558                 ret = reganode(pRExC_state, GOSUB, num);
8559                 if (!SIZE_ONLY) {
8560                     if (num > (I32)RExC_rx->nparens) {
8561                         RExC_parse++;
8562                         vFAIL("Reference to nonexistent group");
8563                     }
8564                     ARG2L_SET( ret, RExC_recurse_count++);
8565                     RExC_emit++;
8566                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
8567                         "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
8568                 } else {
8569                     RExC_size++;
8570                 }
8571                 RExC_seen |= REG_SEEN_RECURSE;
8572                 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
8573                 Set_Node_Offset(ret, parse_start); /* MJD */
8574
8575                 *flagp |= POSTPONED;
8576                 nextchar(pRExC_state);
8577                 return ret;
8578             } /* named and numeric backreferences */
8579             assert(0); /* NOT REACHED */
8580
8581             case '?':           /* (??...) */
8582                 is_logical = 1;
8583                 if (*RExC_parse != '{') {
8584                     RExC_parse++;
8585                     vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8586                     /*NOTREACHED*/
8587                 }
8588                 *flagp |= POSTPONED;
8589                 paren = *RExC_parse++;
8590                 /* FALL THROUGH */
8591             case '{':           /* (?{...}) */
8592             {
8593                 U32 n = 0;
8594                 struct reg_code_block *cb;
8595
8596                 RExC_seen_zerolen++;
8597
8598                 if (   !pRExC_state->num_code_blocks
8599                     || pRExC_state->code_index >= pRExC_state->num_code_blocks
8600                     || pRExC_state->code_blocks[pRExC_state->code_index].start
8601                         != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
8602                             - RExC_start)
8603                 ) {
8604                     if (RExC_pm_flags & PMf_USE_RE_EVAL)
8605                         FAIL("panic: Sequence (?{...}): no code block found\n");
8606                     FAIL("Eval-group not allowed at runtime, use re 'eval'");
8607                 }
8608                 /* this is a pre-compiled code block (?{...}) */
8609                 cb = &pRExC_state->code_blocks[pRExC_state->code_index];
8610                 RExC_parse = RExC_start + cb->end;
8611                 if (!SIZE_ONLY) {
8612                     OP *o = cb->block;
8613                     if (cb->src_regex) {
8614                         n = add_data(pRExC_state, 2, "rl");
8615                         RExC_rxi->data->data[n] =
8616                             (void*)SvREFCNT_inc((SV*)cb->src_regex);
8617                         RExC_rxi->data->data[n+1] = (void*)o;
8618                     }
8619                     else {
8620                         n = add_data(pRExC_state, 1,
8621                                (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l");
8622                         RExC_rxi->data->data[n] = (void*)o;
8623                     }
8624                 }
8625                 pRExC_state->code_index++;
8626                 nextchar(pRExC_state);
8627
8628                 if (is_logical) {
8629                     regnode *eval;
8630                     ret = reg_node(pRExC_state, LOGICAL);
8631                     eval = reganode(pRExC_state, EVAL, n);
8632                     if (!SIZE_ONLY) {
8633                         ret->flags = 2;
8634                         /* for later propagation into (??{}) return value */
8635                         eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME);
8636                     }
8637                     REGTAIL(pRExC_state, ret, eval);
8638                     /* deal with the length of this later - MJD */
8639                     return ret;
8640                 }
8641                 ret = reganode(pRExC_state, EVAL, n);
8642                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
8643                 Set_Node_Offset(ret, parse_start);
8644                 return ret;
8645             }
8646             case '(':           /* (?(?{...})...) and (?(?=...)...) */
8647             {
8648                 int is_define= 0;
8649                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
8650                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
8651                         || RExC_parse[1] == '<'
8652                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
8653                         I32 flag;
8654
8655                         ret = reg_node(pRExC_state, LOGICAL);
8656                         if (!SIZE_ONLY)
8657                             ret->flags = 1;
8658                         REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
8659                         goto insert_if;
8660                     }
8661                 }
8662                 else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
8663                          || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
8664                 {
8665                     char ch = RExC_parse[0] == '<' ? '>' : '\'';
8666                     char *name_start= RExC_parse++;
8667                     U32 num = 0;
8668                     SV *sv_dat=reg_scan_name(pRExC_state,
8669                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8670                     if (RExC_parse == name_start || *RExC_parse != ch)
8671                         vFAIL2("Sequence (?(%c... not terminated",
8672                             (ch == '>' ? '<' : ch));
8673                     RExC_parse++;
8674                     if (!SIZE_ONLY) {
8675                         num = add_data( pRExC_state, 1, "S" );
8676                         RExC_rxi->data->data[num]=(void*)sv_dat;
8677                         SvREFCNT_inc_simple_void(sv_dat);
8678                     }
8679                     ret = reganode(pRExC_state,NGROUPP,num);
8680                     goto insert_if_check_paren;
8681                 }
8682                 else if (RExC_parse[0] == 'D' &&
8683                          RExC_parse[1] == 'E' &&
8684                          RExC_parse[2] == 'F' &&
8685                          RExC_parse[3] == 'I' &&
8686                          RExC_parse[4] == 'N' &&
8687                          RExC_parse[5] == 'E')
8688                 {
8689                     ret = reganode(pRExC_state,DEFINEP,0);
8690                     RExC_parse +=6 ;
8691                     is_define = 1;
8692                     goto insert_if_check_paren;
8693                 }
8694                 else if (RExC_parse[0] == 'R') {
8695                     RExC_parse++;
8696                     parno = 0;
8697                     if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
8698                         parno = atoi(RExC_parse++);
8699                         while (isDIGIT(*RExC_parse))
8700                             RExC_parse++;
8701                     } else if (RExC_parse[0] == '&') {
8702                         SV *sv_dat;
8703                         RExC_parse++;
8704                         sv_dat = reg_scan_name(pRExC_state,
8705                             SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8706                         parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
8707                     }
8708                     ret = reganode(pRExC_state,INSUBP,parno);
8709                     goto insert_if_check_paren;
8710                 }
8711                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
8712                     /* (?(1)...) */
8713                     char c;
8714                     parno = atoi(RExC_parse++);
8715
8716                     while (isDIGIT(*RExC_parse))
8717                         RExC_parse++;
8718                     ret = reganode(pRExC_state, GROUPP, parno);
8719
8720                  insert_if_check_paren:
8721                     if ((c = *nextchar(pRExC_state)) != ')')
8722                         vFAIL("Switch condition not recognized");
8723                   insert_if:
8724                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
8725                     br = regbranch(pRExC_state, &flags, 1,depth+1);
8726                     if (br == NULL)
8727                         br = reganode(pRExC_state, LONGJMP, 0);
8728                     else
8729                         REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
8730                     c = *nextchar(pRExC_state);
8731                     if (flags&HASWIDTH)
8732                         *flagp |= HASWIDTH;
8733                     if (c == '|') {
8734                         if (is_define)
8735                             vFAIL("(?(DEFINE)....) does not allow branches");
8736                         lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
8737                         regbranch(pRExC_state, &flags, 1,depth+1);
8738                         REGTAIL(pRExC_state, ret, lastbr);
8739                         if (flags&HASWIDTH)
8740                             *flagp |= HASWIDTH;
8741                         c = *nextchar(pRExC_state);
8742                     }
8743                     else
8744                         lastbr = NULL;
8745                     if (c != ')')
8746                         vFAIL("Switch (?(condition)... contains too many branches");
8747                     ender = reg_node(pRExC_state, TAIL);
8748                     REGTAIL(pRExC_state, br, ender);
8749                     if (lastbr) {
8750                         REGTAIL(pRExC_state, lastbr, ender);
8751                         REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
8752                     }
8753                     else
8754                         REGTAIL(pRExC_state, ret, ender);
8755                     RExC_size++; /* XXX WHY do we need this?!!
8756                                     For large programs it seems to be required
8757                                     but I can't figure out why. -- dmq*/
8758                     return ret;
8759                 }
8760                 else {
8761                     vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
8762                 }
8763             }
8764             case 0:
8765                 RExC_parse--; /* for vFAIL to print correctly */
8766                 vFAIL("Sequence (? incomplete");
8767                 break;
8768             case DEFAULT_PAT_MOD:   /* Use default flags with the exceptions
8769                                        that follow */
8770                 has_use_defaults = TRUE;
8771                 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
8772                 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
8773                                                 ? REGEX_UNICODE_CHARSET
8774                                                 : REGEX_DEPENDS_CHARSET);
8775                 goto parse_flags;
8776             default:
8777                 --RExC_parse;
8778                 parse_flags:      /* (?i) */
8779             {
8780                 U32 posflags = 0, negflags = 0;
8781                 U32 *flagsp = &posflags;
8782                 char has_charset_modifier = '\0';
8783                 regex_charset cs = get_regex_charset(RExC_flags);
8784                 if (cs == REGEX_DEPENDS_CHARSET
8785                     && (RExC_utf8 || RExC_uni_semantics))
8786                 {
8787                     cs = REGEX_UNICODE_CHARSET;
8788                 }
8789
8790                 while (*RExC_parse) {
8791                     /* && strchr("iogcmsx", *RExC_parse) */
8792                     /* (?g), (?gc) and (?o) are useless here
8793                        and must be globally applied -- japhy */
8794                     switch (*RExC_parse) {
8795                     CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
8796                     case LOCALE_PAT_MOD:
8797                         if (has_charset_modifier) {
8798                             goto excess_modifier;
8799                         }
8800                         else if (flagsp == &negflags) {
8801                             goto neg_modifier;
8802                         }
8803                         cs = REGEX_LOCALE_CHARSET;
8804                         has_charset_modifier = LOCALE_PAT_MOD;
8805                         RExC_contains_locale = 1;
8806                         break;
8807                     case UNICODE_PAT_MOD:
8808                         if (has_charset_modifier) {
8809                             goto excess_modifier;
8810                         }
8811                         else if (flagsp == &negflags) {
8812                             goto neg_modifier;
8813                         }
8814                         cs = REGEX_UNICODE_CHARSET;
8815                         has_charset_modifier = UNICODE_PAT_MOD;
8816                         break;
8817                     case ASCII_RESTRICT_PAT_MOD:
8818                         if (flagsp == &negflags) {
8819                             goto neg_modifier;
8820                         }
8821                         if (has_charset_modifier) {
8822                             if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
8823                                 goto excess_modifier;
8824                             }
8825                             /* Doubled modifier implies more restricted */
8826                             cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
8827                         }
8828                         else {
8829                             cs = REGEX_ASCII_RESTRICTED_CHARSET;
8830                         }
8831                         has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
8832                         break;
8833                     case DEPENDS_PAT_MOD:
8834                         if (has_use_defaults) {
8835                             goto fail_modifiers;
8836                         }
8837                         else if (flagsp == &negflags) {
8838                             goto neg_modifier;
8839                         }
8840                         else if (has_charset_modifier) {
8841                             goto excess_modifier;
8842                         }
8843
8844                         /* The dual charset means unicode semantics if the
8845                          * pattern (or target, not known until runtime) are
8846                          * utf8, or something in the pattern indicates unicode
8847                          * semantics */
8848                         cs = (RExC_utf8 || RExC_uni_semantics)
8849                              ? REGEX_UNICODE_CHARSET
8850                              : REGEX_DEPENDS_CHARSET;
8851                         has_charset_modifier = DEPENDS_PAT_MOD;
8852                         break;
8853                     excess_modifier:
8854                         RExC_parse++;
8855                         if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
8856                             vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
8857                         }
8858                         else if (has_charset_modifier == *(RExC_parse - 1)) {
8859                             vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
8860                         }
8861                         else {
8862                             vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
8863                         }
8864                         /*NOTREACHED*/
8865                     neg_modifier:
8866                         RExC_parse++;
8867                         vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
8868                         /*NOTREACHED*/
8869                     case ONCE_PAT_MOD: /* 'o' */
8870                     case GLOBAL_PAT_MOD: /* 'g' */
8871                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
8872                             const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
8873                             if (! (wastedflags & wflagbit) ) {
8874                                 wastedflags |= wflagbit;
8875                                 vWARN5(
8876                                     RExC_parse + 1,
8877                                     "Useless (%s%c) - %suse /%c modifier",
8878                                     flagsp == &negflags ? "?-" : "?",
8879                                     *RExC_parse,
8880                                     flagsp == &negflags ? "don't " : "",
8881                                     *RExC_parse
8882                                 );
8883                             }
8884                         }
8885                         break;
8886
8887                     case CONTINUE_PAT_MOD: /* 'c' */
8888                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
8889                             if (! (wastedflags & WASTED_C) ) {
8890                                 wastedflags |= WASTED_GC;
8891                                 vWARN3(
8892                                     RExC_parse + 1,
8893                                     "Useless (%sc) - %suse /gc modifier",
8894                                     flagsp == &negflags ? "?-" : "?",
8895                                     flagsp == &negflags ? "don't " : ""
8896                                 );
8897                             }
8898                         }
8899                         break;
8900                     case KEEPCOPY_PAT_MOD: /* 'p' */
8901                         if (flagsp == &negflags) {
8902                             if (SIZE_ONLY)
8903                                 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
8904                         } else {
8905                             *flagsp |= RXf_PMf_KEEPCOPY;
8906                         }
8907                         break;
8908                     case '-':
8909                         /* A flag is a default iff it is following a minus, so
8910                          * if there is a minus, it means will be trying to
8911                          * re-specify a default which is an error */
8912                         if (has_use_defaults || flagsp == &negflags) {
8913             fail_modifiers:
8914                             RExC_parse++;
8915                             vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8916                             /*NOTREACHED*/
8917                         }
8918                         flagsp = &negflags;
8919                         wastedflags = 0;  /* reset so (?g-c) warns twice */
8920                         break;
8921                     case ':':
8922                         paren = ':';
8923                         /*FALLTHROUGH*/
8924                     case ')':
8925                         RExC_flags |= posflags;
8926                         RExC_flags &= ~negflags;
8927                         set_regex_charset(&RExC_flags, cs);
8928                         if (paren != ':') {
8929                             oregflags |= posflags;
8930                             oregflags &= ~negflags;
8931                             set_regex_charset(&oregflags, cs);
8932                         }
8933                         nextchar(pRExC_state);
8934                         if (paren != ':') {
8935                             *flagp = TRYAGAIN;
8936                             return NULL;
8937                         } else {
8938                             ret = NULL;
8939                             goto parse_rest;
8940                         }
8941                         /*NOTREACHED*/
8942                     default:
8943                         RExC_parse++;
8944                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8945                         /*NOTREACHED*/
8946                     }
8947                     ++RExC_parse;
8948                 }
8949             }} /* one for the default block, one for the switch */
8950         }
8951         else {                  /* (...) */
8952           capturing_parens:
8953             parno = RExC_npar;
8954             RExC_npar++;
8955
8956             ret = reganode(pRExC_state, OPEN, parno);
8957             if (!SIZE_ONLY ){
8958                 if (!RExC_nestroot)
8959                     RExC_nestroot = parno;
8960                 if (RExC_seen & REG_SEEN_RECURSE
8961                     && !RExC_open_parens[parno-1])
8962                 {
8963                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
8964                         "Setting open paren #%"IVdf" to %d\n",
8965                         (IV)parno, REG_NODE_NUM(ret)));
8966                     RExC_open_parens[parno-1]= ret;
8967                 }
8968             }
8969             Set_Node_Length(ret, 1); /* MJD */
8970             Set_Node_Offset(ret, RExC_parse); /* MJD */
8971             is_open = 1;
8972         }
8973     }
8974     else                        /* ! paren */
8975         ret = NULL;
8976
8977    parse_rest:
8978     /* Pick up the branches, linking them together. */
8979     parse_start = RExC_parse;   /* MJD */
8980     br = regbranch(pRExC_state, &flags, 1,depth+1);
8981
8982     /*     branch_len = (paren != 0); */
8983
8984     if (br == NULL)
8985         return(NULL);
8986     if (*RExC_parse == '|') {
8987         if (!SIZE_ONLY && RExC_extralen) {
8988             reginsert(pRExC_state, BRANCHJ, br, depth+1);
8989         }
8990         else {                  /* MJD */
8991             reginsert(pRExC_state, BRANCH, br, depth+1);
8992             Set_Node_Length(br, paren != 0);
8993             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
8994         }
8995         have_branch = 1;
8996         if (SIZE_ONLY)
8997             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
8998     }
8999     else if (paren == ':') {
9000         *flagp |= flags&SIMPLE;
9001     }
9002     if (is_open) {                              /* Starts with OPEN. */
9003         REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
9004     }
9005     else if (paren != '?')              /* Not Conditional */
9006         ret = br;
9007     *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9008     lastbr = br;
9009     while (*RExC_parse == '|') {
9010         if (!SIZE_ONLY && RExC_extralen) {
9011             ender = reganode(pRExC_state, LONGJMP,0);
9012             REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
9013         }
9014         if (SIZE_ONLY)
9015             RExC_extralen += 2;         /* Account for LONGJMP. */
9016         nextchar(pRExC_state);
9017         if (freeze_paren) {
9018             if (RExC_npar > after_freeze)
9019                 after_freeze = RExC_npar;
9020             RExC_npar = freeze_paren;
9021         }
9022         br = regbranch(pRExC_state, &flags, 0, depth+1);
9023
9024         if (br == NULL)
9025             return(NULL);
9026         REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
9027         lastbr = br;
9028         *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9029     }
9030
9031     if (have_branch || paren != ':') {
9032         /* Make a closing node, and hook it on the end. */
9033         switch (paren) {
9034         case ':':
9035             ender = reg_node(pRExC_state, TAIL);
9036             break;
9037         case 1:
9038             ender = reganode(pRExC_state, CLOSE, parno);
9039             if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
9040                 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9041                         "Setting close paren #%"IVdf" to %d\n",
9042                         (IV)parno, REG_NODE_NUM(ender)));
9043                 RExC_close_parens[parno-1]= ender;
9044                 if (RExC_nestroot == parno)
9045                     RExC_nestroot = 0;
9046             }
9047             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
9048             Set_Node_Length(ender,1); /* MJD */
9049             break;
9050         case '<':
9051         case ',':
9052         case '=':
9053         case '!':
9054             *flagp &= ~HASWIDTH;
9055             /* FALL THROUGH */
9056         case '>':
9057             ender = reg_node(pRExC_state, SUCCEED);
9058             break;
9059         case 0:
9060             ender = reg_node(pRExC_state, END);
9061             if (!SIZE_ONLY) {
9062                 assert(!RExC_opend); /* there can only be one! */
9063                 RExC_opend = ender;
9064             }
9065             break;
9066         }
9067         DEBUG_PARSE_r(if (!SIZE_ONLY) {
9068             SV * const mysv_val1=sv_newmortal();
9069             SV * const mysv_val2=sv_newmortal();
9070             DEBUG_PARSE_MSG("lsbr");
9071             regprop(RExC_rx, mysv_val1, lastbr);
9072             regprop(RExC_rx, mysv_val2, ender);
9073             PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9074                           SvPV_nolen_const(mysv_val1),
9075                           (IV)REG_NODE_NUM(lastbr),
9076                           SvPV_nolen_const(mysv_val2),
9077                           (IV)REG_NODE_NUM(ender),
9078                           (IV)(ender - lastbr)
9079             );
9080         });
9081         REGTAIL(pRExC_state, lastbr, ender);
9082
9083         if (have_branch && !SIZE_ONLY) {
9084             char is_nothing= 1;
9085             if (depth==1)
9086                 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
9087
9088             /* Hook the tails of the branches to the closing node. */
9089             for (br = ret; br; br = regnext(br)) {
9090                 const U8 op = PL_regkind[OP(br)];
9091                 if (op == BRANCH) {
9092                     REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9093                     if (OP(NEXTOPER(br)) != NOTHING || regnext(NEXTOPER(br)) != ender)
9094                         is_nothing= 0;
9095                 }
9096                 else if (op == BRANCHJ) {
9097                     REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9098                     /* for now we always disable this optimisation * /
9099                     if (OP(NEXTOPER(NEXTOPER(br))) != NOTHING || regnext(NEXTOPER(NEXTOPER(br))) != ender)
9100                     */
9101                         is_nothing= 0;
9102                 }
9103             }
9104             if (is_nothing) {
9105                 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
9106                 DEBUG_PARSE_r(if (!SIZE_ONLY) {
9107                     SV * const mysv_val1=sv_newmortal();
9108                     SV * const mysv_val2=sv_newmortal();
9109                     DEBUG_PARSE_MSG("NADA");
9110                     regprop(RExC_rx, mysv_val1, ret);
9111                     regprop(RExC_rx, mysv_val2, ender);
9112                     PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9113                                   SvPV_nolen_const(mysv_val1),
9114                                   (IV)REG_NODE_NUM(ret),
9115                                   SvPV_nolen_const(mysv_val2),
9116                                   (IV)REG_NODE_NUM(ender),
9117                                   (IV)(ender - ret)
9118                     );
9119                 });
9120                 OP(br)= NOTHING;
9121                 if (OP(ender) == TAIL) {
9122                     NEXT_OFF(br)= 0;
9123                     RExC_emit= br + 1;
9124                 } else {
9125                     regnode *opt;
9126                     for ( opt= br + 1; opt < ender ; opt++ )
9127                         OP(opt)= OPTIMIZED;
9128                     NEXT_OFF(br)= ender - br;
9129                 }
9130             }
9131         }
9132     }
9133
9134     {
9135         const char *p;
9136         static const char parens[] = "=!<,>";
9137
9138         if (paren && (p = strchr(parens, paren))) {
9139             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
9140             int flag = (p - parens) > 1;
9141
9142             if (paren == '>')
9143                 node = SUSPEND, flag = 0;
9144             reginsert(pRExC_state, node,ret, depth+1);
9145             Set_Node_Cur_Length(ret);
9146             Set_Node_Offset(ret, parse_start + 1);
9147             ret->flags = flag;
9148             REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
9149         }
9150     }
9151
9152     /* Check for proper termination. */
9153     if (paren) {
9154         RExC_flags = oregflags;
9155         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
9156             RExC_parse = oregcomp_parse;
9157             vFAIL("Unmatched (");
9158         }
9159     }
9160     else if (!paren && RExC_parse < RExC_end) {
9161         if (*RExC_parse == ')') {
9162             RExC_parse++;
9163             vFAIL("Unmatched )");
9164         }
9165         else
9166             FAIL("Junk on end of regexp");      /* "Can't happen". */
9167         assert(0); /* NOTREACHED */
9168     }
9169
9170     if (RExC_in_lookbehind) {
9171         RExC_in_lookbehind--;
9172     }
9173     if (after_freeze > RExC_npar)
9174         RExC_npar = after_freeze;
9175     return(ret);
9176 }
9177
9178 /*
9179  - regbranch - one alternative of an | operator
9180  *
9181  * Implements the concatenation operator.
9182  */
9183 STATIC regnode *
9184 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
9185 {
9186     dVAR;
9187     register regnode *ret;
9188     register regnode *chain = NULL;
9189     register regnode *latest;
9190     I32 flags = 0, c = 0;
9191     GET_RE_DEBUG_FLAGS_DECL;
9192
9193     PERL_ARGS_ASSERT_REGBRANCH;
9194
9195     DEBUG_PARSE("brnc");
9196
9197     if (first)
9198         ret = NULL;
9199     else {
9200         if (!SIZE_ONLY && RExC_extralen)
9201             ret = reganode(pRExC_state, BRANCHJ,0);
9202         else {
9203             ret = reg_node(pRExC_state, BRANCH);
9204             Set_Node_Length(ret, 1);
9205         }
9206     }
9207
9208     if (!first && SIZE_ONLY)
9209         RExC_extralen += 1;                     /* BRANCHJ */
9210
9211     *flagp = WORST;                     /* Tentatively. */
9212
9213     RExC_parse--;
9214     nextchar(pRExC_state);
9215     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
9216         flags &= ~TRYAGAIN;
9217         latest = regpiece(pRExC_state, &flags,depth+1);
9218         if (latest == NULL) {
9219             if (flags & TRYAGAIN)
9220                 continue;
9221             return(NULL);
9222         }
9223         else if (ret == NULL)
9224             ret = latest;
9225         *flagp |= flags&(HASWIDTH|POSTPONED);
9226         if (chain == NULL)      /* First piece. */
9227             *flagp |= flags&SPSTART;
9228         else {
9229             RExC_naughty++;
9230             REGTAIL(pRExC_state, chain, latest);
9231         }
9232         chain = latest;
9233         c++;
9234     }
9235     if (chain == NULL) {        /* Loop ran zero times. */
9236         chain = reg_node(pRExC_state, NOTHING);
9237         if (ret == NULL)
9238             ret = chain;
9239     }
9240     if (c == 1) {
9241         *flagp |= flags&SIMPLE;
9242     }
9243
9244     return ret;
9245 }
9246
9247 /*
9248  - regpiece - something followed by possible [*+?]
9249  *
9250  * Note that the branching code sequences used for ? and the general cases
9251  * of * and + are somewhat optimized:  they use the same NOTHING node as
9252  * both the endmarker for their branch list and the body of the last branch.
9253  * It might seem that this node could be dispensed with entirely, but the
9254  * endmarker role is not redundant.
9255  */
9256 STATIC regnode *
9257 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
9258 {
9259     dVAR;
9260     register regnode *ret;
9261     register char op;
9262     register char *next;
9263     I32 flags;
9264     const char * const origparse = RExC_parse;
9265     I32 min;
9266     I32 max = REG_INFTY;
9267 #ifdef RE_TRACK_PATTERN_OFFSETS
9268     char *parse_start;
9269 #endif
9270     const char *maxpos = NULL;
9271     GET_RE_DEBUG_FLAGS_DECL;
9272
9273     PERL_ARGS_ASSERT_REGPIECE;
9274
9275     DEBUG_PARSE("piec");
9276
9277     ret = regatom(pRExC_state, &flags,depth+1);
9278     if (ret == NULL) {
9279         if (flags & TRYAGAIN)
9280             *flagp |= TRYAGAIN;
9281         return(NULL);
9282     }
9283
9284     op = *RExC_parse;
9285
9286     if (op == '{' && regcurly(RExC_parse)) {
9287         maxpos = NULL;
9288 #ifdef RE_TRACK_PATTERN_OFFSETS
9289         parse_start = RExC_parse; /* MJD */
9290 #endif
9291         next = RExC_parse + 1;
9292         while (isDIGIT(*next) || *next == ',') {
9293             if (*next == ',') {
9294                 if (maxpos)
9295                     break;
9296                 else
9297                     maxpos = next;
9298             }
9299             next++;
9300         }
9301         if (*next == '}') {             /* got one */
9302             if (!maxpos)
9303                 maxpos = next;
9304             RExC_parse++;
9305             min = atoi(RExC_parse);
9306             if (*maxpos == ',')
9307                 maxpos++;
9308             else
9309                 maxpos = RExC_parse;
9310             max = atoi(maxpos);
9311             if (!max && *maxpos != '0')
9312                 max = REG_INFTY;                /* meaning "infinity" */
9313             else if (max >= REG_INFTY)
9314                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
9315             RExC_parse = next;
9316             nextchar(pRExC_state);
9317
9318         do_curly:
9319             if ((flags&SIMPLE)) {
9320                 RExC_naughty += 2 + RExC_naughty / 2;
9321                 reginsert(pRExC_state, CURLY, ret, depth+1);
9322                 Set_Node_Offset(ret, parse_start+1); /* MJD */
9323                 Set_Node_Cur_Length(ret);
9324             }
9325             else {
9326                 regnode * const w = reg_node(pRExC_state, WHILEM);
9327
9328                 w->flags = 0;
9329                 REGTAIL(pRExC_state, ret, w);
9330                 if (!SIZE_ONLY && RExC_extralen) {
9331                     reginsert(pRExC_state, LONGJMP,ret, depth+1);
9332                     reginsert(pRExC_state, NOTHING,ret, depth+1);
9333                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
9334                 }
9335                 reginsert(pRExC_state, CURLYX,ret, depth+1);
9336                                 /* MJD hk */
9337                 Set_Node_Offset(ret, parse_start+1);
9338                 Set_Node_Length(ret,
9339                                 op == '{' ? (RExC_parse - parse_start) : 1);
9340
9341                 if (!SIZE_ONLY && RExC_extralen)
9342                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
9343                 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
9344                 if (SIZE_ONLY)
9345                     RExC_whilem_seen++, RExC_extralen += 3;
9346                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
9347             }
9348             ret->flags = 0;
9349
9350             if (min > 0)
9351                 *flagp = WORST;
9352             if (max > 0)
9353                 *flagp |= HASWIDTH;
9354             if (max < min)
9355                 vFAIL("Can't do {n,m} with n > m");
9356             if (!SIZE_ONLY) {
9357                 ARG1_SET(ret, (U16)min);
9358                 ARG2_SET(ret, (U16)max);
9359             }
9360
9361             goto nest_check;
9362         }
9363     }
9364
9365     if (!ISMULT1(op)) {
9366         *flagp = flags;
9367         return(ret);
9368     }
9369
9370 #if 0                           /* Now runtime fix should be reliable. */
9371
9372     /* if this is reinstated, don't forget to put this back into perldiag:
9373
9374             =item Regexp *+ operand could be empty at {#} in regex m/%s/
9375
9376            (F) The part of the regexp subject to either the * or + quantifier
9377            could match an empty string. The {#} shows in the regular
9378            expression about where the problem was discovered.
9379
9380     */
9381
9382     if (!(flags&HASWIDTH) && op != '?')
9383       vFAIL("Regexp *+ operand could be empty");
9384 #endif
9385
9386 #ifdef RE_TRACK_PATTERN_OFFSETS
9387     parse_start = RExC_parse;
9388 #endif
9389     nextchar(pRExC_state);
9390
9391     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
9392
9393     if (op == '*' && (flags&SIMPLE)) {
9394         reginsert(pRExC_state, STAR, ret, depth+1);
9395         ret->flags = 0;
9396         RExC_naughty += 4;
9397     }
9398     else if (op == '*') {
9399         min = 0;
9400         goto do_curly;
9401     }
9402     else if (op == '+' && (flags&SIMPLE)) {
9403         reginsert(pRExC_state, PLUS, ret, depth+1);
9404         ret->flags = 0;
9405         RExC_naughty += 3;
9406     }
9407     else if (op == '+') {
9408         min = 1;
9409         goto do_curly;
9410     }
9411     else if (op == '?') {
9412         min = 0; max = 1;
9413         goto do_curly;
9414     }
9415   nest_check:
9416     if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
9417         ckWARN3reg(RExC_parse,
9418                    "%.*s matches null string many times",
9419                    (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
9420                    origparse);
9421     }
9422
9423     if (RExC_parse < RExC_end && *RExC_parse == '?') {
9424         nextchar(pRExC_state);
9425         reginsert(pRExC_state, MINMOD, ret, depth+1);
9426         REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
9427     }
9428 #ifndef REG_ALLOW_MINMOD_SUSPEND
9429     else
9430 #endif
9431     if (RExC_parse < RExC_end && *RExC_parse == '+') {
9432         regnode *ender;
9433         nextchar(pRExC_state);
9434         ender = reg_node(pRExC_state, SUCCEED);
9435         REGTAIL(pRExC_state, ret, ender);
9436         reginsert(pRExC_state, SUSPEND, ret, depth+1);
9437         ret->flags = 0;
9438         ender = reg_node(pRExC_state, TAIL);
9439         REGTAIL(pRExC_state, ret, ender);
9440         /*ret= ender;*/
9441     }
9442
9443     if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
9444         RExC_parse++;
9445         vFAIL("Nested quantifiers");
9446     }
9447
9448     return(ret);
9449 }
9450
9451
9452 /* reg_namedseq(pRExC_state,UVp, UV depth)
9453
9454    This is expected to be called by a parser routine that has
9455    recognized '\N' and needs to handle the rest. RExC_parse is
9456    expected to point at the first char following the N at the time
9457    of the call.
9458
9459    The \N may be inside (indicated by valuep not being NULL) or outside a
9460    character class.
9461
9462    \N may begin either a named sequence, or if outside a character class, mean
9463    to match a non-newline.  For non single-quoted regexes, the tokenizer has
9464    attempted to decide which, and in the case of a named sequence converted it
9465    into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
9466    where c1... are the characters in the sequence.  For single-quoted regexes,
9467    the tokenizer passes the \N sequence through unchanged; this code will not
9468    attempt to determine this nor expand those.  The net effect is that if the
9469    beginning of the passed-in pattern isn't '{U+' or there is no '}', it
9470    signals that this \N occurrence means to match a non-newline.
9471
9472    Only the \N{U+...} form should occur in a character class, for the same
9473    reason that '.' inside a character class means to just match a period: it
9474    just doesn't make sense.
9475
9476    If valuep is non-null then it is assumed that we are parsing inside
9477    of a charclass definition and the first codepoint in the resolved
9478    string is returned via *valuep and the routine will return NULL.
9479    In this mode if a multichar string is returned from the charnames
9480    handler, a warning will be issued, and only the first char in the
9481    sequence will be examined. If the string returned is zero length
9482    then the value of *valuep is undefined and NON-NULL will
9483    be returned to indicate failure. (This will NOT be a valid pointer
9484    to a regnode.)
9485
9486    If valuep is null then it is assumed that we are parsing normal text and a
9487    new EXACT node is inserted into the program containing the resolved string,
9488    and a pointer to the new node is returned.  But if the string is zero length
9489    a NOTHING node is emitted instead.
9490
9491    On success RExC_parse is set to the char following the endbrace.
9492    Parsing failures will generate a fatal error via vFAIL(...)
9493  */
9494 STATIC regnode *
9495 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp, U32 depth)
9496 {
9497     char * endbrace;    /* '}' following the name */
9498     regnode *ret = NULL;
9499     char* p;
9500
9501     GET_RE_DEBUG_FLAGS_DECL;
9502
9503     PERL_ARGS_ASSERT_REG_NAMEDSEQ;
9504
9505     GET_RE_DEBUG_FLAGS;
9506
9507     /* The [^\n] meaning of \N ignores spaces and comments under the /x
9508      * modifier.  The other meaning does not */
9509     p = (RExC_flags & RXf_PMf_EXTENDED)
9510         ? regwhite( pRExC_state, RExC_parse )
9511         : RExC_parse;
9512
9513     /* Disambiguate between \N meaning a named character versus \N meaning
9514      * [^\n].  The former is assumed when it can't be the latter. */
9515     if (*p != '{' || regcurly(p)) {
9516         RExC_parse = p;
9517         if (valuep) {
9518             /* no bare \N in a charclass */
9519             vFAIL("\\N in a character class must be a named character: \\N{...}");
9520         }
9521         nextchar(pRExC_state);
9522         ret = reg_node(pRExC_state, REG_ANY);
9523         *flagp |= HASWIDTH|SIMPLE;
9524         RExC_naughty++;
9525         RExC_parse--;
9526         Set_Node_Length(ret, 1); /* MJD */
9527         return ret;
9528     }
9529
9530     /* Here, we have decided it should be a named sequence */
9531
9532     /* The test above made sure that the next real character is a '{', but
9533      * under the /x modifier, it could be separated by space (or a comment and
9534      * \n) and this is not allowed (for consistency with \x{...} and the
9535      * tokenizer handling of \N{NAME}). */
9536     if (*RExC_parse != '{') {
9537         vFAIL("Missing braces on \\N{}");
9538     }
9539
9540     RExC_parse++;       /* Skip past the '{' */
9541
9542     if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
9543         || ! (endbrace == RExC_parse            /* nothing between the {} */
9544               || (endbrace - RExC_parse >= 2    /* U+ (bad hex is checked below */
9545                   && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
9546     {
9547         if (endbrace) RExC_parse = endbrace;    /* position msg's '<--HERE' */
9548         vFAIL("\\N{NAME} must be resolved by the lexer");
9549     }
9550
9551     if (endbrace == RExC_parse) {   /* empty: \N{} */
9552         if (! valuep) {
9553             RExC_parse = endbrace + 1;
9554             return reg_node(pRExC_state,NOTHING);
9555         }
9556
9557         if (SIZE_ONLY) {
9558             ckWARNreg(RExC_parse,
9559                     "Ignoring zero length \\N{} in character class"
9560             );
9561             RExC_parse = endbrace + 1;
9562         }
9563         *valuep = 0;
9564         return (regnode *) &RExC_parse; /* Invalid regnode pointer */
9565     }
9566
9567     REQUIRE_UTF8;       /* named sequences imply Unicode semantics */
9568     RExC_parse += 2;    /* Skip past the 'U+' */
9569
9570     if (valuep) {   /* In a bracketed char class */
9571         /* We only pay attention to the first char of
9572         multichar strings being returned. I kinda wonder
9573         if this makes sense as it does change the behaviour
9574         from earlier versions, OTOH that behaviour was broken
9575         as well. XXX Solution is to recharacterize as
9576         [rest-of-class]|multi1|multi2... */
9577
9578         STRLEN length_of_hex;
9579         I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
9580             | PERL_SCAN_DISALLOW_PREFIX
9581             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
9582
9583         char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
9584         if (endchar < endbrace) {
9585             ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
9586         }
9587
9588         length_of_hex = (STRLEN)(endchar - RExC_parse);
9589         *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
9590
9591         /* The tokenizer should have guaranteed validity, but it's possible to
9592          * bypass it by using single quoting, so check */
9593         if (length_of_hex == 0
9594             || length_of_hex != (STRLEN)(endchar - RExC_parse) )
9595         {
9596             RExC_parse += length_of_hex;        /* Includes all the valid */
9597             RExC_parse += (RExC_orig_utf8)      /* point to after 1st invalid */
9598                             ? UTF8SKIP(RExC_parse)
9599                             : 1;
9600             /* Guard against malformed utf8 */
9601             if (RExC_parse >= endchar) RExC_parse = endchar;
9602             vFAIL("Invalid hexadecimal number in \\N{U+...}");
9603         }
9604
9605         RExC_parse = endbrace + 1;
9606         if (endchar == endbrace) return NULL;
9607
9608         ret = (regnode *) &RExC_parse;  /* Invalid regnode pointer */
9609     }
9610     else {      /* Not a char class */
9611
9612         /* What is done here is to convert this to a sub-pattern of the form
9613          * (?:\x{char1}\x{char2}...)
9614          * and then call reg recursively.  That way, it retains its atomicness,
9615          * while not having to worry about special handling that some code
9616          * points may have.  toke.c has converted the original Unicode values
9617          * to native, so that we can just pass on the hex values unchanged.  We
9618          * do have to set a flag to keep recoding from happening in the
9619          * recursion */
9620
9621         SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
9622         STRLEN len;
9623         char *endchar;      /* Points to '.' or '}' ending cur char in the input
9624                                stream */
9625         char *orig_end = RExC_end;
9626
9627         while (RExC_parse < endbrace) {
9628
9629             /* Code points are separated by dots.  If none, there is only one
9630              * code point, and is terminated by the brace */
9631             endchar = RExC_parse + strcspn(RExC_parse, ".}");
9632
9633             /* Convert to notation the rest of the code understands */
9634             sv_catpv(substitute_parse, "\\x{");
9635             sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
9636             sv_catpv(substitute_parse, "}");
9637
9638             /* Point to the beginning of the next character in the sequence. */
9639             RExC_parse = endchar + 1;
9640         }
9641         sv_catpv(substitute_parse, ")");
9642
9643         RExC_parse = SvPV(substitute_parse, len);
9644
9645         /* Don't allow empty number */
9646         if (len < 8) {
9647             vFAIL("Invalid hexadecimal number in \\N{U+...}");
9648         }
9649         RExC_end = RExC_parse + len;
9650
9651         /* The values are Unicode, and therefore not subject to recoding */
9652         RExC_override_recoding = 1;
9653
9654         ret = reg(pRExC_state, 1, flagp, depth+1);
9655
9656         RExC_parse = endbrace;
9657         RExC_end = orig_end;
9658         RExC_override_recoding = 0;
9659
9660         nextchar(pRExC_state);
9661     }
9662
9663     return ret;
9664 }
9665
9666
9667 /*
9668  * reg_recode
9669  *
9670  * It returns the code point in utf8 for the value in *encp.
9671  *    value: a code value in the source encoding
9672  *    encp:  a pointer to an Encode object
9673  *
9674  * If the result from Encode is not a single character,
9675  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
9676  */
9677 STATIC UV
9678 S_reg_recode(pTHX_ const char value, SV **encp)
9679 {
9680     STRLEN numlen = 1;
9681     SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
9682     const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
9683     const STRLEN newlen = SvCUR(sv);
9684     UV uv = UNICODE_REPLACEMENT;
9685
9686     PERL_ARGS_ASSERT_REG_RECODE;
9687
9688     if (newlen)
9689         uv = SvUTF8(sv)
9690              ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
9691              : *(U8*)s;
9692
9693     if (!newlen || numlen != newlen) {
9694         uv = UNICODE_REPLACEMENT;
9695         *encp = NULL;
9696     }
9697     return uv;
9698 }
9699
9700
9701 /*
9702  - regatom - the lowest level
9703
9704    Try to identify anything special at the start of the pattern. If there
9705    is, then handle it as required. This may involve generating a single regop,
9706    such as for an assertion; or it may involve recursing, such as to
9707    handle a () structure.
9708
9709    If the string doesn't start with something special then we gobble up
9710    as much literal text as we can.
9711
9712    Once we have been able to handle whatever type of thing started the
9713    sequence, we return.
9714
9715    Note: we have to be careful with escapes, as they can be both literal
9716    and special, and in the case of \10 and friends, context determines which.
9717
9718    A summary of the code structure is:
9719
9720    switch (first_byte) {
9721         cases for each special:
9722             handle this special;
9723             break;
9724         case '\\':
9725             switch (2nd byte) {
9726                 cases for each unambiguous special:
9727                     handle this special;
9728                     break;
9729                 cases for each ambigous special/literal:
9730                     disambiguate;
9731                     if (special)  handle here
9732                     else goto defchar;
9733                 default: // unambiguously literal:
9734                     goto defchar;
9735             }
9736         default:  // is a literal char
9737             // FALL THROUGH
9738         defchar:
9739             create EXACTish node for literal;
9740             while (more input and node isn't full) {
9741                 switch (input_byte) {
9742                    cases for each special;
9743                        make sure parse pointer is set so that the next call to
9744                            regatom will see this special first
9745                        goto loopdone; // EXACTish node terminated by prev. char
9746                    default:
9747                        append char to EXACTISH node;
9748                 }
9749                 get next input byte;
9750             }
9751         loopdone:
9752    }
9753    return the generated node;
9754
9755    Specifically there are two separate switches for handling
9756    escape sequences, with the one for handling literal escapes requiring
9757    a dummy entry for all of the special escapes that are actually handled
9758    by the other.
9759 */
9760
9761 STATIC regnode *
9762 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
9763 {
9764     dVAR;
9765     register regnode *ret = NULL;
9766     I32 flags;
9767     char *parse_start = RExC_parse;
9768     U8 op;
9769     GET_RE_DEBUG_FLAGS_DECL;
9770     DEBUG_PARSE("atom");
9771     *flagp = WORST;             /* Tentatively. */
9772
9773     PERL_ARGS_ASSERT_REGATOM;
9774
9775 tryagain:
9776     switch ((U8)*RExC_parse) {
9777     case '^':
9778         RExC_seen_zerolen++;
9779         nextchar(pRExC_state);
9780         if (RExC_flags & RXf_PMf_MULTILINE)
9781             ret = reg_node(pRExC_state, MBOL);
9782         else if (RExC_flags & RXf_PMf_SINGLELINE)
9783             ret = reg_node(pRExC_state, SBOL);
9784         else
9785             ret = reg_node(pRExC_state, BOL);
9786         Set_Node_Length(ret, 1); /* MJD */
9787         break;
9788     case '$':
9789         nextchar(pRExC_state);
9790         if (*RExC_parse)
9791             RExC_seen_zerolen++;
9792         if (RExC_flags & RXf_PMf_MULTILINE)
9793             ret = reg_node(pRExC_state, MEOL);
9794         else if (RExC_flags & RXf_PMf_SINGLELINE)
9795             ret = reg_node(pRExC_state, SEOL);
9796         else
9797             ret = reg_node(pRExC_state, EOL);
9798         Set_Node_Length(ret, 1); /* MJD */
9799         break;
9800     case '.':
9801         nextchar(pRExC_state);
9802         if (RExC_flags & RXf_PMf_SINGLELINE)
9803             ret = reg_node(pRExC_state, SANY);
9804         else
9805             ret = reg_node(pRExC_state, REG_ANY);
9806         *flagp |= HASWIDTH|SIMPLE;
9807         RExC_naughty++;
9808         Set_Node_Length(ret, 1); /* MJD */
9809         break;
9810     case '[':
9811     {
9812         char * const oregcomp_parse = ++RExC_parse;
9813         ret = regclass(pRExC_state,depth+1);
9814         if (*RExC_parse != ']') {
9815             RExC_parse = oregcomp_parse;
9816             vFAIL("Unmatched [");
9817         }
9818         nextchar(pRExC_state);
9819         *flagp |= HASWIDTH|SIMPLE;
9820         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
9821         break;
9822     }
9823     case '(':
9824         nextchar(pRExC_state);
9825         ret = reg(pRExC_state, 1, &flags,depth+1);
9826         if (ret == NULL) {
9827                 if (flags & TRYAGAIN) {
9828                     if (RExC_parse == RExC_end) {
9829                          /* Make parent create an empty node if needed. */
9830                         *flagp |= TRYAGAIN;
9831                         return(NULL);
9832                     }
9833                     goto tryagain;
9834                 }
9835                 return(NULL);
9836         }
9837         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
9838         break;
9839     case '|':
9840     case ')':
9841         if (flags & TRYAGAIN) {
9842             *flagp |= TRYAGAIN;
9843             return NULL;
9844         }
9845         vFAIL("Internal urp");
9846                                 /* Supposed to be caught earlier. */
9847         break;
9848     case '?':
9849     case '+':
9850     case '*':
9851         RExC_parse++;
9852         vFAIL("Quantifier follows nothing");
9853         break;
9854     case '\\':
9855         /* Special Escapes
9856
9857            This switch handles escape sequences that resolve to some kind
9858            of special regop and not to literal text. Escape sequnces that
9859            resolve to literal text are handled below in the switch marked
9860            "Literal Escapes".
9861
9862            Every entry in this switch *must* have a corresponding entry
9863            in the literal escape switch. However, the opposite is not
9864            required, as the default for this switch is to jump to the
9865            literal text handling code.
9866         */
9867         switch ((U8)*++RExC_parse) {
9868         /* Special Escapes */
9869         case 'A':
9870             RExC_seen_zerolen++;
9871             ret = reg_node(pRExC_state, SBOL);
9872             *flagp |= SIMPLE;
9873             goto finish_meta_pat;
9874         case 'G':
9875             ret = reg_node(pRExC_state, GPOS);
9876             RExC_seen |= REG_SEEN_GPOS;
9877             *flagp |= SIMPLE;
9878             goto finish_meta_pat;
9879         case 'K':
9880             RExC_seen_zerolen++;
9881             ret = reg_node(pRExC_state, KEEPS);
9882             *flagp |= SIMPLE;
9883             /* XXX:dmq : disabling in-place substitution seems to
9884              * be necessary here to avoid cases of memory corruption, as
9885              * with: C<$_="x" x 80; s/x\K/y/> -- rgs
9886              */
9887             RExC_seen |= REG_SEEN_LOOKBEHIND;
9888             goto finish_meta_pat;
9889         case 'Z':
9890             ret = reg_node(pRExC_state, SEOL);
9891             *flagp |= SIMPLE;
9892             RExC_seen_zerolen++;                /* Do not optimize RE away */
9893             goto finish_meta_pat;
9894         case 'z':
9895             ret = reg_node(pRExC_state, EOS);
9896             *flagp |= SIMPLE;
9897             RExC_seen_zerolen++;                /* Do not optimize RE away */
9898             goto finish_meta_pat;
9899         case 'C':
9900             ret = reg_node(pRExC_state, CANY);
9901             RExC_seen |= REG_SEEN_CANY;
9902             *flagp |= HASWIDTH|SIMPLE;
9903             goto finish_meta_pat;
9904         case 'X':
9905             ret = reg_node(pRExC_state, CLUMP);
9906             *flagp |= HASWIDTH;
9907             goto finish_meta_pat;
9908         case 'w':
9909             switch (get_regex_charset(RExC_flags)) {
9910                 case REGEX_LOCALE_CHARSET:
9911                     op = ALNUML;
9912                     break;
9913                 case REGEX_UNICODE_CHARSET:
9914                     op = ALNUMU;
9915                     break;
9916                 case REGEX_ASCII_RESTRICTED_CHARSET:
9917                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9918                     op = ALNUMA;
9919                     break;
9920                 case REGEX_DEPENDS_CHARSET:
9921                     op = ALNUM;
9922                     break;
9923                 default:
9924                     goto bad_charset;
9925             }
9926             ret = reg_node(pRExC_state, op);
9927             *flagp |= HASWIDTH|SIMPLE;
9928             goto finish_meta_pat;
9929         case 'W':
9930             switch (get_regex_charset(RExC_flags)) {
9931                 case REGEX_LOCALE_CHARSET:
9932                     op = NALNUML;
9933                     break;
9934                 case REGEX_UNICODE_CHARSET:
9935                     op = NALNUMU;
9936                     break;
9937                 case REGEX_ASCII_RESTRICTED_CHARSET:
9938                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9939                     op = NALNUMA;
9940                     break;
9941                 case REGEX_DEPENDS_CHARSET:
9942                     op = NALNUM;
9943                     break;
9944                 default:
9945                     goto bad_charset;
9946             }
9947             ret = reg_node(pRExC_state, op);
9948             *flagp |= HASWIDTH|SIMPLE;
9949             goto finish_meta_pat;
9950         case 'b':
9951             RExC_seen_zerolen++;
9952             RExC_seen |= REG_SEEN_LOOKBEHIND;
9953             switch (get_regex_charset(RExC_flags)) {
9954                 case REGEX_LOCALE_CHARSET:
9955                     op = BOUNDL;
9956                     break;
9957                 case REGEX_UNICODE_CHARSET:
9958                     op = BOUNDU;
9959                     break;
9960                 case REGEX_ASCII_RESTRICTED_CHARSET:
9961                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9962                     op = BOUNDA;
9963                     break;
9964                 case REGEX_DEPENDS_CHARSET:
9965                     op = BOUND;
9966                     break;
9967                 default:
9968                     goto bad_charset;
9969             }
9970             ret = reg_node(pRExC_state, op);
9971             FLAGS(ret) = get_regex_charset(RExC_flags);
9972             *flagp |= SIMPLE;
9973             goto finish_meta_pat;
9974         case 'B':
9975             RExC_seen_zerolen++;
9976             RExC_seen |= REG_SEEN_LOOKBEHIND;
9977             switch (get_regex_charset(RExC_flags)) {
9978                 case REGEX_LOCALE_CHARSET:
9979                     op = NBOUNDL;
9980                     break;
9981                 case REGEX_UNICODE_CHARSET:
9982                     op = NBOUNDU;
9983                     break;
9984                 case REGEX_ASCII_RESTRICTED_CHARSET:
9985                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9986                     op = NBOUNDA;
9987                     break;
9988                 case REGEX_DEPENDS_CHARSET:
9989                     op = NBOUND;
9990                     break;
9991                 default:
9992                     goto bad_charset;
9993             }
9994             ret = reg_node(pRExC_state, op);
9995             FLAGS(ret) = get_regex_charset(RExC_flags);
9996             *flagp |= SIMPLE;
9997             goto finish_meta_pat;
9998         case 's':
9999             switch (get_regex_charset(RExC_flags)) {
10000                 case REGEX_LOCALE_CHARSET:
10001                     op = SPACEL;
10002                     break;
10003                 case REGEX_UNICODE_CHARSET:
10004                     op = SPACEU;
10005                     break;
10006                 case REGEX_ASCII_RESTRICTED_CHARSET:
10007                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
10008                     op = SPACEA;
10009                     break;
10010                 case REGEX_DEPENDS_CHARSET:
10011                     op = SPACE;
10012                     break;
10013                 default:
10014                     goto bad_charset;
10015             }
10016             ret = reg_node(pRExC_state, op);
10017             *flagp |= HASWIDTH|SIMPLE;
10018             goto finish_meta_pat;
10019         case 'S':
10020             switch (get_regex_charset(RExC_flags)) {
10021                 case REGEX_LOCALE_CHARSET:
10022                     op = NSPACEL;
10023                     break;
10024                 case REGEX_UNICODE_CHARSET:
10025                     op = NSPACEU;
10026                     break;
10027                 case REGEX_ASCII_RESTRICTED_CHARSET:
10028                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
10029                     op = NSPACEA;
10030                     break;
10031                 case REGEX_DEPENDS_CHARSET:
10032                     op = NSPACE;
10033                     break;
10034                 default:
10035                     goto bad_charset;
10036             }
10037             ret = reg_node(pRExC_state, op);
10038             *flagp |= HASWIDTH|SIMPLE;
10039             goto finish_meta_pat;
10040         case 'd':
10041             switch (get_regex_charset(RExC_flags)) {
10042                 case REGEX_LOCALE_CHARSET:
10043                     op = DIGITL;
10044                     break;
10045                 case REGEX_ASCII_RESTRICTED_CHARSET:
10046                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
10047                     op = DIGITA;
10048                     break;
10049                 case REGEX_DEPENDS_CHARSET: /* No difference between these */
10050                 case REGEX_UNICODE_CHARSET:
10051                     op = DIGIT;
10052                     break;
10053                 default:
10054                     goto bad_charset;
10055             }
10056             ret = reg_node(pRExC_state, op);
10057             *flagp |= HASWIDTH|SIMPLE;
10058             goto finish_meta_pat;
10059         case 'D':
10060             switch (get_regex_charset(RExC_flags)) {
10061                 case REGEX_LOCALE_CHARSET:
10062                     op = NDIGITL;
10063                     break;
10064                 case REGEX_ASCII_RESTRICTED_CHARSET:
10065                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
10066                     op = NDIGITA;
10067                     break;
10068                 case REGEX_DEPENDS_CHARSET: /* No difference between these */
10069                 case REGEX_UNICODE_CHARSET:
10070                     op = NDIGIT;
10071                     break;
10072                 default:
10073                     goto bad_charset;
10074             }
10075             ret = reg_node(pRExC_state, op);
10076             *flagp |= HASWIDTH|SIMPLE;
10077             goto finish_meta_pat;
10078         case 'R':
10079             ret = reg_node(pRExC_state, LNBREAK);
10080             *flagp |= HASWIDTH|SIMPLE;
10081             goto finish_meta_pat;
10082         case 'h':
10083             ret = reg_node(pRExC_state, HORIZWS);
10084             *flagp |= HASWIDTH|SIMPLE;
10085             goto finish_meta_pat;
10086         case 'H':
10087             ret = reg_node(pRExC_state, NHORIZWS);
10088             *flagp |= HASWIDTH|SIMPLE;
10089             goto finish_meta_pat;
10090         case 'v':
10091             ret = reg_node(pRExC_state, VERTWS);
10092             *flagp |= HASWIDTH|SIMPLE;
10093             goto finish_meta_pat;
10094         case 'V':
10095             ret = reg_node(pRExC_state, NVERTWS);
10096             *flagp |= HASWIDTH|SIMPLE;
10097          finish_meta_pat:
10098             nextchar(pRExC_state);
10099             Set_Node_Length(ret, 2); /* MJD */
10100             break;
10101         case 'p':
10102         case 'P':
10103             {
10104                 char* const oldregxend = RExC_end;
10105 #ifdef DEBUGGING
10106                 char* parse_start = RExC_parse - 2;
10107 #endif
10108
10109                 if (RExC_parse[1] == '{') {
10110                   /* a lovely hack--pretend we saw [\pX] instead */
10111                     RExC_end = strchr(RExC_parse, '}');
10112                     if (!RExC_end) {
10113                         const U8 c = (U8)*RExC_parse;
10114                         RExC_parse += 2;
10115                         RExC_end = oldregxend;
10116                         vFAIL2("Missing right brace on \\%c{}", c);
10117                     }
10118                     RExC_end++;
10119                 }
10120                 else {
10121                     RExC_end = RExC_parse + 2;
10122                     if (RExC_end > oldregxend)
10123                         RExC_end = oldregxend;
10124                 }
10125                 RExC_parse--;
10126
10127                 ret = regclass(pRExC_state,depth+1);
10128
10129                 RExC_end = oldregxend;
10130                 RExC_parse--;
10131
10132                 Set_Node_Offset(ret, parse_start + 2);
10133                 Set_Node_Cur_Length(ret);
10134                 nextchar(pRExC_state);
10135                 *flagp |= HASWIDTH|SIMPLE;
10136             }
10137             break;
10138         case 'N':
10139             /* Handle \N and \N{NAME} here and not below because it can be
10140             multicharacter. join_exact() will join them up later on.
10141             Also this makes sure that things like /\N{BLAH}+/ and
10142             \N{BLAH} being multi char Just Happen. dmq*/
10143             ++RExC_parse;
10144             ret= reg_namedseq(pRExC_state, NULL, flagp, depth);
10145             break;
10146         case 'k':    /* Handle \k<NAME> and \k'NAME' */
10147         parse_named_seq:
10148         {
10149             char ch= RExC_parse[1];
10150             if (ch != '<' && ch != '\'' && ch != '{') {
10151                 RExC_parse++;
10152                 vFAIL2("Sequence %.2s... not terminated",parse_start);
10153             } else {
10154                 /* this pretty much dupes the code for (?P=...) in reg(), if
10155                    you change this make sure you change that */
10156                 char* name_start = (RExC_parse += 2);
10157                 U32 num = 0;
10158                 SV *sv_dat = reg_scan_name(pRExC_state,
10159                     SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10160                 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
10161                 if (RExC_parse == name_start || *RExC_parse != ch)
10162                     vFAIL2("Sequence %.3s... not terminated",parse_start);
10163
10164                 if (!SIZE_ONLY) {
10165                     num = add_data( pRExC_state, 1, "S" );
10166                     RExC_rxi->data->data[num]=(void*)sv_dat;
10167                     SvREFCNT_inc_simple_void(sv_dat);
10168                 }
10169
10170                 RExC_sawback = 1;
10171                 ret = reganode(pRExC_state,
10172                                ((! FOLD)
10173                                  ? NREF
10174                                  : (MORE_ASCII_RESTRICTED)
10175                                    ? NREFFA
10176                                    : (AT_LEAST_UNI_SEMANTICS)
10177                                      ? NREFFU
10178                                      : (LOC)
10179                                        ? NREFFL
10180                                        : NREFF),
10181                                 num);
10182                 *flagp |= HASWIDTH;
10183
10184                 /* override incorrect value set in reganode MJD */
10185                 Set_Node_Offset(ret, parse_start+1);
10186                 Set_Node_Cur_Length(ret); /* MJD */
10187                 nextchar(pRExC_state);
10188
10189             }
10190             break;
10191         }
10192         case 'g':
10193         case '1': case '2': case '3': case '4':
10194         case '5': case '6': case '7': case '8': case '9':
10195             {
10196                 I32 num;
10197                 bool isg = *RExC_parse == 'g';
10198                 bool isrel = 0;
10199                 bool hasbrace = 0;
10200                 if (isg) {
10201                     RExC_parse++;
10202                     if (*RExC_parse == '{') {
10203                         RExC_parse++;
10204                         hasbrace = 1;
10205                     }
10206                     if (*RExC_parse == '-') {
10207                         RExC_parse++;
10208                         isrel = 1;
10209                     }
10210                     if (hasbrace && !isDIGIT(*RExC_parse)) {
10211                         if (isrel) RExC_parse--;
10212                         RExC_parse -= 2;
10213                         goto parse_named_seq;
10214                 }   }
10215                 num = atoi(RExC_parse);
10216                 if (isg && num == 0)
10217                     vFAIL("Reference to invalid group 0");
10218                 if (isrel) {
10219                     num = RExC_npar - num;
10220                     if (num < 1)
10221                         vFAIL("Reference to nonexistent or unclosed group");
10222                 }
10223                 if (!isg && num > 9 && num >= RExC_npar)
10224                     /* Probably a character specified in octal, e.g. \35 */
10225                     goto defchar;
10226                 else {
10227                     char * const parse_start = RExC_parse - 1; /* MJD */
10228                     while (isDIGIT(*RExC_parse))
10229                         RExC_parse++;
10230                     if (parse_start == RExC_parse - 1)
10231                         vFAIL("Unterminated \\g... pattern");
10232                     if (hasbrace) {
10233                         if (*RExC_parse != '}')
10234                             vFAIL("Unterminated \\g{...} pattern");
10235                         RExC_parse++;
10236                     }
10237                     if (!SIZE_ONLY) {
10238                         if (num > (I32)RExC_rx->nparens)
10239                             vFAIL("Reference to nonexistent group");
10240                     }
10241                     RExC_sawback = 1;
10242                     ret = reganode(pRExC_state,
10243                                    ((! FOLD)
10244                                      ? REF
10245                                      : (MORE_ASCII_RESTRICTED)
10246                                        ? REFFA
10247                                        : (AT_LEAST_UNI_SEMANTICS)
10248                                          ? REFFU
10249                                          : (LOC)
10250                                            ? REFFL
10251                                            : REFF),
10252                                     num);
10253                     *flagp |= HASWIDTH;
10254
10255                     /* override incorrect value set in reganode MJD */
10256                     Set_Node_Offset(ret, parse_start+1);
10257                     Set_Node_Cur_Length(ret); /* MJD */
10258                     RExC_parse--;
10259                     nextchar(pRExC_state);
10260                 }
10261             }
10262             break;
10263         case '\0':
10264             if (RExC_parse >= RExC_end)
10265                 FAIL("Trailing \\");
10266             /* FALL THROUGH */
10267         default:
10268             /* Do not generate "unrecognized" warnings here, we fall
10269                back into the quick-grab loop below */
10270             parse_start--;
10271             goto defchar;
10272         }
10273         break;
10274
10275     case '#':
10276         if (RExC_flags & RXf_PMf_EXTENDED) {
10277             if ( reg_skipcomment( pRExC_state ) )
10278                 goto tryagain;
10279         }
10280         /* FALL THROUGH */
10281
10282     default:
10283
10284             parse_start = RExC_parse - 1;
10285
10286             RExC_parse++;
10287
10288         defchar: {
10289             register STRLEN len;
10290             register UV ender;
10291             register char *p;
10292             char *s;
10293             STRLEN foldlen;
10294             U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
10295             U8 node_type;
10296
10297             /* Is this a LATIN LOWER CASE SHARP S in an EXACTFU node?  If so,
10298              * it is folded to 'ss' even if not utf8 */
10299             bool is_exactfu_sharp_s;
10300
10301             ender = 0;
10302             node_type = ((! FOLD) ? EXACT
10303                         : (LOC)
10304                           ? EXACTFL
10305                           : (MORE_ASCII_RESTRICTED)
10306                             ? EXACTFA
10307                             : (AT_LEAST_UNI_SEMANTICS)
10308                               ? EXACTFU
10309                               : EXACTF);
10310             ret = reg_node(pRExC_state, node_type);
10311             s = STRING(ret);
10312
10313             /* XXX The node can hold up to 255 bytes, yet this only goes to
10314              * 127.  I (khw) do not know why.  Keeping it somewhat less than
10315              * 255 allows us to not have to worry about overflow due to
10316              * converting to utf8 and fold expansion, but that value is
10317              * 255-UTF8_MAXBYTES_CASE.  join_exact() may join adjacent nodes
10318              * split up by this limit into a single one using the real max of
10319              * 255.  Even at 127, this breaks under rare circumstances.  If
10320              * folding, we do not want to split a node at a character that is a
10321              * non-final in a multi-char fold, as an input string could just
10322              * happen to want to match across the node boundary.  The join
10323              * would solve that problem if the join actually happens.  But a
10324              * series of more than two nodes in a row each of 127 would cause
10325              * the first join to succeed to get to 254, but then there wouldn't
10326              * be room for the next one, which could at be one of those split
10327              * multi-char folds.  I don't know of any fool-proof solution.  One
10328              * could back off to end with only a code point that isn't such a
10329              * non-final, but it is possible for there not to be any in the
10330              * entire node. */
10331             for (len = 0, p = RExC_parse - 1;
10332                  len < 127 && p < RExC_end;
10333                  len++)
10334             {
10335                 char * const oldp = p;
10336
10337                 if (RExC_flags & RXf_PMf_EXTENDED)
10338                     p = regwhite( pRExC_state, p );
10339                 switch ((U8)*p) {
10340                 case '^':
10341                 case '$':
10342                 case '.':
10343                 case '[':
10344                 case '(':
10345                 case ')':
10346                 case '|':
10347                     goto loopdone;
10348                 case '\\':
10349                     /* Literal Escapes Switch
10350
10351                        This switch is meant to handle escape sequences that
10352                        resolve to a literal character.
10353
10354                        Every escape sequence that represents something
10355                        else, like an assertion or a char class, is handled
10356                        in the switch marked 'Special Escapes' above in this
10357                        routine, but also has an entry here as anything that
10358                        isn't explicitly mentioned here will be treated as
10359                        an unescaped equivalent literal.
10360                     */
10361
10362                     switch ((U8)*++p) {
10363                     /* These are all the special escapes. */
10364                     case 'A':             /* Start assertion */
10365                     case 'b': case 'B':   /* Word-boundary assertion*/
10366                     case 'C':             /* Single char !DANGEROUS! */
10367                     case 'd': case 'D':   /* digit class */
10368                     case 'g': case 'G':   /* generic-backref, pos assertion */
10369                     case 'h': case 'H':   /* HORIZWS */
10370                     case 'k': case 'K':   /* named backref, keep marker */
10371                     case 'N':             /* named char sequence */
10372                     case 'p': case 'P':   /* Unicode property */
10373                               case 'R':   /* LNBREAK */
10374                     case 's': case 'S':   /* space class */
10375                     case 'v': case 'V':   /* VERTWS */
10376                     case 'w': case 'W':   /* word class */
10377                     case 'X':             /* eXtended Unicode "combining character sequence" */
10378                     case 'z': case 'Z':   /* End of line/string assertion */
10379                         --p;
10380                         goto loopdone;
10381
10382                     /* Anything after here is an escape that resolves to a
10383                        literal. (Except digits, which may or may not)
10384                      */
10385                     case 'n':
10386                         ender = '\n';
10387                         p++;
10388                         break;
10389                     case 'r':
10390                         ender = '\r';
10391                         p++;
10392                         break;
10393                     case 't':
10394                         ender = '\t';
10395                         p++;
10396                         break;
10397                     case 'f':
10398                         ender = '\f';
10399                         p++;
10400                         break;
10401                     case 'e':
10402                           ender = ASCII_TO_NATIVE('\033');
10403                         p++;
10404                         break;
10405                     case 'a':
10406                           ender = ASCII_TO_NATIVE('\007');
10407                         p++;
10408                         break;
10409                     case 'o':
10410                         {
10411                             STRLEN brace_len = len;
10412                             UV result;
10413                             const char* error_msg;
10414
10415                             bool valid = grok_bslash_o(p,
10416                                                        &result,
10417                                                        &brace_len,
10418                                                        &error_msg,
10419                                                        1);
10420                             p += brace_len;
10421                             if (! valid) {
10422                                 RExC_parse = p; /* going to die anyway; point
10423                                                    to exact spot of failure */
10424                                 vFAIL(error_msg);
10425                             }
10426                             else
10427                             {
10428                                 ender = result;
10429                             }
10430                             if (PL_encoding && ender < 0x100) {
10431                                 goto recode_encoding;
10432                             }
10433                             if (ender > 0xff) {
10434                                 REQUIRE_UTF8;
10435                             }
10436                             break;
10437                         }
10438                     case 'x':
10439                         if (*++p == '{') {
10440                             char* const e = strchr(p, '}');
10441
10442                             if (!e) {
10443                                 RExC_parse = p + 1;
10444                                 vFAIL("Missing right brace on \\x{}");
10445                             }
10446                             else {
10447                                 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
10448                                     | PERL_SCAN_DISALLOW_PREFIX;
10449                                 STRLEN numlen = e - p - 1;
10450                                 ender = grok_hex(p + 1, &numlen, &flags, NULL);
10451                                 if (ender > 0xff)
10452                                     REQUIRE_UTF8;
10453                                 p = e + 1;
10454                             }
10455                         }
10456                         else {
10457                             I32 flags = PERL_SCAN_DISALLOW_PREFIX;
10458                             STRLEN numlen = 2;
10459                             ender = grok_hex(p, &numlen, &flags, NULL);
10460                             p += numlen;
10461                         }
10462                         if (PL_encoding && ender < 0x100)
10463                             goto recode_encoding;
10464                         break;
10465                     case 'c':
10466                         p++;
10467                         ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
10468                         break;
10469                     case '0': case '1': case '2': case '3':case '4':
10470                     case '5': case '6': case '7':
10471                         if (*p == '0' ||
10472                             (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
10473                         {
10474                             I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
10475                             STRLEN numlen = 3;
10476                             ender = grok_oct(p, &numlen, &flags, NULL);
10477                             if (ender > 0xff) {
10478                                 REQUIRE_UTF8;
10479                             }
10480                             p += numlen;
10481                         }
10482                         else {
10483                             --p;
10484                             goto loopdone;
10485                         }
10486                         if (PL_encoding && ender < 0x100)
10487                             goto recode_encoding;
10488                         break;
10489                     recode_encoding:
10490                         if (! RExC_override_recoding) {
10491                             SV* enc = PL_encoding;
10492                             ender = reg_recode((const char)(U8)ender, &enc);
10493                             if (!enc && SIZE_ONLY)
10494                                 ckWARNreg(p, "Invalid escape in the specified encoding");
10495                             REQUIRE_UTF8;
10496                         }
10497                         break;
10498                     case '\0':
10499                         if (p >= RExC_end)
10500                             FAIL("Trailing \\");
10501                         /* FALL THROUGH */
10502                     default:
10503                         if (!SIZE_ONLY&& isALNUMC(*p)) {
10504                             ckWARN2reg(p + 1, "Unrecognized escape \\%.1s passed through", p);
10505                         }
10506                         goto normal_default;
10507                     }
10508                     break;
10509                 case '{':
10510                     /* Currently we don't warn when the lbrace is at the start
10511                      * of a construct.  This catches it in the middle of a
10512                      * literal string, or when its the first thing after
10513                      * something like "\b" */
10514                     if (! SIZE_ONLY
10515                         && (len || (p > RExC_start && isALPHA_A(*(p -1)))))
10516                     {
10517                         ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through");
10518                     }
10519                     /*FALLTHROUGH*/
10520                 default:
10521                   normal_default:
10522                     if (UTF8_IS_START(*p) && UTF) {
10523                         STRLEN numlen;
10524                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
10525                                                &numlen, UTF8_ALLOW_DEFAULT);
10526                         p += numlen;
10527                     }
10528                     else
10529                         ender = (U8) *p++;
10530                     break;
10531                 } /* End of switch on the literal */
10532
10533                 is_exactfu_sharp_s = (node_type == EXACTFU
10534                                       && ender == LATIN_SMALL_LETTER_SHARP_S);
10535                 if ( RExC_flags & RXf_PMf_EXTENDED)
10536                     p = regwhite( pRExC_state, p );
10537                 if ((UTF && FOLD) || is_exactfu_sharp_s) {
10538                     /* Prime the casefolded buffer.  Locale rules, which apply
10539                      * only to code points < 256, aren't known until execution,
10540                      * so for them, just output the original character using
10541                      * utf8.  If we start to fold non-UTF patterns, be sure to
10542                      * update join_exact() */
10543                     if (LOC && ender < 256) {
10544                         if (UNI_IS_INVARIANT(ender)) {
10545                             *tmpbuf = (U8) ender;
10546                             foldlen = 1;
10547                         } else {
10548                             *tmpbuf = UTF8_TWO_BYTE_HI(ender);
10549                             *(tmpbuf + 1) = UTF8_TWO_BYTE_LO(ender);
10550                             foldlen = 2;
10551                         }
10552                     }
10553                     else if (isASCII(ender)) {  /* Note: Here can't also be LOC
10554                                                  */
10555                         ender = toLOWER(ender);
10556                         *tmpbuf = (U8) ender;
10557                         foldlen = 1;
10558                     }
10559                     else if (! MORE_ASCII_RESTRICTED && ! LOC) {
10560
10561                         /* Locale and /aa require more selectivity about the
10562                          * fold, so are handled below.  Otherwise, here, just
10563                          * use the fold */
10564                         ender = toFOLD_uni(ender, tmpbuf, &foldlen);
10565                     }
10566                     else {
10567                         /* Under locale rules or /aa we are not to mix,
10568                          * respectively, ords < 256 or ASCII with non-.  So
10569                          * reject folds that mix them, using only the
10570                          * non-folded code point.  So do the fold to a
10571                          * temporary, and inspect each character in it. */
10572                         U8 trialbuf[UTF8_MAXBYTES_CASE+1];
10573                         U8* s = trialbuf;
10574                         UV tmpender = toFOLD_uni(ender, trialbuf, &foldlen);
10575                         U8* e = s + foldlen;
10576                         bool fold_ok = TRUE;
10577
10578                         while (s < e) {
10579                             if (isASCII(*s)
10580                                 || (LOC && (UTF8_IS_INVARIANT(*s)
10581                                            || UTF8_IS_DOWNGRADEABLE_START(*s))))
10582                             {
10583                                 fold_ok = FALSE;
10584                                 break;
10585                             }
10586                             s += UTF8SKIP(s);
10587                         }
10588                         if (fold_ok) {
10589                             Copy(trialbuf, tmpbuf, foldlen, U8);
10590                             ender = tmpender;
10591                         }
10592                         else {
10593                             uvuni_to_utf8(tmpbuf, ender);
10594                             foldlen = UNISKIP(ender);
10595                         }
10596                     }
10597                 }
10598                 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
10599                     if (len)
10600                         p = oldp;
10601                     else if (UTF || is_exactfu_sharp_s) {
10602                          if (FOLD) {
10603                               /* Emit all the Unicode characters. */
10604                               STRLEN numlen;
10605                               for (foldbuf = tmpbuf;
10606                                    foldlen;
10607                                    foldlen -= numlen) {
10608
10609                                    /* tmpbuf has been constructed by us, so we
10610                                     * know it is valid utf8 */
10611                                    ender = valid_utf8_to_uvchr(foldbuf, &numlen);
10612                                    if (numlen > 0) {
10613                                         const STRLEN unilen = reguni(pRExC_state, ender, s);
10614                                         s       += unilen;
10615                                         len     += unilen;
10616                                         /* In EBCDIC the numlen
10617                                          * and unilen can differ. */
10618                                         foldbuf += numlen;
10619                                         if (numlen >= foldlen)
10620                                              break;
10621                                    }
10622                                    else
10623                                         break; /* "Can't happen." */
10624                               }
10625                          }
10626                          else {
10627                               const STRLEN unilen = reguni(pRExC_state, ender, s);
10628                               if (unilen > 0) {
10629                                    s   += unilen;
10630                                    len += unilen;
10631                               }
10632                          }
10633                     }
10634                     else {
10635                         len++;
10636                         REGC((char)ender, s++);
10637                     }
10638                     break;
10639                 }
10640                 if (UTF || is_exactfu_sharp_s) {
10641                      if (FOLD) {
10642                           /* Emit all the Unicode characters. */
10643                           STRLEN numlen;
10644                           for (foldbuf = tmpbuf;
10645                                foldlen;
10646                                foldlen -= numlen) {
10647                                ender = valid_utf8_to_uvchr(foldbuf, &numlen);
10648                                if (numlen > 0) {
10649                                     const STRLEN unilen = reguni(pRExC_state, ender, s);
10650                                     len     += unilen;
10651                                     s       += unilen;
10652                                     /* In EBCDIC the numlen
10653                                      * and unilen can differ. */
10654                                     foldbuf += numlen;
10655                                     if (numlen >= foldlen)
10656                                          break;
10657                                }
10658                                else
10659                                     break;
10660                           }
10661                      }
10662                      else {
10663                           const STRLEN unilen = reguni(pRExC_state, ender, s);
10664                           if (unilen > 0) {
10665                                s   += unilen;
10666                                len += unilen;
10667                           }
10668                      }
10669                      len--;
10670                 }
10671                 else {
10672                     REGC((char)ender, s++);
10673                 }
10674             }
10675         loopdone:   /* Jumped to when encounters something that shouldn't be in
10676                        the node */
10677             RExC_parse = p - 1;
10678             Set_Node_Cur_Length(ret); /* MJD */
10679             nextchar(pRExC_state);
10680             {
10681                 /* len is STRLEN which is unsigned, need to copy to signed */
10682                 IV iv = len;
10683                 if (iv < 0)
10684                     vFAIL("Internal disaster");
10685             }
10686             if (len > 0)
10687                 *flagp |= HASWIDTH;
10688             if (len == 1 && UNI_IS_INVARIANT(ender))
10689                 *flagp |= SIMPLE;
10690
10691             if (SIZE_ONLY)
10692                 RExC_size += STR_SZ(len);
10693             else {
10694                 STR_LEN(ret) = len;
10695                 RExC_emit += STR_SZ(len);
10696             }
10697         }
10698         break;
10699     }
10700
10701     return(ret);
10702
10703 /* Jumped to when an unrecognized character set is encountered */
10704 bad_charset:
10705     Perl_croak(aTHX_ "panic: Unknown regex character set encoding: %u", get_regex_charset(RExC_flags));
10706     return(NULL);
10707 }
10708
10709 STATIC char *
10710 S_regwhite( RExC_state_t *pRExC_state, char *p )
10711 {
10712     const char *e = RExC_end;
10713
10714     PERL_ARGS_ASSERT_REGWHITE;
10715
10716     while (p < e) {
10717         if (isSPACE(*p))
10718             ++p;
10719         else if (*p == '#') {
10720             bool ended = 0;
10721             do {
10722                 if (*p++ == '\n') {
10723                     ended = 1;
10724                     break;
10725                 }
10726             } while (p < e);
10727             if (!ended)
10728                 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
10729         }
10730         else
10731             break;
10732     }
10733     return p;
10734 }
10735
10736 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
10737    Character classes ([:foo:]) can also be negated ([:^foo:]).
10738    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
10739    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
10740    but trigger failures because they are currently unimplemented. */
10741
10742 #define POSIXCC_DONE(c)   ((c) == ':')
10743 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
10744 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
10745
10746 STATIC I32
10747 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
10748 {
10749     dVAR;
10750     I32 namedclass = OOB_NAMEDCLASS;
10751
10752     PERL_ARGS_ASSERT_REGPPOSIXCC;
10753
10754     if (value == '[' && RExC_parse + 1 < RExC_end &&
10755         /* I smell either [: or [= or [. -- POSIX has been here, right? */
10756         POSIXCC(UCHARAT(RExC_parse))) {
10757         const char c = UCHARAT(RExC_parse);
10758         char* const s = RExC_parse++;
10759
10760         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
10761             RExC_parse++;
10762         if (RExC_parse == RExC_end)
10763             /* Grandfather lone [:, [=, [. */
10764             RExC_parse = s;
10765         else {
10766             const char* const t = RExC_parse++; /* skip over the c */
10767             assert(*t == c);
10768
10769             if (UCHARAT(RExC_parse) == ']') {
10770                 const char *posixcc = s + 1;
10771                 RExC_parse++; /* skip over the ending ] */
10772
10773                 if (*s == ':') {
10774                     const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
10775                     const I32 skip = t - posixcc;
10776
10777                     /* Initially switch on the length of the name.  */
10778                     switch (skip) {
10779                     case 4:
10780                         if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
10781                             namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
10782                         break;
10783                     case 5:
10784                         /* Names all of length 5.  */
10785                         /* alnum alpha ascii blank cntrl digit graph lower
10786                            print punct space upper  */
10787                         /* Offset 4 gives the best switch position.  */
10788                         switch (posixcc[4]) {
10789                         case 'a':
10790                             if (memEQ(posixcc, "alph", 4)) /* alpha */
10791                                 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
10792                             break;
10793                         case 'e':
10794                             if (memEQ(posixcc, "spac", 4)) /* space */
10795                                 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
10796                             break;
10797                         case 'h':
10798                             if (memEQ(posixcc, "grap", 4)) /* graph */
10799                                 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
10800                             break;
10801                         case 'i':
10802                             if (memEQ(posixcc, "asci", 4)) /* ascii */
10803                                 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
10804                             break;
10805                         case 'k':
10806                             if (memEQ(posixcc, "blan", 4)) /* blank */
10807                                 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
10808                             break;
10809                         case 'l':
10810                             if (memEQ(posixcc, "cntr", 4)) /* cntrl */
10811                                 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
10812                             break;
10813                         case 'm':
10814                             if (memEQ(posixcc, "alnu", 4)) /* alnum */
10815                                 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
10816                             break;
10817                         case 'r':
10818                             if (memEQ(posixcc, "lowe", 4)) /* lower */
10819                                 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
10820                             else if (memEQ(posixcc, "uppe", 4)) /* upper */
10821                                 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
10822                             break;
10823                         case 't':
10824                             if (memEQ(posixcc, "digi", 4)) /* digit */
10825                                 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
10826                             else if (memEQ(posixcc, "prin", 4)) /* print */
10827                                 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
10828                             else if (memEQ(posixcc, "punc", 4)) /* punct */
10829                                 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
10830                             break;
10831                         }
10832                         break;
10833                     case 6:
10834                         if (memEQ(posixcc, "xdigit", 6))
10835                             namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
10836                         break;
10837                     }
10838
10839                     if (namedclass == OOB_NAMEDCLASS)
10840                         Simple_vFAIL3("POSIX class [:%.*s:] unknown",
10841                                       t - s - 1, s + 1);
10842                     assert (posixcc[skip] == ':');
10843                     assert (posixcc[skip+1] == ']');
10844                 } else if (!SIZE_ONLY) {
10845                     /* [[=foo=]] and [[.foo.]] are still future. */
10846
10847                     /* adjust RExC_parse so the warning shows after
10848                        the class closes */
10849                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
10850                         RExC_parse++;
10851                     Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
10852                 }
10853             } else {
10854                 /* Maternal grandfather:
10855                  * "[:" ending in ":" but not in ":]" */
10856                 RExC_parse = s;
10857             }
10858         }
10859     }
10860
10861     return namedclass;
10862 }
10863
10864 STATIC void
10865 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
10866 {
10867     dVAR;
10868
10869     PERL_ARGS_ASSERT_CHECKPOSIXCC;
10870
10871     if (POSIXCC(UCHARAT(RExC_parse))) {
10872         const char *s = RExC_parse;
10873         const char  c = *s++;
10874
10875         while (isALNUM(*s))
10876             s++;
10877         if (*s && c == *s && s[1] == ']') {
10878             ckWARN3reg(s+2,
10879                        "POSIX syntax [%c %c] belongs inside character classes",
10880                        c, c);
10881
10882             /* [[=foo=]] and [[.foo.]] are still future. */
10883             if (POSIXCC_NOTYET(c)) {
10884                 /* adjust RExC_parse so the error shows after
10885                    the class closes */
10886                 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
10887                     NOOP;
10888                 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
10889             }
10890         }
10891     }
10892 }
10893
10894 /* Generate the code to add a full posix character <class> to the bracketed
10895  * character class given by <node>.  (<node> is needed only under locale rules)
10896  * destlist     is the inversion list for non-locale rules that this class is
10897  *              to be added to
10898  * sourcelist   is the ASCII-range inversion list to add under /a rules
10899  * Xsourcelist  is the full Unicode range list to use otherwise. */
10900 #define DO_POSIX(node, class, destlist, sourcelist, Xsourcelist)           \
10901     if (LOC) {                                                             \
10902         SV* scratch_list = NULL;                                           \
10903                                                                            \
10904         /* Set this class in the node for runtime matching */              \
10905         ANYOF_CLASS_SET(node, class);                                      \
10906                                                                            \
10907         /* For above Latin1 code points, we use the full Unicode range */  \
10908         _invlist_intersection(PL_AboveLatin1,                              \
10909                               Xsourcelist,                                 \
10910                               &scratch_list);                              \
10911         /* And set the output to it, adding instead if there already is an \
10912          * output.  Checking if <destlist> is NULL first saves an extra    \
10913          * clone.  Its reference count will be decremented at the next     \
10914          * union, etc, or if this is the only instance, at the end of the  \
10915          * routine */                                                      \
10916         if (! destlist) {                                                  \
10917             destlist = scratch_list;                                       \
10918         }                                                                  \
10919         else {                                                             \
10920             _invlist_union(destlist, scratch_list, &destlist);             \
10921             SvREFCNT_dec(scratch_list);                                    \
10922         }                                                                  \
10923     }                                                                      \
10924     else {                                                                 \
10925         /* For non-locale, just add it to any existing list */             \
10926         _invlist_union(destlist,                                           \
10927                        (AT_LEAST_ASCII_RESTRICTED)                         \
10928                            ? sourcelist                                    \
10929                            : Xsourcelist,                                  \
10930                        &destlist);                                         \
10931     }
10932
10933 /* Like DO_POSIX, but matches the complement of <sourcelist> and <Xsourcelist>.
10934  */
10935 #define DO_N_POSIX(node, class, destlist, sourcelist, Xsourcelist)         \
10936     if (LOC) {                                                             \
10937         SV* scratch_list = NULL;                                           \
10938         ANYOF_CLASS_SET(node, class);                                      \
10939         _invlist_subtract(PL_AboveLatin1, Xsourcelist, &scratch_list);     \
10940         if (! destlist) {                                                  \
10941             destlist = scratch_list;                                       \
10942         }                                                                  \
10943         else {                                                             \
10944             _invlist_union(destlist, scratch_list, &destlist);             \
10945             SvREFCNT_dec(scratch_list);                                    \
10946         }                                                                  \
10947     }                                                                      \
10948     else {                                                                 \
10949         _invlist_union_complement_2nd(destlist,                            \
10950                                     (AT_LEAST_ASCII_RESTRICTED)            \
10951                                         ? sourcelist                       \
10952                                         : Xsourcelist,                     \
10953                                     &destlist);                            \
10954         /* Under /d, everything in the upper half of the Latin1 range      \
10955          * matches this complement */                                      \
10956         if (DEPENDS_SEMANTICS) {                                           \
10957             ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL;                \
10958         }                                                                  \
10959     }
10960
10961 /* Generate the code to add a posix character <class> to the bracketed
10962  * character class given by <node>.  (<node> is needed only under locale rules)
10963  * destlist       is the inversion list for non-locale rules that this class is
10964  *                to be added to
10965  * sourcelist     is the ASCII-range inversion list to add under /a rules
10966  * l1_sourcelist  is the Latin1 range list to use otherwise.
10967  * Xpropertyname  is the name to add to <run_time_list> of the property to
10968  *                specify the code points above Latin1 that will have to be
10969  *                determined at run-time
10970  * run_time_list  is a SV* that contains text names of properties that are to
10971  *                be computed at run time.  This concatenates <Xpropertyname>
10972  *                to it, apppropriately
10973  * This is essentially DO_POSIX, but we know only the Latin1 values at compile
10974  * time */
10975 #define DO_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist,      \
10976                               l1_sourcelist, Xpropertyname, run_time_list) \
10977         /* First, resolve whether to use the ASCII-only list or the L1     \
10978          * list */                                                         \
10979         DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(node, class, destlist,      \
10980                 ((AT_LEAST_ASCII_RESTRICTED) ? sourcelist : l1_sourcelist),\
10981                 Xpropertyname, run_time_list)
10982
10983 #define DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(node, class, destlist, sourcelist, \
10984                 Xpropertyname, run_time_list)                              \
10985     /* If not /a matching, there are going to be code points we will have  \
10986      * to defer to runtime to look-up */                                   \
10987     if (! AT_LEAST_ASCII_RESTRICTED) {                                     \
10988         Perl_sv_catpvf(aTHX_ run_time_list, "+utf8::%s\n", Xpropertyname); \
10989     }                                                                      \
10990     if (LOC) {                                                             \
10991         ANYOF_CLASS_SET(node, class);                                      \
10992     }                                                                      \
10993     else {                                                                 \
10994         _invlist_union(destlist, sourcelist, &destlist);                   \
10995     }
10996
10997 /* Like DO_POSIX_LATIN1_ONLY_KNOWN, but for the complement.  A combination of
10998  * this and DO_N_POSIX */
10999 #define DO_N_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist,    \
11000                               l1_sourcelist, Xpropertyname, run_time_list) \
11001     if (AT_LEAST_ASCII_RESTRICTED) {                                       \
11002         _invlist_union_complement_2nd(destlist, sourcelist, &destlist);    \
11003     }                                                                      \
11004     else {                                                                 \
11005         Perl_sv_catpvf(aTHX_ run_time_list, "!utf8::%s\n", Xpropertyname); \
11006         if (LOC) {                                                         \
11007             ANYOF_CLASS_SET(node, namedclass);                             \
11008         }                                                                  \
11009         else {                                                             \
11010             SV* scratch_list = NULL;                                       \
11011             _invlist_subtract(PL_Latin1, l1_sourcelist, &scratch_list);    \
11012             if (! destlist) {                                              \
11013                 destlist = scratch_list;                                   \
11014             }                                                              \
11015             else {                                                         \
11016                 _invlist_union(destlist, scratch_list, &destlist);         \
11017                 SvREFCNT_dec(scratch_list);                                \
11018             }                                                              \
11019             if (DEPENDS_SEMANTICS) {                                       \
11020                 ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL;            \
11021             }                                                              \
11022         }                                                                  \
11023     }
11024
11025 STATIC U8
11026 S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, SV** invlist_ptr, AV** alternate_ptr)
11027 {
11028
11029     /* Handle the setting of folds in the bitmap for non-locale ANYOF nodes.
11030      * Locale folding is done at run-time, so this function should not be
11031      * called for nodes that are for locales.
11032      *
11033      * This function sets the bit corresponding to the fold of the input
11034      * 'value', if not already set.  The fold of 'f' is 'F', and the fold of
11035      * 'F' is 'f'.
11036      *
11037      * It also knows about the characters that are in the bitmap that have
11038      * folds that are matchable only outside it, and sets the appropriate lists
11039      * and flags.
11040      *
11041      * It returns the number of bits that actually changed from 0 to 1 */
11042
11043     U8 stored = 0;
11044     U8 fold;
11045
11046     PERL_ARGS_ASSERT_SET_REGCLASS_BIT_FOLD;
11047
11048     fold = (AT_LEAST_UNI_SEMANTICS) ? PL_fold_latin1[value]
11049                                     : PL_fold[value];
11050
11051     /* It assumes the bit for 'value' has already been set */
11052     if (fold != value && ! ANYOF_BITMAP_TEST(node, fold)) {
11053         ANYOF_BITMAP_SET(node, fold);
11054         stored++;
11055     }
11056     if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value) && (! isASCII(value) || ! MORE_ASCII_RESTRICTED)) {
11057         /* Certain Latin1 characters have matches outside the bitmap.  To get
11058          * here, 'value' is one of those characters.   None of these matches is
11059          * valid for ASCII characters under /aa, which have been excluded by
11060          * the 'if' above.  The matches fall into three categories:
11061          * 1) They are singly folded-to or -from an above 255 character, as
11062          *    LATIN SMALL LETTER Y WITH DIAERESIS and LATIN CAPITAL LETTER Y
11063          *    WITH DIAERESIS;
11064          * 2) They are part of a multi-char fold with another character in the
11065          *    bitmap, only LATIN SMALL LETTER SHARP S => "ss" fits that bill;
11066          * 3) They are part of a multi-char fold with a character not in the
11067          *    bitmap, such as various ligatures.
11068          * We aren't dealing fully with multi-char folds, except we do deal
11069          * with the pattern containing a character that has a multi-char fold
11070          * (not so much the inverse).
11071          * For types 1) and 3), the matches only happen when the target string
11072          * is utf8; that's not true for 2), and we set a flag for it.
11073          *
11074          * The code below adds to the passed in inversion list the single fold
11075          * closures for 'value'.  The values are hard-coded here so that an
11076          * innocent-looking character class, like /[ks]/i won't have to go out
11077          * to disk to find the possible matches.  XXX It would be better to
11078          * generate these via regen, in case a new version of the Unicode
11079          * standard adds new mappings, though that is not really likely. */
11080         switch (value) {
11081             case 'k':
11082             case 'K':
11083                 /* KELVIN SIGN */
11084                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212A);
11085                 break;
11086             case 's':
11087             case 'S':
11088                 /* LATIN SMALL LETTER LONG S */
11089                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x017F);
11090                 break;
11091             case MICRO_SIGN:
11092                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
11093                                                  GREEK_SMALL_LETTER_MU);
11094                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
11095                                                  GREEK_CAPITAL_LETTER_MU);
11096                 break;
11097             case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
11098             case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
11099                 /* ANGSTROM SIGN */
11100                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212B);
11101                 if (DEPENDS_SEMANTICS) {    /* See DEPENDS comment below */
11102                     *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
11103                                                      PL_fold_latin1[value]);
11104                 }
11105                 break;
11106             case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
11107                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
11108                                         LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
11109                 break;
11110             case LATIN_SMALL_LETTER_SHARP_S:
11111                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
11112                                         LATIN_CAPITAL_LETTER_SHARP_S);
11113
11114                 /* Under /a, /d, and /u, this can match the two chars "ss" */
11115                 if (! MORE_ASCII_RESTRICTED) {
11116                     add_alternate(alternate_ptr, (U8 *) "ss", 2);
11117
11118                     /* And under /u or /a, it can match even if the target is
11119                      * not utf8 */
11120                     if (AT_LEAST_UNI_SEMANTICS) {
11121                         ANYOF_FLAGS(node) |= ANYOF_NONBITMAP_NON_UTF8;
11122                     }
11123                 }
11124                 break;
11125             case 'F': case 'f':
11126             case 'I': case 'i':
11127             case 'L': case 'l':
11128             case 'T': case 't':
11129             case 'A': case 'a':
11130             case 'H': case 'h':
11131             case 'J': case 'j':
11132             case 'N': case 'n':
11133             case 'W': case 'w':
11134             case 'Y': case 'y':
11135                 /* These all are targets of multi-character folds from code
11136                  * points that require UTF8 to express, so they can't match
11137                  * unless the target string is in UTF-8, so no action here is
11138                  * necessary, as regexec.c properly handles the general case
11139                  * for UTF-8 matching */
11140                 break;
11141             default:
11142                 /* Use deprecated warning to increase the chances of this
11143                  * being output */
11144                 ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%x; please use the perlbug utility to report;", value);
11145                 break;
11146         }
11147     }
11148     else if (DEPENDS_SEMANTICS
11149             && ! isASCII(value)
11150             && PL_fold_latin1[value] != value)
11151     {
11152            /* Under DEPENDS rules, non-ASCII Latin1 characters match their
11153             * folds only when the target string is in UTF-8.  We add the fold
11154             * here to the list of things to match outside the bitmap, which
11155             * won't be looked at unless it is UTF8 (or else if something else
11156             * says to look even if not utf8, but those things better not happen
11157             * under DEPENDS semantics. */
11158         *invlist_ptr = add_cp_to_invlist(*invlist_ptr, PL_fold_latin1[value]);
11159     }
11160
11161     return stored;
11162 }
11163
11164
11165 PERL_STATIC_INLINE U8
11166 S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, SV** invlist_ptr, AV** alternate_ptr)
11167 {
11168     /* This inline function sets a bit in the bitmap if not already set, and if
11169      * appropriate, its fold, returning the number of bits that actually
11170      * changed from 0 to 1 */
11171
11172     U8 stored;
11173
11174     PERL_ARGS_ASSERT_SET_REGCLASS_BIT;
11175
11176     if (ANYOF_BITMAP_TEST(node, value)) {   /* Already set */
11177         return 0;
11178     }
11179
11180     ANYOF_BITMAP_SET(node, value);
11181     stored = 1;
11182
11183     if (FOLD && ! LOC) {        /* Locale folds aren't known until runtime */
11184         stored += set_regclass_bit_fold(pRExC_state, node, value, invlist_ptr, alternate_ptr);
11185     }
11186
11187     return stored;
11188 }
11189
11190 STATIC void
11191 S_add_alternate(pTHX_ AV** alternate_ptr, U8* string, STRLEN len)
11192 {
11193     /* Adds input 'string' with length 'len' to the ANYOF node's unicode
11194      * alternate list, pointed to by 'alternate_ptr'.  This is an array of
11195      * the multi-character folds of characters in the node */
11196     SV *sv;
11197
11198     PERL_ARGS_ASSERT_ADD_ALTERNATE;
11199
11200     if (! *alternate_ptr) {
11201         *alternate_ptr = newAV();
11202     }
11203     sv = newSVpvn_utf8((char*)string, len, TRUE);
11204     av_push(*alternate_ptr, sv);
11205     return;
11206 }
11207
11208 /*
11209    parse a class specification and produce either an ANYOF node that
11210    matches the pattern or perhaps will be optimized into an EXACTish node
11211    instead. The node contains a bit map for the first 256 characters, with the
11212    corresponding bit set if that character is in the list.  For characters
11213    above 255, a range list is used */
11214
11215 STATIC regnode *
11216 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
11217 {
11218     dVAR;
11219     register UV nextvalue;
11220     register IV prevvalue = OOB_UNICODE;
11221     register IV range = 0;
11222     UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
11223     register regnode *ret;
11224     STRLEN numlen;
11225     IV namedclass;
11226     char *rangebegin = NULL;
11227     bool need_class = 0;
11228     bool allow_full_fold = TRUE;   /* Assume wants multi-char folding */
11229     SV *listsv = NULL;
11230     STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
11231                                       than just initialized.  */
11232     SV* properties = NULL;    /* Code points that match \p{} \P{} */
11233     UV element_count = 0;   /* Number of distinct elements in the class.
11234                                Optimizations may be possible if this is tiny */
11235     UV n;
11236
11237     /* Unicode properties are stored in a swash; this holds the current one
11238      * being parsed.  If this swash is the only above-latin1 component of the
11239      * character class, an optimization is to pass it directly on to the
11240      * execution engine.  Otherwise, it is set to NULL to indicate that there
11241      * are other things in the class that have to be dealt with at execution
11242      * time */
11243     SV* swash = NULL;           /* Code points that match \p{} \P{} */
11244
11245     /* Set if a component of this character class is user-defined; just passed
11246      * on to the engine */
11247     UV has_user_defined_property = 0;
11248
11249     /* code points this node matches that can't be stored in the bitmap */
11250     SV* nonbitmap = NULL;
11251
11252     /* The items that are to match that aren't stored in the bitmap, but are a
11253      * result of things that are stored there.  This is the fold closure of
11254      * such a character, either because it has DEPENDS semantics and shouldn't
11255      * be matched unless the target string is utf8, or is a code point that is
11256      * too large for the bit map, as for example, the fold of the MICRO SIGN is
11257      * above 255.  This all is solely for performance reasons.  By having this
11258      * code know the outside-the-bitmap folds that the bitmapped characters are
11259      * involved with, we don't have to go out to disk to find the list of
11260      * matches, unless the character class includes code points that aren't
11261      * storable in the bit map.  That means that a character class with an 's'
11262      * in it, for example, doesn't need to go out to disk to find everything
11263      * that matches.  A 2nd list is used so that the 'nonbitmap' list is kept
11264      * empty unless there is something whose fold we don't know about, and will
11265      * have to go out to the disk to find. */
11266     SV* l1_fold_invlist = NULL;
11267
11268     /* List of multi-character folds that are matched by this node */
11269     AV* unicode_alternate  = NULL;
11270 #ifdef EBCDIC
11271     UV literal_endpoint = 0;
11272 #endif
11273     UV stored = 0;  /* how many chars stored in the bitmap */
11274
11275     regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
11276         case we need to change the emitted regop to an EXACT. */
11277     const char * orig_parse = RExC_parse;
11278     GET_RE_DEBUG_FLAGS_DECL;
11279
11280     PERL_ARGS_ASSERT_REGCLASS;
11281 #ifndef DEBUGGING
11282     PERL_UNUSED_ARG(depth);
11283 #endif
11284
11285     DEBUG_PARSE("clas");
11286
11287     /* Assume we are going to generate an ANYOF node. */
11288     ret = reganode(pRExC_state, ANYOF, 0);
11289
11290
11291     if (!SIZE_ONLY) {
11292         ANYOF_FLAGS(ret) = 0;
11293     }
11294
11295     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
11296         RExC_naughty++;
11297         RExC_parse++;
11298         if (!SIZE_ONLY)
11299             ANYOF_FLAGS(ret) |= ANYOF_INVERT;
11300
11301         /* We have decided to not allow multi-char folds in inverted character
11302          * classes, due to the confusion that can happen, especially with
11303          * classes that are designed for a non-Unicode world:  You have the
11304          * peculiar case that:
11305             "s s" =~ /^[^\xDF]+$/i => Y
11306             "ss"  =~ /^[^\xDF]+$/i => N
11307          *
11308          * See [perl #89750] */
11309         allow_full_fold = FALSE;
11310     }
11311
11312     if (SIZE_ONLY) {
11313         RExC_size += ANYOF_SKIP;
11314         listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
11315     }
11316     else {
11317         RExC_emit += ANYOF_SKIP;
11318         if (LOC) {
11319             ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
11320         }
11321         ANYOF_BITMAP_ZERO(ret);
11322         listsv = newSVpvs("# comment\n");
11323         initial_listsv_len = SvCUR(listsv);
11324     }
11325
11326     nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
11327
11328     if (!SIZE_ONLY && POSIXCC(nextvalue))
11329         checkposixcc(pRExC_state);
11330
11331     /* allow 1st char to be ] (allowing it to be - is dealt with later) */
11332     if (UCHARAT(RExC_parse) == ']')
11333         goto charclassloop;
11334
11335 parseit:
11336     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
11337
11338     charclassloop:
11339
11340         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
11341
11342         if (!range) {
11343             rangebegin = RExC_parse;
11344             element_count++;
11345         }
11346         if (UTF) {
11347             value = utf8n_to_uvchr((U8*)RExC_parse,
11348                                    RExC_end - RExC_parse,
11349                                    &numlen, UTF8_ALLOW_DEFAULT);
11350             RExC_parse += numlen;
11351         }
11352         else
11353             value = UCHARAT(RExC_parse++);
11354
11355         nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
11356         if (value == '[' && POSIXCC(nextvalue))
11357             namedclass = regpposixcc(pRExC_state, value);
11358         else if (value == '\\') {
11359             if (UTF) {
11360                 value = utf8n_to_uvchr((U8*)RExC_parse,
11361                                    RExC_end - RExC_parse,
11362                                    &numlen, UTF8_ALLOW_DEFAULT);
11363                 RExC_parse += numlen;
11364             }
11365             else
11366                 value = UCHARAT(RExC_parse++);
11367             /* Some compilers cannot handle switching on 64-bit integer
11368              * values, therefore value cannot be an UV.  Yes, this will
11369              * be a problem later if we want switch on Unicode.
11370              * A similar issue a little bit later when switching on
11371              * namedclass. --jhi */
11372             switch ((I32)value) {
11373             case 'w':   namedclass = ANYOF_ALNUM;       break;
11374             case 'W':   namedclass = ANYOF_NALNUM;      break;
11375             case 's':   namedclass = ANYOF_SPACE;       break;
11376             case 'S':   namedclass = ANYOF_NSPACE;      break;
11377             case 'd':   namedclass = ANYOF_DIGIT;       break;
11378             case 'D':   namedclass = ANYOF_NDIGIT;      break;
11379             case 'v':   namedclass = ANYOF_VERTWS;      break;
11380             case 'V':   namedclass = ANYOF_NVERTWS;     break;
11381             case 'h':   namedclass = ANYOF_HORIZWS;     break;
11382             case 'H':   namedclass = ANYOF_NHORIZWS;    break;
11383             case 'N':  /* Handle \N{NAME} in class */
11384                 {
11385                     /* We only pay attention to the first char of
11386                     multichar strings being returned. I kinda wonder
11387                     if this makes sense as it does change the behaviour
11388                     from earlier versions, OTOH that behaviour was broken
11389                     as well. */
11390                     UV v; /* value is register so we cant & it /grrr */
11391                     if (reg_namedseq(pRExC_state, &v, NULL, depth)) {
11392                         goto parseit;
11393                     }
11394                     value= v;
11395                 }
11396                 break;
11397             case 'p':
11398             case 'P':
11399                 {
11400                 char *e;
11401                 if (RExC_parse >= RExC_end)
11402                     vFAIL2("Empty \\%c{}", (U8)value);
11403                 if (*RExC_parse == '{') {
11404                     const U8 c = (U8)value;
11405                     e = strchr(RExC_parse++, '}');
11406                     if (!e)
11407                         vFAIL2("Missing right brace on \\%c{}", c);
11408                     while (isSPACE(UCHARAT(RExC_parse)))
11409                         RExC_parse++;
11410                     if (e == RExC_parse)
11411                         vFAIL2("Empty \\%c{}", c);
11412                     n = e - RExC_parse;
11413                     while (isSPACE(UCHARAT(RExC_parse + n - 1)))
11414                         n--;
11415                 }
11416                 else {
11417                     e = RExC_parse;
11418                     n = 1;
11419                 }
11420                 if (!SIZE_ONLY) {
11421                     SV** invlistsvp;
11422                     SV* invlist;
11423                     char* name;
11424                     if (UCHARAT(RExC_parse) == '^') {
11425                          RExC_parse++;
11426                          n--;
11427                          value = value == 'p' ? 'P' : 'p'; /* toggle */
11428                          while (isSPACE(UCHARAT(RExC_parse))) {
11429                               RExC_parse++;
11430                               n--;
11431                          }
11432                     }
11433                     /* Try to get the definition of the property into
11434                      * <invlist>.  If /i is in effect, the effective property
11435                      * will have its name be <__NAME_i>.  The design is
11436                      * discussed in commit
11437                      * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
11438                     Newx(name, n + sizeof("_i__\n"), char);
11439
11440                     sprintf(name, "%s%.*s%s\n",
11441                                     (FOLD) ? "__" : "",
11442                                     (int)n,
11443                                     RExC_parse,
11444                                     (FOLD) ? "_i" : ""
11445                     );
11446
11447                     /* Look up the property name, and get its swash and
11448                      * inversion list, if the property is found  */
11449                     if (swash) {
11450                         SvREFCNT_dec(swash);
11451                     }
11452                     swash = _core_swash_init("utf8", name, &PL_sv_undef,
11453                                              1, /* binary */
11454                                              0, /* not tr/// */
11455                                              TRUE, /* this routine will handle
11456                                                       undefined properties */
11457                                              NULL, FALSE /* No inversion list */
11458                                             );
11459                     if (   ! swash
11460                         || ! SvROK(swash)
11461                         || ! SvTYPE(SvRV(swash)) == SVt_PVHV
11462                         || ! (invlistsvp =
11463                                 hv_fetchs(MUTABLE_HV(SvRV(swash)),
11464                                 "INVLIST", FALSE))
11465                         || ! (invlist = *invlistsvp))
11466                     {
11467                         if (swash) {
11468                             SvREFCNT_dec(swash);
11469                             swash = NULL;
11470                         }
11471
11472                         /* Here didn't find it.  It could be a user-defined
11473                          * property that will be available at run-time.  Add it
11474                          * to the list to look up then */
11475                         Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n",
11476                                         (value == 'p' ? '+' : '!'),
11477                                         name);
11478                         has_user_defined_property = 1;
11479
11480                         /* We don't know yet, so have to assume that the
11481                          * property could match something in the Latin1 range,
11482                          * hence something that isn't utf8 */
11483                         ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
11484                     }
11485                     else {
11486
11487                         /* Here, did get the swash and its inversion list.  If
11488                          * the swash is from a user-defined property, then this
11489                          * whole character class should be regarded as such */
11490                         SV** user_defined_svp =
11491                                             hv_fetchs(MUTABLE_HV(SvRV(swash)),
11492                                                         "USER_DEFINED", FALSE);
11493                         if (user_defined_svp) {
11494                             has_user_defined_property
11495                                                     |= SvUV(*user_defined_svp);
11496                         }
11497
11498                         /* Invert if asking for the complement */
11499                         if (value == 'P') {
11500                             _invlist_union_complement_2nd(properties, invlist, &properties);
11501
11502                             /* The swash can't be used as-is, because we've
11503                              * inverted things; delay removing it to here after
11504                              * have copied its invlist above */
11505                             SvREFCNT_dec(swash);
11506                             swash = NULL;
11507                         }
11508                         else {
11509                             _invlist_union(properties, invlist, &properties);
11510                         }
11511                     }
11512                     Safefree(name);
11513                 }
11514                 RExC_parse = e + 1;
11515                 namedclass = ANYOF_MAX;  /* no official name, but it's named */
11516
11517                 /* \p means they want Unicode semantics */
11518                 RExC_uni_semantics = 1;
11519                 }
11520                 break;
11521             case 'n':   value = '\n';                   break;
11522             case 'r':   value = '\r';                   break;
11523             case 't':   value = '\t';                   break;
11524             case 'f':   value = '\f';                   break;
11525             case 'b':   value = '\b';                   break;
11526             case 'e':   value = ASCII_TO_NATIVE('\033');break;
11527             case 'a':   value = ASCII_TO_NATIVE('\007');break;
11528             case 'o':
11529                 RExC_parse--;   /* function expects to be pointed at the 'o' */
11530                 {
11531                     const char* error_msg;
11532                     bool valid = grok_bslash_o(RExC_parse,
11533                                                &value,
11534                                                &numlen,
11535                                                &error_msg,
11536                                                SIZE_ONLY);
11537                     RExC_parse += numlen;
11538                     if (! valid) {
11539                         vFAIL(error_msg);
11540                     }
11541                 }
11542                 if (PL_encoding && value < 0x100) {
11543                     goto recode_encoding;
11544                 }
11545                 break;
11546             case 'x':
11547                 if (*RExC_parse == '{') {
11548                     I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
11549                         | PERL_SCAN_DISALLOW_PREFIX;
11550                     char * const e = strchr(RExC_parse++, '}');
11551                     if (!e)
11552                         vFAIL("Missing right brace on \\x{}");
11553
11554                     numlen = e - RExC_parse;
11555                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
11556                     RExC_parse = e + 1;
11557                 }
11558                 else {
11559                     I32 flags = PERL_SCAN_DISALLOW_PREFIX;
11560                     numlen = 2;
11561                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
11562                     RExC_parse += numlen;
11563                 }
11564                 if (PL_encoding && value < 0x100)
11565                     goto recode_encoding;
11566                 break;
11567             case 'c':
11568                 value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
11569                 break;
11570             case '0': case '1': case '2': case '3': case '4':
11571             case '5': case '6': case '7':
11572                 {
11573                     /* Take 1-3 octal digits */
11574                     I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
11575                     numlen = 3;
11576                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
11577                     RExC_parse += numlen;
11578                     if (PL_encoding && value < 0x100)
11579                         goto recode_encoding;
11580                     break;
11581                 }
11582             recode_encoding:
11583                 if (! RExC_override_recoding) {
11584                     SV* enc = PL_encoding;
11585                     value = reg_recode((const char)(U8)value, &enc);
11586                     if (!enc && SIZE_ONLY)
11587                         ckWARNreg(RExC_parse,
11588                                   "Invalid escape in the specified encoding");
11589                     break;
11590                 }
11591             default:
11592                 /* Allow \_ to not give an error */
11593                 if (!SIZE_ONLY && isALNUM(value) && value != '_') {
11594                     ckWARN2reg(RExC_parse,
11595                                "Unrecognized escape \\%c in character class passed through",
11596                                (int)value);
11597                 }
11598                 break;
11599             }
11600         } /* end of \blah */
11601 #ifdef EBCDIC
11602         else
11603             literal_endpoint++;
11604 #endif
11605
11606         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
11607
11608             /* What matches in a locale is not known until runtime, so need to
11609              * (one time per class) allocate extra space to pass to regexec.
11610              * The space will contain a bit for each named class that is to be
11611              * matched against.  This isn't needed for \p{} and pseudo-classes,
11612              * as they are not affected by locale, and hence are dealt with
11613              * separately */
11614             if (LOC && namedclass < ANYOF_MAX && ! need_class) {
11615                 need_class = 1;
11616                 if (SIZE_ONLY) {
11617                     RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
11618                 }
11619                 else {
11620                     RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
11621                     ANYOF_CLASS_ZERO(ret);
11622                 }
11623                 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
11624             }
11625
11626             /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
11627              * literal, as is the character that began the false range, i.e.
11628              * the 'a' in the examples */
11629             if (range) {
11630                 if (!SIZE_ONLY) {
11631                     const int w =
11632                         RExC_parse >= rangebegin ?
11633                         RExC_parse - rangebegin : 0;
11634                     ckWARN4reg(RExC_parse,
11635                                "False [] range \"%*.*s\"",
11636                                w, w, rangebegin);
11637
11638                     stored +=
11639                          set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
11640                     if (prevvalue < 256) {
11641                         stored +=
11642                          set_regclass_bit(pRExC_state, ret, (U8) prevvalue, &l1_fold_invlist, &unicode_alternate);
11643                     }
11644                     else {
11645                         nonbitmap = add_cp_to_invlist(nonbitmap, prevvalue);
11646                     }
11647                 }
11648
11649                 range = 0; /* this was not a true range */
11650             }
11651
11652             if (!SIZE_ONLY) {
11653
11654                 /* Possible truncation here but in some 64-bit environments
11655                  * the compiler gets heartburn about switch on 64-bit values.
11656                  * A similar issue a little earlier when switching on value.
11657                  * --jhi */
11658                 switch ((I32)namedclass) {
11659
11660                 case ANYOF_ALNUMC: /* C's alnum, in contrast to \w */
11661                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11662                         PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv);
11663                     break;
11664                 case ANYOF_NALNUMC:
11665                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11666                         PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv);
11667                     break;
11668                 case ANYOF_ALPHA:
11669                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11670                         PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv);
11671                     break;
11672                 case ANYOF_NALPHA:
11673                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11674                         PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv);
11675                     break;
11676                 case ANYOF_ASCII:
11677                     if (LOC) {
11678                         ANYOF_CLASS_SET(ret, namedclass);
11679                     }
11680                     else {
11681                         _invlist_union(properties, PL_ASCII, &properties);
11682                     }
11683                     break;
11684                 case ANYOF_NASCII:
11685                     if (LOC) {
11686                         ANYOF_CLASS_SET(ret, namedclass);
11687                     }
11688                     else {
11689                         _invlist_union_complement_2nd(properties,
11690                                                     PL_ASCII, &properties);
11691                         if (DEPENDS_SEMANTICS) {
11692                             ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
11693                         }
11694                     }
11695                     break;
11696                 case ANYOF_BLANK:
11697                     DO_POSIX(ret, namedclass, properties,
11698                                             PL_PosixBlank, PL_XPosixBlank);
11699                     break;
11700                 case ANYOF_NBLANK:
11701                     DO_N_POSIX(ret, namedclass, properties,
11702                                             PL_PosixBlank, PL_XPosixBlank);
11703                     break;
11704                 case ANYOF_CNTRL:
11705                     DO_POSIX(ret, namedclass, properties,
11706                                             PL_PosixCntrl, PL_XPosixCntrl);
11707                     break;
11708                 case ANYOF_NCNTRL:
11709                     DO_N_POSIX(ret, namedclass, properties,
11710                                             PL_PosixCntrl, PL_XPosixCntrl);
11711                     break;
11712                 case ANYOF_DIGIT:
11713                     /* There are no digits in the Latin1 range outside of
11714                      * ASCII, so call the macro that doesn't have to resolve
11715                      * them */
11716                     DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(ret, namedclass, properties,
11717                         PL_PosixDigit, "XPosixDigit", listsv);
11718                     break;
11719                 case ANYOF_NDIGIT:
11720                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11721                         PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv);
11722                     break;
11723                 case ANYOF_GRAPH:
11724                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11725                         PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv);
11726                     break;
11727                 case ANYOF_NGRAPH:
11728                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11729                         PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv);
11730                     break;
11731                 case ANYOF_HORIZWS:
11732                     /* For these, we use the nonbitmap, as /d doesn't make a
11733                      * difference in what these match.  There would be problems
11734                      * if these characters had folds other than themselves, as
11735                      * nonbitmap is subject to folding.  It turns out that \h
11736                      * is just a synonym for XPosixBlank */
11737                     _invlist_union(nonbitmap, PL_XPosixBlank, &nonbitmap);
11738                     break;
11739                 case ANYOF_NHORIZWS:
11740                     _invlist_union_complement_2nd(nonbitmap,
11741                                                  PL_XPosixBlank, &nonbitmap);
11742                     break;
11743                 case ANYOF_LOWER:
11744                 case ANYOF_NLOWER:
11745                 {   /* These require special handling, as they differ under
11746                        folding, matching Cased there (which in the ASCII range
11747                        is the same as Alpha */
11748
11749                     SV* ascii_source;
11750                     SV* l1_source;
11751                     const char *Xname;
11752
11753                     if (FOLD && ! LOC) {
11754                         ascii_source = PL_PosixAlpha;
11755                         l1_source = PL_L1Cased;
11756                         Xname = "Cased";
11757                     }
11758                     else {
11759                         ascii_source = PL_PosixLower;
11760                         l1_source = PL_L1PosixLower;
11761                         Xname = "XPosixLower";
11762                     }
11763                     if (namedclass == ANYOF_LOWER) {
11764                         DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11765                                     ascii_source, l1_source, Xname, listsv);
11766                     }
11767                     else {
11768                         DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
11769                             properties, ascii_source, l1_source, Xname, listsv);
11770                     }
11771                     break;
11772                 }
11773                 case ANYOF_PRINT:
11774                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11775                         PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv);
11776                     break;
11777                 case ANYOF_NPRINT:
11778                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11779                         PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv);
11780                     break;
11781                 case ANYOF_PUNCT:
11782                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11783                         PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv);
11784                     break;
11785                 case ANYOF_NPUNCT:
11786                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11787                         PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv);
11788                     break;
11789                 case ANYOF_PSXSPC:
11790                     DO_POSIX(ret, namedclass, properties,
11791                                             PL_PosixSpace, PL_XPosixSpace);
11792                     break;
11793                 case ANYOF_NPSXSPC:
11794                     DO_N_POSIX(ret, namedclass, properties,
11795                                             PL_PosixSpace, PL_XPosixSpace);
11796                     break;
11797                 case ANYOF_SPACE:
11798                     DO_POSIX(ret, namedclass, properties,
11799                                             PL_PerlSpace, PL_XPerlSpace);
11800                     break;
11801                 case ANYOF_NSPACE:
11802                     DO_N_POSIX(ret, namedclass, properties,
11803                                             PL_PerlSpace, PL_XPerlSpace);
11804                     break;
11805                 case ANYOF_UPPER:   /* Same as LOWER, above */
11806                 case ANYOF_NUPPER:
11807                 {
11808                     SV* ascii_source;
11809                     SV* l1_source;
11810                     const char *Xname;
11811
11812                     if (FOLD && ! LOC) {
11813                         ascii_source = PL_PosixAlpha;
11814                         l1_source = PL_L1Cased;
11815                         Xname = "Cased";
11816                     }
11817                     else {
11818                         ascii_source = PL_PosixUpper;
11819                         l1_source = PL_L1PosixUpper;
11820                         Xname = "XPosixUpper";
11821                     }
11822                     if (namedclass == ANYOF_UPPER) {
11823                         DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11824                                     ascii_source, l1_source, Xname, listsv);
11825                     }
11826                     else {
11827                         DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
11828                         properties, ascii_source, l1_source, Xname, listsv);
11829                     }
11830                     break;
11831                 }
11832                 case ANYOF_ALNUM:   /* Really is 'Word' */
11833                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11834                             PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
11835                     break;
11836                 case ANYOF_NALNUM:
11837                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11838                             PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
11839                     break;
11840                 case ANYOF_VERTWS:
11841                     /* For these, we use the nonbitmap, as /d doesn't make a
11842                      * difference in what these match.  There would be problems
11843                      * if these characters had folds other than themselves, as
11844                      * nonbitmap is subject to folding */
11845                     _invlist_union(nonbitmap, PL_VertSpace, &nonbitmap);
11846                     break;
11847                 case ANYOF_NVERTWS:
11848                     _invlist_union_complement_2nd(nonbitmap,
11849                                                     PL_VertSpace, &nonbitmap);
11850                     break;
11851                 case ANYOF_XDIGIT:
11852                     DO_POSIX(ret, namedclass, properties,
11853                                             PL_PosixXDigit, PL_XPosixXDigit);
11854                     break;
11855                 case ANYOF_NXDIGIT:
11856                     DO_N_POSIX(ret, namedclass, properties,
11857                                             PL_PosixXDigit, PL_XPosixXDigit);
11858                     break;
11859                 case ANYOF_MAX:
11860                     /* this is to handle \p and \P */
11861                     break;
11862                 default:
11863                     vFAIL("Invalid [::] class");
11864                     break;
11865                 }
11866
11867                 continue;
11868             }
11869         } /* end of namedclass \blah */
11870
11871         if (range) {
11872             if (prevvalue > (IV)value) /* b-a */ {
11873                 const int w = RExC_parse - rangebegin;
11874                 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
11875                 range = 0; /* not a valid range */
11876             }
11877         }
11878         else {
11879             prevvalue = value; /* save the beginning of the range */
11880             if (RExC_parse+1 < RExC_end
11881                 && *RExC_parse == '-'
11882                 && RExC_parse[1] != ']')
11883             {
11884                 RExC_parse++;
11885
11886                 /* a bad range like \w-, [:word:]- ? */
11887                 if (namedclass > OOB_NAMEDCLASS) {
11888                     if (ckWARN(WARN_REGEXP)) {
11889                         const int w =
11890                             RExC_parse >= rangebegin ?
11891                             RExC_parse - rangebegin : 0;
11892                         vWARN4(RExC_parse,
11893                                "False [] range \"%*.*s\"",
11894                                w, w, rangebegin);
11895                     }
11896                     if (!SIZE_ONLY)
11897                         stored +=
11898                             set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
11899                 } else
11900                     range = 1;  /* yeah, it's a range! */
11901                 continue;       /* but do it the next time */
11902             }
11903         }
11904
11905         /* non-Latin1 code point implies unicode semantics.  Must be set in
11906          * pass1 so is there for the whole of pass 2 */
11907         if (value > 255) {
11908             RExC_uni_semantics = 1;
11909         }
11910
11911         /* now is the next time */
11912         if (!SIZE_ONLY) {
11913             if (prevvalue < 256) {
11914                 const IV ceilvalue = value < 256 ? value : 255;
11915                 IV i;
11916 #ifdef EBCDIC
11917                 /* In EBCDIC [\x89-\x91] should include
11918                  * the \x8e but [i-j] should not. */
11919                 if (literal_endpoint == 2 &&
11920                     ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
11921                      (isUPPER(prevvalue) && isUPPER(ceilvalue))))
11922                 {
11923                     if (isLOWER(prevvalue)) {
11924                         for (i = prevvalue; i <= ceilvalue; i++)
11925                             if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
11926                                 stored +=
11927                                   set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
11928                             }
11929                     } else {
11930                         for (i = prevvalue; i <= ceilvalue; i++)
11931                             if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
11932                                 stored +=
11933                                   set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
11934                             }
11935                     }
11936                 }
11937                 else
11938 #endif
11939                       for (i = prevvalue; i <= ceilvalue; i++) {
11940                         stored += set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
11941                       }
11942           }
11943           if (value > 255) {
11944             const UV prevnatvalue  = NATIVE_TO_UNI(prevvalue);
11945             const UV natvalue      = NATIVE_TO_UNI(value);
11946             nonbitmap = _add_range_to_invlist(nonbitmap, prevnatvalue, natvalue);
11947         }
11948 #ifdef EBCDIC
11949             literal_endpoint = 0;
11950 #endif
11951         }
11952
11953         range = 0; /* this range (if it was one) is done now */
11954     }
11955
11956
11957
11958     if (SIZE_ONLY)
11959         return ret;
11960     /****** !SIZE_ONLY AFTER HERE *********/
11961
11962     /* If folding and there are code points above 255, we calculate all
11963      * characters that could fold to or from the ones already on the list */
11964     if (FOLD && nonbitmap) {
11965         UV start, end;  /* End points of code point ranges */
11966
11967         SV* fold_intersection = NULL;
11968
11969         /* This is a list of all the characters that participate in folds
11970             * (except marks, etc in multi-char folds */
11971         if (! PL_utf8_foldable) {
11972             SV* swash = swash_init("utf8", "Cased", &PL_sv_undef, 1, 0);
11973             PL_utf8_foldable = _swash_to_invlist(swash);
11974             SvREFCNT_dec(swash);
11975         }
11976
11977         /* This is a hash that for a particular fold gives all characters
11978             * that are involved in it */
11979         if (! PL_utf8_foldclosures) {
11980
11981             /* If we were unable to find any folds, then we likely won't be
11982              * able to find the closures.  So just create an empty list.
11983              * Folding will effectively be restricted to the non-Unicode rules
11984              * hard-coded into Perl.  (This case happens legitimately during
11985              * compilation of Perl itself before the Unicode tables are
11986              * generated) */
11987             if (invlist_len(PL_utf8_foldable) == 0) {
11988                 PL_utf8_foldclosures = newHV();
11989             } else {
11990                 /* If the folds haven't been read in, call a fold function
11991                     * to force that */
11992                 if (! PL_utf8_tofold) {
11993                     U8 dummy[UTF8_MAXBYTES+1];
11994                     STRLEN dummy_len;
11995
11996                     /* This particular string is above \xff in both UTF-8 and
11997                      * UTFEBCDIC */
11998                     to_utf8_fold((U8*) "\xC8\x80", dummy, &dummy_len);
11999                     assert(PL_utf8_tofold); /* Verify that worked */
12000                 }
12001                 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
12002             }
12003         }
12004
12005         /* Only the characters in this class that participate in folds need be
12006          * checked.  Get the intersection of this class and all the possible
12007          * characters that are foldable.  This can quickly narrow down a large
12008          * class */
12009         _invlist_intersection(PL_utf8_foldable, nonbitmap, &fold_intersection);
12010
12011         /* Now look at the foldable characters in this class individually */
12012         invlist_iterinit(fold_intersection);
12013         while (invlist_iternext(fold_intersection, &start, &end)) {
12014             UV j;
12015
12016             /* Look at every character in the range */
12017             for (j = start; j <= end; j++) {
12018
12019                 /* Get its fold */
12020                 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
12021                 STRLEN foldlen;
12022                 const UV f =
12023                     _to_uni_fold_flags(j, foldbuf, &foldlen,
12024                                        (allow_full_fold) ? FOLD_FLAGS_FULL : 0);
12025
12026                 if (foldlen > (STRLEN)UNISKIP(f)) {
12027
12028                     /* Any multicharacter foldings (disallowed in lookbehind
12029                      * patterns) require the following transform: [ABCDEF] ->
12030                      * (?:[ABCabcDEFd]|pq|rst) where E folds into "pq" and F
12031                      * folds into "rst", all other characters fold to single
12032                      * characters.  We save away these multicharacter foldings,
12033                      * to be later saved as part of the additional "s" data. */
12034                     if (! RExC_in_lookbehind) {
12035                         U8* loc = foldbuf;
12036                         U8* e = foldbuf + foldlen;
12037
12038                         /* If any of the folded characters of this are in the
12039                          * Latin1 range, tell the regex engine that this can
12040                          * match a non-utf8 target string.  The only multi-byte
12041                          * fold whose source is in the Latin1 range (U+00DF)
12042                          * applies only when the target string is utf8, or
12043                          * under unicode rules */
12044                         if (j > 255 || AT_LEAST_UNI_SEMANTICS) {
12045                             while (loc < e) {
12046
12047                                 /* Can't mix ascii with non- under /aa */
12048                                 if (MORE_ASCII_RESTRICTED
12049                                     && (isASCII(*loc) != isASCII(j)))
12050                                 {
12051                                     goto end_multi_fold;
12052                                 }
12053                                 if (UTF8_IS_INVARIANT(*loc)
12054                                     || UTF8_IS_DOWNGRADEABLE_START(*loc))
12055                                 {
12056                                     /* Can't mix above and below 256 under LOC
12057                                      */
12058                                     if (LOC) {
12059                                         goto end_multi_fold;
12060                                     }
12061                                     ANYOF_FLAGS(ret)
12062                                             |= ANYOF_NONBITMAP_NON_UTF8;
12063                                     break;
12064                                 }
12065                                 loc += UTF8SKIP(loc);
12066                             }
12067                         }
12068
12069                         add_alternate(&unicode_alternate, foldbuf, foldlen);
12070                     end_multi_fold: ;
12071                     }
12072
12073                     /* This is special-cased, as it is the only letter which
12074                      * has both a multi-fold and single-fold in Latin1.  All
12075                      * the other chars that have single and multi-folds are
12076                      * always in utf8, and the utf8 folding algorithm catches
12077                      * them */
12078                     if (! LOC && j == LATIN_CAPITAL_LETTER_SHARP_S) {
12079                         stored += set_regclass_bit(pRExC_state,
12080                                         ret,
12081                                         LATIN_SMALL_LETTER_SHARP_S,
12082                                         &l1_fold_invlist, &unicode_alternate);
12083                     }
12084                 }
12085                 else {
12086                     /* Single character fold.  Add everything in its fold
12087                      * closure to the list that this node should match */
12088                     SV** listp;
12089
12090                     /* The fold closures data structure is a hash with the keys
12091                      * being every character that is folded to, like 'k', and
12092                      * the values each an array of everything that folds to its
12093                      * key.  e.g. [ 'k', 'K', KELVIN_SIGN ] */
12094                     if ((listp = hv_fetch(PL_utf8_foldclosures,
12095                                     (char *) foldbuf, foldlen, FALSE)))
12096                     {
12097                         AV* list = (AV*) *listp;
12098                         IV k;
12099                         for (k = 0; k <= av_len(list); k++) {
12100                             SV** c_p = av_fetch(list, k, FALSE);
12101                             UV c;
12102                             if (c_p == NULL) {
12103                                 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
12104                             }
12105                             c = SvUV(*c_p);
12106
12107                             /* /aa doesn't allow folds between ASCII and non-;
12108                              * /l doesn't allow them between above and below
12109                              * 256 */
12110                             if ((MORE_ASCII_RESTRICTED
12111                                  && (isASCII(c) != isASCII(j)))
12112                                     || (LOC && ((c < 256) != (j < 256))))
12113                             {
12114                                 continue;
12115                             }
12116
12117                             if (c < 256 && AT_LEAST_UNI_SEMANTICS) {
12118                                 stored += set_regclass_bit(pRExC_state,
12119                                         ret,
12120                                         (U8) c,
12121                                         &l1_fold_invlist, &unicode_alternate);
12122                             }
12123                                 /* It may be that the code point is already in
12124                                  * this range or already in the bitmap, in
12125                                  * which case we need do nothing */
12126                             else if ((c < start || c > end)
12127                                         && (c > 255
12128                                             || ! ANYOF_BITMAP_TEST(ret, c)))
12129                             {
12130                                 nonbitmap = add_cp_to_invlist(nonbitmap, c);
12131                             }
12132                         }
12133                     }
12134                 }
12135             }
12136         }
12137         SvREFCNT_dec(fold_intersection);
12138     }
12139
12140     /* Combine the two lists into one. */
12141     if (l1_fold_invlist) {
12142         if (nonbitmap) {
12143             _invlist_union(nonbitmap, l1_fold_invlist, &nonbitmap);
12144             SvREFCNT_dec(l1_fold_invlist);
12145         }
12146         else {
12147             nonbitmap = l1_fold_invlist;
12148         }
12149     }
12150
12151     /* And combine the result (if any) with any inversion list from properties.
12152      * The lists are kept separate up to now because we don't want to fold the
12153      * properties */
12154     if (properties) {
12155         if (nonbitmap) {
12156             _invlist_union(nonbitmap, properties, &nonbitmap);
12157             SvREFCNT_dec(properties);
12158         }
12159         else {
12160             nonbitmap = properties;
12161         }
12162     }
12163
12164     /* Here, <nonbitmap> contains all the code points we can determine at
12165      * compile time that we haven't put into the bitmap.  Go through it, and
12166      * for things that belong in the bitmap, put them there, and delete from
12167      * <nonbitmap> */
12168     if (nonbitmap) {
12169
12170         /* Above-ASCII code points in /d have to stay in <nonbitmap>, as they
12171          * possibly only should match when the target string is UTF-8 */
12172         UV max_cp_to_set = (DEPENDS_SEMANTICS) ? 127 : 255;
12173
12174         /* This gets set if we actually need to modify things */
12175         bool change_invlist = FALSE;
12176
12177         UV start, end;
12178
12179         /* Start looking through <nonbitmap> */
12180         invlist_iterinit(nonbitmap);
12181         while (invlist_iternext(nonbitmap, &start, &end)) {
12182             UV high;
12183             int i;
12184
12185             /* Quit if are above what we should change */
12186             if (start > max_cp_to_set) {
12187                 break;
12188             }
12189
12190             change_invlist = TRUE;
12191
12192             /* Set all the bits in the range, up to the max that we are doing */
12193             high = (end < max_cp_to_set) ? end : max_cp_to_set;
12194             for (i = start; i <= (int) high; i++) {
12195                 if (! ANYOF_BITMAP_TEST(ret, i)) {
12196                     ANYOF_BITMAP_SET(ret, i);
12197                     stored++;
12198                     prevvalue = value;
12199                     value = i;
12200                 }
12201             }
12202         }
12203
12204         /* Done with loop; remove any code points that are in the bitmap from
12205          * <nonbitmap> */
12206         if (change_invlist) {
12207             _invlist_subtract(nonbitmap,
12208                               (DEPENDS_SEMANTICS)
12209                                 ? PL_ASCII
12210                                 : PL_Latin1,
12211                               &nonbitmap);
12212         }
12213
12214         /* If have completely emptied it, remove it completely */
12215         if (invlist_len(nonbitmap) == 0) {
12216             SvREFCNT_dec(nonbitmap);
12217             nonbitmap = NULL;
12218         }
12219     }
12220
12221     /* Here, we have calculated what code points should be in the character
12222      * class.  <nonbitmap> does not overlap the bitmap except possibly in the
12223      * case of DEPENDS rules.
12224      *
12225      * Now we can see about various optimizations.  Fold calculation (which we
12226      * did above) needs to take place before inversion.  Otherwise /[^k]/i
12227      * would invert to include K, which under /i would match k, which it
12228      * shouldn't. */
12229
12230     /* Optimize inverted simple patterns (e.g. [^a-z]).  Note that we haven't
12231      * set the FOLD flag yet, so this does optimize those.  It doesn't
12232      * optimize locale.  Doing so perhaps could be done as long as there is
12233      * nothing like \w in it; some thought also would have to be given to the
12234      * interaction with above 0x100 chars */
12235     if ((ANYOF_FLAGS(ret) & ANYOF_INVERT)
12236         && ! LOC
12237         && ! unicode_alternate
12238         /* In case of /d, there are some things that should match only when in
12239          * not in the bitmap, i.e., they require UTF8 to match.  These are
12240          * listed in nonbitmap, but if ANYOF_NONBITMAP_NON_UTF8 is set in this
12241          * case, they don't require UTF8, so can invert here */
12242         && (! nonbitmap
12243             || ! DEPENDS_SEMANTICS
12244             || (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP_NON_UTF8))
12245         && SvCUR(listsv) == initial_listsv_len)
12246     {
12247         int i;
12248         if (! nonbitmap) {
12249             for (i = 0; i < 256; ++i) {
12250                 if (ANYOF_BITMAP_TEST(ret, i)) {
12251                     ANYOF_BITMAP_CLEAR(ret, i);
12252                 }
12253                 else {
12254                     ANYOF_BITMAP_SET(ret, i);
12255                     prevvalue = value;
12256                     value = i;
12257                 }
12258             }
12259             /* The inversion means that everything above 255 is matched */
12260             ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
12261         }
12262         else {
12263             /* Here, also has things outside the bitmap that may overlap with
12264              * the bitmap.  We have to sync them up, so that they get inverted
12265              * in both places.  Earlier, we removed all overlaps except in the
12266              * case of /d rules, so no syncing is needed except for this case
12267              */
12268             SV *remove_list = NULL;
12269
12270             if (DEPENDS_SEMANTICS) {
12271                 UV start, end;
12272
12273                 /* Set the bits that correspond to the ones that aren't in the
12274                  * bitmap.  Otherwise, when we invert, we'll miss these.
12275                  * Earlier, we removed from the nonbitmap all code points
12276                  * < 128, so there is no extra work here */
12277                 invlist_iterinit(nonbitmap);
12278                 while (invlist_iternext(nonbitmap, &start, &end)) {
12279                     if (start > 255) {  /* The bit map goes to 255 */
12280                         break;
12281                     }
12282                     if (end > 255) {
12283                         end = 255;
12284                     }
12285                     for (i = start; i <= (int) end; ++i) {
12286                         ANYOF_BITMAP_SET(ret, i);
12287                         prevvalue = value;
12288                         value = i;
12289                     }
12290                 }
12291             }
12292
12293             /* Now invert both the bitmap and the nonbitmap.  Anything in the
12294              * bitmap has to also be removed from the non-bitmap, but again,
12295              * there should not be overlap unless is /d rules. */
12296             _invlist_invert(nonbitmap);
12297
12298             /* Any swash can't be used as-is, because we've inverted things */
12299             if (swash) {
12300                 SvREFCNT_dec(swash);
12301                 swash = NULL;
12302             }
12303
12304             for (i = 0; i < 256; ++i) {
12305                 if (ANYOF_BITMAP_TEST(ret, i)) {
12306                     ANYOF_BITMAP_CLEAR(ret, i);
12307                     if (DEPENDS_SEMANTICS) {
12308                         if (! remove_list) {
12309                             remove_list = _new_invlist(2);
12310                         }
12311                         remove_list = add_cp_to_invlist(remove_list, i);
12312                     }
12313                 }
12314                 else {
12315                     ANYOF_BITMAP_SET(ret, i);
12316                     prevvalue = value;
12317                     value = i;
12318                 }
12319             }
12320
12321             /* And do the removal */
12322             if (DEPENDS_SEMANTICS) {
12323                 if (remove_list) {
12324                     _invlist_subtract(nonbitmap, remove_list, &nonbitmap);
12325                     SvREFCNT_dec(remove_list);
12326                 }
12327             }
12328             else {
12329                 /* There is no overlap for non-/d, so just delete anything
12330                  * below 256 */
12331                 _invlist_intersection(nonbitmap, PL_AboveLatin1, &nonbitmap);
12332             }
12333         }
12334
12335         stored = 256 - stored;
12336
12337         /* Clear the invert flag since have just done it here */
12338         ANYOF_FLAGS(ret) &= ~ANYOF_INVERT;
12339     }
12340
12341     /* Folding in the bitmap is taken care of above, but not for locale (for
12342      * which we have to wait to see what folding is in effect at runtime), and
12343      * for some things not in the bitmap (only the upper latin folds in this
12344      * case, as all other single-char folding has been set above).  Set
12345      * run-time fold flag for these */
12346     if (FOLD && (LOC
12347                 || (DEPENDS_SEMANTICS
12348                     && nonbitmap
12349                     && ! (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP_NON_UTF8))
12350                 || unicode_alternate))
12351     {
12352         ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD;
12353     }
12354
12355     /* A single character class can be "optimized" into an EXACTish node.
12356      * Note that since we don't currently count how many characters there are
12357      * outside the bitmap, we are XXX missing optimization possibilities for
12358      * them.  This optimization can't happen unless this is a truly single
12359      * character class, which means that it can't be an inversion into a
12360      * many-character class, and there must be no possibility of there being
12361      * things outside the bitmap.  'stored' (only) for locales doesn't include
12362      * \w, etc, so have to make a special test that they aren't present
12363      *
12364      * Similarly A 2-character class of the very special form like [bB] can be
12365      * optimized into an EXACTFish node, but only for non-locales, and for
12366      * characters which only have the two folds; so things like 'fF' and 'Ii'
12367      * wouldn't work because they are part of the fold of 'LATIN SMALL LIGATURE
12368      * FI'. */
12369     if (! nonbitmap
12370         && ! unicode_alternate
12371         && SvCUR(listsv) == initial_listsv_len
12372         && ! (ANYOF_FLAGS(ret) & (ANYOF_INVERT|ANYOF_UNICODE_ALL))
12373         && (((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
12374                               || (! ANYOF_CLASS_TEST_ANY_SET(ret)))))
12375             || (stored == 2 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
12376                                  && (! _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value))
12377                                  /* If the latest code point has a fold whose
12378                                   * bit is set, it must be the only other one */
12379                                 && ((prevvalue = PL_fold_latin1[value]) != (IV)value)
12380                                  && ANYOF_BITMAP_TEST(ret, prevvalue)))))
12381     {
12382         /* Note that the information needed to decide to do this optimization
12383          * is not currently available until the 2nd pass, and that the actually
12384          * used EXACTish node takes less space than the calculated ANYOF node,
12385          * and hence the amount of space calculated in the first pass is larger
12386          * than actually used, so this optimization doesn't gain us any space.
12387          * But an EXACT node is faster than an ANYOF node, and can be combined
12388          * with any adjacent EXACT nodes later by the optimizer for further
12389          * gains.  The speed of executing an EXACTF is similar to an ANYOF
12390          * node, so the optimization advantage comes from the ability to join
12391          * it to adjacent EXACT nodes */
12392
12393         const char * cur_parse= RExC_parse;
12394         U8 op;
12395         RExC_emit = (regnode *)orig_emit;
12396         RExC_parse = (char *)orig_parse;
12397
12398         if (stored == 1) {
12399
12400             /* A locale node with one point can be folded; all the other cases
12401              * with folding will have two points, since we calculate them above
12402              */
12403             if (ANYOF_FLAGS(ret) & ANYOF_LOC_NONBITMAP_FOLD) {
12404                  op = EXACTFL;
12405             }
12406             else {
12407                 op = EXACT;
12408             }
12409         }
12410         else {   /* else 2 chars in the bit map: the folds of each other */
12411
12412             /* Use the folded value, which for the cases where we get here,
12413              * is just the lower case of the current one (which may resolve to
12414              * itself, or to the other one */
12415             value = toLOWER_LATIN1(value);
12416
12417             /* To join adjacent nodes, they must be the exact EXACTish type.
12418              * Try to use the most likely type, by using EXACTFA if possible,
12419              * then EXACTFU if the regex calls for it, or is required because
12420              * the character is non-ASCII.  (If <value> is ASCII, its fold is
12421              * also ASCII for the cases where we get here.) */
12422             if (MORE_ASCII_RESTRICTED && isASCII(value)) {
12423                 op = EXACTFA;
12424             }
12425             else if (AT_LEAST_UNI_SEMANTICS || !isASCII(value)) {
12426                 op = EXACTFU;
12427             }
12428             else {    /* Otherwise, more likely to be EXACTF type */
12429                 op = EXACTF;
12430             }
12431         }
12432
12433         ret = reg_node(pRExC_state, op);
12434         RExC_parse = (char *)cur_parse;
12435         if (UTF && ! NATIVE_IS_INVARIANT(value)) {
12436             *STRING(ret)= UTF8_EIGHT_BIT_HI((U8) value);
12437             *(STRING(ret) + 1)= UTF8_EIGHT_BIT_LO((U8) value);
12438             STR_LEN(ret)= 2;
12439             RExC_emit += STR_SZ(2);
12440         }
12441         else {
12442             *STRING(ret)= (char)value;
12443             STR_LEN(ret)= 1;
12444             RExC_emit += STR_SZ(1);
12445         }
12446         SvREFCNT_dec(listsv);
12447         return ret;
12448     }
12449
12450     /* If there is a swash and more than one element, we can't use the swash in
12451      * the optimization below. */
12452     if (swash && element_count > 1) {
12453         SvREFCNT_dec(swash);
12454         swash = NULL;
12455     }
12456     if (! nonbitmap
12457         && SvCUR(listsv) == initial_listsv_len
12458         && ! unicode_alternate)
12459     {
12460         ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
12461         SvREFCNT_dec(listsv);
12462         SvREFCNT_dec(unicode_alternate);
12463     }
12464     else {
12465         /* av[0] stores the character class description in its textual form:
12466          *       used later (regexec.c:Perl_regclass_swash()) to initialize the
12467          *       appropriate swash, and is also useful for dumping the regnode.
12468          * av[1] if NULL, is a placeholder to later contain the swash computed
12469          *       from av[0].  But if no further computation need be done, the
12470          *       swash is stored there now.
12471          * av[2] stores the multicharacter foldings, used later in
12472          *       regexec.c:S_reginclass().
12473          * av[3] stores the nonbitmap inversion list for use in addition or
12474          *       instead of av[0]; not used if av[1] isn't NULL
12475          * av[4] is set if any component of the class is from a user-defined
12476          *       property; not used if av[1] isn't NULL */
12477         AV * const av = newAV();
12478         SV *rv;
12479
12480         av_store(av, 0, (SvCUR(listsv) == initial_listsv_len)
12481                         ? &PL_sv_undef
12482                         : listsv);
12483         if (swash) {
12484             av_store(av, 1, swash);
12485             SvREFCNT_dec(nonbitmap);
12486         }
12487         else {
12488             av_store(av, 1, NULL);
12489             if (nonbitmap) {
12490                 av_store(av, 3, nonbitmap);
12491                 av_store(av, 4, newSVuv(has_user_defined_property));
12492             }
12493         }
12494
12495         /* Store any computed multi-char folds only if we are allowing
12496          * them */
12497         if (allow_full_fold) {
12498             av_store(av, 2, MUTABLE_SV(unicode_alternate));
12499             if (unicode_alternate) { /* This node is variable length */
12500                 OP(ret) = ANYOFV;
12501             }
12502         }
12503         else {
12504             av_store(av, 2, NULL);
12505         }
12506         rv = newRV_noinc(MUTABLE_SV(av));
12507         n = add_data(pRExC_state, 1, "s");
12508         RExC_rxi->data->data[n] = (void*)rv;
12509         ARG_SET(ret, n);
12510     }
12511     return ret;
12512 }
12513
12514
12515 /* reg_skipcomment()
12516
12517    Absorbs an /x style # comments from the input stream.
12518    Returns true if there is more text remaining in the stream.
12519    Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
12520    terminates the pattern without including a newline.
12521
12522    Note its the callers responsibility to ensure that we are
12523    actually in /x mode
12524
12525 */
12526
12527 STATIC bool
12528 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
12529 {
12530     bool ended = 0;
12531
12532     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
12533
12534     while (RExC_parse < RExC_end)
12535         if (*RExC_parse++ == '\n') {
12536             ended = 1;
12537             break;
12538         }
12539     if (!ended) {
12540         /* we ran off the end of the pattern without ending
12541            the comment, so we have to add an \n when wrapping */
12542         RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
12543         return 0;
12544     } else
12545         return 1;
12546 }
12547
12548 /* nextchar()
12549
12550    Advances the parse position, and optionally absorbs
12551    "whitespace" from the inputstream.
12552
12553    Without /x "whitespace" means (?#...) style comments only,
12554    with /x this means (?#...) and # comments and whitespace proper.
12555
12556    Returns the RExC_parse point from BEFORE the scan occurs.
12557
12558    This is the /x friendly way of saying RExC_parse++.
12559 */
12560
12561 STATIC char*
12562 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
12563 {
12564     char* const retval = RExC_parse++;
12565
12566     PERL_ARGS_ASSERT_NEXTCHAR;
12567
12568     for (;;) {
12569         if (RExC_end - RExC_parse >= 3
12570             && *RExC_parse == '('
12571             && RExC_parse[1] == '?'
12572             && RExC_parse[2] == '#')
12573         {
12574             while (*RExC_parse != ')') {
12575                 if (RExC_parse == RExC_end)
12576                     FAIL("Sequence (?#... not terminated");
12577                 RExC_parse++;
12578             }
12579             RExC_parse++;
12580             continue;
12581         }
12582         if (RExC_flags & RXf_PMf_EXTENDED) {
12583             if (isSPACE(*RExC_parse)) {
12584                 RExC_parse++;
12585                 continue;
12586             }
12587             else if (*RExC_parse == '#') {
12588                 if ( reg_skipcomment( pRExC_state ) )
12589                     continue;
12590             }
12591         }
12592         return retval;
12593     }
12594 }
12595
12596 /*
12597 - reg_node - emit a node
12598 */
12599 STATIC regnode *                        /* Location. */
12600 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
12601 {
12602     dVAR;
12603     register regnode *ptr;
12604     regnode * const ret = RExC_emit;
12605     GET_RE_DEBUG_FLAGS_DECL;
12606
12607     PERL_ARGS_ASSERT_REG_NODE;
12608
12609     if (SIZE_ONLY) {
12610         SIZE_ALIGN(RExC_size);
12611         RExC_size += 1;
12612         return(ret);
12613     }
12614     if (RExC_emit >= RExC_emit_bound)
12615         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
12616                    op, RExC_emit, RExC_emit_bound);
12617
12618     NODE_ALIGN_FILL(ret);
12619     ptr = ret;
12620     FILL_ADVANCE_NODE(ptr, op);
12621     REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 1);
12622 #ifdef RE_TRACK_PATTERN_OFFSETS
12623     if (RExC_offsets) {         /* MJD */
12624         MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
12625               "reg_node", __LINE__,
12626               PL_reg_name[op],
12627               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
12628                 ? "Overwriting end of array!\n" : "OK",
12629               (UV)(RExC_emit - RExC_emit_start),
12630               (UV)(RExC_parse - RExC_start),
12631               (UV)RExC_offsets[0]));
12632         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
12633     }
12634 #endif
12635     RExC_emit = ptr;
12636     return(ret);
12637 }
12638
12639 /*
12640 - reganode - emit a node with an argument
12641 */
12642 STATIC regnode *                        /* Location. */
12643 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
12644 {
12645     dVAR;
12646     register regnode *ptr;
12647     regnode * const ret = RExC_emit;
12648     GET_RE_DEBUG_FLAGS_DECL;
12649
12650     PERL_ARGS_ASSERT_REGANODE;
12651
12652     if (SIZE_ONLY) {
12653         SIZE_ALIGN(RExC_size);
12654         RExC_size += 2;
12655         /*
12656            We can't do this:
12657
12658            assert(2==regarglen[op]+1);
12659
12660            Anything larger than this has to allocate the extra amount.
12661            If we changed this to be:
12662
12663            RExC_size += (1 + regarglen[op]);
12664
12665            then it wouldn't matter. Its not clear what side effect
12666            might come from that so its not done so far.
12667            -- dmq
12668         */
12669         return(ret);
12670     }
12671     if (RExC_emit >= RExC_emit_bound)
12672         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
12673                    op, RExC_emit, RExC_emit_bound);
12674
12675     NODE_ALIGN_FILL(ret);
12676     ptr = ret;
12677     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
12678     REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 2);
12679 #ifdef RE_TRACK_PATTERN_OFFSETS
12680     if (RExC_offsets) {         /* MJD */
12681         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
12682               "reganode",
12683               __LINE__,
12684               PL_reg_name[op],
12685               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
12686               "Overwriting end of array!\n" : "OK",
12687               (UV)(RExC_emit - RExC_emit_start),
12688               (UV)(RExC_parse - RExC_start),
12689               (UV)RExC_offsets[0]));
12690         Set_Cur_Node_Offset;
12691     }
12692 #endif
12693     RExC_emit = ptr;
12694     return(ret);
12695 }
12696
12697 /*
12698 - reguni - emit (if appropriate) a Unicode character
12699 */
12700 STATIC STRLEN
12701 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
12702 {
12703     dVAR;
12704
12705     PERL_ARGS_ASSERT_REGUNI;
12706
12707     return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
12708 }
12709
12710 /*
12711 - reginsert - insert an operator in front of already-emitted operand
12712 *
12713 * Means relocating the operand.
12714 */
12715 STATIC void
12716 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
12717 {
12718     dVAR;
12719     register regnode *src;
12720     register regnode *dst;
12721     register regnode *place;
12722     const int offset = regarglen[(U8)op];
12723     const int size = NODE_STEP_REGNODE + offset;
12724     GET_RE_DEBUG_FLAGS_DECL;
12725
12726     PERL_ARGS_ASSERT_REGINSERT;
12727     PERL_UNUSED_ARG(depth);
12728 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
12729     DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
12730     if (SIZE_ONLY) {
12731         RExC_size += size;
12732         return;
12733     }
12734
12735     src = RExC_emit;
12736     RExC_emit += size;
12737     dst = RExC_emit;
12738     if (RExC_open_parens) {
12739         int paren;
12740         /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
12741         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
12742             if ( RExC_open_parens[paren] >= opnd ) {
12743                 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
12744                 RExC_open_parens[paren] += size;
12745             } else {
12746                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
12747             }
12748             if ( RExC_close_parens[paren] >= opnd ) {
12749                 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
12750                 RExC_close_parens[paren] += size;
12751             } else {
12752                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
12753             }
12754         }
12755     }
12756
12757     while (src > opnd) {
12758         StructCopy(--src, --dst, regnode);
12759 #ifdef RE_TRACK_PATTERN_OFFSETS
12760         if (RExC_offsets) {     /* MJD 20010112 */
12761             MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
12762                   "reg_insert",
12763                   __LINE__,
12764                   PL_reg_name[op],
12765                   (UV)(dst - RExC_emit_start) > RExC_offsets[0]
12766                     ? "Overwriting end of array!\n" : "OK",
12767                   (UV)(src - RExC_emit_start),
12768                   (UV)(dst - RExC_emit_start),
12769                   (UV)RExC_offsets[0]));
12770             Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
12771             Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
12772         }
12773 #endif
12774     }
12775
12776
12777     place = opnd;               /* Op node, where operand used to be. */
12778 #ifdef RE_TRACK_PATTERN_OFFSETS
12779     if (RExC_offsets) {         /* MJD */
12780         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
12781               "reginsert",
12782               __LINE__,
12783               PL_reg_name[op],
12784               (UV)(place - RExC_emit_start) > RExC_offsets[0]
12785               ? "Overwriting end of array!\n" : "OK",
12786               (UV)(place - RExC_emit_start),
12787               (UV)(RExC_parse - RExC_start),
12788               (UV)RExC_offsets[0]));
12789         Set_Node_Offset(place, RExC_parse);
12790         Set_Node_Length(place, 1);
12791     }
12792 #endif
12793     src = NEXTOPER(place);
12794     FILL_ADVANCE_NODE(place, op);
12795     REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (place) - 1);
12796     Zero(src, offset, regnode);
12797 }
12798
12799 /*
12800 - regtail - set the next-pointer at the end of a node chain of p to val.
12801 - SEE ALSO: regtail_study
12802 */
12803 /* TODO: All three parms should be const */
12804 STATIC void
12805 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
12806 {
12807     dVAR;
12808     register regnode *scan;
12809     GET_RE_DEBUG_FLAGS_DECL;
12810
12811     PERL_ARGS_ASSERT_REGTAIL;
12812 #ifndef DEBUGGING
12813     PERL_UNUSED_ARG(depth);
12814 #endif
12815
12816     if (SIZE_ONLY)
12817         return;
12818
12819     /* Find last node. */
12820     scan = p;
12821     for (;;) {
12822         regnode * const temp = regnext(scan);
12823         DEBUG_PARSE_r({
12824             SV * const mysv=sv_newmortal();
12825             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
12826             regprop(RExC_rx, mysv, scan);
12827             PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
12828                 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
12829                     (temp == NULL ? "->" : ""),
12830                     (temp == NULL ? PL_reg_name[OP(val)] : "")
12831             );
12832         });
12833         if (temp == NULL)
12834             break;
12835         scan = temp;
12836     }
12837
12838     if (reg_off_by_arg[OP(scan)]) {
12839         ARG_SET(scan, val - scan);
12840     }
12841     else {
12842         NEXT_OFF(scan) = val - scan;
12843     }
12844 }
12845
12846 #ifdef DEBUGGING
12847 /*
12848 - regtail_study - set the next-pointer at the end of a node chain of p to val.
12849 - Look for optimizable sequences at the same time.
12850 - currently only looks for EXACT chains.
12851
12852 This is experimental code. The idea is to use this routine to perform
12853 in place optimizations on branches and groups as they are constructed,
12854 with the long term intention of removing optimization from study_chunk so
12855 that it is purely analytical.
12856
12857 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
12858 to control which is which.
12859
12860 */
12861 /* TODO: All four parms should be const */
12862
12863 STATIC U8
12864 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
12865 {
12866     dVAR;
12867     register regnode *scan;
12868     U8 exact = PSEUDO;
12869 #ifdef EXPERIMENTAL_INPLACESCAN
12870     I32 min = 0;
12871 #endif
12872     GET_RE_DEBUG_FLAGS_DECL;
12873
12874     PERL_ARGS_ASSERT_REGTAIL_STUDY;
12875
12876
12877     if (SIZE_ONLY)
12878         return exact;
12879
12880     /* Find last node. */
12881
12882     scan = p;
12883     for (;;) {
12884         regnode * const temp = regnext(scan);
12885 #ifdef EXPERIMENTAL_INPLACESCAN
12886         if (PL_regkind[OP(scan)] == EXACT) {
12887             bool has_exactf_sharp_s;    /* Unexamined in this routine */
12888             if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1))
12889                 return EXACT;
12890         }
12891 #endif
12892         if ( exact ) {
12893             switch (OP(scan)) {
12894                 case EXACT:
12895                 case EXACTF:
12896                 case EXACTFA:
12897                 case EXACTFU:
12898                 case EXACTFU_SS:
12899                 case EXACTFU_TRICKYFOLD:
12900                 case EXACTFL:
12901                         if( exact == PSEUDO )
12902                             exact= OP(scan);
12903                         else if ( exact != OP(scan) )
12904                             exact= 0;
12905                 case NOTHING:
12906                     break;
12907                 default:
12908                     exact= 0;
12909             }
12910         }
12911         DEBUG_PARSE_r({
12912             SV * const mysv=sv_newmortal();
12913             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
12914             regprop(RExC_rx, mysv, scan);
12915             PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
12916                 SvPV_nolen_const(mysv),
12917                 REG_NODE_NUM(scan),
12918                 PL_reg_name[exact]);
12919         });
12920         if (temp == NULL)
12921             break;
12922         scan = temp;
12923     }
12924     DEBUG_PARSE_r({
12925         SV * const mysv_val=sv_newmortal();
12926         DEBUG_PARSE_MSG("");
12927         regprop(RExC_rx, mysv_val, val);
12928         PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
12929                       SvPV_nolen_const(mysv_val),
12930                       (IV)REG_NODE_NUM(val),
12931                       (IV)(val - scan)
12932         );
12933     });
12934     if (reg_off_by_arg[OP(scan)]) {
12935         ARG_SET(scan, val - scan);
12936     }
12937     else {
12938         NEXT_OFF(scan) = val - scan;
12939     }
12940
12941     return exact;
12942 }
12943 #endif
12944
12945 /*
12946  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
12947  */
12948 #ifdef DEBUGGING
12949 static void
12950 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
12951 {
12952     int bit;
12953     int set=0;
12954     regex_charset cs;
12955
12956     for (bit=0; bit<32; bit++) {
12957         if (flags & (1<<bit)) {
12958             if ((1<<bit) & RXf_PMf_CHARSET) {   /* Output separately, below */
12959                 continue;
12960             }
12961             if (!set++ && lead)
12962                 PerlIO_printf(Perl_debug_log, "%s",lead);
12963             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
12964         }
12965     }
12966     if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
12967             if (!set++ && lead) {
12968                 PerlIO_printf(Perl_debug_log, "%s",lead);
12969             }
12970             switch (cs) {
12971                 case REGEX_UNICODE_CHARSET:
12972                     PerlIO_printf(Perl_debug_log, "UNICODE");
12973                     break;
12974                 case REGEX_LOCALE_CHARSET:
12975                     PerlIO_printf(Perl_debug_log, "LOCALE");
12976                     break;
12977                 case REGEX_ASCII_RESTRICTED_CHARSET:
12978                     PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
12979                     break;
12980                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
12981                     PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
12982                     break;
12983                 default:
12984                     PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
12985                     break;
12986             }
12987     }
12988     if (lead)  {
12989         if (set)
12990             PerlIO_printf(Perl_debug_log, "\n");
12991         else
12992             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
12993     }
12994 }
12995 #endif
12996
12997 void
12998 Perl_regdump(pTHX_ const regexp *r)
12999 {
13000 #ifdef DEBUGGING
13001     dVAR;
13002     SV * const sv = sv_newmortal();
13003     SV *dsv= sv_newmortal();
13004     RXi_GET_DECL(r,ri);
13005     GET_RE_DEBUG_FLAGS_DECL;
13006
13007     PERL_ARGS_ASSERT_REGDUMP;
13008
13009     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
13010
13011     /* Header fields of interest. */
13012     if (r->anchored_substr) {
13013         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
13014             RE_SV_DUMPLEN(r->anchored_substr), 30);
13015         PerlIO_printf(Perl_debug_log,
13016                       "anchored %s%s at %"IVdf" ",
13017                       s, RE_SV_TAIL(r->anchored_substr),
13018                       (IV)r->anchored_offset);
13019     } else if (r->anchored_utf8) {
13020         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
13021             RE_SV_DUMPLEN(r->anchored_utf8), 30);
13022         PerlIO_printf(Perl_debug_log,
13023                       "anchored utf8 %s%s at %"IVdf" ",
13024                       s, RE_SV_TAIL(r->anchored_utf8),
13025                       (IV)r->anchored_offset);
13026     }
13027     if (r->float_substr) {
13028         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
13029             RE_SV_DUMPLEN(r->float_substr), 30);
13030         PerlIO_printf(Perl_debug_log,
13031                       "floating %s%s at %"IVdf"..%"UVuf" ",
13032                       s, RE_SV_TAIL(r->float_substr),
13033                       (IV)r->float_min_offset, (UV)r->float_max_offset);
13034     } else if (r->float_utf8) {
13035         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
13036             RE_SV_DUMPLEN(r->float_utf8), 30);
13037         PerlIO_printf(Perl_debug_log,
13038                       "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
13039                       s, RE_SV_TAIL(r->float_utf8),
13040                       (IV)r->float_min_offset, (UV)r->float_max_offset);
13041     }
13042     if (r->check_substr || r->check_utf8)
13043         PerlIO_printf(Perl_debug_log,
13044                       (const char *)
13045                       (r->check_substr == r->float_substr
13046                        && r->check_utf8 == r->float_utf8
13047                        ? "(checking floating" : "(checking anchored"));
13048     if (r->extflags & RXf_NOSCAN)
13049         PerlIO_printf(Perl_debug_log, " noscan");
13050     if (r->extflags & RXf_CHECK_ALL)
13051         PerlIO_printf(Perl_debug_log, " isall");
13052     if (r->check_substr || r->check_utf8)
13053         PerlIO_printf(Perl_debug_log, ") ");
13054
13055     if (ri->regstclass) {
13056         regprop(r, sv, ri->regstclass);
13057         PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
13058     }
13059     if (r->extflags & RXf_ANCH) {
13060         PerlIO_printf(Perl_debug_log, "anchored");
13061         if (r->extflags & RXf_ANCH_BOL)
13062             PerlIO_printf(Perl_debug_log, "(BOL)");
13063         if (r->extflags & RXf_ANCH_MBOL)
13064             PerlIO_printf(Perl_debug_log, "(MBOL)");
13065         if (r->extflags & RXf_ANCH_SBOL)
13066             PerlIO_printf(Perl_debug_log, "(SBOL)");
13067         if (r->extflags & RXf_ANCH_GPOS)
13068             PerlIO_printf(Perl_debug_log, "(GPOS)");
13069         PerlIO_putc(Perl_debug_log, ' ');
13070     }
13071     if (r->extflags & RXf_GPOS_SEEN)
13072         PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
13073     if (r->intflags & PREGf_SKIP)
13074         PerlIO_printf(Perl_debug_log, "plus ");
13075     if (r->intflags & PREGf_IMPLICIT)
13076         PerlIO_printf(Perl_debug_log, "implicit ");
13077     PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
13078     if (r->extflags & RXf_EVAL_SEEN)
13079         PerlIO_printf(Perl_debug_log, "with eval ");
13080     PerlIO_printf(Perl_debug_log, "\n");
13081     DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
13082 #else
13083     PERL_ARGS_ASSERT_REGDUMP;
13084     PERL_UNUSED_CONTEXT;
13085     PERL_UNUSED_ARG(r);
13086 #endif  /* DEBUGGING */
13087 }
13088
13089 /*
13090 - regprop - printable representation of opcode
13091 */
13092 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
13093 STMT_START { \
13094         if (do_sep) {                           \
13095             Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
13096             if (flags & ANYOF_INVERT)           \
13097                 /*make sure the invert info is in each */ \
13098                 sv_catpvs(sv, "^");             \
13099             do_sep = 0;                         \
13100         }                                       \
13101 } STMT_END
13102
13103 void
13104 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
13105 {
13106 #ifdef DEBUGGING
13107     dVAR;
13108     register int k;
13109     RXi_GET_DECL(prog,progi);
13110     GET_RE_DEBUG_FLAGS_DECL;
13111
13112     PERL_ARGS_ASSERT_REGPROP;
13113
13114     sv_setpvs(sv, "");
13115
13116     if (OP(o) > REGNODE_MAX)            /* regnode.type is unsigned */
13117         /* It would be nice to FAIL() here, but this may be called from
13118            regexec.c, and it would be hard to supply pRExC_state. */
13119         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
13120     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
13121
13122     k = PL_regkind[OP(o)];
13123
13124     if (k == EXACT) {
13125         sv_catpvs(sv, " ");
13126         /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
13127          * is a crude hack but it may be the best for now since
13128          * we have no flag "this EXACTish node was UTF-8"
13129          * --jhi */
13130         pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
13131                   PERL_PV_ESCAPE_UNI_DETECT |
13132                   PERL_PV_ESCAPE_NONASCII   |
13133                   PERL_PV_PRETTY_ELLIPSES   |
13134                   PERL_PV_PRETTY_LTGT       |
13135                   PERL_PV_PRETTY_NOCLEAR
13136                   );
13137     } else if (k == TRIE) {
13138         /* print the details of the trie in dumpuntil instead, as
13139          * progi->data isn't available here */
13140         const char op = OP(o);
13141         const U32 n = ARG(o);
13142         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
13143                (reg_ac_data *)progi->data->data[n] :
13144                NULL;
13145         const reg_trie_data * const trie
13146             = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
13147
13148         Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
13149         DEBUG_TRIE_COMPILE_r(
13150             Perl_sv_catpvf(aTHX_ sv,
13151                 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
13152                 (UV)trie->startstate,
13153                 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
13154                 (UV)trie->wordcount,
13155                 (UV)trie->minlen,
13156                 (UV)trie->maxlen,
13157                 (UV)TRIE_CHARCOUNT(trie),
13158                 (UV)trie->uniquecharcount
13159             )
13160         );
13161         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
13162             int i;
13163             int rangestart = -1;
13164             U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
13165             sv_catpvs(sv, "[");
13166             for (i = 0; i <= 256; i++) {
13167                 if (i < 256 && BITMAP_TEST(bitmap,i)) {
13168                     if (rangestart == -1)
13169                         rangestart = i;
13170                 } else if (rangestart != -1) {
13171                     if (i <= rangestart + 3)
13172                         for (; rangestart < i; rangestart++)
13173                             put_byte(sv, rangestart);
13174                     else {
13175                         put_byte(sv, rangestart);
13176                         sv_catpvs(sv, "-");
13177                         put_byte(sv, i - 1);
13178                     }
13179                     rangestart = -1;
13180                 }
13181             }
13182             sv_catpvs(sv, "]");
13183         }
13184
13185     } else if (k == CURLY) {
13186         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
13187             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
13188         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
13189     }
13190     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
13191         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
13192     else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
13193         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
13194         if ( RXp_PAREN_NAMES(prog) ) {
13195             if ( k != REF || (OP(o) < NREF)) {
13196                 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
13197                 SV **name= av_fetch(list, ARG(o), 0 );
13198                 if (name)
13199                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
13200             }
13201             else {
13202                 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
13203                 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
13204                 I32 *nums=(I32*)SvPVX(sv_dat);
13205                 SV **name= av_fetch(list, nums[0], 0 );
13206                 I32 n;
13207                 if (name) {
13208                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
13209                         Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
13210                                     (n ? "," : ""), (IV)nums[n]);
13211                     }
13212                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
13213                 }
13214             }
13215         }
13216     } else if (k == GOSUB)
13217         Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
13218     else if (k == VERB) {
13219         if (!o->flags)
13220             Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
13221                            SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
13222     } else if (k == LOGICAL)
13223         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);     /* 2: embedded, otherwise 1 */
13224     else if (k == ANYOF) {
13225         int i, rangestart = -1;
13226         const U8 flags = ANYOF_FLAGS(o);
13227         int do_sep = 0;
13228
13229         /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
13230         static const char * const anyofs[] = {
13231             "\\w",
13232             "\\W",
13233             "\\s",
13234             "\\S",
13235             "\\d",
13236             "\\D",
13237             "[:alnum:]",
13238             "[:^alnum:]",
13239             "[:alpha:]",
13240             "[:^alpha:]",
13241             "[:ascii:]",
13242             "[:^ascii:]",
13243             "[:cntrl:]",
13244             "[:^cntrl:]",
13245             "[:graph:]",
13246             "[:^graph:]",
13247             "[:lower:]",
13248             "[:^lower:]",
13249             "[:print:]",
13250             "[:^print:]",
13251             "[:punct:]",
13252             "[:^punct:]",
13253             "[:upper:]",
13254             "[:^upper:]",
13255             "[:xdigit:]",
13256             "[:^xdigit:]",
13257             "[:space:]",
13258             "[:^space:]",
13259             "[:blank:]",
13260             "[:^blank:]"
13261         };
13262
13263         if (flags & ANYOF_LOCALE)
13264             sv_catpvs(sv, "{loc}");
13265         if (flags & ANYOF_LOC_NONBITMAP_FOLD)
13266             sv_catpvs(sv, "{i}");
13267         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
13268         if (flags & ANYOF_INVERT)
13269             sv_catpvs(sv, "^");
13270
13271         /* output what the standard cp 0-255 bitmap matches */
13272         for (i = 0; i <= 256; i++) {
13273             if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
13274                 if (rangestart == -1)
13275                     rangestart = i;
13276             } else if (rangestart != -1) {
13277                 if (i <= rangestart + 3)
13278                     for (; rangestart < i; rangestart++)
13279                         put_byte(sv, rangestart);
13280                 else {
13281                     put_byte(sv, rangestart);
13282                     sv_catpvs(sv, "-");
13283                     put_byte(sv, i - 1);
13284                 }
13285                 do_sep = 1;
13286                 rangestart = -1;
13287             }
13288         }
13289
13290         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
13291         /* output any special charclass tests (used entirely under use locale) */
13292         if (ANYOF_CLASS_TEST_ANY_SET(o))
13293             for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
13294                 if (ANYOF_CLASS_TEST(o,i)) {
13295                     sv_catpv(sv, anyofs[i]);
13296                     do_sep = 1;
13297                 }
13298
13299         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
13300
13301         if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
13302             sv_catpvs(sv, "{non-utf8-latin1-all}");
13303         }
13304
13305         /* output information about the unicode matching */
13306         if (flags & ANYOF_UNICODE_ALL)
13307             sv_catpvs(sv, "{unicode_all}");
13308         else if (ANYOF_NONBITMAP(o))
13309             sv_catpvs(sv, "{unicode}");
13310         if (flags & ANYOF_NONBITMAP_NON_UTF8)
13311             sv_catpvs(sv, "{outside bitmap}");
13312
13313         if (ANYOF_NONBITMAP(o)) {
13314             SV *lv; /* Set if there is something outside the bit map */
13315             SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
13316             bool byte_output = FALSE;   /* If something in the bitmap has been
13317                                            output */
13318
13319             if (lv && lv != &PL_sv_undef) {
13320                 if (sw) {
13321                     U8 s[UTF8_MAXBYTES_CASE+1];
13322
13323                     for (i = 0; i <= 256; i++) { /* Look at chars in bitmap */
13324                         uvchr_to_utf8(s, i);
13325
13326                         if (i < 256
13327                             && ! ANYOF_BITMAP_TEST(o, i)    /* Don't duplicate
13328                                                                things already
13329                                                                output as part
13330                                                                of the bitmap */
13331                             && swash_fetch(sw, s, TRUE))
13332                         {
13333                             if (rangestart == -1)
13334                                 rangestart = i;
13335                         } else if (rangestart != -1) {
13336                             byte_output = TRUE;
13337                             if (i <= rangestart + 3)
13338                                 for (; rangestart < i; rangestart++) {
13339                                     put_byte(sv, rangestart);
13340                                 }
13341                             else {
13342                                 put_byte(sv, rangestart);
13343                                 sv_catpvs(sv, "-");
13344                                 put_byte(sv, i-1);
13345                             }
13346                             rangestart = -1;
13347                         }
13348                     }
13349                 }
13350
13351                 {
13352                     char *s = savesvpv(lv);
13353                     char * const origs = s;
13354
13355                     while (*s && *s != '\n')
13356                         s++;
13357
13358                     if (*s == '\n') {
13359                         const char * const t = ++s;
13360
13361                         if (byte_output) {
13362                             sv_catpvs(sv, " ");
13363                         }
13364
13365                         while (*s) {
13366                             if (*s == '\n') {
13367
13368                                 /* Truncate very long output */
13369                                 if (s - origs > 256) {
13370                                     Perl_sv_catpvf(aTHX_ sv,
13371                                                    "%.*s...",
13372                                                    (int) (s - origs - 1),
13373                                                    t);
13374                                     goto out_dump;
13375                                 }
13376                                 *s = ' ';
13377                             }
13378                             else if (*s == '\t') {
13379                                 *s = '-';
13380                             }
13381                             s++;
13382                         }
13383                         if (s[-1] == ' ')
13384                             s[-1] = 0;
13385
13386                         sv_catpv(sv, t);
13387                     }
13388
13389                 out_dump:
13390
13391                     Safefree(origs);
13392                 }
13393                 SvREFCNT_dec(lv);
13394             }
13395         }
13396
13397         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
13398     }
13399     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
13400         Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
13401 #else
13402     PERL_UNUSED_CONTEXT;
13403     PERL_UNUSED_ARG(sv);
13404     PERL_UNUSED_ARG(o);
13405     PERL_UNUSED_ARG(prog);
13406 #endif  /* DEBUGGING */
13407 }
13408
13409 SV *
13410 Perl_re_intuit_string(pTHX_ REGEXP * const r)
13411 {                               /* Assume that RE_INTUIT is set */
13412     dVAR;
13413     struct regexp *const prog = (struct regexp *)SvANY(r);
13414     GET_RE_DEBUG_FLAGS_DECL;
13415
13416     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
13417     PERL_UNUSED_CONTEXT;
13418
13419     DEBUG_COMPILE_r(
13420         {
13421             const char * const s = SvPV_nolen_const(prog->check_substr
13422                       ? prog->check_substr : prog->check_utf8);
13423
13424             if (!PL_colorset) reginitcolors();
13425             PerlIO_printf(Perl_debug_log,
13426                       "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
13427                       PL_colors[4],
13428                       prog->check_substr ? "" : "utf8 ",
13429                       PL_colors[5],PL_colors[0],
13430                       s,
13431                       PL_colors[1],
13432                       (strlen(s) > 60 ? "..." : ""));
13433         } );
13434
13435     return prog->check_substr ? prog->check_substr : prog->check_utf8;
13436 }
13437
13438 /*
13439    pregfree()
13440
13441    handles refcounting and freeing the perl core regexp structure. When
13442    it is necessary to actually free the structure the first thing it
13443    does is call the 'free' method of the regexp_engine associated to
13444    the regexp, allowing the handling of the void *pprivate; member
13445    first. (This routine is not overridable by extensions, which is why
13446    the extensions free is called first.)
13447
13448    See regdupe and regdupe_internal if you change anything here.
13449 */
13450 #ifndef PERL_IN_XSUB_RE
13451 void
13452 Perl_pregfree(pTHX_ REGEXP *r)
13453 {
13454     SvREFCNT_dec(r);
13455 }
13456
13457 void
13458 Perl_pregfree2(pTHX_ REGEXP *rx)
13459 {
13460     dVAR;
13461     struct regexp *const r = (struct regexp *)SvANY(rx);
13462     GET_RE_DEBUG_FLAGS_DECL;
13463
13464     PERL_ARGS_ASSERT_PREGFREE2;
13465
13466     if (r->mother_re) {
13467         ReREFCNT_dec(r->mother_re);
13468     } else {
13469         CALLREGFREE_PVT(rx); /* free the private data */
13470         SvREFCNT_dec(RXp_PAREN_NAMES(r));
13471     }
13472     if (r->substrs) {
13473         SvREFCNT_dec(r->anchored_substr);
13474         SvREFCNT_dec(r->anchored_utf8);
13475         SvREFCNT_dec(r->float_substr);
13476         SvREFCNT_dec(r->float_utf8);
13477         Safefree(r->substrs);
13478     }
13479     RX_MATCH_COPY_FREE(rx);
13480 #ifdef PERL_OLD_COPY_ON_WRITE
13481     SvREFCNT_dec(r->saved_copy);
13482 #endif
13483     Safefree(r->offs);
13484     SvREFCNT_dec(r->qr_anoncv);
13485 }
13486
13487 /*  reg_temp_copy()
13488
13489     This is a hacky workaround to the structural issue of match results
13490     being stored in the regexp structure which is in turn stored in
13491     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
13492     could be PL_curpm in multiple contexts, and could require multiple
13493     result sets being associated with the pattern simultaneously, such
13494     as when doing a recursive match with (??{$qr})
13495
13496     The solution is to make a lightweight copy of the regexp structure
13497     when a qr// is returned from the code executed by (??{$qr}) this
13498     lightweight copy doesn't actually own any of its data except for
13499     the starp/end and the actual regexp structure itself.
13500
13501 */
13502
13503
13504 REGEXP *
13505 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
13506 {
13507     struct regexp *ret;
13508     struct regexp *const r = (struct regexp *)SvANY(rx);
13509     register const I32 npar = r->nparens+1;
13510
13511     PERL_ARGS_ASSERT_REG_TEMP_COPY;
13512
13513     if (!ret_x)
13514         ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
13515     ret = (struct regexp *)SvANY(ret_x);
13516
13517     (void)ReREFCNT_inc(rx);
13518     /* We can take advantage of the existing "copied buffer" mechanism in SVs
13519        by pointing directly at the buffer, but flagging that the allocated
13520        space in the copy is zero. As we've just done a struct copy, it's now
13521        a case of zero-ing that, rather than copying the current length.  */
13522     SvPV_set(ret_x, RX_WRAPPED(rx));
13523     SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
13524     memcpy(&(ret->xpv_cur), &(r->xpv_cur),
13525            sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
13526     SvLEN_set(ret_x, 0);
13527     SvSTASH_set(ret_x, NULL);
13528     SvMAGIC_set(ret_x, NULL);
13529     Newx(ret->offs, npar, regexp_paren_pair);
13530     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
13531     if (r->substrs) {
13532         Newx(ret->substrs, 1, struct reg_substr_data);
13533         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
13534
13535         SvREFCNT_inc_void(ret->anchored_substr);
13536         SvREFCNT_inc_void(ret->anchored_utf8);
13537         SvREFCNT_inc_void(ret->float_substr);
13538         SvREFCNT_inc_void(ret->float_utf8);
13539
13540         /* check_substr and check_utf8, if non-NULL, point to either their
13541            anchored or float namesakes, and don't hold a second reference.  */
13542     }
13543     RX_MATCH_COPIED_off(ret_x);
13544 #ifdef PERL_OLD_COPY_ON_WRITE
13545     ret->saved_copy = NULL;
13546 #endif
13547     ret->mother_re = rx;
13548     SvREFCNT_inc_void(ret->qr_anoncv);
13549
13550     return ret_x;
13551 }
13552 #endif
13553
13554 /* regfree_internal()
13555
13556    Free the private data in a regexp. This is overloadable by
13557    extensions. Perl takes care of the regexp structure in pregfree(),
13558    this covers the *pprivate pointer which technically perl doesn't
13559    know about, however of course we have to handle the
13560    regexp_internal structure when no extension is in use.
13561
13562    Note this is called before freeing anything in the regexp
13563    structure.
13564  */
13565
13566 void
13567 Perl_regfree_internal(pTHX_ REGEXP * const rx)
13568 {
13569     dVAR;
13570     struct regexp *const r = (struct regexp *)SvANY(rx);
13571     RXi_GET_DECL(r,ri);
13572     GET_RE_DEBUG_FLAGS_DECL;
13573
13574     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
13575
13576     DEBUG_COMPILE_r({
13577         if (!PL_colorset)
13578             reginitcolors();
13579         {
13580             SV *dsv= sv_newmortal();
13581             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
13582                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
13583             PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
13584                 PL_colors[4],PL_colors[5],s);
13585         }
13586     });
13587 #ifdef RE_TRACK_PATTERN_OFFSETS
13588     if (ri->u.offsets)
13589         Safefree(ri->u.offsets);             /* 20010421 MJD */
13590 #endif
13591     if (ri->code_blocks) {
13592         int n;
13593         for (n = 0; n < ri->num_code_blocks; n++)
13594             SvREFCNT_dec(ri->code_blocks[n].src_regex);
13595         Safefree(ri->code_blocks);
13596     }
13597
13598     if (ri->data) {
13599         int n = ri->data->count;
13600
13601         while (--n >= 0) {
13602           /* If you add a ->what type here, update the comment in regcomp.h */
13603             switch (ri->data->what[n]) {
13604             case 'a':
13605             case 'r':
13606             case 's':
13607             case 'S':
13608             case 'u':
13609                 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
13610                 break;
13611             case 'f':
13612                 Safefree(ri->data->data[n]);
13613                 break;
13614             case 'l':
13615             case 'L':
13616                 break;
13617             case 'T':
13618                 { /* Aho Corasick add-on structure for a trie node.
13619                      Used in stclass optimization only */
13620                     U32 refcount;
13621                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
13622                     OP_REFCNT_LOCK;
13623                     refcount = --aho->refcount;
13624                     OP_REFCNT_UNLOCK;
13625                     if ( !refcount ) {
13626                         PerlMemShared_free(aho->states);
13627                         PerlMemShared_free(aho->fail);
13628                          /* do this last!!!! */
13629                         PerlMemShared_free(ri->data->data[n]);
13630                         PerlMemShared_free(ri->regstclass);
13631                     }
13632                 }
13633                 break;
13634             case 't':
13635                 {
13636                     /* trie structure. */
13637                     U32 refcount;
13638                     reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
13639                     OP_REFCNT_LOCK;
13640                     refcount = --trie->refcount;
13641                     OP_REFCNT_UNLOCK;
13642                     if ( !refcount ) {
13643                         PerlMemShared_free(trie->charmap);
13644                         PerlMemShared_free(trie->states);
13645                         PerlMemShared_free(trie->trans);
13646                         if (trie->bitmap)
13647                             PerlMemShared_free(trie->bitmap);
13648                         if (trie->jump)
13649                             PerlMemShared_free(trie->jump);
13650                         PerlMemShared_free(trie->wordinfo);
13651                         /* do this last!!!! */
13652                         PerlMemShared_free(ri->data->data[n]);
13653                     }
13654                 }
13655                 break;
13656             default:
13657                 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
13658             }
13659         }
13660         Safefree(ri->data->what);
13661         Safefree(ri->data);
13662     }
13663
13664     Safefree(ri);
13665 }
13666
13667 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
13668 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
13669 #define SAVEPVN(p,n)    ((p) ? savepvn(p,n) : NULL)
13670
13671 /*
13672    re_dup - duplicate a regexp.
13673
13674    This routine is expected to clone a given regexp structure. It is only
13675    compiled under USE_ITHREADS.
13676
13677    After all of the core data stored in struct regexp is duplicated
13678    the regexp_engine.dupe method is used to copy any private data
13679    stored in the *pprivate pointer. This allows extensions to handle
13680    any duplication it needs to do.
13681
13682    See pregfree() and regfree_internal() if you change anything here.
13683 */
13684 #if defined(USE_ITHREADS)
13685 #ifndef PERL_IN_XSUB_RE
13686 void
13687 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
13688 {
13689     dVAR;
13690     I32 npar;
13691     const struct regexp *r = (const struct regexp *)SvANY(sstr);
13692     struct regexp *ret = (struct regexp *)SvANY(dstr);
13693
13694     PERL_ARGS_ASSERT_RE_DUP_GUTS;
13695
13696     npar = r->nparens+1;
13697     Newx(ret->offs, npar, regexp_paren_pair);
13698     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
13699     if(ret->swap) {
13700         /* no need to copy these */
13701         Newx(ret->swap, npar, regexp_paren_pair);
13702     }
13703
13704     if (ret->substrs) {
13705         /* Do it this way to avoid reading from *r after the StructCopy().
13706            That way, if any of the sv_dup_inc()s dislodge *r from the L1
13707            cache, it doesn't matter.  */
13708         const bool anchored = r->check_substr
13709             ? r->check_substr == r->anchored_substr
13710             : r->check_utf8 == r->anchored_utf8;
13711         Newx(ret->substrs, 1, struct reg_substr_data);
13712         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
13713
13714         ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
13715         ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
13716         ret->float_substr = sv_dup_inc(ret->float_substr, param);
13717         ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
13718
13719         /* check_substr and check_utf8, if non-NULL, point to either their
13720            anchored or float namesakes, and don't hold a second reference.  */
13721
13722         if (ret->check_substr) {
13723             if (anchored) {
13724                 assert(r->check_utf8 == r->anchored_utf8);
13725                 ret->check_substr = ret->anchored_substr;
13726                 ret->check_utf8 = ret->anchored_utf8;
13727             } else {
13728                 assert(r->check_substr == r->float_substr);
13729                 assert(r->check_utf8 == r->float_utf8);
13730                 ret->check_substr = ret->float_substr;
13731                 ret->check_utf8 = ret->float_utf8;
13732             }
13733         } else if (ret->check_utf8) {
13734             if (anchored) {
13735                 ret->check_utf8 = ret->anchored_utf8;
13736             } else {
13737                 ret->check_utf8 = ret->float_utf8;
13738             }
13739         }
13740     }
13741
13742     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
13743     ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
13744
13745     if (ret->pprivate)
13746         RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
13747
13748     if (RX_MATCH_COPIED(dstr))
13749         ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
13750     else
13751         ret->subbeg = NULL;
13752 #ifdef PERL_OLD_COPY_ON_WRITE
13753     ret->saved_copy = NULL;
13754 #endif
13755
13756     if (ret->mother_re) {
13757         if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
13758             /* Our storage points directly to our mother regexp, but that's
13759                1: a buffer in a different thread
13760                2: something we no longer hold a reference on
13761                so we need to copy it locally.  */
13762             /* Note we need to use SvCUR(), rather than
13763                SvLEN(), on our mother_re, because it, in
13764                turn, may well be pointing to its own mother_re.  */
13765             SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
13766                                    SvCUR(ret->mother_re)+1));
13767             SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
13768         }
13769         ret->mother_re      = NULL;
13770     }
13771     ret->gofs = 0;
13772 }
13773 #endif /* PERL_IN_XSUB_RE */
13774
13775 /*
13776    regdupe_internal()
13777
13778    This is the internal complement to regdupe() which is used to copy
13779    the structure pointed to by the *pprivate pointer in the regexp.
13780    This is the core version of the extension overridable cloning hook.
13781    The regexp structure being duplicated will be copied by perl prior
13782    to this and will be provided as the regexp *r argument, however
13783    with the /old/ structures pprivate pointer value. Thus this routine
13784    may override any copying normally done by perl.
13785
13786    It returns a pointer to the new regexp_internal structure.
13787 */
13788
13789 void *
13790 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
13791 {
13792     dVAR;
13793     struct regexp *const r = (struct regexp *)SvANY(rx);
13794     regexp_internal *reti;
13795     int len;
13796     RXi_GET_DECL(r,ri);
13797
13798     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
13799
13800     len = ProgLen(ri);
13801
13802     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
13803     Copy(ri->program, reti->program, len+1, regnode);
13804
13805     reti->num_code_blocks = ri->num_code_blocks;
13806     if (ri->code_blocks) {
13807         int n;
13808         Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
13809                 struct reg_code_block);
13810         Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
13811                 struct reg_code_block);
13812         for (n = 0; n < ri->num_code_blocks; n++)
13813              reti->code_blocks[n].src_regex = (REGEXP*)
13814                     sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
13815     }
13816     else
13817         reti->code_blocks = NULL;
13818
13819     reti->regstclass = NULL;
13820
13821     if (ri->data) {
13822         struct reg_data *d;
13823         const int count = ri->data->count;
13824         int i;
13825
13826         Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
13827                 char, struct reg_data);
13828         Newx(d->what, count, U8);
13829
13830         d->count = count;
13831         for (i = 0; i < count; i++) {
13832             d->what[i] = ri->data->what[i];
13833             switch (d->what[i]) {
13834                 /* see also regcomp.h and regfree_internal() */
13835             case 'a': /* actually an AV, but the dup function is identical.  */
13836             case 'r':
13837             case 's':
13838             case 'S':
13839             case 'u': /* actually an HV, but the dup function is identical.  */
13840                 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
13841                 break;
13842             case 'f':
13843                 /* This is cheating. */
13844                 Newx(d->data[i], 1, struct regnode_charclass_class);
13845                 StructCopy(ri->data->data[i], d->data[i],
13846                             struct regnode_charclass_class);
13847                 reti->regstclass = (regnode*)d->data[i];
13848                 break;
13849             case 'T':
13850                 /* Trie stclasses are readonly and can thus be shared
13851                  * without duplication. We free the stclass in pregfree
13852                  * when the corresponding reg_ac_data struct is freed.
13853                  */
13854                 reti->regstclass= ri->regstclass;
13855                 /* Fall through */
13856             case 't':
13857                 OP_REFCNT_LOCK;
13858                 ((reg_trie_data*)ri->data->data[i])->refcount++;
13859                 OP_REFCNT_UNLOCK;
13860                 /* Fall through */
13861             case 'l':
13862             case 'L':
13863                 d->data[i] = ri->data->data[i];
13864                 break;
13865             default:
13866                 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
13867             }
13868         }
13869
13870         reti->data = d;
13871     }
13872     else
13873         reti->data = NULL;
13874
13875     reti->name_list_idx = ri->name_list_idx;
13876
13877 #ifdef RE_TRACK_PATTERN_OFFSETS
13878     if (ri->u.offsets) {
13879         Newx(reti->u.offsets, 2*len+1, U32);
13880         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
13881     }
13882 #else
13883     SetProgLen(reti,len);
13884 #endif
13885
13886     return (void*)reti;
13887 }
13888
13889 #endif    /* USE_ITHREADS */
13890
13891 #ifndef PERL_IN_XSUB_RE
13892
13893 /*
13894  - regnext - dig the "next" pointer out of a node
13895  */
13896 regnode *
13897 Perl_regnext(pTHX_ register regnode *p)
13898 {
13899     dVAR;
13900     register I32 offset;
13901
13902     if (!p)
13903         return(NULL);
13904
13905     if (OP(p) > REGNODE_MAX) {          /* regnode.type is unsigned */
13906         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
13907     }
13908
13909     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
13910     if (offset == 0)
13911         return(NULL);
13912
13913     return(p+offset);
13914 }
13915 #endif
13916
13917 STATIC void
13918 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
13919 {
13920     va_list args;
13921     STRLEN l1 = strlen(pat1);
13922     STRLEN l2 = strlen(pat2);
13923     char buf[512];
13924     SV *msv;
13925     const char *message;
13926
13927     PERL_ARGS_ASSERT_RE_CROAK2;
13928
13929     if (l1 > 510)
13930         l1 = 510;
13931     if (l1 + l2 > 510)
13932         l2 = 510 - l1;
13933     Copy(pat1, buf, l1 , char);
13934     Copy(pat2, buf + l1, l2 , char);
13935     buf[l1 + l2] = '\n';
13936     buf[l1 + l2 + 1] = '\0';
13937 #ifdef I_STDARG
13938     /* ANSI variant takes additional second argument */
13939     va_start(args, pat2);
13940 #else
13941     va_start(args);
13942 #endif
13943     msv = vmess(buf, &args);
13944     va_end(args);
13945     message = SvPV_const(msv,l1);
13946     if (l1 > 512)
13947         l1 = 512;
13948     Copy(message, buf, l1 , char);
13949     buf[l1-1] = '\0';                   /* Overwrite \n */
13950     Perl_croak(aTHX_ "%s", buf);
13951 }
13952
13953 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
13954
13955 #ifndef PERL_IN_XSUB_RE
13956 void
13957 Perl_save_re_context(pTHX)
13958 {
13959     dVAR;
13960
13961     struct re_save_state *state;
13962
13963     SAVEVPTR(PL_curcop);
13964     SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
13965
13966     state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
13967     PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
13968     SSPUSHUV(SAVEt_RE_STATE);
13969
13970     Copy(&PL_reg_state, state, 1, struct re_save_state);
13971
13972     PL_reg_oldsaved = NULL;
13973     PL_reg_oldsavedlen = 0;
13974     PL_reg_maxiter = 0;
13975     PL_reg_leftiter = 0;
13976     PL_reg_poscache = NULL;
13977     PL_reg_poscache_size = 0;
13978 #ifdef PERL_OLD_COPY_ON_WRITE
13979     PL_nrs = NULL;
13980 #endif
13981
13982     /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
13983     if (PL_curpm) {
13984         const REGEXP * const rx = PM_GETRE(PL_curpm);
13985         if (rx) {
13986             U32 i;
13987             for (i = 1; i <= RX_NPARENS(rx); i++) {
13988                 char digits[TYPE_CHARS(long)];
13989                 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
13990                 GV *const *const gvp
13991                     = (GV**)hv_fetch(PL_defstash, digits, len, 0);
13992
13993                 if (gvp) {
13994                     GV * const gv = *gvp;
13995                     if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
13996                         save_scalar(gv);
13997                 }
13998             }
13999         }
14000     }
14001 }
14002 #endif
14003
14004 static void
14005 clear_re(pTHX_ void *r)
14006 {
14007     dVAR;
14008     ReREFCNT_dec((REGEXP *)r);
14009 }
14010
14011 #ifdef DEBUGGING
14012
14013 STATIC void
14014 S_put_byte(pTHX_ SV *sv, int c)
14015 {
14016     PERL_ARGS_ASSERT_PUT_BYTE;
14017
14018     /* Our definition of isPRINT() ignores locales, so only bytes that are
14019        not part of UTF-8 are considered printable. I assume that the same
14020        holds for UTF-EBCDIC.
14021        Also, code point 255 is not printable in either (it's E0 in EBCDIC,
14022        which Wikipedia says:
14023
14024        EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
14025        ones (binary 1111 1111, hexadecimal FF). It is similar, but not
14026        identical, to the ASCII delete (DEL) or rubout control character.
14027        ) So the old condition can be simplified to !isPRINT(c)  */
14028     if (!isPRINT(c)) {
14029         if (c < 256) {
14030             Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
14031         }
14032         else {
14033             Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
14034         }
14035     }
14036     else {
14037         const char string = c;
14038         if (c == '-' || c == ']' || c == '\\' || c == '^')
14039             sv_catpvs(sv, "\\");
14040         sv_catpvn(sv, &string, 1);
14041     }
14042 }
14043
14044
14045 #define CLEAR_OPTSTART \
14046     if (optstart) STMT_START { \
14047             DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
14048             optstart=NULL; \
14049     } STMT_END
14050
14051 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
14052
14053 STATIC const regnode *
14054 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
14055             const regnode *last, const regnode *plast,
14056             SV* sv, I32 indent, U32 depth)
14057 {
14058     dVAR;
14059     register U8 op = PSEUDO;    /* Arbitrary non-END op. */
14060     register const regnode *next;
14061     const regnode *optstart= NULL;
14062
14063     RXi_GET_DECL(r,ri);
14064     GET_RE_DEBUG_FLAGS_DECL;
14065
14066     PERL_ARGS_ASSERT_DUMPUNTIL;
14067
14068 #ifdef DEBUG_DUMPUNTIL
14069     PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
14070         last ? last-start : 0,plast ? plast-start : 0);
14071 #endif
14072
14073     if (plast && plast < last)
14074         last= plast;
14075
14076     while (PL_regkind[op] != END && (!last || node < last)) {
14077         /* While that wasn't END last time... */
14078         NODE_ALIGN(node);
14079         op = OP(node);
14080         if (op == CLOSE || op == WHILEM)
14081             indent--;
14082         next = regnext((regnode *)node);
14083
14084         /* Where, what. */
14085         if (OP(node) == OPTIMIZED) {
14086             if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
14087                 optstart = node;
14088             else
14089                 goto after_print;
14090         } else
14091             CLEAR_OPTSTART;
14092
14093         regprop(r, sv, node);
14094         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
14095                       (int)(2*indent + 1), "", SvPVX_const(sv));
14096
14097         if (OP(node) != OPTIMIZED) {
14098             if (next == NULL)           /* Next ptr. */
14099                 PerlIO_printf(Perl_debug_log, " (0)");
14100             else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
14101                 PerlIO_printf(Perl_debug_log, " (FAIL)");
14102             else
14103                 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
14104             (void)PerlIO_putc(Perl_debug_log, '\n');
14105         }
14106
14107       after_print:
14108         if (PL_regkind[(U8)op] == BRANCHJ) {
14109             assert(next);
14110             {
14111                 register const regnode *nnode = (OP(next) == LONGJMP
14112                                              ? regnext((regnode *)next)
14113                                              : next);
14114                 if (last && nnode > last)
14115                     nnode = last;
14116                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
14117             }
14118         }
14119         else if (PL_regkind[(U8)op] == BRANCH) {
14120             assert(next);
14121             DUMPUNTIL(NEXTOPER(node), next);
14122         }
14123         else if ( PL_regkind[(U8)op]  == TRIE ) {
14124             const regnode *this_trie = node;
14125             const char op = OP(node);
14126             const U32 n = ARG(node);
14127             const reg_ac_data * const ac = op>=AHOCORASICK ?
14128                (reg_ac_data *)ri->data->data[n] :
14129                NULL;
14130             const reg_trie_data * const trie =
14131                 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
14132 #ifdef DEBUGGING
14133             AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
14134 #endif
14135             const regnode *nextbranch= NULL;
14136             I32 word_idx;
14137             sv_setpvs(sv, "");
14138             for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
14139                 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
14140
14141                 PerlIO_printf(Perl_debug_log, "%*s%s ",
14142                    (int)(2*(indent+3)), "",
14143                     elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
14144                             PL_colors[0], PL_colors[1],
14145                             (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
14146                             PERL_PV_PRETTY_ELLIPSES    |
14147                             PERL_PV_PRETTY_LTGT
14148                             )
14149                             : "???"
14150                 );
14151                 if (trie->jump) {
14152                     U16 dist= trie->jump[word_idx+1];
14153                     PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
14154                                   (UV)((dist ? this_trie + dist : next) - start));
14155                     if (dist) {
14156                         if (!nextbranch)
14157                             nextbranch= this_trie + trie->jump[0];
14158                         DUMPUNTIL(this_trie + dist, nextbranch);
14159                     }
14160                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
14161                         nextbranch= regnext((regnode *)nextbranch);
14162                 } else {
14163                     PerlIO_printf(Perl_debug_log, "\n");
14164                 }
14165             }
14166             if (last && next > last)
14167                 node= last;
14168             else
14169                 node= next;
14170         }
14171         else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
14172             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
14173                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
14174         }
14175         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
14176             assert(next);
14177             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
14178         }
14179         else if ( op == PLUS || op == STAR) {
14180             DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
14181         }
14182         else if (PL_regkind[(U8)op] == ANYOF) {
14183             /* arglen 1 + class block */
14184             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
14185                     ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
14186             node = NEXTOPER(node);
14187         }
14188         else if (PL_regkind[(U8)op] == EXACT) {
14189             /* Literal string, where present. */
14190             node += NODE_SZ_STR(node) - 1;
14191             node = NEXTOPER(node);
14192         }
14193         else {
14194             node = NEXTOPER(node);
14195             node += regarglen[(U8)op];
14196         }
14197         if (op == CURLYX || op == OPEN)
14198             indent++;
14199     }
14200     CLEAR_OPTSTART;
14201 #ifdef DEBUG_DUMPUNTIL
14202     PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
14203 #endif
14204     return node;
14205 }
14206
14207 #endif  /* DEBUGGING */
14208
14209 /*
14210  * Local variables:
14211  * c-indentation-style: bsd
14212  * c-basic-offset: 4
14213  * indent-tabs-mode: nil
14214  * End:
14215  *
14216  * ex: set ts=8 sts=4 sw=4 et:
14217  */