src/5015008/regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  *      Copyright (c) 1986 by University of Toronto.
  42  *      Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  *      Permission is granted to anyone to use this software for any
  45  *      purpose on any computer system, and to redistribute it freely,
  46  *      subject to the following restrictions:
  47  *
  48  *      1. The author is not responsible for the consequences of use of
  49  *              this software, no matter how awful, even if they arise
  50  *              from defects in it.
  51  *
  52  *      2. The origin of this software must not be misrepresented, either
  53  *              by explicit claim or by omission.
  54  *
  55  *      3. Altered versions must be plainly marked as such, and must not
  56  *              be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #include "perl.h"
  76
  77 #ifndef PERL_IN_XSUB_RE
  78 #include "re_defs.h"
  79 #endif
  80
  81 #define REG_COMP_C
  82 #ifdef PERL_IN_XSUB_RE
  83 #  include "re_comp.h"
  84 #else
  85 #  include "regcomp.h"
  86 #endif
  87
  88 #include "dquote_static.c"
  89 #ifndef PERL_IN_XSUB_RE
  90 #  include "charclass_invlists.h"
  91 #endif
  92
  93 #ifdef op
  94 #undef op
  95 #endif /* op */
  96
  97 #ifdef MSDOS
  98 #  if defined(BUGGY_MSC6)
  99  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
 100 #    pragma optimize("a",off)
 101  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
 102 #    pragma optimize("w",on )
 103 #  endif /* BUGGY_MSC6 */
 104 #endif /* MSDOS */
 105
 106 #ifndef STATIC
 107 #define STATIC  static
 108 #endif
 109
 110 typedef struct RExC_state_t {
 111     U32         flags;                  /* are we folding, multilining? */
 112     char        *precomp;               /* uncompiled string. */
 113     REGEXP      *rx_sv;                 /* The SV that is the regexp. */
 114     regexp      *rx;                    /* perl core regexp structure */
 115     regexp_internal     *rxi;           /* internal data for regexp object pprivate field */
 116     char        *start;                 /* Start of input for compile */
 117     char        *end;                   /* End of input for compile */
 118     char        *parse;                 /* Input-scan pointer. */
 119     I32         whilem_seen;            /* number of WHILEM in this expr */
 120     regnode     *emit_start;            /* Start of emitted-code area */
 121     regnode     *emit_bound;            /* First regnode outside of the allocated space */
 122     regnode     *emit;                  /* Code-emit pointer; &regdummy = don't = compiling */
 123     I32         naughty;                /* How bad is this pattern? */
 124     I32         sawback;                /* Did we see \1, ...? */
 125     U32         seen;
 126     I32         size;                   /* Code size. */
 127     I32         npar;                   /* Capture buffer count, (OPEN). */
 128     I32         cpar;                   /* Capture buffer count, (CLOSE). */
 129     I32         nestroot;               /* root parens we are in - used by accept */
 130     I32         extralen;
 131     I32         seen_zerolen;
 132     I32         seen_evals;
 133     regnode     **open_parens;          /* pointers to open parens */
 134     regnode     **close_parens;         /* pointers to close parens */
 135     regnode     *opend;                 /* END node in program */
 136     I32         utf8;           /* whether the pattern is utf8 or not */
 137     I32         orig_utf8;      /* whether the pattern was originally in utf8 */
 138                                 /* XXX use this for future optimisation of case
 139                                  * where pattern must be upgraded to utf8. */
 140     I32         uni_semantics;  /* If a d charset modifier should use unicode
 141                                    rules, even if the pattern is not in
 142                                    utf8 */
 143     HV          *paren_names;           /* Paren names */
 144
 145     regnode     **recurse;              /* Recurse regops */
 146     I32         recurse_count;          /* Number of recurse regops */
 147     I32         in_lookbehind;
 148     I32         contains_locale;
 149     I32         override_recoding;
 150 #if ADD_TO_REGEXEC
 151     char        *starttry;              /* -Dr: where regtry was called. */
 152 #define RExC_starttry   (pRExC_state->starttry)
 153 #endif
 154 #ifdef DEBUGGING
 155     const char  *lastparse;
 156     I32         lastnum;
 157     AV          *paren_name_list;       /* idx -> name */
 158 #define RExC_lastparse  (pRExC_state->lastparse)
 159 #define RExC_lastnum    (pRExC_state->lastnum)
 160 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 161 #endif
 162 } RExC_state_t;
 163
 164 #define RExC_flags      (pRExC_state->flags)
 165 #define RExC_precomp    (pRExC_state->precomp)
 166 #define RExC_rx_sv      (pRExC_state->rx_sv)
 167 #define RExC_rx         (pRExC_state->rx)
 168 #define RExC_rxi        (pRExC_state->rxi)
 169 #define RExC_start      (pRExC_state->start)
 170 #define RExC_end        (pRExC_state->end)
 171 #define RExC_parse      (pRExC_state->parse)
 172 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
 173 #ifdef RE_TRACK_PATTERN_OFFSETS
 174 #define RExC_offsets    (pRExC_state->rxi->u.offsets) /* I am not like the others */
 175 #endif
 176 #define RExC_emit       (pRExC_state->emit)
 177 #define RExC_emit_start (pRExC_state->emit_start)
 178 #define RExC_emit_bound (pRExC_state->emit_bound)
 179 #define RExC_naughty    (pRExC_state->naughty)
 180 #define RExC_sawback    (pRExC_state->sawback)
 181 #define RExC_seen       (pRExC_state->seen)
 182 #define RExC_size       (pRExC_state->size)
 183 #define RExC_npar       (pRExC_state->npar)
 184 #define RExC_nestroot   (pRExC_state->nestroot)
 185 #define RExC_extralen   (pRExC_state->extralen)
 186 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
 187 #define RExC_seen_evals (pRExC_state->seen_evals)
 188 #define RExC_utf8       (pRExC_state->utf8)
 189 #define RExC_uni_semantics      (pRExC_state->uni_semantics)
 190 #define RExC_orig_utf8  (pRExC_state->orig_utf8)
 191 #define RExC_open_parens        (pRExC_state->open_parens)
 192 #define RExC_close_parens       (pRExC_state->close_parens)
 193 #define RExC_opend      (pRExC_state->opend)
 194 #define RExC_paren_names        (pRExC_state->paren_names)
 195 #define RExC_recurse    (pRExC_state->recurse)
 196 #define RExC_recurse_count      (pRExC_state->recurse_count)
 197 #define RExC_in_lookbehind      (pRExC_state->in_lookbehind)
 198 #define RExC_contains_locale    (pRExC_state->contains_locale)
 199 #define RExC_override_recoding  (pRExC_state->override_recoding)
 200
 201
 202 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
 203 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 204         ((*s) == '{' && regcurly(s)))
 205
 206 #ifdef SPSTART
 207 #undef SPSTART          /* dratted cpp namespace... */
 208 #endif
 209 /*
 210  * Flags to be passed up and down.
 211  */
 212 #define WORST           0       /* Worst case. */
 213 #define HASWIDTH        0x01    /* Known to match non-null strings. */
 214
 215 /* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
 216  * character, and if utf8, must be invariant.  Note that this is not the same thing as REGNODE_SIMPLE */
 217 #define SIMPLE          0x02
 218 #define SPSTART         0x04    /* Starts with * or +. */
 219 #define TRYAGAIN        0x08    /* Weeded out a declaration. */
 220 #define POSTPONED       0x10    /* (?1),(?&name), (??{...}) or similar */
 221
 222 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 223
 224 /* whether trie related optimizations are enabled */
 225 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 226 #define TRIE_STUDY_OPT
 227 #define FULL_TRIE_STUDY
 228 #define TRIE_STCLASS
 229 #endif
 230
 231
 232
 233 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 234 #define PBITVAL(paren) (1 << ((paren) & 7))
 235 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 236 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 237 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 238
 239 /* If not already in utf8, do a longjmp back to the beginning */
 240 #define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
 241 #define REQUIRE_UTF8    STMT_START {                                       \
 242                                      if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
 243                         } STMT_END
 244
 245 /* About scan_data_t.
 246
 247   During optimisation we recurse through the regexp program performing
 248   various inplace (keyhole style) optimisations. In addition study_chunk
 249   and scan_commit populate this data structure with information about
 250   what strings MUST appear in the pattern. We look for the longest
 251   string that must appear at a fixed location, and we look for the
 252   longest string that may appear at a floating location. So for instance
 253   in the pattern:
 254
 255     /FOO[xX]A.*B[xX]BAR/
 256
 257   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 258   strings (because they follow a .* construct). study_chunk will identify
 259   both FOO and BAR as being the longest fixed and floating strings respectively.
 260
 261   The strings can be composites, for instance
 262
 263      /(f)(o)(o)/
 264
 265   will result in a composite fixed substring 'foo'.
 266
 267   For each string some basic information is maintained:
 268
 269   - offset or min_offset
 270     This is the position the string must appear at, or not before.
 271     It also implicitly (when combined with minlenp) tells us how many
 272     characters must match before the string we are searching for.
 273     Likewise when combined with minlenp and the length of the string it
 274     tells us how many characters must appear after the string we have
 275     found.
 276
 277   - max_offset
 278     Only used for floating strings. This is the rightmost point that
 279     the string can appear at. If set to I32 max it indicates that the
 280     string can occur infinitely far to the right.
 281
 282   - minlenp
 283     A pointer to the minimum length of the pattern that the string
 284     was found inside. This is important as in the case of positive
 285     lookahead or positive lookbehind we can have multiple patterns
 286     involved. Consider
 287
 288     /(?=FOO).*F/
 289
 290     The minimum length of the pattern overall is 3, the minimum length
 291     of the lookahead part is 3, but the minimum length of the part that
 292     will actually match is 1. So 'FOO's minimum length is 3, but the
 293     minimum length for the F is 1. This is important as the minimum length
 294     is used to determine offsets in front of and behind the string being
 295     looked for.  Since strings can be composites this is the length of the
 296     pattern at the time it was committed with a scan_commit. Note that
 297     the length is calculated by study_chunk, so that the minimum lengths
 298     are not known until the full pattern has been compiled, thus the
 299     pointer to the value.
 300
 301   - lookbehind
 302
 303     In the case of lookbehind the string being searched for can be
 304     offset past the start point of the final matching string.
 305     If this value was just blithely removed from the min_offset it would
 306     invalidate some of the calculations for how many chars must match
 307     before or after (as they are derived from min_offset and minlen and
 308     the length of the string being searched for).
 309     When the final pattern is compiled and the data is moved from the
 310     scan_data_t structure into the regexp structure the information
 311     about lookbehind is factored in, with the information that would
 312     have been lost precalculated in the end_shift field for the
 313     associated string.
 314
 315   The fields pos_min and pos_delta are used to store the minimum offset
 316   and the delta to the maximum offset at the current point in the pattern.
 317
 318 */
 319
 320 typedef struct scan_data_t {
 321     /*I32 len_min;      unused */
 322     /*I32 len_delta;    unused */
 323     I32 pos_min;
 324     I32 pos_delta;
 325     SV *last_found;
 326     I32 last_end;           /* min value, <0 unless valid. */
 327     I32 last_start_min;
 328     I32 last_start_max;
 329     SV **longest;           /* Either &l_fixed, or &l_float. */
 330     SV *longest_fixed;      /* longest fixed string found in pattern */
 331     I32 offset_fixed;       /* offset where it starts */
 332     I32 *minlen_fixed;      /* pointer to the minlen relevant to the string */
 333     I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 334     SV *longest_float;      /* longest floating string found in pattern */
 335     I32 offset_float_min;   /* earliest point in string it can appear */
 336     I32 offset_float_max;   /* latest point in string it can appear */
 337     I32 *minlen_float;      /* pointer to the minlen relevant to the string */
 338     I32 lookbehind_float;   /* is the position of the string modified by LB */
 339     I32 flags;
 340     I32 whilem_c;
 341     I32 *last_closep;
 342     struct regnode_charclass_class *start_class;
 343 } scan_data_t;
 344
 345 /*
 346  * Forward declarations for pregcomp()'s friends.
 347  */
 348
 349 static const scan_data_t zero_scan_data =
 350   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 351
 352 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 353 #define SF_BEFORE_SEOL          0x0001
 354 #define SF_BEFORE_MEOL          0x0002
 355 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 356 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 357
 358 #ifdef NO_UNARY_PLUS
 359 #  define SF_FIX_SHIFT_EOL      (0+2)
 360 #  define SF_FL_SHIFT_EOL               (0+4)
 361 #else
 362 #  define SF_FIX_SHIFT_EOL      (+2)
 363 #  define SF_FL_SHIFT_EOL               (+4)
 364 #endif
 365
 366 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 367 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 368
 369 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 370 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 371 #define SF_IS_INF               0x0040
 372 #define SF_HAS_PAR              0x0080
 373 #define SF_IN_PAR               0x0100
 374 #define SF_HAS_EVAL             0x0200
 375 #define SCF_DO_SUBSTR           0x0400
 376 #define SCF_DO_STCLASS_AND      0x0800
 377 #define SCF_DO_STCLASS_OR       0x1000
 378 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 379 #define SCF_WHILEM_VISITED_POS  0x2000
 380
 381 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 382 #define SCF_SEEN_ACCEPT         0x8000
 383
 384 #define UTF cBOOL(RExC_utf8)
 385 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 386 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
 387 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
 388 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
 389 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
 390 #define MORE_ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
 391 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
 392
 393 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 394
 395 #define OOB_UNICODE             12345678
 396 #define OOB_NAMEDCLASS          -1
 397
 398 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 399 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 400
 401
 402 /* length of regex to show in messages that don't mark a position within */
 403 #define RegexLengthToShowInErrorMessages 127
 404
 405 /*
 406  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 407  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 408  * op/pragma/warn/regcomp.
 409  */
 410 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 411 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 412
 413 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
 414
 415 /*
 416  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 417  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 418  * "...".
 419  */
 420 #define _FAIL(code) STMT_START {                                        \
 421     const char *ellipses = "";                                          \
 422     IV len = RExC_end - RExC_precomp;                                   \
 423                                                                         \
 424     if (!SIZE_ONLY)                                                     \
 425         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);                   \
 426     if (len > RegexLengthToShowInErrorMessages) {                       \
 427         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
 428         len = RegexLengthToShowInErrorMessages - 10;                    \
 429         ellipses = "...";                                               \
 430     }                                                                   \
 431     code;                                                               \
 432 } STMT_END
 433
 434 #define FAIL(msg) _FAIL(                            \
 435     Perl_croak(aTHX_ "%s in regex m/%.*s%s/",       \
 436             msg, (int)len, RExC_precomp, ellipses))
 437
 438 #define FAIL2(msg,arg) _FAIL(                       \
 439     Perl_croak(aTHX_ msg " in regex m/%.*s%s/",     \
 440             arg, (int)len, RExC_precomp, ellipses))
 441
 442 /*
 443  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 444  */
 445 #define Simple_vFAIL(m) STMT_START {                                    \
 446     const IV offset = RExC_parse - RExC_precomp;                        \
 447     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
 448             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
 449 } STMT_END
 450
 451 /*
 452  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 453  */
 454 #define vFAIL(m) STMT_START {                           \
 455     if (!SIZE_ONLY)                                     \
 456         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 457     Simple_vFAIL(m);                                    \
 458 } STMT_END
 459
 460 /*
 461  * Like Simple_vFAIL(), but accepts two arguments.
 462  */
 463 #define Simple_vFAIL2(m,a1) STMT_START {                        \
 464     const IV offset = RExC_parse - RExC_precomp;                        \
 465     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,                   \
 466             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 467 } STMT_END
 468
 469 /*
 470  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 471  */
 472 #define vFAIL2(m,a1) STMT_START {                       \
 473     if (!SIZE_ONLY)                                     \
 474         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 475     Simple_vFAIL2(m, a1);                               \
 476 } STMT_END
 477
 478
 479 /*
 480  * Like Simple_vFAIL(), but accepts three arguments.
 481  */
 482 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
 483     const IV offset = RExC_parse - RExC_precomp;                \
 484     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,               \
 485             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 486 } STMT_END
 487
 488 /*
 489  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 490  */
 491 #define vFAIL3(m,a1,a2) STMT_START {                    \
 492     if (!SIZE_ONLY)                                     \
 493         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 494     Simple_vFAIL3(m, a1, a2);                           \
 495 } STMT_END
 496
 497 /*
 498  * Like Simple_vFAIL(), but accepts four arguments.
 499  */
 500 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
 501     const IV offset = RExC_parse - RExC_precomp;                \
 502     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,           \
 503             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 504 } STMT_END
 505
 506 #define ckWARNreg(loc,m) STMT_START {                                   \
 507     const IV offset = loc - RExC_precomp;                               \
 508     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 509             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 510 } STMT_END
 511
 512 #define ckWARNregdep(loc,m) STMT_START {                                \
 513     const IV offset = loc - RExC_precomp;                               \
 514     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 515             m REPORT_LOCATION,                                          \
 516             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 517 } STMT_END
 518
 519 #define ckWARN2regdep(loc,m, a1) STMT_START {                           \
 520     const IV offset = loc - RExC_precomp;                               \
 521     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 522             m REPORT_LOCATION,                                          \
 523             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
 524 } STMT_END
 525
 526 #define ckWARN2reg(loc, m, a1) STMT_START {                             \
 527     const IV offset = loc - RExC_precomp;                               \
 528     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 529             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
 530 } STMT_END
 531
 532 #define vWARN3(loc, m, a1, a2) STMT_START {                             \
 533     const IV offset = loc - RExC_precomp;                               \
 534     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 535             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 536 } STMT_END
 537
 538 #define ckWARN3reg(loc, m, a1, a2) STMT_START {                         \
 539     const IV offset = loc - RExC_precomp;                               \
 540     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 541             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 542 } STMT_END
 543
 544 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
 545     const IV offset = loc - RExC_precomp;                               \
 546     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 547             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 548 } STMT_END
 549
 550 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START {                     \
 551     const IV offset = loc - RExC_precomp;                               \
 552     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 553             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 554 } STMT_END
 555
 556 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
 557     const IV offset = loc - RExC_precomp;                               \
 558     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 559             a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
 560 } STMT_END
 561
 562
 563 /* Allow for side effects in s */
 564 #define REGC(c,s) STMT_START {                  \
 565     if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
 566 } STMT_END
 567
 568 /* Macros for recording node offsets.   20001227 mjd@plover.com
 569  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 570  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 571  * Element 0 holds the number n.
 572  * Position is 1 indexed.
 573  */
 574 #ifndef RE_TRACK_PATTERN_OFFSETS
 575 #define Set_Node_Offset_To_R(node,byte)
 576 #define Set_Node_Offset(node,byte)
 577 #define Set_Cur_Node_Offset
 578 #define Set_Node_Length_To_R(node,len)
 579 #define Set_Node_Length(node,len)
 580 #define Set_Node_Cur_Length(node)
 581 #define Node_Offset(n)
 582 #define Node_Length(n)
 583 #define Set_Node_Offset_Length(node,offset,len)
 584 #define ProgLen(ri) ri->u.proglen
 585 #define SetProgLen(ri,x) ri->u.proglen = x
 586 #else
 587 #define ProgLen(ri) ri->u.offsets[0]
 588 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 589 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
 590     if (! SIZE_ONLY) {                                                  \
 591         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
 592                     __LINE__, (int)(node), (int)(byte)));               \
 593         if((node) < 0) {                                                \
 594             Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
 595         } else {                                                        \
 596             RExC_offsets[2*(node)-1] = (byte);                          \
 597         }                                                               \
 598     }                                                                   \
 599 } STMT_END
 600
 601 #define Set_Node_Offset(node,byte) \
 602     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 603 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 604
 605 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
 606     if (! SIZE_ONLY) {                                                  \
 607         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
 608                 __LINE__, (int)(node), (int)(len)));                    \
 609         if((node) < 0) {                                                \
 610             Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
 611         } else {                                                        \
 612             RExC_offsets[2*(node)] = (len);                             \
 613         }                                                               \
 614     }                                                                   \
 615 } STMT_END
 616
 617 #define Set_Node_Length(node,len) \
 618     Set_Node_Length_To_R((node)-RExC_emit_start, len)
 619 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
 620 #define Set_Node_Cur_Length(node) \
 621     Set_Node_Length(node, RExC_parse - parse_start)
 622
 623 /* Get offsets and lengths */
 624 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 625 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 626
 627 #define Set_Node_Offset_Length(node,offset,len) STMT_START {    \
 628     Set_Node_Offset_To_R((node)-RExC_emit_start, (offset));     \
 629     Set_Node_Length_To_R((node)-RExC_emit_start, (len));        \
 630 } STMT_END
 631 #endif
 632
 633 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 634 #define EXPERIMENTAL_INPLACESCAN
 635 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
 636
 637 #define DEBUG_STUDYDATA(str,data,depth)                              \
 638 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
 639     PerlIO_printf(Perl_debug_log,                                    \
 640         "%*s" str "Pos:%"IVdf"/%"IVdf                                \
 641         " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s",       \
 642         (int)(depth)*2, "",                                          \
 643         (IV)((data)->pos_min),                                       \
 644         (IV)((data)->pos_delta),                                     \
 645         (UV)((data)->flags),                                         \
 646         (IV)((data)->whilem_c),                                      \
 647         (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
 648         is_inf ? "INF " : ""                                         \
 649     );                                                               \
 650     if ((data)->last_found)                                          \
 651         PerlIO_printf(Perl_debug_log,                                \
 652             "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 653             " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 654             SvPVX_const((data)->last_found),                         \
 655             (IV)((data)->last_end),                                  \
 656             (IV)((data)->last_start_min),                            \
 657             (IV)((data)->last_start_max),                            \
 658             ((data)->longest &&                                      \
 659              (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 660             SvPVX_const((data)->longest_fixed),                      \
 661             (IV)((data)->offset_fixed),                              \
 662             ((data)->longest &&                                      \
 663              (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 664             SvPVX_const((data)->longest_float),                      \
 665             (IV)((data)->offset_float_min),                          \
 666             (IV)((data)->offset_float_max)                           \
 667         );                                                           \
 668     PerlIO_printf(Perl_debug_log,"\n");                              \
 669 });
 670
 671 static void clear_re(pTHX_ void *r);
 672
 673 /* Mark that we cannot extend a found fixed substring at this point.
 674    Update the longest found anchored substring and the longest found
 675    floating substrings if needed. */
 676
 677 STATIC void
 678 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
 679 {
 680     const STRLEN l = CHR_SVLEN(data->last_found);
 681     const STRLEN old_l = CHR_SVLEN(*data->longest);
 682     GET_RE_DEBUG_FLAGS_DECL;
 683
 684     PERL_ARGS_ASSERT_SCAN_COMMIT;
 685
 686     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 687         SvSetMagicSV(*data->longest, data->last_found);
 688         if (*data->longest == data->longest_fixed) {
 689             data->offset_fixed = l ? data->last_start_min : data->pos_min;
 690             if (data->flags & SF_BEFORE_EOL)
 691                 data->flags
 692                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
 693             else
 694                 data->flags &= ~SF_FIX_BEFORE_EOL;
 695             data->minlen_fixed=minlenp;
 696             data->lookbehind_fixed=0;
 697         }
 698         else { /* *data->longest == data->longest_float */
 699             data->offset_float_min = l ? data->last_start_min : data->pos_min;
 700             data->offset_float_max = (l
 701                                       ? data->last_start_max
 702                                       : data->pos_min + data->pos_delta);
 703             if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
 704                 data->offset_float_max = I32_MAX;
 705             if (data->flags & SF_BEFORE_EOL)
 706                 data->flags
 707                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
 708             else
 709                 data->flags &= ~SF_FL_BEFORE_EOL;
 710             data->minlen_float=minlenp;
 711             data->lookbehind_float=0;
 712         }
 713     }
 714     SvCUR_set(data->last_found, 0);
 715     {
 716         SV * const sv = data->last_found;
 717         if (SvUTF8(sv) && SvMAGICAL(sv)) {
 718             MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 719             if (mg)
 720                 mg->mg_len = 0;
 721         }
 722     }
 723     data->last_end = -1;
 724     data->flags &= ~SF_BEFORE_EOL;
 725     DEBUG_STUDYDATA("commit: ",data,0);
 726 }
 727
 728 /* Can match anything (initialization) */
 729 STATIC void
 730 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 731 {
 732     PERL_ARGS_ASSERT_CL_ANYTHING;
 733
 734     ANYOF_BITMAP_SETALL(cl);
 735     cl->flags = ANYOF_CLASS|ANYOF_EOS|ANYOF_UNICODE_ALL
 736                 |ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL;
 737
 738     /* If any portion of the regex is to operate under locale rules,
 739      * initialization includes it.  The reason this isn't done for all regexes
 740      * is that the optimizer was written under the assumption that locale was
 741      * all-or-nothing.  Given the complexity and lack of documentation in the
 742      * optimizer, and that there are inadequate test cases for locale, so many
 743      * parts of it may not work properly, it is safest to avoid locale unless
 744      * necessary. */
 745     if (RExC_contains_locale) {
 746         ANYOF_CLASS_SETALL(cl);     /* /l uses class */
 747         cl->flags |= ANYOF_LOCALE;
 748     }
 749     else {
 750         ANYOF_CLASS_ZERO(cl);       /* Only /l uses class now */
 751     }
 752 }
 753
 754 /* Can match anything (initialization) */
 755 STATIC int
 756 S_cl_is_anything(const struct regnode_charclass_class *cl)
 757 {
 758     int value;
 759
 760     PERL_ARGS_ASSERT_CL_IS_ANYTHING;
 761
 762     for (value = 0; value <= ANYOF_MAX; value += 2)
 763         if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
 764             return 1;
 765     if (!(cl->flags & ANYOF_UNICODE_ALL))
 766         return 0;
 767     if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
 768         return 0;
 769     return 1;
 770 }
 771
 772 /* Can match anything (initialization) */
 773 STATIC void
 774 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 775 {
 776     PERL_ARGS_ASSERT_CL_INIT;
 777
 778     Zero(cl, 1, struct regnode_charclass_class);
 779     cl->type = ANYOF;
 780     cl_anything(pRExC_state, cl);
 781     ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
 782 }
 783
 784 /* These two functions currently do the exact same thing */
 785 #define cl_init_zero            S_cl_init
 786
 787 /* 'AND' a given class with another one.  Can create false positives.  'cl'
 788  * should not be inverted.  'and_with->flags & ANYOF_CLASS' should be 0 if
 789  * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
 790 STATIC void
 791 S_cl_and(struct regnode_charclass_class *cl,
 792         const struct regnode_charclass_class *and_with)
 793 {
 794     PERL_ARGS_ASSERT_CL_AND;
 795
 796     assert(and_with->type == ANYOF);
 797
 798     /* I (khw) am not sure all these restrictions are necessary XXX */
 799     if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
 800         && !(ANYOF_CLASS_TEST_ANY_SET(cl))
 801         && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 802         && !(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
 803         && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) {
 804         int i;
 805
 806         if (and_with->flags & ANYOF_INVERT)
 807             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 808                 cl->bitmap[i] &= ~and_with->bitmap[i];
 809         else
 810             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 811                 cl->bitmap[i] &= and_with->bitmap[i];
 812     } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
 813
 814     if (and_with->flags & ANYOF_INVERT) {
 815
 816         /* Here, the and'ed node is inverted.  Get the AND of the flags that
 817          * aren't affected by the inversion.  Those that are affected are
 818          * handled individually below */
 819         U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
 820         cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
 821         cl->flags |= affected_flags;
 822
 823         /* We currently don't know how to deal with things that aren't in the
 824          * bitmap, but we know that the intersection is no greater than what
 825          * is already in cl, so let there be false positives that get sorted
 826          * out after the synthetic start class succeeds, and the node is
 827          * matched for real. */
 828
 829         /* The inversion of these two flags indicate that the resulting
 830          * intersection doesn't have them */
 831         if (and_with->flags & ANYOF_UNICODE_ALL) {
 832             cl->flags &= ~ANYOF_UNICODE_ALL;
 833         }
 834         if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
 835             cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
 836         }
 837     }
 838     else {   /* and'd node is not inverted */
 839         U8 outside_bitmap_but_not_utf8; /* Temp variable */
 840
 841         if (! ANYOF_NONBITMAP(and_with)) {
 842
 843             /* Here 'and_with' doesn't match anything outside the bitmap
 844              * (except possibly ANYOF_UNICODE_ALL), which means the
 845              * intersection can't either, except for ANYOF_UNICODE_ALL, in
 846              * which case we don't know what the intersection is, but it's no
 847              * greater than what cl already has, so can just leave it alone,
 848              * with possible false positives */
 849             if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
 850                 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
 851                 cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
 852             }
 853         }
 854         else if (! ANYOF_NONBITMAP(cl)) {
 855
 856             /* Here, 'and_with' does match something outside the bitmap, and cl
 857              * doesn't have a list of things to match outside the bitmap.  If
 858              * cl can match all code points above 255, the intersection will
 859              * be those above-255 code points that 'and_with' matches.  If cl
 860              * can't match all Unicode code points, it means that it can't
 861              * match anything outside the bitmap (since the 'if' that got us
 862              * into this block tested for that), so we leave the bitmap empty.
 863              */
 864             if (cl->flags & ANYOF_UNICODE_ALL) {
 865                 ARG_SET(cl, ARG(and_with));
 866
 867                 /* and_with's ARG may match things that don't require UTF8.
 868                  * And now cl's will too, in spite of this being an 'and'.  See
 869                  * the comments below about the kludge */
 870                 cl->flags |= and_with->flags & ANYOF_NONBITMAP_NON_UTF8;
 871             }
 872         }
 873         else {
 874             /* Here, both 'and_with' and cl match something outside the
 875              * bitmap.  Currently we do not do the intersection, so just match
 876              * whatever cl had at the beginning.  */
 877         }
 878
 879
 880         /* Take the intersection of the two sets of flags.  However, the
 881          * ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'.  This is a
 882          * kludge around the fact that this flag is not treated like the others
 883          * which are initialized in cl_anything().  The way the optimizer works
 884          * is that the synthetic start class (SSC) is initialized to match
 885          * anything, and then the first time a real node is encountered, its
 886          * values are AND'd with the SSC's with the result being the values of
 887          * the real node.  However, there are paths through the optimizer where
 888          * the AND never gets called, so those initialized bits are set
 889          * inappropriately, which is not usually a big deal, as they just cause
 890          * false positives in the SSC, which will just mean a probably
 891          * imperceptible slow down in execution.  However this bit has a
 892          * higher false positive consequence in that it can cause utf8.pm,
 893          * utf8_heavy.pl ... to be loaded when not necessary, which is a much
 894          * bigger slowdown and also causes significant extra memory to be used.
 895          * In order to prevent this, the code now takes a different tack.  The
 896          * bit isn't set unless some part of the regular expression needs it,
 897          * but once set it won't get cleared.  This means that these extra
 898          * modules won't get loaded unless there was some path through the
 899          * pattern that would have required them anyway, and  so any false
 900          * positives that occur by not ANDing them out when they could be
 901          * aren't as severe as they would be if we treated this bit like all
 902          * the others */
 903         outside_bitmap_but_not_utf8 = (cl->flags | and_with->flags)
 904                                       & ANYOF_NONBITMAP_NON_UTF8;
 905         cl->flags &= and_with->flags;
 906         cl->flags |= outside_bitmap_but_not_utf8;
 907     }
 908 }
 909
 910 /* 'OR' a given class with another one.  Can create false positives.  'cl'
 911  * should not be inverted.  'or_with->flags & ANYOF_CLASS' should be 0 if
 912  * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
 913 STATIC void
 914 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
 915 {
 916     PERL_ARGS_ASSERT_CL_OR;
 917
 918     if (or_with->flags & ANYOF_INVERT) {
 919
 920         /* Here, the or'd node is to be inverted.  This means we take the
 921          * complement of everything not in the bitmap, but currently we don't
 922          * know what that is, so give up and match anything */
 923         if (ANYOF_NONBITMAP(or_with)) {
 924             cl_anything(pRExC_state, cl);
 925         }
 926         /* We do not use
 927          * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
 928          *   <= (B1 | !B2) | (CL1 | !CL2)
 929          * which is wasteful if CL2 is small, but we ignore CL2:
 930          *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
 931          * XXXX Can we handle case-fold?  Unclear:
 932          *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
 933          *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
 934          */
 935         else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 936              && !(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
 937              && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD) ) {
 938             int i;
 939
 940             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 941                 cl->bitmap[i] |= ~or_with->bitmap[i];
 942         } /* XXXX: logic is complicated otherwise */
 943         else {
 944             cl_anything(pRExC_state, cl);
 945         }
 946
 947         /* And, we can just take the union of the flags that aren't affected
 948          * by the inversion */
 949         cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
 950
 951         /* For the remaining flags:
 952             ANYOF_UNICODE_ALL and inverted means to not match anything above
 953                     255, which means that the union with cl should just be
 954                     what cl has in it, so can ignore this flag
 955             ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
 956                     is 127-255 to match them, but then invert that, so the
 957                     union with cl should just be what cl has in it, so can
 958                     ignore this flag
 959          */
 960     } else {    /* 'or_with' is not inverted */
 961         /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
 962         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 963              && (!(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
 964                  || (cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) ) {
 965             int i;
 966
 967             /* OR char bitmap and class bitmap separately */
 968             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 969                 cl->bitmap[i] |= or_with->bitmap[i];
 970             if (ANYOF_CLASS_TEST_ANY_SET(or_with)) {
 971                 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
 972                     cl->classflags[i] |= or_with->classflags[i];
 973                 cl->flags |= ANYOF_CLASS;
 974             }
 975         }
 976         else { /* XXXX: logic is complicated, leave it along for a moment. */
 977             cl_anything(pRExC_state, cl);
 978         }
 979
 980         if (ANYOF_NONBITMAP(or_with)) {
 981
 982             /* Use the added node's outside-the-bit-map match if there isn't a
 983              * conflict.  If there is a conflict (both nodes match something
 984              * outside the bitmap, but what they match outside is not the same
 985              * pointer, and hence not easily compared until XXX we extend
 986              * inversion lists this far), give up and allow the start class to
 987              * match everything outside the bitmap.  If that stuff is all above
 988              * 255, can just set UNICODE_ALL, otherwise caould be anything. */
 989             if (! ANYOF_NONBITMAP(cl)) {
 990                 ARG_SET(cl, ARG(or_with));
 991             }
 992             else if (ARG(cl) != ARG(or_with)) {
 993
 994                 if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
 995                     cl_anything(pRExC_state, cl);
 996                 }
 997                 else {
 998                     cl->flags |= ANYOF_UNICODE_ALL;
 999                 }
1000             }
1001         }
1002
1003         /* Take the union */
1004         cl->flags |= or_with->flags;
1005     }
1006 }
1007
1008 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1009 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
1010 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1011 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
1012
1013
1014 #ifdef DEBUGGING
1015 /*
1016    dump_trie(trie,widecharmap,revcharmap)
1017    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1018    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1019
1020    These routines dump out a trie in a somewhat readable format.
1021    The _interim_ variants are used for debugging the interim
1022    tables that are used to generate the final compressed
1023    representation which is what dump_trie expects.
1024
1025    Part of the reason for their existence is to provide a form
1026    of documentation as to how the different representations function.
1027
1028 */
1029
1030 /*
1031   Dumps the final compressed table form of the trie to Perl_debug_log.
1032   Used for debugging make_trie().
1033 */
1034
1035 STATIC void
1036 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1037             AV *revcharmap, U32 depth)
1038 {
1039     U32 state;
1040     SV *sv=sv_newmortal();
1041     int colwidth= widecharmap ? 6 : 4;
1042     U16 word;
1043     GET_RE_DEBUG_FLAGS_DECL;
1044
1045     PERL_ARGS_ASSERT_DUMP_TRIE;
1046
1047     PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1048         (int)depth * 2 + 2,"",
1049         "Match","Base","Ofs" );
1050
1051     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1052         SV ** const tmp = av_fetch( revcharmap, state, 0);
1053         if ( tmp ) {
1054             PerlIO_printf( Perl_debug_log, "%*s",
1055                 colwidth,
1056                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1057                             PL_colors[0], PL_colors[1],
1058                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1059                             PERL_PV_ESCAPE_FIRSTCHAR
1060                 )
1061             );
1062         }
1063     }
1064     PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1065         (int)depth * 2 + 2,"");
1066
1067     for( state = 0 ; state < trie->uniquecharcount ; state++ )
1068         PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1069     PerlIO_printf( Perl_debug_log, "\n");
1070
1071     for( state = 1 ; state < trie->statecount ; state++ ) {
1072         const U32 base = trie->states[ state ].trans.base;
1073
1074         PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1075
1076         if ( trie->states[ state ].wordnum ) {
1077             PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1078         } else {
1079             PerlIO_printf( Perl_debug_log, "%6s", "" );
1080         }
1081
1082         PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1083
1084         if ( base ) {
1085             U32 ofs = 0;
1086
1087             while( ( base + ofs  < trie->uniquecharcount ) ||
1088                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
1089                      && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1090                     ofs++;
1091
1092             PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1093
1094             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1095                 if ( ( base + ofs >= trie->uniquecharcount ) &&
1096                      ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1097                      trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1098                 {
1099                    PerlIO_printf( Perl_debug_log, "%*"UVXf,
1100                     colwidth,
1101                     (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1102                 } else {
1103                     PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
1104                 }
1105             }
1106
1107             PerlIO_printf( Perl_debug_log, "]");
1108
1109         }
1110         PerlIO_printf( Perl_debug_log, "\n" );
1111     }
1112     PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1113     for (word=1; word <= trie->wordcount; word++) {
1114         PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1115             (int)word, (int)(trie->wordinfo[word].prev),
1116             (int)(trie->wordinfo[word].len));
1117     }
1118     PerlIO_printf(Perl_debug_log, "\n" );
1119 }
1120 /*
1121   Dumps a fully constructed but uncompressed trie in list form.
1122   List tries normally only are used for construction when the number of
1123   possible chars (trie->uniquecharcount) is very high.
1124   Used for debugging make_trie().
1125 */
1126 STATIC void
1127 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1128                          HV *widecharmap, AV *revcharmap, U32 next_alloc,
1129                          U32 depth)
1130 {
1131     U32 state;
1132     SV *sv=sv_newmortal();
1133     int colwidth= widecharmap ? 6 : 4;
1134     GET_RE_DEBUG_FLAGS_DECL;
1135
1136     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1137
1138     /* print out the table precompression.  */
1139     PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1140         (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1141         "------:-----+-----------------\n" );
1142
1143     for( state=1 ; state < next_alloc ; state ++ ) {
1144         U16 charid;
1145
1146         PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1147             (int)depth * 2 + 2,"", (UV)state  );
1148         if ( ! trie->states[ state ].wordnum ) {
1149             PerlIO_printf( Perl_debug_log, "%5s| ","");
1150         } else {
1151             PerlIO_printf( Perl_debug_log, "W%4x| ",
1152                 trie->states[ state ].wordnum
1153             );
1154         }
1155         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1156             SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1157             if ( tmp ) {
1158                 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1159                     colwidth,
1160                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1161                             PL_colors[0], PL_colors[1],
1162                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1163                             PERL_PV_ESCAPE_FIRSTCHAR
1164                     ) ,
1165                     TRIE_LIST_ITEM(state,charid).forid,
1166                     (UV)TRIE_LIST_ITEM(state,charid).newstate
1167                 );
1168                 if (!(charid % 10))
1169                     PerlIO_printf(Perl_debug_log, "\n%*s| ",
1170                         (int)((depth * 2) + 14), "");
1171             }
1172         }
1173         PerlIO_printf( Perl_debug_log, "\n");
1174     }
1175 }
1176
1177 /*
1178   Dumps a fully constructed but uncompressed trie in table form.
1179   This is the normal DFA style state transition table, with a few
1180   twists to facilitate compression later.
1181   Used for debugging make_trie().
1182 */
1183 STATIC void
1184 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1185                           HV *widecharmap, AV *revcharmap, U32 next_alloc,
1186                           U32 depth)
1187 {
1188     U32 state;
1189     U16 charid;
1190     SV *sv=sv_newmortal();
1191     int colwidth= widecharmap ? 6 : 4;
1192     GET_RE_DEBUG_FLAGS_DECL;
1193
1194     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1195
1196     /*
1197        print out the table precompression so that we can do a visual check
1198        that they are identical.
1199      */
1200
1201     PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1202
1203     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1204         SV ** const tmp = av_fetch( revcharmap, charid, 0);
1205         if ( tmp ) {
1206             PerlIO_printf( Perl_debug_log, "%*s",
1207                 colwidth,
1208                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1209                             PL_colors[0], PL_colors[1],
1210                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1211                             PERL_PV_ESCAPE_FIRSTCHAR
1212                 )
1213             );
1214         }
1215     }
1216
1217     PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1218
1219     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1220         PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1221     }
1222
1223     PerlIO_printf( Perl_debug_log, "\n" );
1224
1225     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1226
1227         PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1228             (int)depth * 2 + 2,"",
1229             (UV)TRIE_NODENUM( state ) );
1230
1231         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1232             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1233             if (v)
1234                 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1235             else
1236                 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1237         }
1238         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1239             PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1240         } else {
1241             PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1242             trie->states[ TRIE_NODENUM( state ) ].wordnum );
1243         }
1244     }
1245 }
1246
1247 #endif
1248
1249
1250 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1251   startbranch: the first branch in the whole branch sequence
1252   first      : start branch of sequence of branch-exact nodes.
1253                May be the same as startbranch
1254   last       : Thing following the last branch.
1255                May be the same as tail.
1256   tail       : item following the branch sequence
1257   count      : words in the sequence
1258   flags      : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1259   depth      : indent depth
1260
1261 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1262
1263 A trie is an N'ary tree where the branches are determined by digital
1264 decomposition of the key. IE, at the root node you look up the 1st character and
1265 follow that branch repeat until you find the end of the branches. Nodes can be
1266 marked as "accepting" meaning they represent a complete word. Eg:
1267
1268   /he|she|his|hers/
1269
1270 would convert into the following structure. Numbers represent states, letters
1271 following numbers represent valid transitions on the letter from that state, if
1272 the number is in square brackets it represents an accepting state, otherwise it
1273 will be in parenthesis.
1274
1275       +-h->+-e->[3]-+-r->(8)-+-s->[9]
1276       |    |
1277       |   (2)
1278       |    |
1279      (1)   +-i->(6)-+-s->[7]
1280       |
1281       +-s->(3)-+-h->(4)-+-e->[5]
1282
1283       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1284
1285 This shows that when matching against the string 'hers' we will begin at state 1
1286 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1287 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1288 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1289 single traverse. We store a mapping from accepting to state to which word was
1290 matched, and then when we have multiple possibilities we try to complete the
1291 rest of the regex in the order in which they occured in the alternation.
1292
1293 The only prior NFA like behaviour that would be changed by the TRIE support is
1294 the silent ignoring of duplicate alternations which are of the form:
1295
1296  / (DUPE|DUPE) X? (?{ ... }) Y /x
1297
1298 Thus EVAL blocks following a trie may be called a different number of times with
1299 and without the optimisation. With the optimisations dupes will be silently
1300 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1301 the following demonstrates:
1302
1303  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1304
1305 which prints out 'word' three times, but
1306
1307  'words'=~/(word|word|word)(?{ print $1 })S/
1308
1309 which doesnt print it out at all. This is due to other optimisations kicking in.
1310
1311 Example of what happens on a structural level:
1312
1313 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1314
1315    1: CURLYM[1] {1,32767}(18)
1316    5:   BRANCH(8)
1317    6:     EXACT <ac>(16)
1318    8:   BRANCH(11)
1319    9:     EXACT <ad>(16)
1320   11:   BRANCH(14)
1321   12:     EXACT <ab>(16)
1322   16:   SUCCEED(0)
1323   17:   NOTHING(18)
1324   18: END(0)
1325
1326 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1327 and should turn into:
1328
1329    1: CURLYM[1] {1,32767}(18)
1330    5:   TRIE(16)
1331         [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1332           <ac>
1333           <ad>
1334           <ab>
1335   16:   SUCCEED(0)
1336   17:   NOTHING(18)
1337   18: END(0)
1338
1339 Cases where tail != last would be like /(?foo|bar)baz/:
1340
1341    1: BRANCH(4)
1342    2:   EXACT <foo>(8)
1343    4: BRANCH(7)
1344    5:   EXACT <bar>(8)
1345    7: TAIL(8)
1346    8: EXACT <baz>(10)
1347   10: END(0)
1348
1349 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1350 and would end up looking like:
1351
1352     1: TRIE(8)
1353       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1354         <foo>
1355         <bar>
1356    7: TAIL(8)
1357    8: EXACT <baz>(10)
1358   10: END(0)
1359
1360     d = uvuni_to_utf8_flags(d, uv, 0);
1361
1362 is the recommended Unicode-aware way of saying
1363
1364     *(d++) = uv;
1365 */
1366
1367 #define TRIE_STORE_REVCHAR                                                 \
1368     STMT_START {                                                           \
1369         if (UTF) {                                                         \
1370             SV *zlopp = newSV(2);                                          \
1371             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
1372             unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
1373             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
1374             SvPOK_on(zlopp);                                               \
1375             SvUTF8_on(zlopp);                                              \
1376             av_push(revcharmap, zlopp);                                    \
1377         } else {                                                           \
1378             char ooooff = (char)uvc;                                               \
1379             av_push(revcharmap, newSVpvn(&ooooff, 1));                     \
1380         }                                                                  \
1381         } STMT_END
1382
1383 #define TRIE_READ_CHAR STMT_START {                                           \
1384     wordlen++;                                                                \
1385     if ( UTF ) {                                                              \
1386         if ( folder ) {                                                       \
1387             if ( foldlen > 0 ) {                                              \
1388                uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags );     \
1389                foldlen -= len;                                                \
1390                scan += len;                                                   \
1391                len = 0;                                                       \
1392             } else {                                                          \
1393                 len = UTF8SKIP(uc);\
1394                 uvc = to_utf8_fold( uc, foldbuf, &foldlen);                   \
1395                 foldlen -= UNISKIP( uvc );                                    \
1396                 scan = foldbuf + UNISKIP( uvc );                              \
1397             }                                                                 \
1398         } else {                                                              \
1399             uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1400         }                                                                     \
1401     } else {                                                                  \
1402         uvc = (U32)*uc;                                                       \
1403         len = 1;                                                              \
1404     }                                                                         \
1405 } STMT_END
1406
1407
1408
1409 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
1410     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
1411         U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
1412         Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1413     }                                                           \
1414     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
1415     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
1416     TRIE_LIST_CUR( state )++;                                   \
1417 } STMT_END
1418
1419 #define TRIE_LIST_NEW(state) STMT_START {                       \
1420     Newxz( trie->states[ state ].trans.list,               \
1421         4, reg_trie_trans_le );                                 \
1422      TRIE_LIST_CUR( state ) = 1;                                \
1423      TRIE_LIST_LEN( state ) = 4;                                \
1424 } STMT_END
1425
1426 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
1427     U16 dupe= trie->states[ state ].wordnum;                    \
1428     regnode * const noper_next = regnext( noper );              \
1429                                                                 \
1430     DEBUG_r({                                                   \
1431         /* store the word for dumping */                        \
1432         SV* tmp;                                                \
1433         if (OP(noper) != NOTHING)                               \
1434             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);    \
1435         else                                                    \
1436             tmp = newSVpvn_utf8( "", 0, UTF );                  \
1437         av_push( trie_words, tmp );                             \
1438     });                                                         \
1439                                                                 \
1440     curword++;                                                  \
1441     trie->wordinfo[curword].prev   = 0;                         \
1442     trie->wordinfo[curword].len    = wordlen;                   \
1443     trie->wordinfo[curword].accept = state;                     \
1444                                                                 \
1445     if ( noper_next < tail ) {                                  \
1446         if (!trie->jump)                                        \
1447             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1448         trie->jump[curword] = (U16)(noper_next - convert);      \
1449         if (!jumper)                                            \
1450             jumper = noper_next;                                \
1451         if (!nextbranch)                                        \
1452             nextbranch= regnext(cur);                           \
1453     }                                                           \
1454                                                                 \
1455     if ( dupe ) {                                               \
1456         /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
1457         /* chain, so that when the bits of chain are later    */\
1458         /* linked together, the dups appear in the chain      */\
1459         trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1460         trie->wordinfo[dupe].prev = curword;                    \
1461     } else {                                                    \
1462         /* we haven't inserted this word yet.                */ \
1463         trie->states[ state ].wordnum = curword;                \
1464     }                                                           \
1465 } STMT_END
1466
1467
1468 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)          \
1469      ( ( base + charid >=  ucharcount                                   \
1470          && base + charid < ubound                                      \
1471          && state == trie->trans[ base - ucharcount + charid ].check    \
1472          && trie->trans[ base - ucharcount + charid ].next )            \
1473            ? trie->trans[ base - ucharcount + charid ].next             \
1474            : ( state==1 ? special : 0 )                                 \
1475       )
1476
1477 #define MADE_TRIE       1
1478 #define MADE_JUMP_TRIE  2
1479 #define MADE_EXACT_TRIE 4
1480
1481 STATIC I32
1482 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1483 {
1484     dVAR;
1485     /* first pass, loop through and scan words */
1486     reg_trie_data *trie;
1487     HV *widecharmap = NULL;
1488     AV *revcharmap = newAV();
1489     regnode *cur;
1490     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1491     STRLEN len = 0;
1492     UV uvc = 0;
1493     U16 curword = 0;
1494     U32 next_alloc = 0;
1495     regnode *jumper = NULL;
1496     regnode *nextbranch = NULL;
1497     regnode *convert = NULL;
1498     U32 *prev_states; /* temp array mapping each state to previous one */
1499     /* we just use folder as a flag in utf8 */
1500     const U8 * folder = NULL;
1501
1502 #ifdef DEBUGGING
1503     const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1504     AV *trie_words = NULL;
1505     /* along with revcharmap, this only used during construction but both are
1506      * useful during debugging so we store them in the struct when debugging.
1507      */
1508 #else
1509     const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1510     STRLEN trie_charcount=0;
1511 #endif
1512     SV *re_trie_maxbuff;
1513     GET_RE_DEBUG_FLAGS_DECL;
1514
1515     PERL_ARGS_ASSERT_MAKE_TRIE;
1516 #ifndef DEBUGGING
1517     PERL_UNUSED_ARG(depth);
1518 #endif
1519
1520     switch (flags) {
1521         case EXACT: break;
1522         case EXACTFA:
1523         case EXACTFU: folder = PL_fold_latin1; break;
1524         case EXACTF:  folder = PL_fold; break;
1525         case EXACTFL: folder = PL_fold_locale; break;
1526         default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u", (unsigned) flags );
1527     }
1528
1529     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1530     trie->refcount = 1;
1531     trie->startstate = 1;
1532     trie->wordcount = word_count;
1533     RExC_rxi->data->data[ data_slot ] = (void*)trie;
1534     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1535     if (!(UTF && folder))
1536         trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1537     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1538                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
1539
1540     DEBUG_r({
1541         trie_words = newAV();
1542     });
1543
1544     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1545     if (!SvIOK(re_trie_maxbuff)) {
1546         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1547     }
1548     DEBUG_OPTIMISE_r({
1549                 PerlIO_printf( Perl_debug_log,
1550                   "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1551                   (int)depth * 2 + 2, "",
1552                   REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1553                   REG_NODE_NUM(last), REG_NODE_NUM(tail),
1554                   (int)depth);
1555     });
1556
1557    /* Find the node we are going to overwrite */
1558     if ( first == startbranch && OP( last ) != BRANCH ) {
1559         /* whole branch chain */
1560         convert = first;
1561     } else {
1562         /* branch sub-chain */
1563         convert = NEXTOPER( first );
1564     }
1565
1566     /*  -- First loop and Setup --
1567
1568        We first traverse the branches and scan each word to determine if it
1569        contains widechars, and how many unique chars there are, this is
1570        important as we have to build a table with at least as many columns as we
1571        have unique chars.
1572
1573        We use an array of integers to represent the character codes 0..255
1574        (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1575        native representation of the character value as the key and IV's for the
1576        coded index.
1577
1578        *TODO* If we keep track of how many times each character is used we can
1579        remap the columns so that the table compression later on is more
1580        efficient in terms of memory by ensuring the most common value is in the
1581        middle and the least common are on the outside.  IMO this would be better
1582        than a most to least common mapping as theres a decent chance the most
1583        common letter will share a node with the least common, meaning the node
1584        will not be compressible. With a middle is most common approach the worst
1585        case is when we have the least common nodes twice.
1586
1587      */
1588
1589     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1590         regnode * const noper = NEXTOPER( cur );
1591         const U8 *uc = (U8*)STRING( noper );
1592         const U8 * const e  = uc + STR_LEN( noper );
1593         STRLEN foldlen = 0;
1594         U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1595         const U8 *scan = (U8*)NULL;
1596         U32 wordlen      = 0;         /* required init */
1597         STRLEN chars = 0;
1598         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1599
1600         if (OP(noper) == NOTHING) {
1601             trie->minlen= 0;
1602             continue;
1603         }
1604         if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1605             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1606                                           regardless of encoding */
1607
1608         for ( ; uc < e ; uc += len ) {
1609             TRIE_CHARCOUNT(trie)++;
1610             TRIE_READ_CHAR;
1611             chars++;
1612             if ( uvc < 256 ) {
1613                 if ( !trie->charmap[ uvc ] ) {
1614                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1615                     if ( folder )
1616                         trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1617                     TRIE_STORE_REVCHAR;
1618                 }
1619                 if ( set_bit ) {
1620                     /* store the codepoint in the bitmap, and its folded
1621                      * equivalent. */
1622                     TRIE_BITMAP_SET(trie,uvc);
1623
1624                     /* store the folded codepoint */
1625                     if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1626
1627                     if ( !UTF ) {
1628                         /* store first byte of utf8 representation of
1629                            variant codepoints */
1630                         if (! UNI_IS_INVARIANT(uvc)) {
1631                             TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
1632                         }
1633                     }
1634                     set_bit = 0; /* We've done our bit :-) */
1635                 }
1636             } else {
1637                 SV** svpp;
1638                 if ( !widecharmap )
1639                     widecharmap = newHV();
1640
1641                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1642
1643                 if ( !svpp )
1644                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1645
1646                 if ( !SvTRUE( *svpp ) ) {
1647                     sv_setiv( *svpp, ++trie->uniquecharcount );
1648                     TRIE_STORE_REVCHAR;
1649                 }
1650             }
1651         }
1652         if( cur == first ) {
1653             trie->minlen=chars;
1654             trie->maxlen=chars;
1655         } else if (chars < trie->minlen) {
1656             trie->minlen=chars;
1657         } else if (chars > trie->maxlen) {
1658             trie->maxlen=chars;
1659         }
1660
1661     } /* end first pass */
1662     DEBUG_TRIE_COMPILE_r(
1663         PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1664                 (int)depth * 2 + 2,"",
1665                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1666                 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1667                 (int)trie->minlen, (int)trie->maxlen )
1668     );
1669
1670     /*
1671         We now know what we are dealing with in terms of unique chars and
1672         string sizes so we can calculate how much memory a naive
1673         representation using a flat table  will take. If it's over a reasonable
1674         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1675         conservative but potentially much slower representation using an array
1676         of lists.
1677
1678         At the end we convert both representations into the same compressed
1679         form that will be used in regexec.c for matching with. The latter
1680         is a form that cannot be used to construct with but has memory
1681         properties similar to the list form and access properties similar
1682         to the table form making it both suitable for fast searches and
1683         small enough that its feasable to store for the duration of a program.
1684
1685         See the comment in the code where the compressed table is produced
1686         inplace from the flat tabe representation for an explanation of how
1687         the compression works.
1688
1689     */
1690
1691
1692     Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1693     prev_states[1] = 0;
1694
1695     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1696         /*
1697             Second Pass -- Array Of Lists Representation
1698
1699             Each state will be represented by a list of charid:state records
1700             (reg_trie_trans_le) the first such element holds the CUR and LEN
1701             points of the allocated array. (See defines above).
1702
1703             We build the initial structure using the lists, and then convert
1704             it into the compressed table form which allows faster lookups
1705             (but cant be modified once converted).
1706         */
1707
1708         STRLEN transcount = 1;
1709
1710         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1711             "%*sCompiling trie using list compiler\n",
1712             (int)depth * 2 + 2, ""));
1713
1714         trie->states = (reg_trie_state *)
1715             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1716                                   sizeof(reg_trie_state) );
1717         TRIE_LIST_NEW(1);
1718         next_alloc = 2;
1719
1720         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1721
1722             regnode * const noper = NEXTOPER( cur );
1723             U8 *uc           = (U8*)STRING( noper );
1724             const U8 * const e = uc + STR_LEN( noper );
1725             U32 state        = 1;         /* required init */
1726             U16 charid       = 0;         /* sanity init */
1727             U8 *scan         = (U8*)NULL; /* sanity init */
1728             STRLEN foldlen   = 0;         /* required init */
1729             U32 wordlen      = 0;         /* required init */
1730             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1731
1732             if (OP(noper) != NOTHING) {
1733                 for ( ; uc < e ; uc += len ) {
1734
1735                     TRIE_READ_CHAR;
1736
1737                     if ( uvc < 256 ) {
1738                         charid = trie->charmap[ uvc ];
1739                     } else {
1740                         SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1741                         if ( !svpp ) {
1742                             charid = 0;
1743                         } else {
1744                             charid=(U16)SvIV( *svpp );
1745                         }
1746                     }
1747                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
1748                     if ( charid ) {
1749
1750                         U16 check;
1751                         U32 newstate = 0;
1752
1753                         charid--;
1754                         if ( !trie->states[ state ].trans.list ) {
1755                             TRIE_LIST_NEW( state );
1756                         }
1757                         for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1758                             if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1759                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1760                                 break;
1761                             }
1762                         }
1763                         if ( ! newstate ) {
1764                             newstate = next_alloc++;
1765                             prev_states[newstate] = state;
1766                             TRIE_LIST_PUSH( state, charid, newstate );
1767                             transcount++;
1768                         }
1769                         state = newstate;
1770                     } else {
1771                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1772                     }
1773                 }
1774             }
1775             TRIE_HANDLE_WORD(state);
1776
1777         } /* end second pass */
1778
1779         /* next alloc is the NEXT state to be allocated */
1780         trie->statecount = next_alloc;
1781         trie->states = (reg_trie_state *)
1782             PerlMemShared_realloc( trie->states,
1783                                    next_alloc
1784                                    * sizeof(reg_trie_state) );
1785
1786         /* and now dump it out before we compress it */
1787         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1788                                                          revcharmap, next_alloc,
1789                                                          depth+1)
1790         );
1791
1792         trie->trans = (reg_trie_trans *)
1793             PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1794         {
1795             U32 state;
1796             U32 tp = 0;
1797             U32 zp = 0;
1798
1799
1800             for( state=1 ; state < next_alloc ; state ++ ) {
1801                 U32 base=0;
1802
1803                 /*
1804                 DEBUG_TRIE_COMPILE_MORE_r(
1805                     PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1806                 );
1807                 */
1808
1809                 if (trie->states[state].trans.list) {
1810                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1811                     U16 maxid=minid;
1812                     U16 idx;
1813
1814                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1815                         const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1816                         if ( forid < minid ) {
1817                             minid=forid;
1818                         } else if ( forid > maxid ) {
1819                             maxid=forid;
1820                         }
1821                     }
1822                     if ( transcount < tp + maxid - minid + 1) {
1823                         transcount *= 2;
1824                         trie->trans = (reg_trie_trans *)
1825                             PerlMemShared_realloc( trie->trans,
1826                                                      transcount
1827                                                      * sizeof(reg_trie_trans) );
1828                         Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1829                     }
1830                     base = trie->uniquecharcount + tp - minid;
1831                     if ( maxid == minid ) {
1832                         U32 set = 0;
1833                         for ( ; zp < tp ; zp++ ) {
1834                             if ( ! trie->trans[ zp ].next ) {
1835                                 base = trie->uniquecharcount + zp - minid;
1836                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1837                                 trie->trans[ zp ].check = state;
1838                                 set = 1;
1839                                 break;
1840                             }
1841                         }
1842                         if ( !set ) {
1843                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1844                             trie->trans[ tp ].check = state;
1845                             tp++;
1846                             zp = tp;
1847                         }
1848                     } else {
1849                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1850                             const U32 tid = base -  trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1851                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1852                             trie->trans[ tid ].check = state;
1853                         }
1854                         tp += ( maxid - minid + 1 );
1855                     }
1856                     Safefree(trie->states[ state ].trans.list);
1857                 }
1858                 /*
1859                 DEBUG_TRIE_COMPILE_MORE_r(
1860                     PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1861                 );
1862                 */
1863                 trie->states[ state ].trans.base=base;
1864             }
1865             trie->lasttrans = tp + 1;
1866         }
1867     } else {
1868         /*
1869            Second Pass -- Flat Table Representation.
1870
1871            we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1872            We know that we will need Charcount+1 trans at most to store the data
1873            (one row per char at worst case) So we preallocate both structures
1874            assuming worst case.
1875
1876            We then construct the trie using only the .next slots of the entry
1877            structs.
1878
1879            We use the .check field of the first entry of the node temporarily to
1880            make compression both faster and easier by keeping track of how many non
1881            zero fields are in the node.
1882
1883            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1884            transition.
1885
1886            There are two terms at use here: state as a TRIE_NODEIDX() which is a
1887            number representing the first entry of the node, and state as a
1888            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1889            TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1890            are 2 entrys per node. eg:
1891
1892              A B       A B
1893           1. 2 4    1. 3 7
1894           2. 0 3    3. 0 5
1895           3. 0 0    5. 0 0
1896           4. 0 0    7. 0 0
1897
1898            The table is internally in the right hand, idx form. However as we also
1899            have to deal with the states array which is indexed by nodenum we have to
1900            use TRIE_NODENUM() to convert.
1901
1902         */
1903         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1904             "%*sCompiling trie using table compiler\n",
1905             (int)depth * 2 + 2, ""));
1906
1907         trie->trans = (reg_trie_trans *)
1908             PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1909                                   * trie->uniquecharcount + 1,
1910                                   sizeof(reg_trie_trans) );
1911         trie->states = (reg_trie_state *)
1912             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1913                                   sizeof(reg_trie_state) );
1914         next_alloc = trie->uniquecharcount + 1;
1915
1916
1917         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1918
1919             regnode * const noper   = NEXTOPER( cur );
1920             const U8 *uc     = (U8*)STRING( noper );
1921             const U8 * const e = uc + STR_LEN( noper );
1922
1923             U32 state        = 1;         /* required init */
1924
1925             U16 charid       = 0;         /* sanity init */
1926             U32 accept_state = 0;         /* sanity init */
1927             U8 *scan         = (U8*)NULL; /* sanity init */
1928
1929             STRLEN foldlen   = 0;         /* required init */
1930             U32 wordlen      = 0;         /* required init */
1931             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1932
1933             if ( OP(noper) != NOTHING ) {
1934                 for ( ; uc < e ; uc += len ) {
1935
1936                     TRIE_READ_CHAR;
1937
1938                     if ( uvc < 256 ) {
1939                         charid = trie->charmap[ uvc ];
1940                     } else {
1941                         SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1942                         charid = svpp ? (U16)SvIV(*svpp) : 0;
1943                     }
1944                     if ( charid ) {
1945                         charid--;
1946                         if ( !trie->trans[ state + charid ].next ) {
1947                             trie->trans[ state + charid ].next = next_alloc;
1948                             trie->trans[ state ].check++;
1949                             prev_states[TRIE_NODENUM(next_alloc)]
1950                                     = TRIE_NODENUM(state);
1951                             next_alloc += trie->uniquecharcount;
1952                         }
1953                         state = trie->trans[ state + charid ].next;
1954                     } else {
1955                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1956                     }
1957                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
1958                 }
1959             }
1960             accept_state = TRIE_NODENUM( state );
1961             TRIE_HANDLE_WORD(accept_state);
1962
1963         } /* end second pass */
1964
1965         /* and now dump it out before we compress it */
1966         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1967                                                           revcharmap,
1968                                                           next_alloc, depth+1));
1969
1970         {
1971         /*
1972            * Inplace compress the table.*
1973
1974            For sparse data sets the table constructed by the trie algorithm will
1975            be mostly 0/FAIL transitions or to put it another way mostly empty.
1976            (Note that leaf nodes will not contain any transitions.)
1977
1978            This algorithm compresses the tables by eliminating most such
1979            transitions, at the cost of a modest bit of extra work during lookup:
1980
1981            - Each states[] entry contains a .base field which indicates the
1982            index in the state[] array wheres its transition data is stored.
1983
1984            - If .base is 0 there are no valid transitions from that node.
1985
1986            - If .base is nonzero then charid is added to it to find an entry in
1987            the trans array.
1988
1989            -If trans[states[state].base+charid].check!=state then the
1990            transition is taken to be a 0/Fail transition. Thus if there are fail
1991            transitions at the front of the node then the .base offset will point
1992            somewhere inside the previous nodes data (or maybe even into a node
1993            even earlier), but the .check field determines if the transition is
1994            valid.
1995
1996            XXX - wrong maybe?
1997            The following process inplace converts the table to the compressed
1998            table: We first do not compress the root node 1,and mark all its
1999            .check pointers as 1 and set its .base pointer as 1 as well. This
2000            allows us to do a DFA construction from the compressed table later,
2001            and ensures that any .base pointers we calculate later are greater
2002            than 0.
2003
2004            - We set 'pos' to indicate the first entry of the second node.
2005
2006            - We then iterate over the columns of the node, finding the first and
2007            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2008            and set the .check pointers accordingly, and advance pos
2009            appropriately and repreat for the next node. Note that when we copy
2010            the next pointers we have to convert them from the original
2011            NODEIDX form to NODENUM form as the former is not valid post
2012            compression.
2013
2014            - If a node has no transitions used we mark its base as 0 and do not
2015            advance the pos pointer.
2016
2017            - If a node only has one transition we use a second pointer into the
2018            structure to fill in allocated fail transitions from other states.
2019            This pointer is independent of the main pointer and scans forward
2020            looking for null transitions that are allocated to a state. When it
2021            finds one it writes the single transition into the "hole".  If the
2022            pointer doesnt find one the single transition is appended as normal.
2023
2024            - Once compressed we can Renew/realloc the structures to release the
2025            excess space.
2026
2027            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2028            specifically Fig 3.47 and the associated pseudocode.
2029
2030            demq
2031         */
2032         const U32 laststate = TRIE_NODENUM( next_alloc );
2033         U32 state, charid;
2034         U32 pos = 0, zp=0;
2035         trie->statecount = laststate;
2036
2037         for ( state = 1 ; state < laststate ; state++ ) {
2038             U8 flag = 0;
2039             const U32 stateidx = TRIE_NODEIDX( state );
2040             const U32 o_used = trie->trans[ stateidx ].check;
2041             U32 used = trie->trans[ stateidx ].check;
2042             trie->trans[ stateidx ].check = 0;
2043
2044             for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2045                 if ( flag || trie->trans[ stateidx + charid ].next ) {
2046                     if ( trie->trans[ stateidx + charid ].next ) {
2047                         if (o_used == 1) {
2048                             for ( ; zp < pos ; zp++ ) {
2049                                 if ( ! trie->trans[ zp ].next ) {
2050                                     break;
2051                                 }
2052                             }
2053                             trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2054                             trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2055                             trie->trans[ zp ].check = state;
2056                             if ( ++zp > pos ) pos = zp;
2057                             break;
2058                         }
2059                         used--;
2060                     }
2061                     if ( !flag ) {
2062                         flag = 1;
2063                         trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2064                     }
2065                     trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2066                     trie->trans[ pos ].check = state;
2067                     pos++;
2068                 }
2069             }
2070         }
2071         trie->lasttrans = pos + 1;
2072         trie->states = (reg_trie_state *)
2073             PerlMemShared_realloc( trie->states, laststate
2074                                    * sizeof(reg_trie_state) );
2075         DEBUG_TRIE_COMPILE_MORE_r(
2076                 PerlIO_printf( Perl_debug_log,
2077                     "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2078                     (int)depth * 2 + 2,"",
2079                     (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
2080                     (IV)next_alloc,
2081                     (IV)pos,
2082                     ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2083             );
2084
2085         } /* end table compress */
2086     }
2087     DEBUG_TRIE_COMPILE_MORE_r(
2088             PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2089                 (int)depth * 2 + 2, "",
2090                 (UV)trie->statecount,
2091                 (UV)trie->lasttrans)
2092     );
2093     /* resize the trans array to remove unused space */
2094     trie->trans = (reg_trie_trans *)
2095         PerlMemShared_realloc( trie->trans, trie->lasttrans
2096                                * sizeof(reg_trie_trans) );
2097
2098     {   /* Modify the program and insert the new TRIE node */
2099         U8 nodetype =(U8)(flags & 0xFF);
2100         char *str=NULL;
2101
2102 #ifdef DEBUGGING
2103         regnode *optimize = NULL;
2104 #ifdef RE_TRACK_PATTERN_OFFSETS
2105
2106         U32 mjd_offset = 0;
2107         U32 mjd_nodelen = 0;
2108 #endif /* RE_TRACK_PATTERN_OFFSETS */
2109 #endif /* DEBUGGING */
2110         /*
2111            This means we convert either the first branch or the first Exact,
2112            depending on whether the thing following (in 'last') is a branch
2113            or not and whther first is the startbranch (ie is it a sub part of
2114            the alternation or is it the whole thing.)
2115            Assuming its a sub part we convert the EXACT otherwise we convert
2116            the whole branch sequence, including the first.
2117          */
2118         /* Find the node we are going to overwrite */
2119         if ( first != startbranch || OP( last ) == BRANCH ) {
2120             /* branch sub-chain */
2121             NEXT_OFF( first ) = (U16)(last - first);
2122 #ifdef RE_TRACK_PATTERN_OFFSETS
2123             DEBUG_r({
2124                 mjd_offset= Node_Offset((convert));
2125                 mjd_nodelen= Node_Length((convert));
2126             });
2127 #endif
2128             /* whole branch chain */
2129         }
2130 #ifdef RE_TRACK_PATTERN_OFFSETS
2131         else {
2132             DEBUG_r({
2133                 const  regnode *nop = NEXTOPER( convert );
2134                 mjd_offset= Node_Offset((nop));
2135                 mjd_nodelen= Node_Length((nop));
2136             });
2137         }
2138         DEBUG_OPTIMISE_r(
2139             PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2140                 (int)depth * 2 + 2, "",
2141                 (UV)mjd_offset, (UV)mjd_nodelen)
2142         );
2143 #endif
2144         /* But first we check to see if there is a common prefix we can
2145            split out as an EXACT and put in front of the TRIE node.  */
2146         trie->startstate= 1;
2147         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
2148             U32 state;
2149             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2150                 U32 ofs = 0;
2151                 I32 idx = -1;
2152                 U32 count = 0;
2153                 const U32 base = trie->states[ state ].trans.base;
2154
2155                 if ( trie->states[state].wordnum )
2156                         count = 1;
2157
2158                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2159                     if ( ( base + ofs >= trie->uniquecharcount ) &&
2160                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2161                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2162                     {
2163                         if ( ++count > 1 ) {
2164                             SV **tmp = av_fetch( revcharmap, ofs, 0);
2165                             const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2166                             if ( state == 1 ) break;
2167                             if ( count == 2 ) {
2168                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2169                                 DEBUG_OPTIMISE_r(
2170                                     PerlIO_printf(Perl_debug_log,
2171                                         "%*sNew Start State=%"UVuf" Class: [",
2172                                         (int)depth * 2 + 2, "",
2173                                         (UV)state));
2174                                 if (idx >= 0) {
2175                                     SV ** const tmp = av_fetch( revcharmap, idx, 0);
2176                                     const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2177
2178                                     TRIE_BITMAP_SET(trie,*ch);
2179                                     if ( folder )
2180                                         TRIE_BITMAP_SET(trie, folder[ *ch ]);
2181                                     DEBUG_OPTIMISE_r(
2182                                         PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2183                                     );
2184                                 }
2185                             }
2186                             TRIE_BITMAP_SET(trie,*ch);
2187                             if ( folder )
2188                                 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2189                             DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2190                         }
2191                         idx = ofs;
2192                     }
2193                 }
2194                 if ( count == 1 ) {
2195                     SV **tmp = av_fetch( revcharmap, idx, 0);
2196                     STRLEN len;
2197                     char *ch = SvPV( *tmp, len );
2198                     DEBUG_OPTIMISE_r({
2199                         SV *sv=sv_newmortal();
2200                         PerlIO_printf( Perl_debug_log,
2201                             "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2202                             (int)depth * 2 + 2, "",
2203                             (UV)state, (UV)idx,
2204                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2205                                 PL_colors[0], PL_colors[1],
2206                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2207                                 PERL_PV_ESCAPE_FIRSTCHAR
2208                             )
2209                         );
2210                     });
2211                     if ( state==1 ) {
2212                         OP( convert ) = nodetype;
2213                         str=STRING(convert);
2214                         STR_LEN(convert)=0;
2215                     }
2216                     STR_LEN(convert) += len;
2217                     while (len--)
2218                         *str++ = *ch++;
2219                 } else {
2220 #ifdef DEBUGGING
2221                     if (state>1)
2222                         DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2223 #endif
2224                     break;
2225                 }
2226             }
2227             trie->prefixlen = (state-1);
2228             if (str) {
2229                 regnode *n = convert+NODE_SZ_STR(convert);
2230                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2231                 trie->startstate = state;
2232                 trie->minlen -= (state - 1);
2233                 trie->maxlen -= (state - 1);
2234 #ifdef DEBUGGING
2235                /* At least the UNICOS C compiler choked on this
2236                 * being argument to DEBUG_r(), so let's just have
2237                 * it right here. */
2238                if (
2239 #ifdef PERL_EXT_RE_BUILD
2240                    1
2241 #else
2242                    DEBUG_r_TEST
2243 #endif
2244                    ) {
2245                    regnode *fix = convert;
2246                    U32 word = trie->wordcount;
2247                    mjd_nodelen++;
2248                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2249                    while( ++fix < n ) {
2250                        Set_Node_Offset_Length(fix, 0, 0);
2251                    }
2252                    while (word--) {
2253                        SV ** const tmp = av_fetch( trie_words, word, 0 );
2254                        if (tmp) {
2255                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
2256                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2257                            else
2258                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2259                        }
2260                    }
2261                }
2262 #endif
2263                 if (trie->maxlen) {
2264                     convert = n;
2265                 } else {
2266                     NEXT_OFF(convert) = (U16)(tail - convert);
2267                     DEBUG_r(optimize= n);
2268                 }
2269             }
2270         }
2271         if (!jumper)
2272             jumper = last;
2273         if ( trie->maxlen ) {
2274             NEXT_OFF( convert ) = (U16)(tail - convert);
2275             ARG_SET( convert, data_slot );
2276             /* Store the offset to the first unabsorbed branch in
2277                jump[0], which is otherwise unused by the jump logic.
2278                We use this when dumping a trie and during optimisation. */
2279             if (trie->jump)
2280                 trie->jump[0] = (U16)(nextbranch - convert);
2281
2282             /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2283              *   and there is a bitmap
2284              *   and the first "jump target" node we found leaves enough room
2285              * then convert the TRIE node into a TRIEC node, with the bitmap
2286              * embedded inline in the opcode - this is hypothetically faster.
2287              */
2288             if ( !trie->states[trie->startstate].wordnum
2289                  && trie->bitmap
2290                  && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2291             {
2292                 OP( convert ) = TRIEC;
2293                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2294                 PerlMemShared_free(trie->bitmap);
2295                 trie->bitmap= NULL;
2296             } else
2297                 OP( convert ) = TRIE;
2298
2299             /* store the type in the flags */
2300             convert->flags = nodetype;
2301             DEBUG_r({
2302             optimize = convert
2303                       + NODE_STEP_REGNODE
2304                       + regarglen[ OP( convert ) ];
2305             });
2306             /* XXX We really should free up the resource in trie now,
2307                    as we won't use them - (which resources?) dmq */
2308         }
2309         /* needed for dumping*/
2310         DEBUG_r(if (optimize) {
2311             regnode *opt = convert;
2312
2313             while ( ++opt < optimize) {
2314                 Set_Node_Offset_Length(opt,0,0);
2315             }
2316             /*
2317                 Try to clean up some of the debris left after the
2318                 optimisation.
2319              */
2320             while( optimize < jumper ) {
2321                 mjd_nodelen += Node_Length((optimize));
2322                 OP( optimize ) = OPTIMIZED;
2323                 Set_Node_Offset_Length(optimize,0,0);
2324                 optimize++;
2325             }
2326             Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2327         });
2328     } /* end node insert */
2329
2330     /*  Finish populating the prev field of the wordinfo array.  Walk back
2331      *  from each accept state until we find another accept state, and if
2332      *  so, point the first word's .prev field at the second word. If the
2333      *  second already has a .prev field set, stop now. This will be the
2334      *  case either if we've already processed that word's accept state,
2335      *  or that state had multiple words, and the overspill words were
2336      *  already linked up earlier.
2337      */
2338     {
2339         U16 word;
2340         U32 state;
2341         U16 prev;
2342
2343         for (word=1; word <= trie->wordcount; word++) {
2344             prev = 0;
2345             if (trie->wordinfo[word].prev)
2346                 continue;
2347             state = trie->wordinfo[word].accept;
2348             while (state) {
2349                 state = prev_states[state];
2350                 if (!state)
2351                     break;
2352                 prev = trie->states[state].wordnum;
2353                 if (prev)
2354                     break;
2355             }
2356             trie->wordinfo[word].prev = prev;
2357         }
2358         Safefree(prev_states);
2359     }
2360
2361
2362     /* and now dump out the compressed format */
2363     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2364
2365     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2366 #ifdef DEBUGGING
2367     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2368     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2369 #else
2370     SvREFCNT_dec(revcharmap);
2371 #endif
2372     return trie->jump
2373            ? MADE_JUMP_TRIE
2374            : trie->startstate>1
2375              ? MADE_EXACT_TRIE
2376              : MADE_TRIE;
2377 }
2378
2379 STATIC void
2380 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode *stclass, U32 depth)
2381 {
2382 /* The Trie is constructed and compressed now so we can build a fail array if it's needed
2383
2384    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2385    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2386    ISBN 0-201-10088-6
2387
2388    We find the fail state for each state in the trie, this state is the longest proper
2389    suffix of the current state's 'word' that is also a proper prefix of another word in our
2390    trie. State 1 represents the word '' and is thus the default fail state. This allows
2391    the DFA not to have to restart after its tried and failed a word at a given point, it
2392    simply continues as though it had been matching the other word in the first place.
2393    Consider
2394       'abcdgu'=~/abcdefg|cdgu/
2395    When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2396    fail, which would bring us to the state representing 'd' in the second word where we would
2397    try 'g' and succeed, proceeding to match 'cdgu'.
2398  */
2399  /* add a fail transition */
2400     const U32 trie_offset = ARG(source);
2401     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2402     U32 *q;
2403     const U32 ucharcount = trie->uniquecharcount;
2404     const U32 numstates = trie->statecount;
2405     const U32 ubound = trie->lasttrans + ucharcount;
2406     U32 q_read = 0;
2407     U32 q_write = 0;
2408     U32 charid;
2409     U32 base = trie->states[ 1 ].trans.base;
2410     U32 *fail;
2411     reg_ac_data *aho;
2412     const U32 data_slot = add_data( pRExC_state, 1, "T" );
2413     GET_RE_DEBUG_FLAGS_DECL;
2414
2415     PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2416 #ifndef DEBUGGING
2417     PERL_UNUSED_ARG(depth);
2418 #endif
2419
2420
2421     ARG_SET( stclass, data_slot );
2422     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2423     RExC_rxi->data->data[ data_slot ] = (void*)aho;
2424     aho->trie=trie_offset;
2425     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2426     Copy( trie->states, aho->states, numstates, reg_trie_state );
2427     Newxz( q, numstates, U32);
2428     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2429     aho->refcount = 1;
2430     fail = aho->fail;
2431     /* initialize fail[0..1] to be 1 so that we always have
2432        a valid final fail state */
2433     fail[ 0 ] = fail[ 1 ] = 1;
2434
2435     for ( charid = 0; charid < ucharcount ; charid++ ) {
2436         const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2437         if ( newstate ) {
2438             q[ q_write ] = newstate;
2439             /* set to point at the root */
2440             fail[ q[ q_write++ ] ]=1;
2441         }
2442     }
2443     while ( q_read < q_write) {
2444         const U32 cur = q[ q_read++ % numstates ];
2445         base = trie->states[ cur ].trans.base;
2446
2447         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2448             const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2449             if (ch_state) {
2450                 U32 fail_state = cur;
2451                 U32 fail_base;
2452                 do {
2453                     fail_state = fail[ fail_state ];
2454                     fail_base = aho->states[ fail_state ].trans.base;
2455                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2456
2457                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2458                 fail[ ch_state ] = fail_state;
2459                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2460                 {
2461                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
2462                 }
2463                 q[ q_write++ % numstates] = ch_state;
2464             }
2465         }
2466     }
2467     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2468        when we fail in state 1, this allows us to use the
2469        charclass scan to find a valid start char. This is based on the principle
2470        that theres a good chance the string being searched contains lots of stuff
2471        that cant be a start char.
2472      */
2473     fail[ 0 ] = fail[ 1 ] = 0;
2474     DEBUG_TRIE_COMPILE_r({
2475         PerlIO_printf(Perl_debug_log,
2476                       "%*sStclass Failtable (%"UVuf" states): 0",
2477                       (int)(depth * 2), "", (UV)numstates
2478         );
2479         for( q_read=1; q_read<numstates; q_read++ ) {
2480             PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2481         }
2482         PerlIO_printf(Perl_debug_log, "\n");
2483     });
2484     Safefree(q);
2485     /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2486 }
2487
2488
2489 /*
2490  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2491  * These need to be revisited when a newer toolchain becomes available.
2492  */
2493 #if defined(__sparc64__) && defined(__GNUC__)
2494 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2495 #       undef  SPARC64_GCC_WORKAROUND
2496 #       define SPARC64_GCC_WORKAROUND 1
2497 #   endif
2498 #endif
2499
2500 #define DEBUG_PEEP(str,scan,depth) \
2501     DEBUG_OPTIMISE_r({if (scan){ \
2502        SV * const mysv=sv_newmortal(); \
2503        regnode *Next = regnext(scan); \
2504        regprop(RExC_rx, mysv, scan); \
2505        PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2506        (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2507        Next ? (REG_NODE_NUM(Next)) : 0 ); \
2508    }});
2509
2510
2511 /* The below joins as many adjacent EXACTish nodes as possible into a single
2512  * one, and looks for problematic sequences of characters whose folds vs.
2513  * non-folds have sufficiently different lengths, that the optimizer would be
2514  * fooled into rejecting legitimate matches of them, and the trie construction
2515  * code can't cope with them.  The joining is only done if:
2516  * 1) there is room in the current conglomerated node to entirely contain the
2517  *    next one.
2518  * 2) they are the exact same node type
2519  *
2520  * The adjacent nodes actually may be separated by NOTHING kind nodes, and
2521  * these get optimized out
2522  *
2523  * If there are problematic code sequences, *min_subtract is set to the delta
2524  * that the minimum size of the node can be less than its actual size.  And,
2525  * the node type of the result is changed to reflect that it contains these
2526  * sequences.
2527  *
2528  * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
2529  * and contains LATIN SMALL LETTER SHARP S
2530  *
2531  * This is as good a place as any to discuss the design of handling these
2532  * problematic sequences.  It's been wrong in Perl for a very long time.  There
2533  * are three code points in Unicode whose folded lengths differ so much from
2534  * the un-folded lengths that it causes problems for the optimizer and trie
2535  * construction.  Why only these are problematic, and not others where lengths
2536  * also differ is something I (khw) do not understand.  New versions of Unicode
2537  * might add more such code points.  Hopefully the logic in fold_grind.t that
2538  * figures out what to test (in part by verifying that each size-combination
2539  * gets tested) will catch any that do come along, so they can be added to the
2540  * special handling below.  The chances of new ones are actually rather small,
2541  * as most, if not all, of the world's scripts that have casefolding have
2542  * already been encoded by Unicode.  Also, a number of Unicode's decisions were
2543  * made to allow compatibility with pre-existing standards, and almost all of
2544  * those have already been dealt with.  These would otherwise be the most
2545  * likely candidates for generating further tricky sequences.  In other words,
2546  * Unicode by itself is unlikely to add new ones unless it is for compatibility
2547  * with pre-existing standards, and there aren't many of those left.
2548  *
2549  * The previous designs for dealing with these involved assigning a special
2550  * node for them.  This approach doesn't work, as evidenced by this example:
2551  *      "\xDFs" =~ /s\xDF/ui    # Used to fail before these patches
2552  * Both these fold to "sss", but if the pattern is parsed to create a node of
2553  * that would match just the \xDF, it won't be able to handle the case where a
2554  * successful match would have to cross the node's boundary.  The new approach
2555  * that hopefully generally solves the problem generates an EXACTFU_SS node
2556  * that is "sss".
2557  *
2558  * There are a number of components to the approach (a lot of work for just
2559  * three code points!):
2560  * 1)   This routine examines each EXACTFish node that could contain the
2561  *      problematic sequences.  It returns in *min_subtract how much to
2562  *      subtract from the the actual length of the string to get a real minimum
2563  *      for one that could match it.  This number is usually 0 except for the
2564  *      problematic sequences.  This delta is used by the caller to adjust the
2565  *      min length of the match, and the delta between min and max, so that the
2566  *      optimizer doesn't reject these possibilities based on size constraints.
2567  * 2)   These sequences are not currently correctly handled by the trie code
2568  *      either, so it changes the joined node type to ops that are not handled
2569  *      by trie's, those new ops being EXACTFU_SS and EXACTFU_NO_TRIE.
2570  * 3)   This is sufficient for the two Greek sequences (described below), but
2571  *      the one involving the Sharp s (\xDF) needs more.  The node type
2572  *      EXACTFU_SS is used for an EXACTFU node that contains at least one "ss"
2573  *      sequence in it.  For non-UTF-8 patterns and strings, this is the only
2574  *      case where there is a possible fold length change.  That means that a
2575  *      regular EXACTFU node without UTF-8 involvement doesn't have to concern
2576  *      itself with length changes, and so can be processed faster.  regexec.c
2577  *      takes advantage of this.  Generally, an EXACTFish node that is in UTF-8
2578  *      is pre-folded by regcomp.c.  This saves effort in regex matching.
2579  *      However, probably mostly for historical reasons, the pre-folding isn't
2580  *      done for non-UTF8 patterns (and it can't be for EXACTF and EXACTFL
2581  *      nodes, as what they fold to isn't known until runtime.)  The fold
2582  *      possibilities for the non-UTF8 patterns are quite simple, except for
2583  *      the sharp s.  All the ones that don't involve a UTF-8 target string
2584  *      are members of a fold-pair, and arrays are set up for all of them
2585  *      that quickly find the other member of the pair.  It might actually
2586  *      be faster to pre-fold these, but it isn't currently done, except for
2587  *      the sharp s.  Code elsewhere in this file makes sure that it gets
2588  *      folded to 'ss', even if the pattern isn't UTF-8.  This avoids the
2589  *      issues described in the next item.
2590  * 4)   A problem remains for the sharp s in EXACTF nodes.  Whether it matches
2591  *      'ss' or not is not knowable at compile time.  It will match iff the
2592  *      target string is in UTF-8, unlike the EXACTFU nodes, where it always
2593  *      matches; and the EXACTFL and EXACTFA nodes where it never does.  Thus
2594  *      it can't be folded to "ss" at compile time, unlike EXACTFU does as
2595  *      described in item 3).  An assumption that the optimizer part of
2596  *      regexec.c (probably unwittingly) makes is that a character in the
2597  *      pattern corresponds to at most a single character in the target string.
2598  *      (And I do mean character, and not byte here, unlike other parts of the
2599  *      documentation that have never been updated to account for multibyte
2600  *      Unicode.)  This assumption is wrong only in this case, as all other
2601  *      cases are either 1-1 folds when no UTF-8 is involved; or is true by
2602  *      virtue of having this file pre-fold UTF-8 patterns.   I'm
2603  *      reluctant to try to change this assumption, so instead the code punts.
2604  *      This routine examines EXACTF nodes for the sharp s, and returns a
2605  *      boolean indicating whether or not the node is an EXACTF node that
2606  *      contains a sharp s.  When it is true, the caller sets a flag that later
2607  *      causes the optimizer in this file to not set values for the floating
2608  *      and fixed string lengths, and thus avoids the optimizer code in
2609  *      regexec.c that makes the invalid assumption.  Thus, there is no
2610  *      optimization based on string lengths for EXACTF nodes that contain the
2611  *      sharp s.  This only happens for /id rules (which means the pattern
2612  *      isn't in UTF-8).
2613  */
2614
2615 #define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
2616     if (PL_regkind[OP(scan)] == EXACT) \
2617         join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
2618
2619 STATIC U32
2620 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
2621     /* Merge several consecutive EXACTish nodes into one. */
2622     regnode *n = regnext(scan);
2623     U32 stringok = 1;
2624     regnode *next = scan + NODE_SZ_STR(scan);
2625     U32 merged = 0;
2626     U32 stopnow = 0;
2627 #ifdef DEBUGGING
2628     regnode *stop = scan;
2629     GET_RE_DEBUG_FLAGS_DECL;
2630 #else
2631     PERL_UNUSED_ARG(depth);
2632 #endif
2633
2634     PERL_ARGS_ASSERT_JOIN_EXACT;
2635 #ifndef EXPERIMENTAL_INPLACESCAN
2636     PERL_UNUSED_ARG(flags);
2637     PERL_UNUSED_ARG(val);
2638 #endif
2639     DEBUG_PEEP("join",scan,depth);
2640
2641     /* Look through the subsequent nodes in the chain.  Skip NOTHING, merge
2642      * EXACT ones that are mergeable to the current one. */
2643     while (n
2644            && (PL_regkind[OP(n)] == NOTHING
2645                || (stringok && OP(n) == OP(scan)))
2646            && NEXT_OFF(n)
2647            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
2648     {
2649
2650         if (OP(n) == TAIL || n > next)
2651             stringok = 0;
2652         if (PL_regkind[OP(n)] == NOTHING) {
2653             DEBUG_PEEP("skip:",n,depth);
2654             NEXT_OFF(scan) += NEXT_OFF(n);
2655             next = n + NODE_STEP_REGNODE;
2656 #ifdef DEBUGGING
2657             if (stringok)
2658                 stop = n;
2659 #endif
2660             n = regnext(n);
2661         }
2662         else if (stringok) {
2663             const unsigned int oldl = STR_LEN(scan);
2664             regnode * const nnext = regnext(n);
2665
2666             if (oldl + STR_LEN(n) > U8_MAX)
2667                 break;
2668
2669             DEBUG_PEEP("merg",n,depth);
2670             merged++;
2671
2672             NEXT_OFF(scan) += NEXT_OFF(n);
2673             STR_LEN(scan) += STR_LEN(n);
2674             next = n + NODE_SZ_STR(n);
2675             /* Now we can overwrite *n : */
2676             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2677 #ifdef DEBUGGING
2678             stop = next - 1;
2679 #endif
2680             n = nnext;
2681             if (stopnow) break;
2682         }
2683
2684 #ifdef EXPERIMENTAL_INPLACESCAN
2685         if (flags && !NEXT_OFF(n)) {
2686             DEBUG_PEEP("atch", val, depth);
2687             if (reg_off_by_arg[OP(n)]) {
2688                 ARG_SET(n, val - n);
2689             }
2690             else {
2691                 NEXT_OFF(n) = val - n;
2692             }
2693             stopnow = 1;
2694         }
2695 #endif
2696     }
2697
2698     *min_subtract = 0;
2699     *has_exactf_sharp_s = FALSE;
2700
2701     /* Here, all the adjacent mergeable EXACTish nodes have been merged.  We
2702      * can now analyze for sequences of problematic code points.  (Prior to
2703      * this final joining, sequences could have been split over boundaries, and
2704      * hence missed).  The sequences only happen in folding, hence for any
2705      * non-EXACT EXACTish node */
2706     if (OP(scan) != EXACT) {
2707         U8 *s;
2708         U8 * s0 = (U8*) STRING(scan);
2709         U8 * const s_end = s0 + STR_LEN(scan);
2710
2711         /* The below is perhaps overboard, but this allows us to save a test
2712          * each time through the loop at the expense of a mask.  This is
2713          * because on both EBCDIC and ASCII machines, 'S' and 's' differ by a
2714          * single bit.  On ASCII they are 32 apart; on EBCDIC, they are 64.
2715          * This uses an exclusive 'or' to find that bit and then inverts it to
2716          * form a mask, with just a single 0, in the bit position where 'S' and
2717          * 's' differ. */
2718         const U8 S_or_s_mask = ~ ('S' ^ 's');
2719         const U8 s_masked = 's' & S_or_s_mask;
2720
2721         /* One pass is made over the node's string looking for all the
2722          * possibilities.  to avoid some tests in the loop, there are two main
2723          * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
2724          * non-UTF-8 */
2725         if (UTF) {
2726
2727             /* There are two problematic Greek code points in Unicode
2728              * casefolding
2729              *
2730              * U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2731              * U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2732              *
2733              * which casefold to
2734              *
2735              * Unicode                      UTF-8
2736              *
2737              * U+03B9 U+0308 U+0301         0xCE 0xB9 0xCC 0x88 0xCC 0x81
2738              * U+03C5 U+0308 U+0301         0xCF 0x85 0xCC 0x88 0xCC 0x81
2739              *
2740              * This means that in case-insensitive matching (or "loose
2741              * matching", as Unicode calls it), an EXACTF of length six (the
2742              * UTF-8 encoded byte length of the above casefolded versions) can
2743              * match a target string of length two (the byte length of UTF-8
2744              * encoded U+0390 or U+03B0).  This would rather mess up the
2745              * minimum length computation.  (there are other code points that
2746              * also fold to these two sequences, but the delta is smaller)
2747              *
2748              * If these sequences are found, the minimum length is decreased by
2749              * four (six minus two).
2750              *
2751              * Similarly, 'ss' may match the single char and byte LATIN SMALL
2752              * LETTER SHARP S.  We decrease the min length by 1 for each
2753              * occurrence of 'ss' found */
2754
2755 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2756 #           define U390_first_byte 0xb4
2757             const U8 U390_tail[] = "\x68\xaf\x49\xaf\x42";
2758 #           define U3B0_first_byte 0xb5
2759             const U8 U3B0_tail[] = "\x46\xaf\x49\xaf\x42";
2760 #else
2761 #           define U390_first_byte 0xce
2762             const U8 U390_tail[] = "\xb9\xcc\x88\xcc\x81";
2763 #           define U3B0_first_byte 0xcf
2764             const U8 U3B0_tail[] = "\x85\xcc\x88\xcc\x81";
2765 #endif
2766             const U8 len = sizeof(U390_tail); /* (-1 for NUL; +1 for 1st byte;
2767                                                  yields a net of 0 */
2768             /* Examine the string for one of the problematic sequences */
2769             for (s = s0;
2770                  s < s_end - 1; /* Can stop 1 before the end, as minimum length
2771                                  * sequence we are looking for is 2 */
2772                  s += UTF8SKIP(s))
2773             {
2774
2775                 /* Look for the first byte in each problematic sequence */
2776                 switch (*s) {
2777                     /* We don't have to worry about other things that fold to
2778                      * 's' (such as the long s, U+017F), as all above-latin1
2779                      * code points have been pre-folded */
2780                     case 's':
2781                     case 'S':
2782
2783                         /* Current character is an 's' or 'S'.  If next one is
2784                          * as well, we have the dreaded sequence */
2785                         if (((*(s+1) & S_or_s_mask) == s_masked)
2786                             /* These two node types don't have special handling
2787                              * for 'ss' */
2788                             && OP(scan) != EXACTFL && OP(scan) != EXACTFA)
2789                         {
2790                             *min_subtract += 1;
2791                             OP(scan) = EXACTFU_SS;
2792                             s++;    /* No need to look at this character again */
2793                         }
2794                         break;
2795
2796                     case U390_first_byte:
2797                         if (s_end - s >= len
2798
2799                             /* The 1's are because are skipping comparing the
2800                              * first byte */
2801                             && memEQ(s + 1, U390_tail, len - 1))
2802                         {
2803                             goto greek_sequence;
2804                         }
2805                         break;
2806
2807                     case U3B0_first_byte:
2808                         if (! (s_end - s >= len
2809                                && memEQ(s + 1, U3B0_tail, len - 1)))
2810                         {
2811                             break;
2812                         }
2813                       greek_sequence:
2814                         *min_subtract += 4;
2815
2816                         /* This can't currently be handled by trie's, so change
2817                          * the node type to indicate this.  If EXACTFA and
2818                          * EXACTFL were ever to be handled by trie's, this
2819                          * would have to be changed.  If this node has already
2820                          * been changed to EXACTFU_SS in this loop, leave it as
2821                          * is.  (I (khw) think it doesn't matter in regexec.c
2822                          * for UTF patterns, but no need to change it */
2823                         if (OP(scan) == EXACTFU) {
2824                             OP(scan) = EXACTFU_NO_TRIE;
2825                         }
2826                         s += 6; /* We already know what this sequence is.  Skip
2827                                    the rest of it */
2828                         break;
2829                 }
2830             }
2831         }
2832         else if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
2833
2834             /* Here, the pattern is not UTF-8.  We need to look only for the
2835              * 'ss' sequence, and in the EXACTF case, the sharp s, which can be
2836              * in the final position.  Otherwise we can stop looking 1 byte
2837              * earlier because have to find both the first and second 's' */
2838             const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
2839
2840             for (s = s0; s < upper; s++) {
2841                 switch (*s) {
2842                     case 'S':
2843                     case 's':
2844                         if (s_end - s > 1
2845                             && ((*(s+1) & S_or_s_mask) == s_masked))
2846                         {
2847                             *min_subtract += 1;
2848
2849                             /* EXACTF nodes need to know that the minimum
2850                              * length changed so that a sharp s in the string
2851                              * can match this ss in the pattern, but they
2852                              * remain EXACTF nodes, as they are not trie'able,
2853                              * so don't have to invent a new node type to
2854                              * exclude them from the trie code */
2855                             if (OP(scan) != EXACTF) {
2856                                 OP(scan) = EXACTFU_SS;
2857                             }
2858                             s++;
2859                         }
2860                         break;
2861                     case LATIN_SMALL_LETTER_SHARP_S:
2862                         if (OP(scan) == EXACTF) {
2863                             *has_exactf_sharp_s = TRUE;
2864                         }
2865                         break;
2866                 }
2867             }
2868         }
2869     }
2870
2871 #ifdef DEBUGGING
2872     /* Allow dumping but overwriting the collection of skipped
2873      * ops and/or strings with fake optimized ops */
2874     n = scan + NODE_SZ_STR(scan);
2875     while (n <= stop) {
2876         OP(n) = OPTIMIZED;
2877         FLAGS(n) = 0;
2878         NEXT_OFF(n) = 0;
2879         n++;
2880     }
2881 #endif
2882     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2883     return stopnow;
2884 }
2885
2886 /* REx optimizer.  Converts nodes into quicker variants "in place".
2887    Finds fixed substrings.  */
2888
2889 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2890    to the position after last scanned or to NULL. */
2891
2892 #define INIT_AND_WITHP \
2893     assert(!and_withp); \
2894     Newx(and_withp,1,struct regnode_charclass_class); \
2895     SAVEFREEPV(and_withp)
2896
2897 /* this is a chain of data about sub patterns we are processing that
2898    need to be handled separately/specially in study_chunk. Its so
2899    we can simulate recursion without losing state.  */
2900 struct scan_frame;
2901 typedef struct scan_frame {
2902     regnode *last;  /* last node to process in this frame */
2903     regnode *next;  /* next node to process when last is reached */
2904     struct scan_frame *prev; /*previous frame*/
2905     I32 stop; /* what stopparen do we use */
2906 } scan_frame;
2907
2908
2909 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2910
2911 #define CASE_SYNST_FNC(nAmE)                                       \
2912 case nAmE:                                                         \
2913     if (flags & SCF_DO_STCLASS_AND) {                              \
2914             for (value = 0; value < 256; value++)                  \
2915                 if (!is_ ## nAmE ## _cp(value))                       \
2916                     ANYOF_BITMAP_CLEAR(data->start_class, value);  \
2917     }                                                              \
2918     else {                                                         \
2919             for (value = 0; value < 256; value++)                  \
2920                 if (is_ ## nAmE ## _cp(value))                        \
2921                     ANYOF_BITMAP_SET(data->start_class, value);    \
2922     }                                                              \
2923     break;                                                         \
2924 case N ## nAmE:                                                    \
2925     if (flags & SCF_DO_STCLASS_AND) {                              \
2926             for (value = 0; value < 256; value++)                   \
2927                 if (is_ ## nAmE ## _cp(value))                         \
2928                     ANYOF_BITMAP_CLEAR(data->start_class, value);   \
2929     }                                                               \
2930     else {                                                          \
2931             for (value = 0; value < 256; value++)                   \
2932                 if (!is_ ## nAmE ## _cp(value))                        \
2933                     ANYOF_BITMAP_SET(data->start_class, value);     \
2934     }                                                               \
2935     break
2936
2937
2938
2939 STATIC I32
2940 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2941                         I32 *minlenp, I32 *deltap,
2942                         regnode *last,
2943                         scan_data_t *data,
2944                         I32 stopparen,
2945                         U8* recursed,
2946                         struct regnode_charclass_class *and_withp,
2947                         U32 flags, U32 depth)
2948                         /* scanp: Start here (read-write). */
2949                         /* deltap: Write maxlen-minlen here. */
2950                         /* last: Stop before this one. */
2951                         /* data: string data about the pattern */
2952                         /* stopparen: treat close N as END */
2953                         /* recursed: which subroutines have we recursed into */
2954                         /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2955 {
2956     dVAR;
2957     I32 min = 0, pars = 0, code;
2958     regnode *scan = *scanp, *next;
2959     I32 delta = 0;
2960     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2961     int is_inf_internal = 0;            /* The studied chunk is infinite */
2962     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2963     scan_data_t data_fake;
2964     SV *re_trie_maxbuff = NULL;
2965     regnode *first_non_open = scan;
2966     I32 stopmin = I32_MAX;
2967     scan_frame *frame = NULL;
2968     GET_RE_DEBUG_FLAGS_DECL;
2969
2970     PERL_ARGS_ASSERT_STUDY_CHUNK;
2971
2972 #ifdef DEBUGGING
2973     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2974 #endif
2975
2976     if ( depth == 0 ) {
2977         while (first_non_open && OP(first_non_open) == OPEN)
2978             first_non_open=regnext(first_non_open);
2979     }
2980
2981
2982   fake_study_recurse:
2983     while ( scan && OP(scan) != END && scan < last ){
2984         UV min_subtract = 0;    /* How much to subtract from the minimum node
2985                                    length to get a real minimum (because the
2986                                    folded version may be shorter) */
2987         bool has_exactf_sharp_s = FALSE;
2988         /* Peephole optimizer: */
2989         DEBUG_STUDYDATA("Peep:", data,depth);
2990         DEBUG_PEEP("Peep",scan,depth);
2991
2992         /* Its not clear to khw or hv why this is done here, and not in the
2993          * clauses that deal with EXACT nodes.  khw's guess is that it's
2994          * because of a previous design */
2995         JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
2996
2997         /* Follow the next-chain of the current node and optimize
2998            away all the NOTHINGs from it.  */
2999         if (OP(scan) != CURLYX) {
3000             const int max = (reg_off_by_arg[OP(scan)]
3001                        ? I32_MAX
3002                        /* I32 may be smaller than U16 on CRAYs! */
3003                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3004             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3005             int noff;
3006             regnode *n = scan;
3007
3008             /* Skip NOTHING and LONGJMP. */
3009             while ((n = regnext(n))
3010                    && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3011                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3012                    && off + noff < max)
3013                 off += noff;
3014             if (reg_off_by_arg[OP(scan)])
3015                 ARG(scan) = off;
3016             else
3017                 NEXT_OFF(scan) = off;
3018         }
3019
3020
3021
3022         /* The principal pseudo-switch.  Cannot be a switch, since we
3023            look into several different things.  */
3024         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3025                    || OP(scan) == IFTHEN) {
3026             next = regnext(scan);
3027             code = OP(scan);
3028             /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
3029
3030             if (OP(next) == code || code == IFTHEN) {
3031                 /* NOTE - There is similar code to this block below for handling
3032                    TRIE nodes on a re-study.  If you change stuff here check there
3033                    too. */
3034                 I32 max1 = 0, min1 = I32_MAX, num = 0;
3035                 struct regnode_charclass_class accum;
3036                 regnode * const startbranch=scan;
3037
3038                 if (flags & SCF_DO_SUBSTR)
3039                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
3040                 if (flags & SCF_DO_STCLASS)
3041                     cl_init_zero(pRExC_state, &accum);
3042
3043                 while (OP(scan) == code) {
3044                     I32 deltanext, minnext, f = 0, fake;
3045                     struct regnode_charclass_class this_class;
3046
3047                     num++;
3048                     data_fake.flags = 0;
3049                     if (data) {
3050                         data_fake.whilem_c = data->whilem_c;
3051                         data_fake.last_closep = data->last_closep;
3052                     }
3053                     else
3054                         data_fake.last_closep = &fake;
3055
3056                     data_fake.pos_delta = delta;
3057                     next = regnext(scan);
3058                     scan = NEXTOPER(scan);
3059                     if (code != BRANCH)
3060                         scan = NEXTOPER(scan);
3061                     if (flags & SCF_DO_STCLASS) {
3062                         cl_init(pRExC_state, &this_class);
3063                         data_fake.start_class = &this_class;
3064                         f = SCF_DO_STCLASS_AND;
3065                     }
3066                     if (flags & SCF_WHILEM_VISITED_POS)
3067                         f |= SCF_WHILEM_VISITED_POS;
3068
3069                     /* we suppose the run is continuous, last=next...*/
3070                     minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3071                                           next, &data_fake,
3072                                           stopparen, recursed, NULL, f,depth+1);
3073                     if (min1 > minnext)
3074                         min1 = minnext;
3075                     if (max1 < minnext + deltanext)
3076                         max1 = minnext + deltanext;
3077                     if (deltanext == I32_MAX)
3078                         is_inf = is_inf_internal = 1;
3079                     scan = next;
3080                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3081                         pars++;
3082                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
3083                         if ( stopmin > minnext)
3084                             stopmin = min + min1;
3085                         flags &= ~SCF_DO_SUBSTR;
3086                         if (data)
3087                             data->flags |= SCF_SEEN_ACCEPT;
3088                     }
3089                     if (data) {
3090                         if (data_fake.flags & SF_HAS_EVAL)
3091                             data->flags |= SF_HAS_EVAL;
3092                         data->whilem_c = data_fake.whilem_c;
3093                     }
3094                     if (flags & SCF_DO_STCLASS)
3095                         cl_or(pRExC_state, &accum, &this_class);
3096                 }
3097                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3098                     min1 = 0;
3099                 if (flags & SCF_DO_SUBSTR) {
3100                     data->pos_min += min1;
3101                     data->pos_delta += max1 - min1;
3102                     if (max1 != min1 || is_inf)
3103                         data->longest = &(data->longest_float);
3104                 }
3105                 min += min1;
3106                 delta += max1 - min1;
3107                 if (flags & SCF_DO_STCLASS_OR) {
3108                     cl_or(pRExC_state, data->start_class, &accum);
3109                     if (min1) {
3110                         cl_and(data->start_class, and_withp);
3111                         flags &= ~SCF_DO_STCLASS;
3112                     }
3113                 }
3114                 else if (flags & SCF_DO_STCLASS_AND) {
3115                     if (min1) {
3116                         cl_and(data->start_class, &accum);
3117                         flags &= ~SCF_DO_STCLASS;
3118                     }
3119                     else {
3120                         /* Switch to OR mode: cache the old value of
3121                          * data->start_class */
3122                         INIT_AND_WITHP;
3123                         StructCopy(data->start_class, and_withp,
3124                                    struct regnode_charclass_class);
3125                         flags &= ~SCF_DO_STCLASS_AND;
3126                         StructCopy(&accum, data->start_class,
3127                                    struct regnode_charclass_class);
3128                         flags |= SCF_DO_STCLASS_OR;
3129                         data->start_class->flags |= ANYOF_EOS;
3130                     }
3131                 }
3132
3133                 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
3134                 /* demq.
3135
3136                    Assuming this was/is a branch we are dealing with: 'scan' now
3137                    points at the item that follows the branch sequence, whatever
3138                    it is. We now start at the beginning of the sequence and look
3139                    for subsequences of
3140
3141                    BRANCH->EXACT=>x1
3142                    BRANCH->EXACT=>x2
3143                    tail
3144
3145                    which would be constructed from a pattern like /A|LIST|OF|WORDS/
3146
3147                    If we can find such a subsequence we need to turn the first
3148                    element into a trie and then add the subsequent branch exact
3149                    strings to the trie.
3150
3151                    We have two cases
3152
3153                      1. patterns where the whole set of branches can be converted.
3154
3155                      2. patterns where only a subset can be converted.
3156
3157                    In case 1 we can replace the whole set with a single regop
3158                    for the trie. In case 2 we need to keep the start and end
3159                    branches so
3160
3161                      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3162                      becomes BRANCH TRIE; BRANCH X;
3163
3164                   There is an additional case, that being where there is a
3165                   common prefix, which gets split out into an EXACT like node
3166                   preceding the TRIE node.
3167
3168                   If x(1..n)==tail then we can do a simple trie, if not we make
3169                   a "jump" trie, such that when we match the appropriate word
3170                   we "jump" to the appropriate tail node. Essentially we turn
3171                   a nested if into a case structure of sorts.
3172
3173                 */
3174
3175                     int made=0;
3176                     if (!re_trie_maxbuff) {
3177                         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3178                         if (!SvIOK(re_trie_maxbuff))
3179                             sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3180                     }
3181                     if ( SvIV(re_trie_maxbuff)>=0  ) {
3182                         regnode *cur;
3183                         regnode *first = (regnode *)NULL;
3184                         regnode *last = (regnode *)NULL;
3185                         regnode *tail = scan;
3186                         U8 optype = 0;
3187                         U32 count=0;
3188
3189 #ifdef DEBUGGING
3190                         SV * const mysv = sv_newmortal();       /* for dumping */
3191 #endif
3192                         /* var tail is used because there may be a TAIL
3193                            regop in the way. Ie, the exacts will point to the
3194                            thing following the TAIL, but the last branch will
3195                            point at the TAIL. So we advance tail. If we
3196                            have nested (?:) we may have to move through several
3197                            tails.
3198                          */
3199
3200                         while ( OP( tail ) == TAIL ) {
3201                             /* this is the TAIL generated by (?:) */
3202                             tail = regnext( tail );
3203                         }
3204
3205
3206                         DEBUG_OPTIMISE_r({
3207                             regprop(RExC_rx, mysv, tail );
3208                             PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3209                                 (int)depth * 2 + 2, "",
3210                                 "Looking for TRIE'able sequences. Tail node is: ",
3211                                 SvPV_nolen_const( mysv )
3212                             );
3213                         });
3214
3215                         /*
3216
3217                            step through the branches, cur represents each
3218                            branch, noper is the first thing to be matched
3219                            as part of that branch and noper_next is the
3220                            regnext() of that node. if noper is an EXACT
3221                            and noper_next is the same as scan (our current
3222                            position in the regex) then the EXACT branch is
3223                            a possible optimization target. Once we have
3224                            two or more consecutive such branches we can
3225                            create a trie of the EXACT's contents and stich
3226                            it in place. If the sequence represents all of
3227                            the branches we eliminate the whole thing and
3228                            replace it with a single TRIE. If it is a
3229                            subsequence then we need to stitch it in. This
3230                            means the first branch has to remain, and needs
3231                            to be repointed at the item on the branch chain
3232                            following the last branch optimized. This could
3233                            be either a BRANCH, in which case the
3234                            subsequence is internal, or it could be the
3235                            item following the branch sequence in which
3236                            case the subsequence is at the end.
3237
3238                         */
3239
3240                         /* dont use tail as the end marker for this traverse */
3241                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3242                             regnode * const noper = NEXTOPER( cur );
3243 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3244                             regnode * const noper_next = regnext( noper );
3245 #endif
3246
3247                             DEBUG_OPTIMISE_r({
3248                                 regprop(RExC_rx, mysv, cur);
3249                                 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3250                                    (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3251
3252                                 regprop(RExC_rx, mysv, noper);
3253                                 PerlIO_printf( Perl_debug_log, " -> %s",
3254                                     SvPV_nolen_const(mysv));
3255
3256                                 if ( noper_next ) {
3257                                   regprop(RExC_rx, mysv, noper_next );
3258                                   PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3259                                     SvPV_nolen_const(mysv));
3260                                 }
3261                                 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
3262                                    REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
3263                             });
3264                             if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
3265                                          : PL_regkind[ OP( noper ) ] == EXACT )
3266                                   || OP(noper) == NOTHING )
3267 #ifdef NOJUMPTRIE
3268                                   && noper_next == tail
3269 #endif
3270                                   && count < U16_MAX)
3271                             {
3272                                 count++;
3273                                 if ( !first || optype == NOTHING ) {
3274                                     if (!first) first = cur;
3275                                     optype = OP( noper );
3276                                 } else {
3277                                     last = cur;
3278                                 }
3279                             } else {
3280 /*
3281     Currently the trie logic handles case insensitive matching properly only
3282     when the pattern is UTF-8 and the node is EXACTFU (thus forcing unicode
3283     semantics).
3284
3285     If/when this is fixed the following define can be swapped
3286     in below to fully enable trie logic.
3287
3288 #define TRIE_TYPE_IS_SAFE 1
3289
3290 Note that join_exact() assumes that the other types of EXACTFish nodes are not
3291 used in tries, so that would have to be updated if this changed
3292
3293 */
3294 #define TRIE_TYPE_IS_SAFE ((UTF && optype == EXACTFU) || optype==EXACT)
3295
3296                                 if ( last && TRIE_TYPE_IS_SAFE ) {
3297                                     make_trie( pRExC_state,
3298                                             startbranch, first, cur, tail, count,
3299                                             optype, depth+1 );
3300                                 }
3301                                 if ( PL_regkind[ OP( noper ) ] == EXACT
3302 #ifdef NOJUMPTRIE
3303                                      && noper_next == tail
3304 #endif
3305                                 ){
3306                                     count = 1;
3307                                     first = cur;
3308                                     optype = OP( noper );
3309                                 } else {
3310                                     count = 0;
3311                                     first = NULL;
3312                                     optype = 0;
3313                                 }
3314                                 last = NULL;
3315                             }
3316                         }
3317                         DEBUG_OPTIMISE_r({
3318                             regprop(RExC_rx, mysv, cur);
3319                             PerlIO_printf( Perl_debug_log,
3320                               "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3321                               "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3322
3323                         });
3324
3325                         if ( last && TRIE_TYPE_IS_SAFE ) {
3326                             made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3327 #ifdef TRIE_STUDY_OPT
3328                             if ( ((made == MADE_EXACT_TRIE &&
3329                                  startbranch == first)
3330                                  || ( first_non_open == first )) &&
3331                                  depth==0 ) {
3332                                 flags |= SCF_TRIE_RESTUDY;
3333                                 if ( startbranch == first
3334                                      && scan == tail )
3335                                 {
3336                                     RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3337                                 }
3338                             }
3339 #endif
3340                         }
3341                     }
3342
3343                 } /* do trie */
3344
3345             }
3346             else if ( code == BRANCHJ ) {  /* single branch is optimized. */
3347                 scan = NEXTOPER(NEXTOPER(scan));
3348             } else                      /* single branch is optimized. */
3349                 scan = NEXTOPER(scan);
3350             continue;
3351         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3352             scan_frame *newframe = NULL;
3353             I32 paren;
3354             regnode *start;
3355             regnode *end;
3356
3357             if (OP(scan) != SUSPEND) {
3358             /* set the pointer */
3359                 if (OP(scan) == GOSUB) {
3360                     paren = ARG(scan);
3361                     RExC_recurse[ARG2L(scan)] = scan;
3362                     start = RExC_open_parens[paren-1];
3363                     end   = RExC_close_parens[paren-1];
3364                 } else {
3365                     paren = 0;
3366                     start = RExC_rxi->program + 1;
3367                     end   = RExC_opend;
3368                 }
3369                 if (!recursed) {
3370                     Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3371                     SAVEFREEPV(recursed);
3372                 }
3373                 if (!PAREN_TEST(recursed,paren+1)) {
3374                     PAREN_SET(recursed,paren+1);
3375                     Newx(newframe,1,scan_frame);
3376                 } else {
3377                     if (flags & SCF_DO_SUBSTR) {
3378                         SCAN_COMMIT(pRExC_state,data,minlenp);
3379                         data->longest = &(data->longest_float);
3380                     }
3381                     is_inf = is_inf_internal = 1;
3382                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3383                         cl_anything(pRExC_state, data->start_class);
3384                     flags &= ~SCF_DO_STCLASS;
3385                 }
3386             } else {
3387                 Newx(newframe,1,scan_frame);
3388                 paren = stopparen;
3389                 start = scan+2;
3390                 end = regnext(scan);
3391             }
3392             if (newframe) {
3393                 assert(start);
3394                 assert(end);
3395                 SAVEFREEPV(newframe);
3396                 newframe->next = regnext(scan);
3397                 newframe->last = last;
3398                 newframe->stop = stopparen;
3399                 newframe->prev = frame;
3400
3401                 frame = newframe;
3402                 scan =  start;
3403                 stopparen = paren;
3404                 last = end;
3405
3406                 continue;
3407             }
3408         }
3409         else if (OP(scan) == EXACT) {
3410             I32 l = STR_LEN(scan);
3411             UV uc;
3412             if (UTF) {
3413                 const U8 * const s = (U8*)STRING(scan);
3414                 l = utf8_length(s, s + l);
3415                 uc = utf8_to_uvchr(s, NULL);
3416             } else {
3417                 uc = *((U8*)STRING(scan));
3418             }
3419             min += l;
3420             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3421                 /* The code below prefers earlier match for fixed
3422                    offset, later match for variable offset.  */
3423                 if (data->last_end == -1) { /* Update the start info. */
3424                     data->last_start_min = data->pos_min;
3425                     data->last_start_max = is_inf
3426                         ? I32_MAX : data->pos_min + data->pos_delta;
3427                 }
3428                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3429                 if (UTF)
3430                     SvUTF8_on(data->last_found);
3431                 {
3432                     SV * const sv = data->last_found;
3433                     MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3434                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
3435                     if (mg && mg->mg_len >= 0)
3436                         mg->mg_len += utf8_length((U8*)STRING(scan),
3437                                                   (U8*)STRING(scan)+STR_LEN(scan));
3438                 }
3439                 data->last_end = data->pos_min + l;
3440                 data->pos_min += l; /* As in the first entry. */
3441                 data->flags &= ~SF_BEFORE_EOL;
3442             }
3443             if (flags & SCF_DO_STCLASS_AND) {
3444                 /* Check whether it is compatible with what we know already! */
3445                 int compat = 1;
3446
3447
3448                 /* If compatible, we or it in below.  It is compatible if is
3449                  * in the bitmp and either 1) its bit or its fold is set, or 2)
3450                  * it's for a locale.  Even if there isn't unicode semantics
3451                  * here, at runtime there may be because of matching against a
3452                  * utf8 string, so accept a possible false positive for
3453                  * latin1-range folds */
3454                 if (uc >= 0x100 ||
3455                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3456                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
3457                     && (!(data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD)
3458                         || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3459                     )
3460                 {
3461                     compat = 0;
3462                 }
3463                 ANYOF_CLASS_ZERO(data->start_class);
3464                 ANYOF_BITMAP_ZERO(data->start_class);
3465                 if (compat)
3466                     ANYOF_BITMAP_SET(data->start_class, uc);
3467                 else if (uc >= 0x100) {
3468                     int i;
3469
3470                     /* Some Unicode code points fold to the Latin1 range; as
3471                      * XXX temporary code, instead of figuring out if this is
3472                      * one, just assume it is and set all the start class bits
3473                      * that could be some such above 255 code point's fold
3474                      * which will generate fals positives.  As the code
3475                      * elsewhere that does compute the fold settles down, it
3476                      * can be extracted out and re-used here */
3477                     for (i = 0; i < 256; i++){
3478                         if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3479                             ANYOF_BITMAP_SET(data->start_class, i);
3480                         }
3481                     }
3482                 }
3483                 data->start_class->flags &= ~ANYOF_EOS;
3484                 if (uc < 0x100)
3485                   data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3486             }
3487             else if (flags & SCF_DO_STCLASS_OR) {
3488                 /* false positive possible if the class is case-folded */
3489                 if (uc < 0x100)
3490                     ANYOF_BITMAP_SET(data->start_class, uc);
3491                 else
3492                     data->start_class->flags |= ANYOF_UNICODE_ALL;
3493                 data->start_class->flags &= ~ANYOF_EOS;
3494                 cl_and(data->start_class, and_withp);
3495             }
3496             flags &= ~SCF_DO_STCLASS;
3497         }
3498         else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3499             I32 l = STR_LEN(scan);
3500             UV uc = *((U8*)STRING(scan));
3501
3502             /* Search for fixed substrings supports EXACT only. */
3503             if (flags & SCF_DO_SUBSTR) {
3504                 assert(data);
3505                 SCAN_COMMIT(pRExC_state, data, minlenp);
3506             }
3507             if (UTF) {
3508                 const U8 * const s = (U8 *)STRING(scan);
3509                 l = utf8_length(s, s + l);
3510                 uc = utf8_to_uvchr(s, NULL);
3511             }
3512             else if (has_exactf_sharp_s) {
3513                 RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
3514             }
3515             min += l - min_subtract;
3516             if (min < 0) {
3517                 min = 0;
3518             }
3519             delta += min_subtract;
3520             if (flags & SCF_DO_SUBSTR) {
3521                 data->pos_min += l - min_subtract;
3522                 if (data->pos_min < 0) {
3523                     data->pos_min = 0;
3524                 }
3525                 data->pos_delta += min_subtract;
3526                 if (min_subtract) {
3527                     data->longest = &(data->longest_float);
3528                 }
3529             }
3530             if (flags & SCF_DO_STCLASS_AND) {
3531                 /* Check whether it is compatible with what we know already! */
3532                 int compat = 1;
3533                 if (uc >= 0x100 ||
3534                  (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3535                   && !ANYOF_BITMAP_TEST(data->start_class, uc)
3536                   && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3537                 {
3538                     compat = 0;
3539                 }
3540                 ANYOF_CLASS_ZERO(data->start_class);
3541                 ANYOF_BITMAP_ZERO(data->start_class);
3542                 if (compat) {
3543                     ANYOF_BITMAP_SET(data->start_class, uc);
3544                     data->start_class->flags &= ~ANYOF_EOS;
3545                     data->start_class->flags |= ANYOF_LOC_NONBITMAP_FOLD;
3546                     if (OP(scan) == EXACTFL) {
3547                         /* XXX This set is probably no longer necessary, and
3548                          * probably wrong as LOCALE now is on in the initial
3549                          * state */
3550                         data->start_class->flags |= ANYOF_LOCALE;
3551                     }
3552                     else {
3553
3554                         /* Also set the other member of the fold pair.  In case
3555                          * that unicode semantics is called for at runtime, use
3556                          * the full latin1 fold.  (Can't do this for locale,
3557                          * because not known until runtime) */
3558                         ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3559
3560                         /* All other (EXACTFL handled above) folds except under
3561                          * /iaa that include s, S, and sharp_s also may include
3562                          * the others */
3563                         if (OP(scan) != EXACTFA) {
3564                             if (uc == 's' || uc == 'S') {
3565                                 ANYOF_BITMAP_SET(data->start_class,
3566                                                  LATIN_SMALL_LETTER_SHARP_S);
3567                             }
3568                             else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3569                                 ANYOF_BITMAP_SET(data->start_class, 's');
3570                                 ANYOF_BITMAP_SET(data->start_class, 'S');
3571                             }
3572                         }
3573                     }
3574                 }
3575                 else if (uc >= 0x100) {
3576                     int i;
3577                     for (i = 0; i < 256; i++){
3578                         if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3579                             ANYOF_BITMAP_SET(data->start_class, i);
3580                         }
3581                     }
3582                 }
3583             }
3584             else if (flags & SCF_DO_STCLASS_OR) {
3585                 if (data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) {
3586                     /* false positive possible if the class is case-folded.
3587                        Assume that the locale settings are the same... */
3588                     if (uc < 0x100) {
3589                         ANYOF_BITMAP_SET(data->start_class, uc);
3590                         if (OP(scan) != EXACTFL) {
3591
3592                             /* And set the other member of the fold pair, but
3593                              * can't do that in locale because not known until
3594                              * run-time */
3595                             ANYOF_BITMAP_SET(data->start_class,
3596                                              PL_fold_latin1[uc]);
3597
3598                             /* All folds except under /iaa that include s, S,
3599                              * and sharp_s also may include the others */
3600                             if (OP(scan) != EXACTFA) {
3601                                 if (uc == 's' || uc == 'S') {
3602                                     ANYOF_BITMAP_SET(data->start_class,
3603                                                    LATIN_SMALL_LETTER_SHARP_S);
3604                                 }
3605                                 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3606                                     ANYOF_BITMAP_SET(data->start_class, 's');
3607                                     ANYOF_BITMAP_SET(data->start_class, 'S');
3608                                 }
3609                             }
3610                         }
3611                     }
3612                     data->start_class->flags &= ~ANYOF_EOS;
3613                 }
3614                 cl_and(data->start_class, and_withp);
3615             }
3616             flags &= ~SCF_DO_STCLASS;
3617         }
3618         else if (REGNODE_VARIES(OP(scan))) {
3619             I32 mincount, maxcount, minnext, deltanext, fl = 0;
3620             I32 f = flags, pos_before = 0;
3621             regnode * const oscan = scan;
3622             struct regnode_charclass_class this_class;
3623             struct regnode_charclass_class *oclass = NULL;
3624             I32 next_is_eval = 0;
3625
3626             switch (PL_regkind[OP(scan)]) {
3627             case WHILEM:                /* End of (?:...)* . */
3628                 scan = NEXTOPER(scan);
3629                 goto finish;
3630             case PLUS:
3631                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3632                     next = NEXTOPER(scan);
3633                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3634                         mincount = 1;
3635                         maxcount = REG_INFTY;
3636                         next = regnext(scan);
3637                         scan = NEXTOPER(scan);
3638                         goto do_curly;
3639                     }
3640                 }
3641                 if (flags & SCF_DO_SUBSTR)
3642                     data->pos_min++;
3643                 min++;
3644                 /* Fall through. */
3645             case STAR:
3646                 if (flags & SCF_DO_STCLASS) {
3647                     mincount = 0;
3648                     maxcount = REG_INFTY;
3649                     next = regnext(scan);
3650                     scan = NEXTOPER(scan);
3651                     goto do_curly;
3652                 }
3653                 is_inf = is_inf_internal = 1;
3654                 scan = regnext(scan);
3655                 if (flags & SCF_DO_SUBSTR) {
3656                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3657                     data->longest = &(data->longest_float);
3658                 }
3659                 goto optimize_curly_tail;
3660             case CURLY:
3661                 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3662                     && (scan->flags == stopparen))
3663                 {
3664                     mincount = 1;
3665                     maxcount = 1;
3666                 } else {
3667                     mincount = ARG1(scan);
3668                     maxcount = ARG2(scan);
3669                 }
3670                 next = regnext(scan);
3671                 if (OP(scan) == CURLYX) {
3672                     I32 lp = (data ? *(data->last_closep) : 0);
3673                     scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3674                 }
3675                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3676                 next_is_eval = (OP(scan) == EVAL);
3677               do_curly:
3678                 if (flags & SCF_DO_SUBSTR) {
3679                     if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3680                     pos_before = data->pos_min;
3681                 }
3682                 if (data) {
3683                     fl = data->flags;
3684                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3685                     if (is_inf)
3686                         data->flags |= SF_IS_INF;
3687                 }
3688                 if (flags & SCF_DO_STCLASS) {
3689                     cl_init(pRExC_state, &this_class);
3690                     oclass = data->start_class;
3691                     data->start_class = &this_class;
3692                     f |= SCF_DO_STCLASS_AND;
3693                     f &= ~SCF_DO_STCLASS_OR;
3694                 }
3695                 /* Exclude from super-linear cache processing any {n,m}
3696                    regops for which the combination of input pos and regex
3697                    pos is not enough information to determine if a match
3698                    will be possible.
3699
3700                    For example, in the regex /foo(bar\s*){4,8}baz/ with the
3701                    regex pos at the \s*, the prospects for a match depend not
3702                    only on the input position but also on how many (bar\s*)
3703                    repeats into the {4,8} we are. */
3704                if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3705                     f &= ~SCF_WHILEM_VISITED_POS;
3706
3707                 /* This will finish on WHILEM, setting scan, or on NULL: */
3708                 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3709                                       last, data, stopparen, recursed, NULL,
3710                                       (mincount == 0
3711                                         ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3712
3713                 if (flags & SCF_DO_STCLASS)
3714                     data->start_class = oclass;
3715                 if (mincount == 0 || minnext == 0) {
3716                     if (flags & SCF_DO_STCLASS_OR) {
3717                         cl_or(pRExC_state, data->start_class, &this_class);
3718                     }
3719                     else if (flags & SCF_DO_STCLASS_AND) {
3720                         /* Switch to OR mode: cache the old value of
3721                          * data->start_class */
3722                         INIT_AND_WITHP;
3723                         StructCopy(data->start_class, and_withp,
3724                                    struct regnode_charclass_class);
3725                         flags &= ~SCF_DO_STCLASS_AND;
3726                         StructCopy(&this_class, data->start_class,
3727                                    struct regnode_charclass_class);
3728                         flags |= SCF_DO_STCLASS_OR;
3729                         data->start_class->flags |= ANYOF_EOS;
3730                     }
3731                 } else {                /* Non-zero len */
3732                     if (flags & SCF_DO_STCLASS_OR) {
3733                         cl_or(pRExC_state, data->start_class, &this_class);
3734                         cl_and(data->start_class, and_withp);
3735                     }
3736                     else if (flags & SCF_DO_STCLASS_AND)
3737                         cl_and(data->start_class, &this_class);
3738                     flags &= ~SCF_DO_STCLASS;
3739                 }
3740                 if (!scan)              /* It was not CURLYX, but CURLY. */
3741                     scan = next;
3742                 if ( /* ? quantifier ok, except for (?{ ... }) */
3743                     (next_is_eval || !(mincount == 0 && maxcount == 1))
3744                     && (minnext == 0) && (deltanext == 0)
3745                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3746                     && maxcount <= REG_INFTY/3) /* Complement check for big count */
3747                 {
3748                     ckWARNreg(RExC_parse,
3749                               "Quantifier unexpected on zero-length expression");
3750                 }
3751
3752                 min += minnext * mincount;
3753                 is_inf_internal |= ((maxcount == REG_INFTY
3754                                      && (minnext + deltanext) > 0)
3755                                     || deltanext == I32_MAX);
3756                 is_inf |= is_inf_internal;
3757                 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3758
3759                 /* Try powerful optimization CURLYX => CURLYN. */
3760                 if (  OP(oscan) == CURLYX && data
3761                       && data->flags & SF_IN_PAR
3762                       && !(data->flags & SF_HAS_EVAL)
3763                       && !deltanext && minnext == 1 ) {
3764                     /* Try to optimize to CURLYN.  */
3765                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3766                     regnode * const nxt1 = nxt;
3767 #ifdef DEBUGGING
3768                     regnode *nxt2;
3769 #endif
3770
3771                     /* Skip open. */
3772                     nxt = regnext(nxt);
3773                     if (!REGNODE_SIMPLE(OP(nxt))
3774                         && !(PL_regkind[OP(nxt)] == EXACT
3775                              && STR_LEN(nxt) == 1))
3776                         goto nogo;
3777 #ifdef DEBUGGING
3778                     nxt2 = nxt;
3779 #endif
3780                     nxt = regnext(nxt);
3781                     if (OP(nxt) != CLOSE)
3782                         goto nogo;
3783                     if (RExC_open_parens) {
3784                         RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3785                         RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3786                     }
3787                     /* Now we know that nxt2 is the only contents: */
3788                     oscan->flags = (U8)ARG(nxt);
3789                     OP(oscan) = CURLYN;
3790                     OP(nxt1) = NOTHING; /* was OPEN. */
3791
3792 #ifdef DEBUGGING
3793                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3794                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3795                     NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3796                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
3797                     OP(nxt + 1) = OPTIMIZED; /* was count. */
3798                     NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3799 #endif
3800                 }
3801               nogo:
3802
3803                 /* Try optimization CURLYX => CURLYM. */
3804                 if (  OP(oscan) == CURLYX && data
3805                       && !(data->flags & SF_HAS_PAR)
3806                       && !(data->flags & SF_HAS_EVAL)
3807                       && !deltanext     /* atom is fixed width */
3808                       && minnext != 0   /* CURLYM can't handle zero width */
3809                 ) {
3810                     /* XXXX How to optimize if data == 0? */
3811                     /* Optimize to a simpler form.  */
3812                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3813                     regnode *nxt2;
3814
3815                     OP(oscan) = CURLYM;
3816                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3817                             && (OP(nxt2) != WHILEM))
3818                         nxt = nxt2;
3819                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
3820                     /* Need to optimize away parenths. */
3821                     if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3822                         /* Set the parenth number.  */
3823                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3824
3825                         oscan->flags = (U8)ARG(nxt);
3826                         if (RExC_open_parens) {
3827                             RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3828                             RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3829                         }
3830                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
3831                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
3832
3833 #ifdef DEBUGGING
3834                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3835                         OP(nxt + 1) = OPTIMIZED; /* was count. */
3836                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3837                         NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
3838 #endif
3839 #if 0
3840                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
3841                             regnode *nnxt = regnext(nxt1);
3842                             if (nnxt == nxt) {
3843                                 if (reg_off_by_arg[OP(nxt1)])
3844                                     ARG_SET(nxt1, nxt2 - nxt1);
3845                                 else if (nxt2 - nxt1 < U16_MAX)
3846                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
3847                                 else
3848                                     OP(nxt) = NOTHING;  /* Cannot beautify */
3849                             }
3850                             nxt1 = nnxt;
3851                         }
3852 #endif
3853                         /* Optimize again: */
3854                         study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3855                                     NULL, stopparen, recursed, NULL, 0,depth+1);
3856                     }
3857                     else
3858                         oscan->flags = 0;
3859                 }
3860                 else if ((OP(oscan) == CURLYX)
3861                          && (flags & SCF_WHILEM_VISITED_POS)
3862                          /* See the comment on a similar expression above.
3863                             However, this time it's not a subexpression
3864                             we care about, but the expression itself. */
3865                          && (maxcount == REG_INFTY)
3866                          && data && ++data->whilem_c < 16) {
3867                     /* This stays as CURLYX, we can put the count/of pair. */
3868                     /* Find WHILEM (as in regexec.c) */
3869                     regnode *nxt = oscan + NEXT_OFF(oscan);
3870
3871                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3872                         nxt += ARG(nxt);
3873                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
3874                         | (RExC_whilem_seen << 4)); /* On WHILEM */
3875                 }
3876                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3877                     pars++;
3878                 if (flags & SCF_DO_SUBSTR) {
3879                     SV *last_str = NULL;
3880                     int counted = mincount != 0;
3881
3882                     if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3883 #if defined(SPARC64_GCC_WORKAROUND)
3884                         I32 b = 0;
3885                         STRLEN l = 0;
3886                         const char *s = NULL;
3887                         I32 old = 0;
3888
3889                         if (pos_before >= data->last_start_min)
3890                             b = pos_before;
3891                         else
3892                             b = data->last_start_min;
3893
3894                         l = 0;
3895                         s = SvPV_const(data->last_found, l);
3896                         old = b - data->last_start_min;
3897
3898 #else
3899                         I32 b = pos_before >= data->last_start_min
3900                             ? pos_before : data->last_start_min;
3901                         STRLEN l;
3902                         const char * const s = SvPV_const(data->last_found, l);
3903                         I32 old = b - data->last_start_min;
3904 #endif
3905
3906                         if (UTF)
3907                             old = utf8_hop((U8*)s, old) - (U8*)s;
3908                         l -= old;
3909                         /* Get the added string: */
3910                         last_str = newSVpvn_utf8(s  + old, l, UTF);
3911                         if (deltanext == 0 && pos_before == b) {
3912                             /* What was added is a constant string */
3913                             if (mincount > 1) {
3914                                 SvGROW(last_str, (mincount * l) + 1);
3915                                 repeatcpy(SvPVX(last_str) + l,
3916                                           SvPVX_const(last_str), l, mincount - 1);
3917                                 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3918                                 /* Add additional parts. */
3919                                 SvCUR_set(data->last_found,
3920                                           SvCUR(data->last_found) - l);
3921                                 sv_catsv(data->last_found, last_str);
3922                                 {
3923                                     SV * sv = data->last_found;
3924                                     MAGIC *mg =
3925                                         SvUTF8(sv) && SvMAGICAL(sv) ?
3926                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
3927                                     if (mg && mg->mg_len >= 0)
3928                                         mg->mg_len += CHR_SVLEN(last_str) - l;
3929                                 }
3930                                 data->last_end += l * (mincount - 1);
3931                             }
3932                         } else {
3933                             /* start offset must point into the last copy */
3934                             data->last_start_min += minnext * (mincount - 1);
3935                             data->last_start_max += is_inf ? I32_MAX
3936                                 : (maxcount - 1) * (minnext + data->pos_delta);
3937                         }
3938                     }
3939                     /* It is counted once already... */
3940                     data->pos_min += minnext * (mincount - counted);
3941                     data->pos_delta += - counted * deltanext +
3942                         (minnext + deltanext) * maxcount - minnext * mincount;
3943                     if (mincount != maxcount) {
3944                          /* Cannot extend fixed substrings found inside
3945                             the group.  */
3946                         SCAN_COMMIT(pRExC_state,data,minlenp);
3947                         if (mincount && last_str) {
3948                             SV * const sv = data->last_found;
3949                             MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3950                                 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3951
3952                             if (mg)
3953                                 mg->mg_len = -1;
3954                             sv_setsv(sv, last_str);
3955                             data->last_end = data->pos_min;
3956                             data->last_start_min =
3957                                 data->pos_min - CHR_SVLEN(last_str);
3958                             data->last_start_max = is_inf
3959                                 ? I32_MAX
3960                                 : data->pos_min + data->pos_delta
3961                                 - CHR_SVLEN(last_str);
3962                         }
3963                         data->longest = &(data->longest_float);
3964                     }
3965                     SvREFCNT_dec(last_str);
3966                 }
3967                 if (data && (fl & SF_HAS_EVAL))
3968                     data->flags |= SF_HAS_EVAL;
3969               optimize_curly_tail:
3970                 if (OP(oscan) != CURLYX) {
3971                     while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3972                            && NEXT_OFF(next))
3973                         NEXT_OFF(oscan) += NEXT_OFF(next);
3974                 }
3975                 continue;
3976             default:                    /* REF, ANYOFV, and CLUMP only? */
3977                 if (flags & SCF_DO_SUBSTR) {
3978                     SCAN_COMMIT(pRExC_state,data,minlenp);      /* Cannot expect anything... */
3979                     data->longest = &(data->longest_float);
3980                 }
3981                 is_inf = is_inf_internal = 1;
3982                 if (flags & SCF_DO_STCLASS_OR)
3983                     cl_anything(pRExC_state, data->start_class);
3984                 flags &= ~SCF_DO_STCLASS;
3985                 break;
3986             }
3987         }
3988         else if (OP(scan) == LNBREAK) {
3989             if (flags & SCF_DO_STCLASS) {
3990                 int value = 0;
3991                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3992                 if (flags & SCF_DO_STCLASS_AND) {
3993                     for (value = 0; value < 256; value++)
3994                         if (!is_VERTWS_cp(value))
3995                             ANYOF_BITMAP_CLEAR(data->start_class, value);
3996                 }
3997                 else {
3998                     for (value = 0; value < 256; value++)
3999                         if (is_VERTWS_cp(value))
4000                             ANYOF_BITMAP_SET(data->start_class, value);
4001                 }
4002                 if (flags & SCF_DO_STCLASS_OR)
4003                     cl_and(data->start_class, and_withp);
4004                 flags &= ~SCF_DO_STCLASS;
4005             }
4006             min += 1;
4007             delta += 1;
4008             if (flags & SCF_DO_SUBSTR) {
4009                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
4010                 data->pos_min += 1;
4011                 data->pos_delta += 1;
4012                 data->longest = &(data->longest_float);
4013             }
4014         }
4015         else if (REGNODE_SIMPLE(OP(scan))) {
4016             int value = 0;
4017
4018             if (flags & SCF_DO_SUBSTR) {
4019                 SCAN_COMMIT(pRExC_state,data,minlenp);
4020                 data->pos_min++;
4021             }
4022             min++;
4023             if (flags & SCF_DO_STCLASS) {
4024                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
4025
4026                 /* Some of the logic below assumes that switching
4027                    locale on will only add false positives. */
4028                 switch (PL_regkind[OP(scan)]) {
4029                 case SANY:
4030                 default:
4031                   do_default:
4032                     /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
4033                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4034                         cl_anything(pRExC_state, data->start_class);
4035                     break;
4036                 case REG_ANY:
4037                     if (OP(scan) == SANY)
4038                         goto do_default;
4039                     if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
4040                         value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
4041                                  || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
4042                         cl_anything(pRExC_state, data->start_class);
4043                     }
4044                     if (flags & SCF_DO_STCLASS_AND || !value)
4045                         ANYOF_BITMAP_CLEAR(data->start_class,'\n');
4046                     break;
4047                 case ANYOF:
4048                     if (flags & SCF_DO_STCLASS_AND)
4049                         cl_and(data->start_class,
4050                                (struct regnode_charclass_class*)scan);
4051                     else
4052                         cl_or(pRExC_state, data->start_class,
4053                               (struct regnode_charclass_class*)scan);
4054                     break;
4055                 case ALNUM:
4056                     if (flags & SCF_DO_STCLASS_AND) {
4057                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4058                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
4059                             if (OP(scan) == ALNUMU) {
4060                                 for (value = 0; value < 256; value++) {
4061                                     if (!isWORDCHAR_L1(value)) {
4062                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4063                                     }
4064                                 }
4065                             } else {
4066                                 for (value = 0; value < 256; value++) {
4067                                     if (!isALNUM(value)) {
4068                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4069                                     }
4070                                 }
4071                             }
4072                         }
4073                     }
4074                     else {
4075                         if (data->start_class->flags & ANYOF_LOCALE)
4076                             ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
4077
4078                         /* Even if under locale, set the bits for non-locale
4079                          * in case it isn't a true locale-node.  This will
4080                          * create false positives if it truly is locale */
4081                         if (OP(scan) == ALNUMU) {
4082                             for (value = 0; value < 256; value++) {
4083                                 if (isWORDCHAR_L1(value)) {
4084                                     ANYOF_BITMAP_SET(data->start_class, value);
4085                                 }
4086                             }
4087                         } else {
4088                             for (value = 0; value < 256; value++) {
4089                                 if (isALNUM(value)) {
4090                                     ANYOF_BITMAP_SET(data->start_class, value);
4091                                 }
4092                             }
4093                         }
4094                     }
4095                     break;
4096                 case NALNUM:
4097                     if (flags & SCF_DO_STCLASS_AND) {
4098                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4099                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
4100                             if (OP(scan) == NALNUMU) {
4101                                 for (value = 0; value < 256; value++) {
4102                                     if (isWORDCHAR_L1(value)) {
4103                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4104                                     }
4105                                 }
4106                             } else {
4107                                 for (value = 0; value < 256; value++) {
4108                                     if (isALNUM(value)) {
4109                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4110                                     }
4111                                 }
4112                             }
4113                         }
4114                     }
4115                     else {
4116                         if (data->start_class->flags & ANYOF_LOCALE)
4117                             ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
4118
4119                         /* Even if under locale, set the bits for non-locale in
4120                          * case it isn't a true locale-node.  This will create
4121                          * false positives if it truly is locale */
4122                         if (OP(scan) == NALNUMU) {
4123                             for (value = 0; value < 256; value++) {
4124                                 if (! isWORDCHAR_L1(value)) {
4125                                     ANYOF_BITMAP_SET(data->start_class, value);
4126                                 }
4127                             }
4128                         } else {
4129                             for (value = 0; value < 256; value++) {
4130                                 if (! isALNUM(value)) {
4131                                     ANYOF_BITMAP_SET(data->start_class, value);
4132                                 }
4133                             }
4134                         }
4135                     }
4136                     break;
4137                 case SPACE:
4138                     if (flags & SCF_DO_STCLASS_AND) {
4139                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4140                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
4141                             if (OP(scan) == SPACEU) {
4142                                 for (value = 0; value < 256; value++) {
4143                                     if (!isSPACE_L1(value)) {
4144                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4145                                     }
4146                                 }
4147                             } else {
4148                                 for (value = 0; value < 256; value++) {
4149                                     if (!isSPACE(value)) {
4150                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4151                                     }
4152                                 }
4153                             }
4154                         }
4155                     }
4156                     else {
4157                         if (data->start_class->flags & ANYOF_LOCALE) {
4158                             ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
4159                         }
4160                         if (OP(scan) == SPACEU) {
4161                             for (value = 0; value < 256; value++) {
4162                                 if (isSPACE_L1(value)) {
4163                                     ANYOF_BITMAP_SET(data->start_class, value);
4164                                 }
4165                             }
4166                         } else {
4167                             for (value = 0; value < 256; value++) {
4168                                 if (isSPACE(value)) {
4169                                     ANYOF_BITMAP_SET(data->start_class, value);
4170                                 }
4171                             }
4172                         }
4173                     }
4174                     break;
4175                 case NSPACE:
4176                     if (flags & SCF_DO_STCLASS_AND) {
4177                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4178                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
4179                             if (OP(scan) == NSPACEU) {
4180                                 for (value = 0; value < 256; value++) {
4181                                     if (isSPACE_L1(value)) {
4182                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4183                                     }
4184                                 }
4185                             } else {
4186                                 for (value = 0; value < 256; value++) {
4187                                     if (isSPACE(value)) {
4188                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4189                                     }
4190                                 }
4191                             }
4192                         }
4193                     }
4194                     else {
4195                         if (data->start_class->flags & ANYOF_LOCALE)
4196                             ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
4197                         if (OP(scan) == NSPACEU) {
4198                             for (value = 0; value < 256; value++) {
4199                                 if (!isSPACE_L1(value)) {
4200                                     ANYOF_BITMAP_SET(data->start_class, value);
4201                                 }
4202                             }
4203                         }
4204                         else {
4205                             for (value = 0; value < 256; value++) {
4206                                 if (!isSPACE(value)) {
4207                                     ANYOF_BITMAP_SET(data->start_class, value);
4208                                 }
4209                             }
4210                         }
4211                     }
4212                     break;
4213                 case DIGIT:
4214                     if (flags & SCF_DO_STCLASS_AND) {
4215                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4216                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
4217                             for (value = 0; value < 256; value++)
4218                                 if (!isDIGIT(value))
4219                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
4220                         }
4221                     }
4222                     else {
4223                         if (data->start_class->flags & ANYOF_LOCALE)
4224                             ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
4225                         for (value = 0; value < 256; value++)
4226                             if (isDIGIT(value))
4227                                 ANYOF_BITMAP_SET(data->start_class, value);
4228                     }
4229                     break;
4230                 case NDIGIT:
4231                     if (flags & SCF_DO_STCLASS_AND) {
4232                         if (!(data->start_class->flags & ANYOF_LOCALE))
4233                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
4234                         for (value = 0; value < 256; value++)
4235                             if (isDIGIT(value))
4236                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
4237                     }
4238                     else {
4239                         if (data->start_class->flags & ANYOF_LOCALE)
4240                             ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
4241                         for (value = 0; value < 256; value++)
4242                             if (!isDIGIT(value))
4243                                 ANYOF_BITMAP_SET(data->start_class, value);
4244                     }
4245                     break;
4246                 CASE_SYNST_FNC(VERTWS);
4247                 CASE_SYNST_FNC(HORIZWS);
4248
4249                 }
4250                 if (flags & SCF_DO_STCLASS_OR)
4251                     cl_and(data->start_class, and_withp);
4252                 flags &= ~SCF_DO_STCLASS;
4253             }
4254         }
4255         else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
4256             data->flags |= (OP(scan) == MEOL
4257                             ? SF_BEFORE_MEOL
4258                             : SF_BEFORE_SEOL);
4259         }
4260         else if (  PL_regkind[OP(scan)] == BRANCHJ
4261                  /* Lookbehind, or need to calculate parens/evals/stclass: */
4262                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
4263                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4264             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4265                 || OP(scan) == UNLESSM )
4266             {
4267                 /* Negative Lookahead/lookbehind
4268                    In this case we can't do fixed string optimisation.
4269                 */
4270
4271                 I32 deltanext, minnext, fake = 0;
4272                 regnode *nscan;
4273                 struct regnode_charclass_class intrnl;
4274                 int f = 0;
4275
4276                 data_fake.flags = 0;
4277                 if (data) {
4278                     data_fake.whilem_c = data->whilem_c;
4279                     data_fake.last_closep = data->last_closep;
4280                 }
4281                 else
4282                     data_fake.last_closep = &fake;
4283                 data_fake.pos_delta = delta;
4284                 if ( flags & SCF_DO_STCLASS && !scan->flags
4285                      && OP(scan) == IFMATCH ) { /* Lookahead */
4286                     cl_init(pRExC_state, &intrnl);
4287                     data_fake.start_class = &intrnl;
4288                     f |= SCF_DO_STCLASS_AND;
4289                 }
4290                 if (flags & SCF_WHILEM_VISITED_POS)
4291                     f |= SCF_WHILEM_VISITED_POS;
4292                 next = regnext(scan);
4293                 nscan = NEXTOPER(NEXTOPER(scan));
4294                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
4295                     last, &data_fake, stopparen, recursed, NULL, f, depth+1);
4296                 if (scan->flags) {
4297                     if (deltanext) {
4298                         FAIL("Variable length lookbehind not implemented");
4299                     }
4300                     else if (minnext > (I32)U8_MAX) {
4301                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4302                     }
4303                     scan->flags = (U8)minnext;
4304                 }
4305                 if (data) {
4306                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4307                         pars++;
4308                     if (data_fake.flags & SF_HAS_EVAL)
4309                         data->flags |= SF_HAS_EVAL;
4310                     data->whilem_c = data_fake.whilem_c;
4311                 }
4312                 if (f & SCF_DO_STCLASS_AND) {
4313                     if (flags & SCF_DO_STCLASS_OR) {
4314                         /* OR before, AND after: ideally we would recurse with
4315                          * data_fake to get the AND applied by study of the
4316                          * remainder of the pattern, and then derecurse;
4317                          * *** HACK *** for now just treat as "no information".
4318                          * See [perl #56690].
4319                          */
4320                         cl_init(pRExC_state, data->start_class);
4321                     }  else {
4322                         /* AND before and after: combine and continue */
4323                         const int was = (data->start_class->flags & ANYOF_EOS);
4324
4325                         cl_and(data->start_class, &intrnl);
4326                         if (was)
4327                             data->start_class->flags |= ANYOF_EOS;
4328                     }
4329                 }
4330             }
4331 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4332             else {
4333                 /* Positive Lookahead/lookbehind
4334                    In this case we can do fixed string optimisation,
4335                    but we must be careful about it. Note in the case of
4336                    lookbehind the positions will be offset by the minimum
4337                    length of the pattern, something we won't know about
4338                    until after the recurse.
4339                 */
4340                 I32 deltanext, fake = 0;
4341                 regnode *nscan;
4342                 struct regnode_charclass_class intrnl;
4343                 int f = 0;
4344                 /* We use SAVEFREEPV so that when the full compile
4345                     is finished perl will clean up the allocated
4346                     minlens when it's all done. This way we don't
4347                     have to worry about freeing them when we know
4348                     they wont be used, which would be a pain.
4349                  */
4350                 I32 *minnextp;
4351                 Newx( minnextp, 1, I32 );
4352                 SAVEFREEPV(minnextp);
4353
4354                 if (data) {
4355                     StructCopy(data, &data_fake, scan_data_t);
4356                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
4357                         f |= SCF_DO_SUBSTR;
4358                         if (scan->flags)
4359                             SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
4360                         data_fake.last_found=newSVsv(data->last_found);
4361                     }
4362                 }
4363                 else
4364                     data_fake.last_closep = &fake;
4365                 data_fake.flags = 0;
4366                 data_fake.pos_delta = delta;
4367                 if (is_inf)
4368                     data_fake.flags |= SF_IS_INF;
4369                 if ( flags & SCF_DO_STCLASS && !scan->flags
4370                      && OP(scan) == IFMATCH ) { /* Lookahead */
4371                     cl_init(pRExC_state, &intrnl);
4372                     data_fake.start_class = &intrnl;
4373                     f |= SCF_DO_STCLASS_AND;
4374                 }
4375                 if (flags & SCF_WHILEM_VISITED_POS)
4376                     f |= SCF_WHILEM_VISITED_POS;
4377                 next = regnext(scan);
4378                 nscan = NEXTOPER(NEXTOPER(scan));
4379
4380                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
4381                     last, &data_fake, stopparen, recursed, NULL, f,depth+1);
4382                 if (scan->flags) {
4383                     if (deltanext) {
4384                         FAIL("Variable length lookbehind not implemented");
4385                     }
4386                     else if (*minnextp > (I32)U8_MAX) {
4387                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4388                     }
4389                     scan->flags = (U8)*minnextp;
4390                 }
4391
4392                 *minnextp += min;
4393
4394                 if (f & SCF_DO_STCLASS_AND) {
4395                     const int was = (data->start_class->flags & ANYOF_EOS);
4396
4397                     cl_and(data->start_class, &intrnl);
4398                     if (was)
4399                         data->start_class->flags |= ANYOF_EOS;
4400                 }
4401                 if (data) {
4402                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4403                         pars++;
4404                     if (data_fake.flags & SF_HAS_EVAL)
4405                         data->flags |= SF_HAS_EVAL;
4406                     data->whilem_c = data_fake.whilem_c;
4407                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4408                         if (RExC_rx->minlen<*minnextp)
4409                             RExC_rx->minlen=*minnextp;
4410                         SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4411                         SvREFCNT_dec(data_fake.last_found);
4412
4413                         if ( data_fake.minlen_fixed != minlenp )
4414                         {
4415                             data->offset_fixed= data_fake.offset_fixed;
4416                             data->minlen_fixed= data_fake.minlen_fixed;
4417                             data->lookbehind_fixed+= scan->flags;
4418                         }
4419                         if ( data_fake.minlen_float != minlenp )
4420                         {
4421                             data->minlen_float= data_fake.minlen_float;
4422                             data->offset_float_min=data_fake.offset_float_min;
4423                             data->offset_float_max=data_fake.offset_float_max;
4424                             data->lookbehind_float+= scan->flags;
4425                         }
4426                     }
4427                 }
4428
4429
4430             }
4431 #endif
4432         }
4433         else if (OP(scan) == OPEN) {
4434             if (stopparen != (I32)ARG(scan))
4435                 pars++;
4436         }
4437         else if (OP(scan) == CLOSE) {
4438             if (stopparen == (I32)ARG(scan)) {
4439                 break;
4440             }
4441             if ((I32)ARG(scan) == is_par) {
4442                 next = regnext(scan);
4443
4444                 if ( next && (OP(next) != WHILEM) && next < last)
4445                     is_par = 0;         /* Disable optimization */
4446             }
4447             if (data)
4448                 *(data->last_closep) = ARG(scan);
4449         }
4450         else if (OP(scan) == EVAL) {
4451                 if (data)
4452                     data->flags |= SF_HAS_EVAL;
4453         }
4454         else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4455             if (flags & SCF_DO_SUBSTR) {
4456                 SCAN_COMMIT(pRExC_state,data,minlenp);
4457                 flags &= ~SCF_DO_SUBSTR;
4458             }
4459             if (data && OP(scan)==ACCEPT) {
4460                 data->flags |= SCF_SEEN_ACCEPT;
4461                 if (stopmin > min)
4462                     stopmin = min;
4463             }
4464         }
4465         else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4466         {
4467                 if (flags & SCF_DO_SUBSTR) {
4468                     SCAN_COMMIT(pRExC_state,data,minlenp);
4469                     data->longest = &(data->longest_float);
4470                 }
4471                 is_inf = is_inf_internal = 1;
4472                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4473                     cl_anything(pRExC_state, data->start_class);
4474                 flags &= ~SCF_DO_STCLASS;
4475         }
4476         else if (OP(scan) == GPOS) {
4477             if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4478                 !(delta || is_inf || (data && data->pos_delta)))
4479             {
4480                 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4481                     RExC_rx->extflags |= RXf_ANCH_GPOS;
4482                 if (RExC_rx->gofs < (U32)min)
4483                     RExC_rx->gofs = min;
4484             } else {
4485                 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4486                 RExC_rx->gofs = 0;
4487             }
4488         }
4489 #ifdef TRIE_STUDY_OPT
4490 #ifdef FULL_TRIE_STUDY
4491         else if (PL_regkind[OP(scan)] == TRIE) {
4492             /* NOTE - There is similar code to this block above for handling
4493                BRANCH nodes on the initial study.  If you change stuff here
4494                check there too. */
4495             regnode *trie_node= scan;
4496             regnode *tail= regnext(scan);
4497             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4498             I32 max1 = 0, min1 = I32_MAX;
4499             struct regnode_charclass_class accum;
4500
4501             if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4502                 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4503             if (flags & SCF_DO_STCLASS)
4504                 cl_init_zero(pRExC_state, &accum);
4505
4506             if (!trie->jump) {
4507                 min1= trie->minlen;
4508                 max1= trie->maxlen;
4509             } else {
4510                 const regnode *nextbranch= NULL;
4511                 U32 word;
4512
4513                 for ( word=1 ; word <= trie->wordcount ; word++)
4514                 {
4515                     I32 deltanext=0, minnext=0, f = 0, fake;
4516                     struct regnode_charclass_class this_class;
4517
4518                     data_fake.flags = 0;
4519                     if (data) {
4520                         data_fake.whilem_c = data->whilem_c;
4521                         data_fake.last_closep = data->last_closep;
4522                     }
4523                     else
4524                         data_fake.last_closep = &fake;
4525                     data_fake.pos_delta = delta;
4526                     if (flags & SCF_DO_STCLASS) {
4527                         cl_init(pRExC_state, &this_class);
4528                         data_fake.start_class = &this_class;
4529                         f = SCF_DO_STCLASS_AND;
4530                     }
4531                     if (flags & SCF_WHILEM_VISITED_POS)
4532                         f |= SCF_WHILEM_VISITED_POS;
4533
4534                     if (trie->jump[word]) {
4535                         if (!nextbranch)
4536                             nextbranch = trie_node + trie->jump[0];
4537                         scan= trie_node + trie->jump[word];
4538                         /* We go from the jump point to the branch that follows
4539                            it. Note this means we need the vestigal unused branches
4540                            even though they arent otherwise used.
4541                          */
4542                         minnext = study_chunk(pRExC_state, &scan, minlenp,
4543                             &deltanext, (regnode *)nextbranch, &data_fake,
4544                             stopparen, recursed, NULL, f,depth+1);
4545                     }
4546                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4547                         nextbranch= regnext((regnode*)nextbranch);
4548
4549                     if (min1 > (I32)(minnext + trie->minlen))
4550                         min1 = minnext + trie->minlen;
4551                     if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4552                         max1 = minnext + deltanext + trie->maxlen;
4553                     if (deltanext == I32_MAX)
4554                         is_inf = is_inf_internal = 1;
4555
4556                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4557                         pars++;
4558                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
4559                         if ( stopmin > min + min1)
4560                             stopmin = min + min1;
4561                         flags &= ~SCF_DO_SUBSTR;
4562                         if (data)
4563                             data->flags |= SCF_SEEN_ACCEPT;
4564                     }
4565                     if (data) {
4566                         if (data_fake.flags & SF_HAS_EVAL)
4567                             data->flags |= SF_HAS_EVAL;
4568                         data->whilem_c = data_fake.whilem_c;
4569                     }
4570                     if (flags & SCF_DO_STCLASS)
4571                         cl_or(pRExC_state, &accum, &this_class);
4572                 }
4573             }
4574             if (flags & SCF_DO_SUBSTR) {
4575                 data->pos_min += min1;
4576                 data->pos_delta += max1 - min1;
4577                 if (max1 != min1 || is_inf)
4578                     data->longest = &(data->longest_float);
4579             }
4580             min += min1;
4581             delta += max1 - min1;
4582             if (flags & SCF_DO_STCLASS_OR) {
4583                 cl_or(pRExC_state, data->start_class, &accum);
4584                 if (min1) {
4585                     cl_and(data->start_class, and_withp);
4586                     flags &= ~SCF_DO_STCLASS;
4587                 }
4588             }
4589             else if (flags & SCF_DO_STCLASS_AND) {
4590                 if (min1) {
4591                     cl_and(data->start_class, &accum);
4592                     flags &= ~SCF_DO_STCLASS;
4593                 }
4594                 else {
4595                     /* Switch to OR mode: cache the old value of
4596                      * data->start_class */
4597                     INIT_AND_WITHP;
4598                     StructCopy(data->start_class, and_withp,
4599                                struct regnode_charclass_class);
4600                     flags &= ~SCF_DO_STCLASS_AND;
4601                     StructCopy(&accum, data->start_class,
4602                                struct regnode_charclass_class);
4603                     flags |= SCF_DO_STCLASS_OR;
4604                     data->start_class->flags |= ANYOF_EOS;
4605                 }
4606             }
4607             scan= tail;
4608             continue;
4609         }
4610 #else
4611         else if (PL_regkind[OP(scan)] == TRIE) {
4612             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4613             U8*bang=NULL;
4614
4615             min += trie->minlen;
4616             delta += (trie->maxlen - trie->minlen);
4617             flags &= ~SCF_DO_STCLASS; /* xxx */
4618             if (flags & SCF_DO_SUBSTR) {
4619                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
4620                 data->pos_min += trie->minlen;
4621                 data->pos_delta += (trie->maxlen - trie->minlen);
4622                 if (trie->maxlen != trie->minlen)
4623                     data->longest = &(data->longest_float);
4624             }
4625             if (trie->jump) /* no more substrings -- for now /grr*/
4626                 flags &= ~SCF_DO_SUBSTR;
4627         }
4628 #endif /* old or new */
4629 #endif /* TRIE_STUDY_OPT */
4630
4631         /* Else: zero-length, ignore. */
4632         scan = regnext(scan);
4633     }
4634     if (frame) {
4635         last = frame->last;
4636         scan = frame->next;
4637         stopparen = frame->stop;
4638         frame = frame->prev;
4639         goto fake_study_recurse;
4640     }
4641
4642   finish:
4643     assert(!frame);
4644     DEBUG_STUDYDATA("pre-fin:",data,depth);
4645
4646     *scanp = scan;
4647     *deltap = is_inf_internal ? I32_MAX : delta;
4648     if (flags & SCF_DO_SUBSTR && is_inf)
4649         data->pos_delta = I32_MAX - data->pos_min;
4650     if (is_par > (I32)U8_MAX)
4651         is_par = 0;
4652     if (is_par && pars==1 && data) {
4653         data->flags |= SF_IN_PAR;
4654         data->flags &= ~SF_HAS_PAR;
4655     }
4656     else if (pars && data) {
4657         data->flags |= SF_HAS_PAR;
4658         data->flags &= ~SF_IN_PAR;
4659     }
4660     if (flags & SCF_DO_STCLASS_OR)
4661         cl_and(data->start_class, and_withp);
4662     if (flags & SCF_TRIE_RESTUDY)
4663         data->flags |=  SCF_TRIE_RESTUDY;
4664
4665     DEBUG_STUDYDATA("post-fin:",data,depth);
4666
4667     return min < stopmin ? min : stopmin;
4668 }
4669
4670 STATIC U32
4671 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4672 {
4673     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4674
4675     PERL_ARGS_ASSERT_ADD_DATA;
4676
4677     Renewc(RExC_rxi->data,
4678            sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4679            char, struct reg_data);
4680     if(count)
4681         Renew(RExC_rxi->data->what, count + n, U8);
4682     else
4683         Newx(RExC_rxi->data->what, n, U8);
4684     RExC_rxi->data->count = count + n;
4685     Copy(s, RExC_rxi->data->what + count, n, U8);
4686     return count;
4687 }
4688
4689 /*XXX: todo make this not included in a non debugging perl */
4690 #ifndef PERL_IN_XSUB_RE
4691 void
4692 Perl_reginitcolors(pTHX)
4693 {
4694     dVAR;
4695     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4696     if (s) {
4697         char *t = savepv(s);
4698         int i = 0;
4699         PL_colors[0] = t;
4700         while (++i < 6) {
4701             t = strchr(t, '\t');
4702             if (t) {
4703                 *t = '\0';
4704                 PL_colors[i] = ++t;
4705             }
4706             else
4707                 PL_colors[i] = t = (char *)"";
4708         }
4709     } else {
4710         int i = 0;
4711         while (i < 6)
4712             PL_colors[i++] = (char *)"";
4713     }
4714     PL_colorset = 1;
4715 }
4716 #endif
4717
4718
4719 #ifdef TRIE_STUDY_OPT
4720 #define CHECK_RESTUDY_GOTO                                  \
4721         if (                                                \
4722               (data.flags & SCF_TRIE_RESTUDY)               \
4723               && ! restudied++                              \
4724         )     goto reStudy
4725 #else
4726 #define CHECK_RESTUDY_GOTO
4727 #endif
4728
4729 /*
4730  - pregcomp - compile a regular expression into internal code
4731  *
4732  * We can't allocate space until we know how big the compiled form will be,
4733  * but we can't compile it (and thus know how big it is) until we've got a
4734  * place to put the code.  So we cheat:  we compile it twice, once with code
4735  * generation turned off and size counting turned on, and once "for real".
4736  * This also means that we don't allocate space until we are sure that the
4737  * thing really will compile successfully, and we never have to move the
4738  * code and thus invalidate pointers into it.  (Note that it has to be in
4739  * one piece because free() must be able to free it all.) [NB: not true in perl]
4740  *
4741  * Beware that the optimization-preparation code in here knows about some
4742  * of the structure of the compiled regexp.  [I'll say.]
4743  */
4744
4745
4746
4747 #ifndef PERL_IN_XSUB_RE
4748 #define RE_ENGINE_PTR &PL_core_reg_engine
4749 #else
4750 extern const struct regexp_engine my_reg_engine;
4751 #define RE_ENGINE_PTR &my_reg_engine
4752 #endif
4753
4754 #ifndef PERL_IN_XSUB_RE
4755 REGEXP *
4756 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4757 {
4758     dVAR;
4759     HV * const table = GvHV(PL_hintgv);
4760
4761     PERL_ARGS_ASSERT_PREGCOMP;
4762
4763     /* Dispatch a request to compile a regexp to correct
4764        regexp engine. */
4765     if (table) {
4766         SV **ptr= hv_fetchs(table, "regcomp", FALSE);
4767         GET_RE_DEBUG_FLAGS_DECL;
4768         if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
4769             const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4770             DEBUG_COMPILE_r({
4771                 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4772                     SvIV(*ptr));
4773             });
4774             return CALLREGCOMP_ENG(eng, pattern, flags);
4775         }
4776     }
4777     return Perl_re_compile(aTHX_ pattern, flags);
4778 }
4779 #endif
4780
4781 REGEXP *
4782 Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
4783 {
4784     dVAR;
4785     REGEXP *rx;
4786     struct regexp *r;
4787     register regexp_internal *ri;
4788     STRLEN plen;
4789     char* VOL exp;
4790     char* xend;
4791     regnode *scan;
4792     I32 flags;
4793     I32 minlen = 0;
4794     U32 pm_flags;
4795
4796     /* these are all flags - maybe they should be turned
4797      * into a single int with different bit masks */
4798     I32 sawlookahead = 0;
4799     I32 sawplus = 0;
4800     I32 sawopen = 0;
4801     bool used_setjump = FALSE;
4802     regex_charset initial_charset = get_regex_charset(orig_pm_flags);
4803
4804     U8 jump_ret = 0;
4805     dJMPENV;
4806     scan_data_t data;
4807     RExC_state_t RExC_state;
4808     RExC_state_t * const pRExC_state = &RExC_state;
4809 #ifdef TRIE_STUDY_OPT
4810     int restudied;
4811     RExC_state_t copyRExC_state;
4812 #endif
4813     GET_RE_DEBUG_FLAGS_DECL;
4814
4815     PERL_ARGS_ASSERT_RE_COMPILE;
4816
4817     DEBUG_r(if (!PL_colorset) reginitcolors());
4818
4819 #ifndef PERL_IN_XSUB_RE
4820     /* Initialize these here instead of as-needed, as is quick and avoids
4821      * having to test them each time otherwise */
4822     if (! PL_AboveLatin1) {
4823         PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
4824         PL_ASCII = _new_invlist_C_array(ASCII_invlist);
4825         PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
4826
4827         PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist);
4828         PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist);
4829
4830         PL_L1PosixAlpha = _new_invlist_C_array(L1PosixAlpha_invlist);
4831         PL_PosixAlpha = _new_invlist_C_array(PosixAlpha_invlist);
4832
4833         PL_PosixBlank = _new_invlist_C_array(PosixBlank_invlist);
4834         PL_XPosixBlank = _new_invlist_C_array(XPosixBlank_invlist);
4835
4836         PL_L1Cased = _new_invlist_C_array(L1Cased_invlist);
4837
4838         PL_PosixCntrl = _new_invlist_C_array(PosixCntrl_invlist);
4839         PL_XPosixCntrl = _new_invlist_C_array(XPosixCntrl_invlist);
4840
4841         PL_PosixDigit = _new_invlist_C_array(PosixDigit_invlist);
4842
4843         PL_L1PosixGraph = _new_invlist_C_array(L1PosixGraph_invlist);
4844         PL_PosixGraph = _new_invlist_C_array(PosixGraph_invlist);
4845
4846         PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist);
4847         PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist);
4848
4849         PL_L1PosixLower = _new_invlist_C_array(L1PosixLower_invlist);
4850         PL_PosixLower = _new_invlist_C_array(PosixLower_invlist);
4851
4852         PL_L1PosixPrint = _new_invlist_C_array(L1PosixPrint_invlist);
4853         PL_PosixPrint = _new_invlist_C_array(PosixPrint_invlist);
4854
4855         PL_L1PosixPunct = _new_invlist_C_array(L1PosixPunct_invlist);
4856         PL_PosixPunct = _new_invlist_C_array(PosixPunct_invlist);
4857
4858         PL_PerlSpace = _new_invlist_C_array(PerlSpace_invlist);
4859         PL_XPerlSpace = _new_invlist_C_array(XPerlSpace_invlist);
4860
4861         PL_PosixSpace = _new_invlist_C_array(PosixSpace_invlist);
4862         PL_XPosixSpace = _new_invlist_C_array(XPosixSpace_invlist);
4863
4864         PL_L1PosixUpper = _new_invlist_C_array(L1PosixUpper_invlist);
4865         PL_PosixUpper = _new_invlist_C_array(PosixUpper_invlist);
4866
4867         PL_VertSpace = _new_invlist_C_array(VertSpace_invlist);
4868
4869         PL_PosixWord = _new_invlist_C_array(PosixWord_invlist);
4870         PL_L1PosixWord = _new_invlist_C_array(L1PosixWord_invlist);
4871
4872         PL_PosixXDigit = _new_invlist_C_array(PosixXDigit_invlist);
4873         PL_XPosixXDigit = _new_invlist_C_array(XPosixXDigit_invlist);
4874     }
4875 #endif
4876
4877     exp = SvPV(pattern, plen);
4878
4879     if (plen == 0) { /* ignore the utf8ness if the pattern is 0 length */
4880         RExC_utf8 = RExC_orig_utf8 = 0;
4881     }
4882     else {
4883         RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
4884     }
4885     RExC_uni_semantics = 0;
4886     RExC_contains_locale = 0;
4887
4888     /****************** LONG JUMP TARGET HERE***********************/
4889     /* Longjmp back to here if have to switch in midstream to utf8 */
4890     if (! RExC_orig_utf8) {
4891         JMPENV_PUSH(jump_ret);
4892         used_setjump = TRUE;
4893     }
4894
4895     if (jump_ret == 0) {    /* First time through */
4896         xend = exp + plen;
4897
4898         DEBUG_COMPILE_r({
4899             SV *dsv= sv_newmortal();
4900             RE_PV_QUOTED_DECL(s, RExC_utf8,
4901                 dsv, exp, plen, 60);
4902             PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4903                            PL_colors[4],PL_colors[5],s);
4904         });
4905     }
4906     else {  /* longjumped back */
4907         STRLEN len = plen;
4908
4909         /* If the cause for the longjmp was other than changing to utf8, pop
4910          * our own setjmp, and longjmp to the correct handler */
4911         if (jump_ret != UTF8_LONGJMP) {
4912             JMPENV_POP;
4913             JMPENV_JUMP(jump_ret);
4914         }
4915
4916         GET_RE_DEBUG_FLAGS;
4917
4918         /* It's possible to write a regexp in ascii that represents Unicode
4919         codepoints outside of the byte range, such as via \x{100}. If we
4920         detect such a sequence we have to convert the entire pattern to utf8
4921         and then recompile, as our sizing calculation will have been based
4922         on 1 byte == 1 character, but we will need to use utf8 to encode
4923         at least some part of the pattern, and therefore must convert the whole
4924         thing.
4925         -- dmq */
4926         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4927             "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4928         exp = (char*)Perl_bytes_to_utf8(aTHX_
4929                                         (U8*)SvPV_nomg(pattern, plen),
4930                                         &len);
4931         xend = exp + len;
4932         RExC_orig_utf8 = RExC_utf8 = 1;
4933         SAVEFREEPV(exp);
4934     }
4935
4936 #ifdef TRIE_STUDY_OPT
4937     restudied = 0;
4938 #endif
4939
4940     pm_flags = orig_pm_flags;
4941
4942     if (initial_charset == REGEX_LOCALE_CHARSET) {
4943         RExC_contains_locale = 1;
4944     }
4945     else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
4946
4947         /* Set to use unicode semantics if the pattern is in utf8 and has the
4948          * 'depends' charset specified, as it means unicode when utf8  */
4949         set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
4950     }
4951
4952     RExC_precomp = exp;
4953     RExC_flags = pm_flags;
4954     RExC_sawback = 0;
4955
4956     RExC_seen = 0;
4957     RExC_in_lookbehind = 0;
4958     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4959     RExC_seen_evals = 0;
4960     RExC_extralen = 0;
4961     RExC_override_recoding = 0;
4962
4963     /* First pass: determine size, legality. */
4964     RExC_parse = exp;
4965     RExC_start = exp;
4966     RExC_end = xend;
4967     RExC_naughty = 0;
4968     RExC_npar = 1;
4969     RExC_nestroot = 0;
4970     RExC_size = 0L;
4971     RExC_emit = &PL_regdummy;
4972     RExC_whilem_seen = 0;
4973     RExC_open_parens = NULL;
4974     RExC_close_parens = NULL;
4975     RExC_opend = NULL;
4976     RExC_paren_names = NULL;
4977 #ifdef DEBUGGING
4978     RExC_paren_name_list = NULL;
4979 #endif
4980     RExC_recurse = NULL;
4981     RExC_recurse_count = 0;
4982
4983 #if 0 /* REGC() is (currently) a NOP at the first pass.
4984        * Clever compilers notice this and complain. --jhi */
4985     REGC((U8)REG_MAGIC, (char*)RExC_emit);
4986 #endif
4987     DEBUG_PARSE_r(
4988         PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
4989         RExC_lastnum=0;
4990         RExC_lastparse=NULL;
4991     );
4992     if (reg(pRExC_state, 0, &flags,1) == NULL) {
4993         RExC_precomp = NULL;
4994         return(NULL);
4995     }
4996
4997     /* Here, finished first pass.  Get rid of any added setjmp */
4998     if (used_setjump) {
4999         JMPENV_POP;
5000     }
5001
5002     DEBUG_PARSE_r({
5003         PerlIO_printf(Perl_debug_log,
5004             "Required size %"IVdf" nodes\n"
5005             "Starting second pass (creation)\n",
5006             (IV)RExC_size);
5007         RExC_lastnum=0;
5008         RExC_lastparse=NULL;
5009     });
5010
5011     /* The first pass could have found things that force Unicode semantics */
5012     if ((RExC_utf8 || RExC_uni_semantics)
5013          && get_regex_charset(pm_flags) == REGEX_DEPENDS_CHARSET)
5014     {
5015         set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
5016     }
5017
5018     /* Small enough for pointer-storage convention?
5019        If extralen==0, this means that we will not need long jumps. */
5020     if (RExC_size >= 0x10000L && RExC_extralen)
5021         RExC_size += RExC_extralen;
5022     else
5023         RExC_extralen = 0;
5024     if (RExC_whilem_seen > 15)
5025         RExC_whilem_seen = 15;
5026
5027     /* Allocate space and zero-initialize. Note, the two step process
5028        of zeroing when in debug mode, thus anything assigned has to
5029        happen after that */
5030     rx = (REGEXP*) newSV_type(SVt_REGEXP);
5031     r = (struct regexp*)SvANY(rx);
5032     Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
5033          char, regexp_internal);
5034     if ( r == NULL || ri == NULL )
5035         FAIL("Regexp out of space");
5036 #ifdef DEBUGGING
5037     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
5038     Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
5039 #else
5040     /* bulk initialize base fields with 0. */
5041     Zero(ri, sizeof(regexp_internal), char);
5042 #endif
5043
5044     /* non-zero initialization begins here */
5045     RXi_SET( r, ri );
5046     r->engine= RE_ENGINE_PTR;
5047     r->extflags = pm_flags;
5048     {
5049         bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
5050         bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
5051
5052         /* The caret is output if there are any defaults: if not all the STD
5053          * flags are set, or if no character set specifier is needed */
5054         bool has_default =
5055                     (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
5056                     || ! has_charset);
5057         bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
5058         U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
5059                             >> RXf_PMf_STD_PMMOD_SHIFT);
5060         const char *fptr = STD_PAT_MODS;        /*"msix"*/
5061         char *p;
5062         /* Allocate for the worst case, which is all the std flags are turned
5063          * on.  If more precision is desired, we could do a population count of
5064          * the flags set.  This could be done with a small lookup table, or by
5065          * shifting, masking and adding, or even, when available, assembly
5066          * language for a machine-language population count.
5067          * We never output a minus, as all those are defaults, so are
5068          * covered by the caret */
5069         const STRLEN wraplen = plen + has_p + has_runon
5070             + has_default       /* If needs a caret */
5071
5072                 /* If needs a character set specifier */
5073             + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
5074             + (sizeof(STD_PAT_MODS) - 1)
5075             + (sizeof("(?:)") - 1);
5076
5077         p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
5078         SvPOK_on(rx);
5079         SvFLAGS(rx) |= SvUTF8(pattern);
5080         *p++='('; *p++='?';
5081
5082         /* If a default, cover it using the caret */
5083         if (has_default) {
5084             *p++= DEFAULT_PAT_MOD;
5085         }
5086         if (has_charset) {
5087             STRLEN len;
5088             const char* const name = get_regex_charset_name(r->extflags, &len);
5089             Copy(name, p, len, char);
5090             p += len;
5091         }
5092         if (has_p)
5093             *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
5094         {
5095             char ch;
5096             while((ch = *fptr++)) {
5097                 if(reganch & 1)
5098                     *p++ = ch;
5099                 reganch >>= 1;
5100             }
5101         }
5102
5103         *p++ = ':';
5104         Copy(RExC_precomp, p, plen, char);
5105         assert ((RX_WRAPPED(rx) - p) < 16);
5106         r->pre_prefix = p - RX_WRAPPED(rx);
5107         p += plen;
5108         if (has_runon)
5109             *p++ = '\n';
5110         *p++ = ')';
5111         *p = 0;
5112         SvCUR_set(rx, p - SvPVX_const(rx));
5113     }
5114
5115     r->intflags = 0;
5116     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
5117
5118     if (RExC_seen & REG_SEEN_RECURSE) {
5119         Newxz(RExC_open_parens, RExC_npar,regnode *);
5120         SAVEFREEPV(RExC_open_parens);
5121         Newxz(RExC_close_parens,RExC_npar,regnode *);
5122         SAVEFREEPV(RExC_close_parens);
5123     }
5124
5125     /* Useful during FAIL. */
5126 #ifdef RE_TRACK_PATTERN_OFFSETS
5127     Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
5128     DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
5129                           "%s %"UVuf" bytes for offset annotations.\n",
5130                           ri->u.offsets ? "Got" : "Couldn't get",
5131                           (UV)((2*RExC_size+1) * sizeof(U32))));
5132 #endif
5133     SetProgLen(ri,RExC_size);
5134     RExC_rx_sv = rx;
5135     RExC_rx = r;
5136     RExC_rxi = ri;
5137     REH_CALL_COMP_BEGIN_HOOK(pRExC_state->rx);
5138
5139     /* Second pass: emit code. */
5140     RExC_flags = pm_flags;      /* don't let top level (?i) bleed */
5141     RExC_parse = exp;
5142     RExC_end = xend;
5143     RExC_naughty = 0;
5144     RExC_npar = 1;
5145     RExC_emit_start = ri->program;
5146     RExC_emit = ri->program;
5147     RExC_emit_bound = ri->program + RExC_size + 1;
5148
5149     /* Store the count of eval-groups for security checks: */
5150     RExC_rx->seen_evals = RExC_seen_evals;
5151     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
5152     if (reg(pRExC_state, 0, &flags,1) == NULL) {
5153         ReREFCNT_dec(rx);
5154         return(NULL);
5155     }
5156     /* XXXX To minimize changes to RE engine we always allocate
5157        3-units-long substrs field. */
5158     Newx(r->substrs, 1, struct reg_substr_data);
5159     if (RExC_recurse_count) {
5160         Newxz(RExC_recurse,RExC_recurse_count,regnode *);
5161         SAVEFREEPV(RExC_recurse);
5162     }
5163
5164 reStudy:
5165     r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
5166     Zero(r->substrs, 1, struct reg_substr_data);
5167
5168 #ifdef TRIE_STUDY_OPT
5169     if (!restudied) {
5170         StructCopy(&zero_scan_data, &data, scan_data_t);
5171         copyRExC_state = RExC_state;
5172     } else {
5173         U32 seen=RExC_seen;
5174         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
5175
5176         RExC_state = copyRExC_state;
5177         if (seen & REG_TOP_LEVEL_BRANCHES)
5178             RExC_seen |= REG_TOP_LEVEL_BRANCHES;
5179         else
5180             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
5181         if (data.last_found) {
5182             SvREFCNT_dec(data.longest_fixed);
5183             SvREFCNT_dec(data.longest_float);
5184             SvREFCNT_dec(data.last_found);
5185         }
5186         StructCopy(&zero_scan_data, &data, scan_data_t);
5187     }
5188 #else
5189     StructCopy(&zero_scan_data, &data, scan_data_t);
5190 #endif
5191
5192     /* Dig out information for optimizations. */
5193     r->extflags = RExC_flags; /* was pm_op */
5194     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
5195
5196     if (UTF)
5197         SvUTF8_on(rx);  /* Unicode in it? */
5198     ri->regstclass = NULL;
5199     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
5200         r->intflags |= PREGf_NAUGHTY;
5201     scan = ri->program + 1;             /* First BRANCH. */
5202
5203     /* testing for BRANCH here tells us whether there is "must appear"
5204        data in the pattern. If there is then we can use it for optimisations */
5205     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /*  Only one top-level choice. */
5206         I32 fake;
5207         STRLEN longest_float_length, longest_fixed_length;
5208         struct regnode_charclass_class ch_class; /* pointed to by data */
5209         int stclass_flag;
5210         I32 last_close = 0; /* pointed to by data */
5211         regnode *first= scan;
5212         regnode *first_next= regnext(first);
5213         /*
5214          * Skip introductions and multiplicators >= 1
5215          * so that we can extract the 'meat' of the pattern that must
5216          * match in the large if() sequence following.
5217          * NOTE that EXACT is NOT covered here, as it is normally
5218          * picked up by the optimiser separately.
5219          *
5220          * This is unfortunate as the optimiser isnt handling lookahead
5221          * properly currently.
5222          *
5223          */
5224         while ((OP(first) == OPEN && (sawopen = 1)) ||
5225                /* An OR of *one* alternative - should not happen now. */
5226             (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
5227             /* for now we can't handle lookbehind IFMATCH*/
5228             (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
5229             (OP(first) == PLUS) ||
5230             (OP(first) == MINMOD) ||
5231                /* An {n,m} with n>0 */
5232             (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
5233             (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
5234         {
5235                 /*
5236                  * the only op that could be a regnode is PLUS, all the rest
5237                  * will be regnode_1 or regnode_2.
5238                  *
5239                  */
5240                 if (OP(first) == PLUS)
5241                     sawplus = 1;
5242                 else
5243                     first += regarglen[OP(first)];
5244
5245                 first = NEXTOPER(first);
5246                 first_next= regnext(first);
5247         }
5248
5249         /* Starting-point info. */
5250       again:
5251         DEBUG_PEEP("first:",first,0);
5252         /* Ignore EXACT as we deal with it later. */
5253         if (PL_regkind[OP(first)] == EXACT) {
5254             if (OP(first) == EXACT)
5255                 NOOP;   /* Empty, get anchored substr later. */
5256             else
5257                 ri->regstclass = first;
5258         }
5259 #ifdef TRIE_STCLASS
5260         else if (PL_regkind[OP(first)] == TRIE &&
5261                 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
5262         {
5263             regnode *trie_op;
5264             /* this can happen only on restudy */
5265             if ( OP(first) == TRIE ) {
5266                 struct regnode_1 *trieop = (struct regnode_1 *)
5267                     PerlMemShared_calloc(1, sizeof(struct regnode_1));
5268                 StructCopy(first,trieop,struct regnode_1);
5269                 trie_op=(regnode *)trieop;
5270             } else {
5271                 struct regnode_charclass *trieop = (struct regnode_charclass *)
5272                     PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
5273                 StructCopy(first,trieop,struct regnode_charclass);
5274                 trie_op=(regnode *)trieop;
5275             }
5276             OP(trie_op)+=2;
5277             make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
5278             ri->regstclass = trie_op;
5279         }
5280 #endif
5281         else if (REGNODE_SIMPLE(OP(first)))
5282             ri->regstclass = first;
5283         else if (PL_regkind[OP(first)] == BOUND ||
5284                  PL_regkind[OP(first)] == NBOUND)
5285             ri->regstclass = first;
5286         else if (PL_regkind[OP(first)] == BOL) {
5287             r->extflags |= (OP(first) == MBOL
5288                            ? RXf_ANCH_MBOL
5289                            : (OP(first) == SBOL
5290                               ? RXf_ANCH_SBOL
5291                               : RXf_ANCH_BOL));
5292             first = NEXTOPER(first);
5293             goto again;
5294         }
5295         else if (OP(first) == GPOS) {
5296             r->extflags |= RXf_ANCH_GPOS;
5297             first = NEXTOPER(first);
5298             goto again;
5299         }
5300         else if ((!sawopen || !RExC_sawback) &&
5301             (OP(first) == STAR &&
5302             PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
5303             !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
5304         {
5305             /* turn .* into ^.* with an implied $*=1 */
5306             const int type =
5307                 (OP(NEXTOPER(first)) == REG_ANY)
5308                     ? RXf_ANCH_MBOL
5309                     : RXf_ANCH_SBOL;
5310             r->extflags |= type;
5311             r->intflags |= PREGf_IMPLICIT;
5312             first = NEXTOPER(first);
5313             goto again;
5314         }
5315         if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
5316             && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
5317             /* x+ must match at the 1st pos of run of x's */
5318             r->intflags |= PREGf_SKIP;
5319
5320         /* Scan is after the zeroth branch, first is atomic matcher. */
5321 #ifdef TRIE_STUDY_OPT
5322         DEBUG_PARSE_r(
5323             if (!restudied)
5324                 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
5325                               (IV)(first - scan + 1))
5326         );
5327 #else
5328         DEBUG_PARSE_r(
5329             PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
5330                 (IV)(first - scan + 1))
5331         );
5332 #endif
5333
5334
5335         /*
5336         * If there's something expensive in the r.e., find the
5337         * longest literal string that must appear and make it the
5338         * regmust.  Resolve ties in favor of later strings, since
5339         * the regstart check works with the beginning of the r.e.
5340         * and avoiding duplication strengthens checking.  Not a
5341         * strong reason, but sufficient in the absence of others.
5342         * [Now we resolve ties in favor of the earlier string if
5343         * it happens that c_offset_min has been invalidated, since the
5344         * earlier string may buy us something the later one won't.]
5345         */
5346
5347         data.longest_fixed = newSVpvs("");
5348         data.longest_float = newSVpvs("");
5349         data.last_found = newSVpvs("");
5350         data.longest = &(data.longest_fixed);
5351         first = scan;
5352         if (!ri->regstclass) {
5353             cl_init(pRExC_state, &ch_class);
5354             data.start_class = &ch_class;
5355             stclass_flag = SCF_DO_STCLASS_AND;
5356         } else                          /* XXXX Check for BOUND? */
5357             stclass_flag = 0;
5358         data.last_closep = &last_close;
5359
5360         minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
5361             &data, -1, NULL, NULL,
5362             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
5363
5364
5365         CHECK_RESTUDY_GOTO;
5366
5367
5368         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
5369              && data.last_start_min == 0 && data.last_end > 0
5370              && !RExC_seen_zerolen
5371              && !(RExC_seen & REG_SEEN_VERBARG)
5372              && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
5373             r->extflags |= RXf_CHECK_ALL;
5374         scan_commit(pRExC_state, &data,&minlen,0);
5375         SvREFCNT_dec(data.last_found);
5376
5377         /* Note that code very similar to this but for anchored string
5378            follows immediately below, changes may need to be made to both.
5379            Be careful.
5380          */
5381         longest_float_length = CHR_SVLEN(data.longest_float);
5382         if (longest_float_length
5383             || (data.flags & SF_FL_BEFORE_EOL
5384                 && (!(data.flags & SF_FL_BEFORE_MEOL)
5385                     || (RExC_flags & RXf_PMf_MULTILINE))))
5386         {
5387             I32 t,ml;
5388
5389             /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
5390             if ((RExC_seen & REG_SEEN_EXACTF_SHARP_S)
5391                 || (SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
5392                     && data.offset_fixed == data.offset_float_min
5393                     && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
5394                     goto remove_float;          /* As in (a)+. */
5395
5396             /* copy the information about the longest float from the reg_scan_data
5397                over to the program. */
5398             if (SvUTF8(data.longest_float)) {
5399                 r->float_utf8 = data.longest_float;
5400                 r->float_substr = NULL;
5401             } else {
5402                 r->float_substr = data.longest_float;
5403                 r->float_utf8 = NULL;
5404             }
5405             /* float_end_shift is how many chars that must be matched that
5406                follow this item. We calculate it ahead of time as once the
5407                lookbehind offset is added in we lose the ability to correctly
5408                calculate it.*/
5409             ml = data.minlen_float ? *(data.minlen_float)
5410                                    : (I32)longest_float_length;
5411             r->float_end_shift = ml - data.offset_float_min
5412                 - longest_float_length + (SvTAIL(data.longest_float) != 0)
5413                 + data.lookbehind_float;
5414             r->float_min_offset = data.offset_float_min - data.lookbehind_float;
5415             r->float_max_offset = data.offset_float_max;
5416             if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
5417                 r->float_max_offset -= data.lookbehind_float;
5418
5419             t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
5420                        && (!(data.flags & SF_FL_BEFORE_MEOL)
5421                            || (RExC_flags & RXf_PMf_MULTILINE)));
5422             fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
5423         }
5424         else {
5425           remove_float:
5426             r->float_substr = r->float_utf8 = NULL;
5427             SvREFCNT_dec(data.longest_float);
5428             longest_float_length = 0;
5429         }
5430
5431         /* Note that code very similar to this but for floating string
5432            is immediately above, changes may need to be made to both.
5433            Be careful.
5434          */
5435         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
5436
5437         /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
5438         if (! (RExC_seen & REG_SEEN_EXACTF_SHARP_S)
5439             && (longest_fixed_length
5440                 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
5441                     && (!(data.flags & SF_FIX_BEFORE_MEOL)
5442                         || (RExC_flags & RXf_PMf_MULTILINE)))) )
5443         {
5444             I32 t,ml;
5445
5446             /* copy the information about the longest fixed
5447                from the reg_scan_data over to the program. */
5448             if (SvUTF8(data.longest_fixed)) {
5449                 r->anchored_utf8 = data.longest_fixed;
5450                 r->anchored_substr = NULL;
5451             } else {
5452                 r->anchored_substr = data.longest_fixed;
5453                 r->anchored_utf8 = NULL;
5454             }
5455             /* fixed_end_shift is how many chars that must be matched that
5456                follow this item. We calculate it ahead of time as once the
5457                lookbehind offset is added in we lose the ability to correctly
5458                calculate it.*/
5459             ml = data.minlen_fixed ? *(data.minlen_fixed)
5460                                    : (I32)longest_fixed_length;
5461             r->anchored_end_shift = ml - data.offset_fixed
5462                 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
5463                 + data.lookbehind_fixed;
5464             r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
5465
5466             t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
5467                  && (!(data.flags & SF_FIX_BEFORE_MEOL)
5468                      || (RExC_flags & RXf_PMf_MULTILINE)));
5469             fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
5470         }
5471         else {
5472             r->anchored_substr = r->anchored_utf8 = NULL;
5473             SvREFCNT_dec(data.longest_fixed);
5474             longest_fixed_length = 0;
5475         }
5476         if (ri->regstclass
5477             && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
5478             ri->regstclass = NULL;
5479
5480         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
5481             && stclass_flag
5482             && !(data.start_class->flags & ANYOF_EOS)
5483             && !cl_is_anything(data.start_class))
5484         {
5485             const U32 n = add_data(pRExC_state, 1, "f");
5486             data.start_class->flags |= ANYOF_IS_SYNTHETIC;
5487
5488             Newx(RExC_rxi->data->data[n], 1,
5489                 struct regnode_charclass_class);
5490             StructCopy(data.start_class,
5491                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
5492                        struct regnode_charclass_class);
5493             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
5494             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
5495             DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
5496                       regprop(r, sv, (regnode*)data.start_class);
5497                       PerlIO_printf(Perl_debug_log,
5498                                     "synthetic stclass \"%s\".\n",
5499                                     SvPVX_const(sv));});
5500         }
5501
5502         /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
5503         if (longest_fixed_length > longest_float_length) {
5504             r->check_end_shift = r->anchored_end_shift;
5505             r->check_substr = r->anchored_substr;
5506             r->check_utf8 = r->anchored_utf8;
5507             r->check_offset_min = r->check_offset_max = r->anchored_offset;
5508             if (r->extflags & RXf_ANCH_SINGLE)
5509                 r->extflags |= RXf_NOSCAN;
5510         }
5511         else {
5512             r->check_end_shift = r->float_end_shift;
5513             r->check_substr = r->float_substr;
5514             r->check_utf8 = r->float_utf8;
5515             r->check_offset_min = r->float_min_offset;
5516             r->check_offset_max = r->float_max_offset;
5517         }
5518         /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
5519            This should be changed ASAP!  */
5520         if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
5521             r->extflags |= RXf_USE_INTUIT;
5522             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
5523                 r->extflags |= RXf_INTUIT_TAIL;
5524         }
5525         /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
5526         if ( (STRLEN)minlen < longest_float_length )
5527             minlen= longest_float_length;
5528         if ( (STRLEN)minlen < longest_fixed_length )
5529             minlen= longest_fixed_length;
5530         */
5531     }
5532     else {
5533         /* Several toplevels. Best we can is to set minlen. */
5534         I32 fake;
5535         struct regnode_charclass_class ch_class;
5536         I32 last_close = 0;
5537
5538         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
5539
5540         scan = ri->program + 1;
5541         cl_init(pRExC_state, &ch_class);
5542         data.start_class = &ch_class;
5543         data.last_closep = &last_close;
5544
5545
5546         minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
5547             &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
5548
5549         CHECK_RESTUDY_GOTO;
5550
5551         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
5552                 = r->float_substr = r->float_utf8 = NULL;
5553
5554         if (!(data.start_class->flags & ANYOF_EOS)
5555             && !cl_is_anything(data.start_class))
5556         {
5557             const U32 n = add_data(pRExC_state, 1, "f");
5558             data.start_class->flags |= ANYOF_IS_SYNTHETIC;
5559
5560             Newx(RExC_rxi->data->data[n], 1,
5561                 struct regnode_charclass_class);
5562             StructCopy(data.start_class,
5563                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
5564                        struct regnode_charclass_class);
5565             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
5566             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
5567             DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
5568                       regprop(r, sv, (regnode*)data.start_class);
5569                       PerlIO_printf(Perl_debug_log,
5570                                     "synthetic stclass \"%s\".\n",
5571                                     SvPVX_const(sv));});
5572         }
5573     }
5574
5575     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
5576        the "real" pattern. */
5577     DEBUG_OPTIMISE_r({
5578         PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
5579                       (IV)minlen, (IV)r->minlen);
5580     });
5581     r->minlenret = minlen;
5582     if (r->minlen < minlen)
5583         r->minlen = minlen;
5584
5585     if (RExC_seen & REG_SEEN_GPOS)
5586         r->extflags |= RXf_GPOS_SEEN;
5587     if (RExC_seen & REG_SEEN_LOOKBEHIND)
5588         r->extflags |= RXf_LOOKBEHIND_SEEN;
5589     if (RExC_seen & REG_SEEN_EVAL)
5590         r->extflags |= RXf_EVAL_SEEN;
5591     if (RExC_seen & REG_SEEN_CANY)
5592         r->extflags |= RXf_CANY_SEEN;
5593     if (RExC_seen & REG_SEEN_VERBARG)
5594         r->intflags |= PREGf_VERBARG_SEEN;
5595     if (RExC_seen & REG_SEEN_CUTGROUP)
5596         r->intflags |= PREGf_CUTGROUP_SEEN;
5597     if (RExC_paren_names)
5598         RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
5599     else
5600         RXp_PAREN_NAMES(r) = NULL;
5601
5602 #ifdef STUPID_PATTERN_CHECKS
5603     if (RX_PRELEN(rx) == 0)
5604         r->extflags |= RXf_NULL;
5605     if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
5606         /* XXX: this should happen BEFORE we compile */
5607         r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5608     else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
5609         r->extflags |= RXf_WHITE;
5610     else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
5611         r->extflags |= RXf_START_ONLY;
5612 #else
5613     if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
5614             /* XXX: this should happen BEFORE we compile */
5615             r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5616     else {
5617         regnode *first = ri->program + 1;
5618         U8 fop = OP(first);
5619
5620         if (PL_regkind[fop] == NOTHING && OP(NEXTOPER(first)) == END)
5621             r->extflags |= RXf_NULL;
5622         else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END)
5623             r->extflags |= RXf_START_ONLY;
5624         else if (fop == PLUS && OP(NEXTOPER(first)) == SPACE
5625                              && OP(regnext(first)) == END)
5626             r->extflags |= RXf_WHITE;
5627     }
5628 #endif
5629 #ifdef DEBUGGING
5630     if (RExC_paren_names) {
5631         ri->name_list_idx = add_data( pRExC_state, 1, "a" );
5632         ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
5633     } else
5634 #endif
5635         ri->name_list_idx = 0;
5636
5637     if (RExC_recurse_count) {
5638         for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
5639             const regnode *scan = RExC_recurse[RExC_recurse_count-1];
5640             ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
5641         }
5642     }
5643     Newxz(r->offs, RExC_npar, regexp_paren_pair);
5644     /* assume we don't need to swap parens around before we match */
5645
5646     DEBUG_DUMP_r({
5647         PerlIO_printf(Perl_debug_log,"Final program:\n");
5648         regdump(r);
5649     });
5650 #ifdef RE_TRACK_PATTERN_OFFSETS
5651     DEBUG_OFFSETS_r(if (ri->u.offsets) {
5652         const U32 len = ri->u.offsets[0];
5653         U32 i;
5654         GET_RE_DEBUG_FLAGS_DECL;
5655         PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
5656         for (i = 1; i <= len; i++) {
5657             if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
5658                 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
5659                 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
5660             }
5661         PerlIO_printf(Perl_debug_log, "\n");
5662     });
5663 #endif
5664     return rx;
5665 }
5666
5667 #undef RE_ENGINE_PTR
5668
5669
5670 SV*
5671 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
5672                     const U32 flags)
5673 {
5674     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
5675
5676     PERL_UNUSED_ARG(value);
5677
5678     if (flags & RXapif_FETCH) {
5679         return reg_named_buff_fetch(rx, key, flags);
5680     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
5681         Perl_croak_no_modify(aTHX);
5682         return NULL;
5683     } else if (flags & RXapif_EXISTS) {
5684         return reg_named_buff_exists(rx, key, flags)
5685             ? &PL_sv_yes
5686             : &PL_sv_no;
5687     } else if (flags & RXapif_REGNAMES) {
5688         return reg_named_buff_all(rx, flags);
5689     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
5690         return reg_named_buff_scalar(rx, flags);
5691     } else {
5692         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
5693         return NULL;
5694     }
5695 }
5696
5697 SV*
5698 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
5699                          const U32 flags)
5700 {
5701     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
5702     PERL_UNUSED_ARG(lastkey);
5703
5704     if (flags & RXapif_FIRSTKEY)
5705         return reg_named_buff_firstkey(rx, flags);
5706     else if (flags & RXapif_NEXTKEY)
5707         return reg_named_buff_nextkey(rx, flags);
5708     else {
5709         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
5710         return NULL;
5711     }
5712 }
5713
5714 SV*
5715 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
5716                           const U32 flags)
5717 {
5718     AV *retarray = NULL;
5719     SV *ret;
5720     struct regexp *const rx = (struct regexp *)SvANY(r);
5721
5722     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
5723
5724     if (flags & RXapif_ALL)
5725         retarray=newAV();
5726
5727     if (rx && RXp_PAREN_NAMES(rx)) {
5728         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
5729         if (he_str) {
5730             IV i;
5731             SV* sv_dat=HeVAL(he_str);
5732             I32 *nums=(I32*)SvPVX(sv_dat);
5733             for ( i=0; i<SvIVX(sv_dat); i++ ) {
5734                 if ((I32)(rx->nparens) >= nums[i]
5735                     && rx->offs[nums[i]].start != -1
5736                     && rx->offs[nums[i]].end != -1)
5737                 {
5738                     ret = newSVpvs("");
5739                     CALLREG_NUMBUF_FETCH(r,nums[i],ret);
5740                     if (!retarray)
5741                         return ret;
5742                 } else {
5743                     if (retarray)
5744                         ret = newSVsv(&PL_sv_undef);
5745                 }
5746                 if (retarray)
5747                     av_push(retarray, ret);
5748             }
5749             if (retarray)
5750                 return newRV_noinc(MUTABLE_SV(retarray));
5751         }
5752     }
5753     return NULL;
5754 }
5755
5756 bool
5757 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
5758                            const U32 flags)
5759 {
5760     struct regexp *const rx = (struct regexp *)SvANY(r);
5761
5762     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
5763
5764     if (rx && RXp_PAREN_NAMES(rx)) {
5765         if (flags & RXapif_ALL) {
5766             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
5767         } else {
5768             SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
5769             if (sv) {
5770                 SvREFCNT_dec(sv);
5771                 return TRUE;
5772             } else {
5773                 return FALSE;
5774             }
5775         }
5776     } else {
5777         return FALSE;
5778     }
5779 }
5780
5781 SV*
5782 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
5783 {
5784     struct regexp *const rx = (struct regexp *)SvANY(r);
5785
5786     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
5787
5788     if ( rx && RXp_PAREN_NAMES(rx) ) {
5789         (void)hv_iterinit(RXp_PAREN_NAMES(rx));
5790
5791         return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
5792     } else {
5793         return FALSE;
5794     }
5795 }
5796
5797 SV*
5798 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
5799 {
5800     struct regexp *const rx = (struct regexp *)SvANY(r);
5801     GET_RE_DEBUG_FLAGS_DECL;
5802
5803     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
5804
5805     if (rx && RXp_PAREN_NAMES(rx)) {
5806         HV *hv = RXp_PAREN_NAMES(rx);
5807         HE *temphe;
5808         while ( (temphe = hv_iternext_flags(hv,0)) ) {
5809             IV i;
5810             IV parno = 0;
5811             SV* sv_dat = HeVAL(temphe);
5812             I32 *nums = (I32*)SvPVX(sv_dat);
5813             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5814                 if ((I32)(rx->lastparen) >= nums[i] &&
5815                     rx->offs[nums[i]].start != -1 &&
5816                     rx->offs[nums[i]].end != -1)
5817                 {
5818                     parno = nums[i];
5819                     break;
5820                 }
5821             }
5822             if (parno || flags & RXapif_ALL) {
5823                 return newSVhek(HeKEY_hek(temphe));
5824             }
5825         }
5826     }
5827     return NULL;
5828 }
5829
5830 SV*
5831 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
5832 {
5833     SV *ret;
5834     AV *av;
5835     I32 length;
5836     struct regexp *const rx = (struct regexp *)SvANY(r);
5837
5838     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
5839
5840     if (rx && RXp_PAREN_NAMES(rx)) {
5841         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5842             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
5843         } else if (flags & RXapif_ONE) {
5844             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
5845             av = MUTABLE_AV(SvRV(ret));
5846             length = av_len(av);
5847             SvREFCNT_dec(ret);
5848             return newSViv(length + 1);
5849         } else {
5850             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5851             return NULL;
5852         }
5853     }
5854     return &PL_sv_undef;
5855 }
5856
5857 SV*
5858 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
5859 {
5860     struct regexp *const rx = (struct regexp *)SvANY(r);
5861     AV *av = newAV();
5862
5863     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
5864
5865     if (rx && RXp_PAREN_NAMES(rx)) {
5866         HV *hv= RXp_PAREN_NAMES(rx);
5867         HE *temphe;
5868         (void)hv_iterinit(hv);
5869         while ( (temphe = hv_iternext_flags(hv,0)) ) {
5870             IV i;
5871             IV parno = 0;
5872             SV* sv_dat = HeVAL(temphe);
5873             I32 *nums = (I32*)SvPVX(sv_dat);
5874             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5875                 if ((I32)(rx->lastparen) >= nums[i] &&
5876                     rx->offs[nums[i]].start != -1 &&
5877                     rx->offs[nums[i]].end != -1)
5878                 {
5879                     parno = nums[i];
5880                     break;
5881                 }
5882             }
5883             if (parno || flags & RXapif_ALL) {
5884                 av_push(av, newSVhek(HeKEY_hek(temphe)));
5885             }
5886         }
5887     }
5888
5889     return newRV_noinc(MUTABLE_SV(av));
5890 }
5891
5892 void
5893 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
5894                              SV * const sv)
5895 {
5896     struct regexp *const rx = (struct regexp *)SvANY(r);
5897     char *s = NULL;
5898     I32 i = 0;
5899     I32 s1, t1;
5900
5901     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
5902
5903     if (!rx->subbeg) {
5904         sv_setsv(sv,&PL_sv_undef);
5905         return;
5906     }
5907     else
5908     if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
5909         /* $` */
5910         i = rx->offs[0].start;
5911         s = rx->subbeg;
5912     }
5913     else
5914     if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
5915         /* $' */
5916         s = rx->subbeg + rx->offs[0].end;
5917         i = rx->sublen - rx->offs[0].end;
5918     }
5919     else
5920     if ( 0 <= paren && paren <= (I32)rx->nparens &&
5921         (s1 = rx->offs[paren].start) != -1 &&
5922         (t1 = rx->offs[paren].end) != -1)
5923     {
5924         /* $& $1 ... */
5925         i = t1 - s1;
5926         s = rx->subbeg + s1;
5927     } else {
5928         sv_setsv(sv,&PL_sv_undef);
5929         return;
5930     }
5931     assert(rx->sublen >= (s - rx->subbeg) + i );
5932     if (i >= 0) {
5933         const int oldtainted = PL_tainted;
5934         TAINT_NOT;
5935         sv_setpvn(sv, s, i);
5936         PL_tainted = oldtainted;
5937         if ( (rx->extflags & RXf_CANY_SEEN)
5938             ? (RXp_MATCH_UTF8(rx)
5939                         && (!i || is_utf8_string((U8*)s, i)))
5940             : (RXp_MATCH_UTF8(rx)) )
5941         {
5942             SvUTF8_on(sv);
5943         }
5944         else
5945             SvUTF8_off(sv);
5946         if (PL_tainting) {
5947             if (RXp_MATCH_TAINTED(rx)) {
5948                 if (SvTYPE(sv) >= SVt_PVMG) {
5949                     MAGIC* const mg = SvMAGIC(sv);
5950                     MAGIC* mgt;
5951                     PL_tainted = 1;
5952                     SvMAGIC_set(sv, mg->mg_moremagic);
5953                     SvTAINT(sv);
5954                     if ((mgt = SvMAGIC(sv))) {
5955                         mg->mg_moremagic = mgt;
5956                         SvMAGIC_set(sv, mg);
5957                     }
5958                 } else {
5959                     PL_tainted = 1;
5960                     SvTAINT(sv);
5961                 }
5962             } else
5963                 SvTAINTED_off(sv);
5964         }
5965     } else {
5966         sv_setsv(sv,&PL_sv_undef);
5967         return;
5968     }
5969 }
5970
5971 void
5972 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
5973                                                          SV const * const value)
5974 {
5975     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
5976
5977     PERL_UNUSED_ARG(rx);
5978     PERL_UNUSED_ARG(paren);
5979     PERL_UNUSED_ARG(value);
5980
5981     if (!PL_localizing)
5982         Perl_croak_no_modify(aTHX);
5983 }
5984
5985 I32
5986 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
5987                               const I32 paren)
5988 {
5989     struct regexp *const rx = (struct regexp *)SvANY(r);
5990     I32 i;
5991     I32 s1, t1;
5992
5993     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
5994
5995     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
5996         switch (paren) {
5997       /* $` / ${^PREMATCH} */
5998       case RX_BUFF_IDX_PREMATCH:
5999         if (rx->offs[0].start != -1) {
6000                         i = rx->offs[0].start;
6001                         if (i > 0) {
6002                                 s1 = 0;
6003                                 t1 = i;
6004                                 goto getlen;
6005                         }
6006             }
6007         return 0;
6008       /* $' / ${^POSTMATCH} */
6009       case RX_BUFF_IDX_POSTMATCH:
6010             if (rx->offs[0].end != -1) {
6011                         i = rx->sublen - rx->offs[0].end;
6012                         if (i > 0) {
6013                                 s1 = rx->offs[0].end;
6014                                 t1 = rx->sublen;
6015                                 goto getlen;
6016                         }
6017             }
6018         return 0;
6019       /* $& / ${^MATCH}, $1, $2, ... */
6020       default:
6021             if (paren <= (I32)rx->nparens &&
6022             (s1 = rx->offs[paren].start) != -1 &&
6023             (t1 = rx->offs[paren].end) != -1)
6024             {
6025             i = t1 - s1;
6026             goto getlen;
6027         } else {
6028             if (ckWARN(WARN_UNINITIALIZED))
6029                 report_uninit((const SV *)sv);
6030             return 0;
6031         }
6032     }
6033   getlen:
6034     if (i > 0 && RXp_MATCH_UTF8(rx)) {
6035         const char * const s = rx->subbeg + s1;
6036         const U8 *ep;
6037         STRLEN el;
6038
6039         i = t1 - s1;
6040         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
6041                         i = el;
6042     }
6043     return i;
6044 }
6045
6046 SV*
6047 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
6048 {
6049     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
6050         PERL_UNUSED_ARG(rx);
6051         if (0)
6052             return NULL;
6053         else
6054             return newSVpvs("Regexp");
6055 }
6056
6057 /* Scans the name of a named buffer from the pattern.
6058  * If flags is REG_RSN_RETURN_NULL returns null.
6059  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
6060  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
6061  * to the parsed name as looked up in the RExC_paren_names hash.
6062  * If there is an error throws a vFAIL().. type exception.
6063  */
6064
6065 #define REG_RSN_RETURN_NULL    0
6066 #define REG_RSN_RETURN_NAME    1
6067 #define REG_RSN_RETURN_DATA    2
6068
6069 STATIC SV*
6070 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
6071 {
6072     char *name_start = RExC_parse;
6073
6074     PERL_ARGS_ASSERT_REG_SCAN_NAME;
6075
6076     if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
6077          /* skip IDFIRST by using do...while */
6078         if (UTF)
6079             do {
6080                 RExC_parse += UTF8SKIP(RExC_parse);
6081             } while (isALNUM_utf8((U8*)RExC_parse));
6082         else
6083             do {
6084                 RExC_parse++;
6085             } while (isALNUM(*RExC_parse));
6086     }
6087
6088     if ( flags ) {
6089         SV* sv_name
6090             = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
6091                              SVs_TEMP | (UTF ? SVf_UTF8 : 0));
6092         if ( flags == REG_RSN_RETURN_NAME)
6093             return sv_name;
6094         else if (flags==REG_RSN_RETURN_DATA) {
6095             HE *he_str = NULL;
6096             SV *sv_dat = NULL;
6097             if ( ! sv_name )      /* should not happen*/
6098                 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
6099             if (RExC_paren_names)
6100                 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
6101             if ( he_str )
6102                 sv_dat = HeVAL(he_str);
6103             if ( ! sv_dat )
6104                 vFAIL("Reference to nonexistent named group");
6105             return sv_dat;
6106         }
6107         else {
6108             Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
6109                        (unsigned long) flags);
6110         }
6111         /* NOT REACHED */
6112     }
6113     return NULL;
6114 }
6115
6116 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
6117     int rem=(int)(RExC_end - RExC_parse);                       \
6118     int cut;                                                    \
6119     int num;                                                    \
6120     int iscut=0;                                                \
6121     if (rem>10) {                                               \
6122         rem=10;                                                 \
6123         iscut=1;                                                \
6124     }                                                           \
6125     cut=10-rem;                                                 \
6126     if (RExC_lastparse!=RExC_parse)                             \
6127         PerlIO_printf(Perl_debug_log," >%.*s%-*s",              \
6128             rem, RExC_parse,                                    \
6129             cut + 4,                                            \
6130             iscut ? "..." : "<"                                 \
6131         );                                                      \
6132     else                                                        \
6133         PerlIO_printf(Perl_debug_log,"%16s","");                \
6134                                                                 \
6135     if (SIZE_ONLY)                                              \
6136        num = RExC_size + 1;                                     \
6137     else                                                        \
6138        num=REG_NODE_NUM(RExC_emit);                             \
6139     if (RExC_lastnum!=num)                                      \
6140        PerlIO_printf(Perl_debug_log,"|%4d",num);                \
6141     else                                                        \
6142        PerlIO_printf(Perl_debug_log,"|%4s","");                 \
6143     PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
6144         (int)((depth*2)), "",                                   \
6145         (funcname)                                              \
6146     );                                                          \
6147     RExC_lastnum=num;                                           \
6148     RExC_lastparse=RExC_parse;                                  \
6149 })
6150
6151
6152
6153 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
6154     DEBUG_PARSE_MSG((funcname));                            \
6155     PerlIO_printf(Perl_debug_log,"%4s","\n");               \
6156 })
6157 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
6158     DEBUG_PARSE_MSG((funcname));                            \
6159     PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
6160 })
6161
6162 /* This section of code defines the inversion list object and its methods.  The
6163  * interfaces are highly subject to change, so as much as possible is static to
6164  * this file.  An inversion list is here implemented as a malloc'd C UV array
6165  * with some added info that is placed as UVs at the beginning in a header
6166  * portion.  An inversion list for Unicode is an array of code points, sorted
6167  * by ordinal number.  The zeroth element is the first code point in the list.
6168  * The 1th element is the first element beyond that not in the list.  In other
6169  * words, the first range is
6170  *  invlist[0]..(invlist[1]-1)
6171  * The other ranges follow.  Thus every element whose index is divisible by two
6172  * marks the beginning of a range that is in the list, and every element not
6173  * divisible by two marks the beginning of a range not in the list.  A single
6174  * element inversion list that contains the single code point N generally
6175  * consists of two elements
6176  *  invlist[0] == N
6177  *  invlist[1] == N+1
6178  * (The exception is when N is the highest representable value on the
6179  * machine, in which case the list containing just it would be a single
6180  * element, itself.  By extension, if the last range in the list extends to
6181  * infinity, then the first element of that range will be in the inversion list
6182  * at a position that is divisible by two, and is the final element in the
6183  * list.)
6184  * Taking the complement (inverting) an inversion list is quite simple, if the
6185  * first element is 0, remove it; otherwise add a 0 element at the beginning.
6186  * This implementation reserves an element at the beginning of each inversion list
6187  * to contain 0 when the list contains 0, and contains 1 otherwise.  The actual
6188  * beginning of the list is either that element if 0, or the next one if 1.
6189  *
6190  * More about inversion lists can be found in "Unicode Demystified"
6191  * Chapter 13 by Richard Gillam, published by Addison-Wesley.
6192  * More will be coming when functionality is added later.
6193  *
6194  * The inversion list data structure is currently implemented as an SV pointing
6195  * to an array of UVs that the SV thinks are bytes.  This allows us to have an
6196  * array of UV whose memory management is automatically handled by the existing
6197  * facilities for SV's.
6198  *
6199  * Some of the methods should always be private to the implementation, and some
6200  * should eventually be made public */
6201
6202 #define INVLIST_LEN_OFFSET 0    /* Number of elements in the inversion list */
6203 #define INVLIST_ITER_OFFSET 1   /* Current iteration position */
6204
6205 /* This is a combination of a version and data structure type, so that one
6206  * being passed in can be validated to be an inversion list of the correct
6207  * vintage.  When the structure of the header is changed, a new random number
6208  * in the range 2**31-1 should be generated and the new() method changed to
6209  * insert that at this location.  Then, if an auxiliary program doesn't change
6210  * correspondingly, it will be discovered immediately */
6211 #define INVLIST_VERSION_ID_OFFSET 2
6212 #define INVLIST_VERSION_ID 1064334010
6213
6214 /* For safety, when adding new elements, remember to #undef them at the end of
6215  * the inversion list code section */
6216
6217 #define INVLIST_ZERO_OFFSET 3   /* 0 or 1; must be last element in header */
6218 /* The UV at position ZERO contains either 0 or 1.  If 0, the inversion list
6219  * contains the code point U+00000, and begins here.  If 1, the inversion list
6220  * doesn't contain U+0000, and it begins at the next UV in the array.
6221  * Inverting an inversion list consists of adding or removing the 0 at the
6222  * beginning of it.  By reserving a space for that 0, inversion can be made
6223  * very fast */
6224
6225 #define HEADER_LENGTH (INVLIST_ZERO_OFFSET + 1)
6226
6227 /* Internally things are UVs */
6228 #define TO_INTERNAL_SIZE(x) ((x + HEADER_LENGTH) * sizeof(UV))
6229 #define FROM_INTERNAL_SIZE(x) ((x / sizeof(UV)) - HEADER_LENGTH)
6230
6231 #define INVLIST_INITIAL_LEN 10
6232
6233 PERL_STATIC_INLINE UV*
6234 S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
6235 {
6236     /* Returns a pointer to the first element in the inversion list's array.
6237      * This is called upon initialization of an inversion list.  Where the
6238      * array begins depends on whether the list has the code point U+0000
6239      * in it or not.  The other parameter tells it whether the code that
6240      * follows this call is about to put a 0 in the inversion list or not.
6241      * The first element is either the element with 0, if 0, or the next one,
6242      * if 1 */
6243
6244     UV* zero = get_invlist_zero_addr(invlist);
6245
6246     PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
6247
6248     /* Must be empty */
6249     assert(! *get_invlist_len_addr(invlist));
6250
6251     /* 1^1 = 0; 1^0 = 1 */
6252     *zero = 1 ^ will_have_0;
6253     return zero + *zero;
6254 }
6255
6256 PERL_STATIC_INLINE UV*
6257 S_invlist_array(pTHX_ SV* const invlist)
6258 {
6259     /* Returns the pointer to the inversion list's array.  Every time the
6260      * length changes, this needs to be called in case malloc or realloc moved
6261      * it */
6262
6263     PERL_ARGS_ASSERT_INVLIST_ARRAY;
6264
6265     /* Must not be empty.  If these fail, you probably didn't check for <len>
6266      * being non-zero before trying to get the array */
6267     assert(*get_invlist_len_addr(invlist));
6268     assert(*get_invlist_zero_addr(invlist) == 0
6269            || *get_invlist_zero_addr(invlist) == 1);
6270
6271     /* The array begins either at the element reserved for zero if the
6272      * list contains 0 (that element will be set to 0), or otherwise the next
6273      * element (in which case the reserved element will be set to 1). */
6274     return (UV *) (get_invlist_zero_addr(invlist)
6275                    + *get_invlist_zero_addr(invlist));
6276 }
6277
6278 PERL_STATIC_INLINE UV*
6279 S_get_invlist_len_addr(pTHX_ SV* invlist)
6280 {
6281     /* Return the address of the UV that contains the current number
6282      * of used elements in the inversion list */
6283
6284     PERL_ARGS_ASSERT_GET_INVLIST_LEN_ADDR;
6285
6286     return (UV *) (SvPVX(invlist) + (INVLIST_LEN_OFFSET * sizeof (UV)));
6287 }
6288
6289 PERL_STATIC_INLINE UV
6290 S_invlist_len(pTHX_ SV* const invlist)
6291 {
6292     /* Returns the current number of elements stored in the inversion list's
6293      * array */
6294
6295     PERL_ARGS_ASSERT_INVLIST_LEN;
6296
6297     return *get_invlist_len_addr(invlist);
6298 }
6299
6300 PERL_STATIC_INLINE void
6301 S_invlist_set_len(pTHX_ SV* const invlist, const UV len)
6302 {
6303     /* Sets the current number of elements stored in the inversion list */
6304
6305     PERL_ARGS_ASSERT_INVLIST_SET_LEN;
6306
6307     *get_invlist_len_addr(invlist) = len;
6308
6309     assert(len <= SvLEN(invlist));
6310
6311     SvCUR_set(invlist, TO_INTERNAL_SIZE(len));
6312     /* If the list contains U+0000, that element is part of the header,
6313      * and should not be counted as part of the array.  It will contain
6314      * 0 in that case, and 1 otherwise.  So we could flop 0=>1, 1=>0 and
6315      * subtract:
6316      *  SvCUR_set(invlist,
6317      *            TO_INTERNAL_SIZE(len
6318      *                             - (*get_invlist_zero_addr(inv_list) ^ 1)));
6319      * But, this is only valid if len is not 0.  The consequences of not doing
6320      * this is that the memory allocation code may think that 1 more UV is
6321      * being used than actually is, and so might do an unnecessary grow.  That
6322      * seems worth not bothering to make this the precise amount.
6323      *
6324      * Note that when inverting, SvCUR shouldn't change */
6325 }
6326
6327 PERL_STATIC_INLINE UV
6328 S_invlist_max(pTHX_ SV* const invlist)
6329 {
6330     /* Returns the maximum number of elements storable in the inversion list's
6331      * array, without having to realloc() */
6332
6333     PERL_ARGS_ASSERT_INVLIST_MAX;
6334
6335     return FROM_INTERNAL_SIZE(SvLEN(invlist));
6336 }
6337
6338 PERL_STATIC_INLINE UV*
6339 S_get_invlist_zero_addr(pTHX_ SV* invlist)
6340 {
6341     /* Return the address of the UV that is reserved to hold 0 if the inversion
6342      * list contains 0.  This has to be the last element of the heading, as the
6343      * list proper starts with either it if 0, or the next element if not.
6344      * (But we force it to contain either 0 or 1) */
6345
6346     PERL_ARGS_ASSERT_GET_INVLIST_ZERO_ADDR;
6347
6348     return (UV *) (SvPVX(invlist) + (INVLIST_ZERO_OFFSET * sizeof (UV)));
6349 }
6350
6351 #ifndef PERL_IN_XSUB_RE
6352 SV*
6353 Perl__new_invlist(pTHX_ IV initial_size)
6354 {
6355
6356     /* Return a pointer to a newly constructed inversion list, with enough
6357      * space to store 'initial_size' elements.  If that number is negative, a
6358      * system default is used instead */
6359
6360     SV* new_list;
6361
6362     if (initial_size < 0) {
6363         initial_size = INVLIST_INITIAL_LEN;
6364     }
6365
6366     /* Allocate the initial space */
6367     new_list = newSV(TO_INTERNAL_SIZE(initial_size));
6368     invlist_set_len(new_list, 0);
6369
6370     /* Force iterinit() to be used to get iteration to work */
6371     *get_invlist_iter_addr(new_list) = UV_MAX;
6372
6373     /* This should force a segfault if a method doesn't initialize this
6374      * properly */
6375     *get_invlist_zero_addr(new_list) = UV_MAX;
6376
6377     *get_invlist_version_id_addr(new_list) = INVLIST_VERSION_ID;
6378 #if HEADER_LENGTH != 4
6379 #   error Need to regenerate VERSION_ID by running perl -E 'say int(rand 2**31-1)', and then changing the #if to the new length
6380 #endif
6381
6382     return new_list;
6383 }
6384 #endif
6385
6386 STATIC SV*
6387 S__new_invlist_C_array(pTHX_ UV* list)
6388 {
6389     /* Return a pointer to a newly constructed inversion list, initialized to
6390      * point to <list>, which has to be in the exact correct inversion list
6391      * form, including internal fields.  Thus this is a dangerous routine that
6392      * should not be used in the wrong hands */
6393
6394     SV* invlist = newSV_type(SVt_PV);
6395
6396     PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
6397
6398     SvPV_set(invlist, (char *) list);
6399     SvLEN_set(invlist, 0);  /* Means we own the contents, and the system
6400                                shouldn't touch it */
6401     SvCUR_set(invlist, TO_INTERNAL_SIZE(invlist_len(invlist)));
6402
6403     if (*get_invlist_version_id_addr(invlist) != INVLIST_VERSION_ID) {
6404         Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
6405     }
6406
6407     return invlist;
6408 }
6409
6410 STATIC void
6411 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
6412 {
6413     /* Grow the maximum size of an inversion list */
6414
6415     PERL_ARGS_ASSERT_INVLIST_EXTEND;
6416
6417     SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max));
6418 }
6419
6420 PERL_STATIC_INLINE void
6421 S_invlist_trim(pTHX_ SV* const invlist)
6422 {
6423     PERL_ARGS_ASSERT_INVLIST_TRIM;
6424
6425     /* Change the length of the inversion list to how many entries it currently
6426      * has */
6427
6428     SvPV_shrink_to_cur((SV *) invlist);
6429 }
6430
6431 /* An element is in an inversion list iff its index is even numbered: 0, 2, 4,
6432  * etc */
6433 #define ELEMENT_RANGE_MATCHES_INVLIST(i) (! ((i) & 1))
6434 #define PREV_RANGE_MATCHES_INVLIST(i) (! ELEMENT_RANGE_MATCHES_INVLIST(i))
6435
6436 #define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
6437
6438 #ifndef PERL_IN_XSUB_RE
6439 void
6440 Perl__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
6441 {
6442    /* Subject to change or removal.  Append the range from 'start' to 'end' at
6443     * the end of the inversion list.  The range must be above any existing
6444     * ones. */
6445
6446     UV* array;
6447     UV max = invlist_max(invlist);
6448     UV len = invlist_len(invlist);
6449
6450     PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
6451
6452     if (len == 0) { /* Empty lists must be initialized */
6453         array = _invlist_array_init(invlist, start == 0);
6454     }
6455     else {
6456         /* Here, the existing list is non-empty. The current max entry in the
6457          * list is generally the first value not in the set, except when the
6458          * set extends to the end of permissible values, in which case it is
6459          * the first entry in that final set, and so this call is an attempt to
6460          * append out-of-order */
6461
6462         UV final_element = len - 1;
6463         array = invlist_array(invlist);
6464         if (array[final_element] > start
6465             || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
6466         {
6467             Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
6468                        array[final_element], start,
6469                        ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
6470         }
6471
6472         /* Here, it is a legal append.  If the new range begins with the first
6473          * value not in the set, it is extending the set, so the new first
6474          * value not in the set is one greater than the newly extended range.
6475          * */
6476         if (array[final_element] == start) {
6477             if (end != UV_MAX) {
6478                 array[final_element] = end + 1;
6479             }
6480             else {
6481                 /* But if the end is the maximum representable on the machine,
6482                  * just let the range that this would extend to have no end */
6483                 invlist_set_len(invlist, len - 1);
6484             }
6485             return;
6486         }
6487     }
6488
6489     /* Here the new range doesn't extend any existing set.  Add it */
6490
6491     len += 2;   /* Includes an element each for the start and end of range */
6492
6493     /* If overflows the existing space, extend, which may cause the array to be
6494      * moved */
6495     if (max < len) {
6496         invlist_extend(invlist, len);
6497         invlist_set_len(invlist, len);  /* Have to set len here to avoid assert
6498                                            failure in invlist_array() */
6499         array = invlist_array(invlist);
6500     }
6501     else {
6502         invlist_set_len(invlist, len);
6503     }
6504
6505     /* The next item on the list starts the range, the one after that is
6506      * one past the new range.  */
6507     array[len - 2] = start;
6508     if (end != UV_MAX) {
6509         array[len - 1] = end + 1;
6510     }
6511     else {
6512         /* But if the end is the maximum representable on the machine, just let
6513          * the range have no end */
6514         invlist_set_len(invlist, len - 1);
6515     }
6516 }
6517
6518 STATIC IV
6519 S_invlist_search(pTHX_ SV* const invlist, const UV cp)
6520 {
6521     /* Searches the inversion list for the entry that contains the input code
6522      * point <cp>.  If <cp> is not in the list, -1 is returned.  Otherwise, the
6523      * return value is the index into the list's array of the range that
6524      * contains <cp> */
6525
6526     IV low = 0;
6527     IV high = invlist_len(invlist);
6528     const UV * const array = invlist_array(invlist);
6529
6530     PERL_ARGS_ASSERT_INVLIST_SEARCH;
6531
6532     /* If list is empty or the code point is before the first element, return
6533      * failure. */
6534     if (high == 0 || cp < array[0]) {
6535         return -1;
6536     }
6537
6538     /* Binary search.  What we are looking for is <i> such that
6539      *  array[i] <= cp < array[i+1]
6540      * The loop below converges on the i+1. */
6541     while (low < high) {
6542         IV mid = (low + high) / 2;
6543         if (array[mid] <= cp) {
6544             low = mid + 1;
6545
6546             /* We could do this extra test to exit the loop early.
6547             if (cp < array[low]) {
6548                 return mid;
6549             }
6550             */
6551         }
6552         else { /* cp < array[mid] */
6553             high = mid;
6554         }
6555     }
6556
6557     return high - 1;
6558 }
6559
6560 void
6561 Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch)
6562 {
6563     /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
6564      * but is used when the swash has an inversion list.  This makes this much
6565      * faster, as it uses a binary search instead of a linear one.  This is
6566      * intimately tied to that function, and perhaps should be in utf8.c,
6567      * except it is intimately tied to inversion lists as well.  It assumes
6568      * that <swatch> is all 0's on input */
6569
6570     UV current = start;
6571     const IV len = invlist_len(invlist);
6572     IV i;
6573     const UV * array;
6574
6575     PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
6576
6577     if (len == 0) { /* Empty inversion list */
6578         return;
6579     }
6580
6581     array = invlist_array(invlist);
6582
6583     /* Find which element it is */
6584     i = invlist_search(invlist, start);
6585
6586     /* We populate from <start> to <end> */
6587     while (current < end) {
6588         UV upper;
6589
6590         /* The inversion list gives the results for every possible code point
6591          * after the first one in the list.  Only those ranges whose index is
6592          * even are ones that the inversion list matches.  For the odd ones,
6593          * and if the initial code point is not in the list, we have to skip
6594          * forward to the next element */
6595         if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
6596             i++;
6597             if (i >= len) { /* Finished if beyond the end of the array */
6598                 return;
6599             }
6600             current = array[i];
6601             if (current >= end) {   /* Finished if beyond the end of what we
6602                                        are populating */
6603                 return;
6604             }
6605         }
6606         assert(current >= start);
6607
6608         /* The current range ends one below the next one, except don't go past
6609          * <end> */
6610         i++;
6611         upper = (i < len && array[i] < end) ? array[i] : end;
6612
6613         /* Here we are in a range that matches.  Populate a bit in the 3-bit U8
6614          * for each code point in it */
6615         for (; current < upper; current++) {
6616             const STRLEN offset = (STRLEN)(current - start);
6617             swatch[offset >> 3] |= 1 << (offset & 7);
6618         }
6619
6620         /* Quit if at the end of the list */
6621         if (i >= len) {
6622
6623             /* But first, have to deal with the highest possible code point on
6624              * the platform.  The previous code assumes that <end> is one
6625              * beyond where we want to populate, but that is impossible at the
6626              * platform's infinity, so have to handle it specially */
6627             if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
6628             {
6629                 const STRLEN offset = (STRLEN)(end - start);
6630                 swatch[offset >> 3] |= 1 << (offset & 7);
6631             }
6632             return;
6633         }
6634
6635         /* Advance to the next range, which will be for code points not in the
6636          * inversion list */
6637         current = array[i];
6638     }
6639
6640     return;
6641 }
6642
6643
6644 void
6645 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** output)
6646 {
6647     /* Take the union of two inversion lists and point <output> to it.  *output
6648      * should be defined upon input, and if it points to one of the two lists,
6649      * the reference count to that list will be decremented.  The first list,
6650      * <a>, may be NULL, in which case a copy of the second list is returned.
6651      * If <complement_b> is TRUE, the union is taken of the complement
6652      * (inversion) of <b> instead of b itself.
6653      *
6654      * The basis for this comes from "Unicode Demystified" Chapter 13 by
6655      * Richard Gillam, published by Addison-Wesley, and explained at some
6656      * length there.  The preface says to incorporate its examples into your
6657      * code at your own risk.
6658      *
6659      * The algorithm is like a merge sort.
6660      *
6661      * XXX A potential performance improvement is to keep track as we go along
6662      * if only one of the inputs contributes to the result, meaning the other
6663      * is a subset of that one.  In that case, we can skip the final copy and
6664      * return the larger of the input lists, but then outside code might need
6665      * to keep track of whether to free the input list or not */
6666
6667     UV* array_a;    /* a's array */
6668     UV* array_b;
6669     UV len_a;       /* length of a's array */
6670     UV len_b;
6671
6672     SV* u;                      /* the resulting union */
6673     UV* array_u;
6674     UV len_u;
6675
6676     UV i_a = 0;             /* current index into a's array */
6677     UV i_b = 0;
6678     UV i_u = 0;
6679
6680     /* running count, as explained in the algorithm source book; items are
6681      * stopped accumulating and are output when the count changes to/from 0.
6682      * The count is incremented when we start a range that's in the set, and
6683      * decremented when we start a range that's not in the set.  So its range
6684      * is 0 to 2.  Only when the count is zero is something not in the set.
6685      */
6686     UV count = 0;
6687
6688     PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
6689     assert(a != b);
6690
6691     /* If either one is empty, the union is the other one */
6692     if (a == NULL || ((len_a = invlist_len(a)) == 0)) {
6693         if (*output == a) {
6694             if (a != NULL) {
6695                 SvREFCNT_dec(a);
6696             }
6697         }
6698         if (*output != b) {
6699             *output = invlist_clone(b);
6700             if (complement_b) {
6701                 _invlist_invert(*output);
6702             }
6703         } /* else *output already = b; */
6704         return;
6705     }
6706     else if ((len_b = invlist_len(b)) == 0) {
6707         if (*output == b) {
6708             SvREFCNT_dec(b);
6709         }
6710
6711         /* The complement of an empty list is a list that has everything in it,
6712          * so the union with <a> includes everything too */
6713         if (complement_b) {
6714             if (a == *output) {
6715                 SvREFCNT_dec(a);
6716             }
6717             *output = _new_invlist(1);
6718             _append_range_to_invlist(*output, 0, UV_MAX);
6719         }
6720         else if (*output != a) {
6721             *output = invlist_clone(a);
6722         }
6723         /* else *output already = a; */
6724         return;
6725     }
6726
6727     /* Here both lists exist and are non-empty */
6728     array_a = invlist_array(a);
6729     array_b = invlist_array(b);
6730
6731     /* If are to take the union of 'a' with the complement of b, set it
6732      * up so are looking at b's complement. */
6733     if (complement_b) {
6734
6735         /* To complement, we invert: if the first element is 0, remove it.  To
6736          * do this, we just pretend the array starts one later, and clear the
6737          * flag as we don't have to do anything else later */
6738         if (array_b[0] == 0) {
6739             array_b++;
6740             len_b--;
6741             complement_b = FALSE;
6742         }
6743         else {
6744
6745             /* But if the first element is not zero, we unshift a 0 before the
6746              * array.  The data structure reserves a space for that 0 (which
6747              * should be a '1' right now), so physical shifting is unneeded,
6748              * but temporarily change that element to 0.  Before exiting the
6749              * routine, we must restore the element to '1' */
6750             array_b--;
6751             len_b++;
6752             array_b[0] = 0;
6753         }
6754     }
6755
6756     /* Size the union for the worst case: that the sets are completely
6757      * disjoint */
6758     u = _new_invlist(len_a + len_b);
6759
6760     /* Will contain U+0000 if either component does */
6761     array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
6762                                       || (len_b > 0 && array_b[0] == 0));
6763
6764     /* Go through each list item by item, stopping when exhausted one of
6765      * them */
6766     while (i_a < len_a && i_b < len_b) {
6767         UV cp;      /* The element to potentially add to the union's array */
6768         bool cp_in_set;   /* is it in the the input list's set or not */
6769
6770         /* We need to take one or the other of the two inputs for the union.
6771          * Since we are merging two sorted lists, we take the smaller of the
6772          * next items.  In case of a tie, we take the one that is in its set
6773          * first.  If we took one not in the set first, it would decrement the
6774          * count, possibly to 0 which would cause it to be output as ending the
6775          * range, and the next time through we would take the same number, and
6776          * output it again as beginning the next range.  By doing it the
6777          * opposite way, there is no possibility that the count will be
6778          * momentarily decremented to 0, and thus the two adjoining ranges will
6779          * be seamlessly merged.  (In a tie and both are in the set or both not
6780          * in the set, it doesn't matter which we take first.) */
6781         if (array_a[i_a] < array_b[i_b]
6782             || (array_a[i_a] == array_b[i_b]
6783                 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
6784         {
6785             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
6786             cp= array_a[i_a++];
6787         }
6788         else {
6789             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
6790             cp= array_b[i_b++];
6791         }
6792
6793         /* Here, have chosen which of the two inputs to look at.  Only output
6794          * if the running count changes to/from 0, which marks the
6795          * beginning/end of a range in that's in the set */
6796         if (cp_in_set) {
6797             if (count == 0) {
6798                 array_u[i_u++] = cp;
6799             }
6800             count++;
6801         }
6802         else {
6803             count--;
6804             if (count == 0) {
6805                 array_u[i_u++] = cp;
6806             }
6807         }
6808     }
6809
6810     /* Here, we are finished going through at least one of the lists, which
6811      * means there is something remaining in at most one.  We check if the list
6812      * that hasn't been exhausted is positioned such that we are in the middle
6813      * of a range in its set or not.  (i_a and i_b point to the element beyond
6814      * the one we care about.) If in the set, we decrement 'count'; if 0, there
6815      * is potentially more to output.
6816      * There are four cases:
6817      *  1) Both weren't in their sets, count is 0, and remains 0.  What's left
6818      *     in the union is entirely from the non-exhausted set.
6819      *  2) Both were in their sets, count is 2.  Nothing further should
6820      *     be output, as everything that remains will be in the exhausted
6821      *     list's set, hence in the union; decrementing to 1 but not 0 insures
6822      *     that
6823      *  3) the exhausted was in its set, non-exhausted isn't, count is 1.
6824      *     Nothing further should be output because the union includes
6825      *     everything from the exhausted set.  Not decrementing ensures that.
6826      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1;
6827      *     decrementing to 0 insures that we look at the remainder of the
6828      *     non-exhausted set */
6829     if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
6830         || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
6831     {
6832         count--;
6833     }
6834
6835     /* The final length is what we've output so far, plus what else is about to
6836      * be output.  (If 'count' is non-zero, then the input list we exhausted
6837      * has everything remaining up to the machine's limit in its set, and hence
6838      * in the union, so there will be no further output. */
6839     len_u = i_u;
6840     if (count == 0) {
6841         /* At most one of the subexpressions will be non-zero */
6842         len_u += (len_a - i_a) + (len_b - i_b);
6843     }
6844
6845     /* Set result to final length, which can change the pointer to array_u, so
6846      * re-find it */
6847     if (len_u != invlist_len(u)) {
6848         invlist_set_len(u, len_u);
6849         invlist_trim(u);
6850         array_u = invlist_array(u);
6851     }
6852
6853     /* When 'count' is 0, the list that was exhausted (if one was shorter than
6854      * the other) ended with everything above it not in its set.  That means
6855      * that the remaining part of the union is precisely the same as the
6856      * non-exhausted list, so can just copy it unchanged.  (If both list were
6857      * exhausted at the same time, then the operations below will be both 0.)
6858      */
6859     if (count == 0) {
6860         IV copy_count; /* At most one will have a non-zero copy count */
6861         if ((copy_count = len_a - i_a) > 0) {
6862             Copy(array_a + i_a, array_u + i_u, copy_count, UV);
6863         }
6864         else if ((copy_count = len_b - i_b) > 0) {
6865             Copy(array_b + i_b, array_u + i_u, copy_count, UV);
6866         }
6867     }
6868
6869     /*  We may be removing a reference to one of the inputs */
6870     if (a == *output || b == *output) {
6871         SvREFCNT_dec(*output);
6872     }
6873
6874     /* If we've changed b, restore it */
6875     if (complement_b) {
6876         array_b[0] = 1;
6877     }
6878
6879     *output = u;
6880     return;
6881 }
6882
6883 void
6884 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** i)
6885 {
6886     /* Take the intersection of two inversion lists and point <i> to it.  *i
6887      * should be defined upon input, and if it points to one of the two lists,
6888      * the reference count to that list will be decremented.
6889      * If <complement_b> is TRUE, the result will be the intersection of <a>
6890      * and the complement (or inversion) of <b> instead of <b> directly.
6891      *
6892      * The basis for this comes from "Unicode Demystified" Chapter 13 by
6893      * Richard Gillam, published by Addison-Wesley, and explained at some
6894      * length there.  The preface says to incorporate its examples into your
6895      * code at your own risk.  In fact, it had bugs
6896      *
6897      * The algorithm is like a merge sort, and is essentially the same as the
6898      * union above
6899      */
6900
6901     UV* array_a;                /* a's array */
6902     UV* array_b;
6903     UV len_a;   /* length of a's array */
6904     UV len_b;
6905
6906     SV* r;                   /* the resulting intersection */
6907     UV* array_r;
6908     UV len_r;
6909
6910     UV i_a = 0;             /* current index into a's array */
6911     UV i_b = 0;
6912     UV i_r = 0;
6913
6914     /* running count, as explained in the algorithm source book; items are
6915      * stopped accumulating and are output when the count changes to/from 2.
6916      * The count is incremented when we start a range that's in the set, and
6917      * decremented when we start a range that's not in the set.  So its range
6918      * is 0 to 2.  Only when the count is 2 is something in the intersection.
6919      */
6920     UV count = 0;
6921
6922     PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
6923     assert(a != b);
6924
6925     /* Special case if either one is empty */
6926     len_a = invlist_len(a);
6927     if ((len_a == 0) || ((len_b = invlist_len(b)) == 0)) {
6928
6929         if (len_a != 0 && complement_b) {
6930
6931             /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
6932              * be empty.  Here, also we are using 'b's complement, which hence
6933              * must be every possible code point.  Thus the intersection is
6934              * simply 'a'. */
6935             if (*i != a) {
6936                 *i = invlist_clone(a);
6937
6938                 if (*i == b) {
6939                     SvREFCNT_dec(b);
6940                 }
6941             }
6942             /* else *i is already 'a' */
6943             return;
6944         }
6945
6946         /* Here, 'a' or 'b' is empty and not using the complement of 'b'.  The
6947          * intersection must be empty */
6948         if (*i == a) {
6949             SvREFCNT_dec(a);
6950         }
6951         else if (*i == b) {
6952             SvREFCNT_dec(b);
6953         }
6954         *i = _new_invlist(0);
6955         return;
6956     }
6957
6958     /* Here both lists exist and are non-empty */
6959     array_a = invlist_array(a);
6960     array_b = invlist_array(b);
6961
6962     /* If are to take the intersection of 'a' with the complement of b, set it
6963      * up so are looking at b's complement. */
6964     if (complement_b) {
6965
6966         /* To complement, we invert: if the first element is 0, remove it.  To
6967          * do this, we just pretend the array starts one later, and clear the
6968          * flag as we don't have to do anything else later */
6969         if (array_b[0] == 0) {
6970             array_b++;
6971             len_b--;
6972             complement_b = FALSE;
6973         }
6974         else {
6975
6976             /* But if the first element is not zero, we unshift a 0 before the
6977              * array.  The data structure reserves a space for that 0 (which
6978              * should be a '1' right now), so physical shifting is unneeded,
6979              * but temporarily change that element to 0.  Before exiting the
6980              * routine, we must restore the element to '1' */
6981             array_b--;
6982             len_b++;
6983             array_b[0] = 0;
6984         }
6985     }
6986
6987     /* Size the intersection for the worst case: that the intersection ends up
6988      * fragmenting everything to be completely disjoint */
6989     r= _new_invlist(len_a + len_b);
6990
6991     /* Will contain U+0000 iff both components do */
6992     array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
6993                                      && len_b > 0 && array_b[0] == 0);
6994
6995     /* Go through each list item by item, stopping when exhausted one of
6996      * them */
6997     while (i_a < len_a && i_b < len_b) {
6998         UV cp;      /* The element to potentially add to the intersection's
6999                        array */
7000         bool cp_in_set; /* Is it in the input list's set or not */
7001
7002         /* We need to take one or the other of the two inputs for the
7003          * intersection.  Since we are merging two sorted lists, we take the
7004          * smaller of the next items.  In case of a tie, we take the one that
7005          * is not in its set first (a difference from the union algorithm).  If
7006          * we took one in the set first, it would increment the count, possibly
7007          * to 2 which would cause it to be output as starting a range in the
7008          * intersection, and the next time through we would take that same
7009          * number, and output it again as ending the set.  By doing it the
7010          * opposite of this, there is no possibility that the count will be
7011          * momentarily incremented to 2.  (In a tie and both are in the set or
7012          * both not in the set, it doesn't matter which we take first.) */
7013         if (array_a[i_a] < array_b[i_b]
7014             || (array_a[i_a] == array_b[i_b]
7015                 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7016         {
7017             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7018             cp= array_a[i_a++];
7019         }
7020         else {
7021             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7022             cp= array_b[i_b++];
7023         }
7024
7025         /* Here, have chosen which of the two inputs to look at.  Only output
7026          * if the running count changes to/from 2, which marks the
7027          * beginning/end of a range that's in the intersection */
7028         if (cp_in_set) {
7029             count++;
7030             if (count == 2) {
7031                 array_r[i_r++] = cp;
7032             }
7033         }
7034         else {
7035             if (count == 2) {
7036                 array_r[i_r++] = cp;
7037             }
7038             count--;
7039         }
7040     }
7041
7042     /* Here, we are finished going through at least one of the lists, which
7043      * means there is something remaining in at most one.  We check if the list
7044      * that has been exhausted is positioned such that we are in the middle
7045      * of a range in its set or not.  (i_a and i_b point to elements 1 beyond
7046      * the ones we care about.)  There are four cases:
7047      *  1) Both weren't in their sets, count is 0, and remains 0.  There's
7048      *     nothing left in the intersection.
7049      *  2) Both were in their sets, count is 2 and perhaps is incremented to
7050      *     above 2.  What should be output is exactly that which is in the
7051      *     non-exhausted set, as everything it has is also in the intersection
7052      *     set, and everything it doesn't have can't be in the intersection
7053      *  3) The exhausted was in its set, non-exhausted isn't, count is 1, and
7054      *     gets incremented to 2.  Like the previous case, the intersection is
7055      *     everything that remains in the non-exhausted set.
7056      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
7057      *     remains 1.  And the intersection has nothing more. */
7058     if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7059         || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7060     {
7061         count++;
7062     }
7063
7064     /* The final length is what we've output so far plus what else is in the
7065      * intersection.  At most one of the subexpressions below will be non-zero */
7066     len_r = i_r;
7067     if (count >= 2) {
7068         len_r += (len_a - i_a) + (len_b - i_b);
7069     }
7070
7071     /* Set result to final length, which can change the pointer to array_r, so
7072      * re-find it */
7073     if (len_r != invlist_len(r)) {
7074         invlist_set_len(r, len_r);
7075         invlist_trim(r);
7076         array_r = invlist_array(r);
7077     }
7078
7079     /* Finish outputting any remaining */
7080     if (count >= 2) { /* At most one will have a non-zero copy count */
7081         IV copy_count;
7082         if ((copy_count = len_a - i_a) > 0) {
7083             Copy(array_a + i_a, array_r + i_r, copy_count, UV);
7084         }
7085         else if ((copy_count = len_b - i_b) > 0) {
7086             Copy(array_b + i_b, array_r + i_r, copy_count, UV);
7087         }
7088     }
7089
7090     /*  We may be removing a reference to one of the inputs */
7091     if (a == *i || b == *i) {
7092         SvREFCNT_dec(*i);
7093     }
7094
7095     /* If we've changed b, restore it */
7096     if (complement_b) {
7097         array_b[0] = 1;
7098     }
7099
7100     *i = r;
7101     return;
7102 }
7103
7104 #endif
7105
7106 STATIC SV*
7107 S_add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
7108 {
7109     /* Add the range from 'start' to 'end' inclusive to the inversion list's
7110      * set.  A pointer to the inversion list is returned.  This may actually be
7111      * a new list, in which case the passed in one has been destroyed.  The
7112      * passed in inversion list can be NULL, in which case a new one is created
7113      * with just the one range in it */
7114
7115     SV* range_invlist;
7116     UV len;
7117
7118     if (invlist == NULL) {
7119         invlist = _new_invlist(2);
7120         len = 0;
7121     }
7122     else {
7123         len = invlist_len(invlist);
7124     }
7125
7126     /* If comes after the final entry, can just append it to the end */
7127     if (len == 0
7128         || start >= invlist_array(invlist)
7129                                     [invlist_len(invlist) - 1])
7130     {
7131         _append_range_to_invlist(invlist, start, end);
7132         return invlist;
7133     }
7134
7135     /* Here, can't just append things, create and return a new inversion list
7136      * which is the union of this range and the existing inversion list */
7137     range_invlist = _new_invlist(2);
7138     _append_range_to_invlist(range_invlist, start, end);
7139
7140     _invlist_union(invlist, range_invlist, &invlist);
7141
7142     /* The temporary can be freed */
7143     SvREFCNT_dec(range_invlist);
7144
7145     return invlist;
7146 }
7147
7148 PERL_STATIC_INLINE SV*
7149 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
7150     return add_range_to_invlist(invlist, cp, cp);
7151 }
7152
7153 #ifndef PERL_IN_XSUB_RE
7154 void
7155 Perl__invlist_invert(pTHX_ SV* const invlist)
7156 {
7157     /* Complement the input inversion list.  This adds a 0 if the list didn't
7158      * have a zero; removes it otherwise.  As described above, the data
7159      * structure is set up so that this is very efficient */
7160
7161     UV* len_pos = get_invlist_len_addr(invlist);
7162
7163     PERL_ARGS_ASSERT__INVLIST_INVERT;
7164
7165     /* The inverse of matching nothing is matching everything */
7166     if (*len_pos == 0) {
7167         _append_range_to_invlist(invlist, 0, UV_MAX);
7168         return;
7169     }
7170
7171     /* The exclusive or complents 0 to 1; and 1 to 0.  If the result is 1, the
7172      * zero element was a 0, so it is being removed, so the length decrements
7173      * by 1; and vice-versa.  SvCUR is unaffected */
7174     if (*get_invlist_zero_addr(invlist) ^= 1) {
7175         (*len_pos)--;
7176     }
7177     else {
7178         (*len_pos)++;
7179     }
7180 }
7181
7182 void
7183 Perl__invlist_invert_prop(pTHX_ SV* const invlist)
7184 {
7185     /* Complement the input inversion list (which must be a Unicode property,
7186      * all of which don't match above the Unicode maximum code point.)  And
7187      * Perl has chosen to not have the inversion match above that either.  This
7188      * adds a 0x110000 if the list didn't end with it, and removes it if it did
7189      */
7190
7191     UV len;
7192     UV* array;
7193
7194     PERL_ARGS_ASSERT__INVLIST_INVERT_PROP;
7195
7196     _invlist_invert(invlist);
7197
7198     len = invlist_len(invlist);
7199
7200     if (len != 0) { /* If empty do nothing */
7201         array = invlist_array(invlist);
7202         if (array[len - 1] != PERL_UNICODE_MAX + 1) {
7203             /* Add 0x110000.  First, grow if necessary */
7204             len++;
7205             if (invlist_max(invlist) < len) {
7206                 invlist_extend(invlist, len);
7207                 array = invlist_array(invlist);
7208             }
7209             invlist_set_len(invlist, len);
7210             array[len - 1] = PERL_UNICODE_MAX + 1;
7211         }
7212         else {  /* Remove the 0x110000 */
7213             invlist_set_len(invlist, len - 1);
7214         }
7215     }
7216
7217     return;
7218 }
7219 #endif
7220
7221 PERL_STATIC_INLINE SV*
7222 S_invlist_clone(pTHX_ SV* const invlist)
7223 {
7224
7225     /* Return a new inversion list that is a copy of the input one, which is
7226      * unchanged */
7227
7228     /* Need to allocate extra space to accommodate Perl's addition of a
7229      * trailing NUL to SvPV's, since it thinks they are always strings */
7230     SV* new_invlist = _new_invlist(invlist_len(invlist) + 1);
7231     STRLEN length = SvCUR(invlist);
7232
7233     PERL_ARGS_ASSERT_INVLIST_CLONE;
7234
7235     SvCUR_set(new_invlist, length); /* This isn't done automatically */
7236     Copy(SvPVX(invlist), SvPVX(new_invlist), length, char);
7237
7238     return new_invlist;
7239 }
7240
7241 PERL_STATIC_INLINE UV*
7242 S_get_invlist_iter_addr(pTHX_ SV* invlist)
7243 {
7244     /* Return the address of the UV that contains the current iteration
7245      * position */
7246
7247     PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
7248
7249     return (UV *) (SvPVX(invlist) + (INVLIST_ITER_OFFSET * sizeof (UV)));
7250 }
7251
7252 PERL_STATIC_INLINE UV*
7253 S_get_invlist_version_id_addr(pTHX_ SV* invlist)
7254 {
7255     /* Return the address of the UV that contains the version id. */
7256
7257     PERL_ARGS_ASSERT_GET_INVLIST_VERSION_ID_ADDR;
7258
7259     return (UV *) (SvPVX(invlist) + (INVLIST_VERSION_ID_OFFSET * sizeof (UV)));
7260 }
7261
7262 PERL_STATIC_INLINE void
7263 S_invlist_iterinit(pTHX_ SV* invlist)   /* Initialize iterator for invlist */
7264 {
7265     PERL_ARGS_ASSERT_INVLIST_ITERINIT;
7266
7267     *get_invlist_iter_addr(invlist) = 0;
7268 }
7269
7270 STATIC bool
7271 S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
7272 {
7273     /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
7274      * This call sets in <*start> and <*end>, the next range in <invlist>.
7275      * Returns <TRUE> if successful and the next call will return the next
7276      * range; <FALSE> if was already at the end of the list.  If the latter,
7277      * <*start> and <*end> are unchanged, and the next call to this function
7278      * will start over at the beginning of the list */
7279
7280     UV* pos = get_invlist_iter_addr(invlist);
7281     UV len = invlist_len(invlist);
7282     UV *array;
7283
7284     PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
7285
7286     if (*pos >= len) {
7287         *pos = UV_MAX;  /* Force iternit() to be required next time */
7288         return FALSE;
7289     }
7290
7291     array = invlist_array(invlist);
7292
7293     *start = array[(*pos)++];
7294
7295     if (*pos >= len) {
7296         *end = UV_MAX;
7297     }
7298     else {
7299         *end = array[(*pos)++] - 1;
7300     }
7301
7302     return TRUE;
7303 }
7304
7305 #ifndef PERL_IN_XSUB_RE
7306 SV *
7307 Perl__invlist_contents(pTHX_ SV* const invlist)
7308 {
7309     /* Get the contents of an inversion list into a string SV so that they can
7310      * be printed out.  It uses the format traditionally done for debug tracing
7311      */
7312
7313     UV start, end;
7314     SV* output = newSVpvs("\n");
7315
7316     PERL_ARGS_ASSERT__INVLIST_CONTENTS;
7317
7318     invlist_iterinit(invlist);
7319     while (invlist_iternext(invlist, &start, &end)) {
7320         if (end == UV_MAX) {
7321             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
7322         }
7323         else if (end != start) {
7324             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
7325                     start,       end);
7326         }
7327         else {
7328             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
7329         }
7330     }
7331
7332     return output;
7333 }
7334 #endif
7335
7336 #if 0
7337 void
7338 S_invlist_dump(pTHX_ SV* const invlist, const char * const header)
7339 {
7340     /* Dumps out the ranges in an inversion list.  The string 'header'
7341      * if present is output on a line before the first range */
7342
7343     UV start, end;
7344
7345     if (header && strlen(header)) {
7346         PerlIO_printf(Perl_debug_log, "%s\n", header);
7347     }
7348     invlist_iterinit(invlist);
7349     while (invlist_iternext(invlist, &start, &end)) {
7350         if (end == UV_MAX) {
7351             PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. INFINITY\n", start);
7352         }
7353         else {
7354             PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. 0x%04"UVXf"\n", start, end);
7355         }
7356     }
7357 }
7358 #endif
7359
7360 #undef HEADER_LENGTH
7361 #undef INVLIST_INITIAL_LENGTH
7362 #undef TO_INTERNAL_SIZE
7363 #undef FROM_INTERNAL_SIZE
7364 #undef INVLIST_LEN_OFFSET
7365 #undef INVLIST_ZERO_OFFSET
7366 #undef INVLIST_ITER_OFFSET
7367 #undef INVLIST_VERSION_ID
7368
7369 /* End of inversion list object */
7370
7371 /*
7372  - reg - regular expression, i.e. main body or parenthesized thing
7373  *
7374  * Caller must absorb opening parenthesis.
7375  *
7376  * Combining parenthesis handling with the base level of regular expression
7377  * is a trifle forced, but the need to tie the tails of the branches to what
7378  * follows makes it hard to avoid.
7379  */
7380 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
7381 #ifdef DEBUGGING
7382 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
7383 #else
7384 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
7385 #endif
7386
7387 STATIC regnode *
7388 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
7389     /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
7390 {
7391     dVAR;
7392     register regnode *ret;              /* Will be the head of the group. */
7393     register regnode *br;
7394     register regnode *lastbr;
7395     register regnode *ender = NULL;
7396     register I32 parno = 0;
7397     I32 flags;
7398     U32 oregflags = RExC_flags;
7399     bool have_branch = 0;
7400     bool is_open = 0;
7401     I32 freeze_paren = 0;
7402     I32 after_freeze = 0;
7403
7404     /* for (?g), (?gc), and (?o) warnings; warning
7405        about (?c) will warn about (?g) -- japhy    */
7406
7407 #define WASTED_O  0x01
7408 #define WASTED_G  0x02
7409 #define WASTED_C  0x04
7410 #define WASTED_GC (0x02|0x04)
7411     I32 wastedflags = 0x00;
7412
7413     char * parse_start = RExC_parse; /* MJD */
7414     char * const oregcomp_parse = RExC_parse;
7415
7416     GET_RE_DEBUG_FLAGS_DECL;
7417
7418     PERL_ARGS_ASSERT_REG;
7419     DEBUG_PARSE("reg ");
7420
7421     *flagp = 0;                         /* Tentatively. */
7422
7423
7424     /* Make an OPEN node, if parenthesized. */
7425     if (paren) {
7426         if ( *RExC_parse == '*') { /* (*VERB:ARG) */
7427             char *start_verb = RExC_parse;
7428             STRLEN verb_len = 0;
7429             char *start_arg = NULL;
7430             unsigned char op = 0;
7431             int argok = 1;
7432             int internal_argval = 0; /* internal_argval is only useful if !argok */
7433             while ( *RExC_parse && *RExC_parse != ')' ) {
7434                 if ( *RExC_parse == ':' ) {
7435                     start_arg = RExC_parse + 1;
7436                     break;
7437                 }
7438                 RExC_parse++;
7439             }
7440             ++start_verb;
7441             verb_len = RExC_parse - start_verb;
7442             if ( start_arg ) {
7443                 RExC_parse++;
7444                 while ( *RExC_parse && *RExC_parse != ')' )
7445                     RExC_parse++;
7446                 if ( *RExC_parse != ')' )
7447                     vFAIL("Unterminated verb pattern argument");
7448                 if ( RExC_parse == start_arg )
7449                     start_arg = NULL;
7450             } else {
7451                 if ( *RExC_parse != ')' )
7452                     vFAIL("Unterminated verb pattern");
7453             }
7454
7455             switch ( *start_verb ) {
7456             case 'A':  /* (*ACCEPT) */
7457                 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
7458                     op = ACCEPT;
7459                     internal_argval = RExC_nestroot;
7460                 }
7461                 break;
7462             case 'C':  /* (*COMMIT) */
7463                 if ( memEQs(start_verb,verb_len,"COMMIT") )
7464                     op = COMMIT;
7465                 break;
7466             case 'F':  /* (*FAIL) */
7467                 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
7468                     op = OPFAIL;
7469                     argok = 0;
7470                 }
7471                 break;
7472             case ':':  /* (*:NAME) */
7473             case 'M':  /* (*MARK:NAME) */
7474                 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
7475                     op = MARKPOINT;
7476                     argok = -1;
7477                 }
7478                 break;
7479             case 'P':  /* (*PRUNE) */
7480                 if ( memEQs(start_verb,verb_len,"PRUNE") )
7481                     op = PRUNE;
7482                 break;
7483             case 'S':   /* (*SKIP) */
7484                 if ( memEQs(start_verb,verb_len,"SKIP") )
7485                     op = SKIP;
7486                 break;
7487             case 'T':  /* (*THEN) */
7488                 /* [19:06] <TimToady> :: is then */
7489                 if ( memEQs(start_verb,verb_len,"THEN") ) {
7490                     op = CUTGROUP;
7491                     RExC_seen |= REG_SEEN_CUTGROUP;
7492                 }
7493                 break;
7494             }
7495             if ( ! op ) {
7496                 RExC_parse++;
7497                 vFAIL3("Unknown verb pattern '%.*s'",
7498                     verb_len, start_verb);
7499             }
7500             if ( argok ) {
7501                 if ( start_arg && internal_argval ) {
7502                     vFAIL3("Verb pattern '%.*s' may not have an argument",
7503                         verb_len, start_verb);
7504                 } else if ( argok < 0 && !start_arg ) {
7505                     vFAIL3("Verb pattern '%.*s' has a mandatory argument",
7506                         verb_len, start_verb);
7507                 } else {
7508                     ret = reganode(pRExC_state, op, internal_argval);
7509                     if ( ! internal_argval && ! SIZE_ONLY ) {
7510                         if (start_arg) {
7511                             SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
7512                             ARG(ret) = add_data( pRExC_state, 1, "S" );
7513                             RExC_rxi->data->data[ARG(ret)]=(void*)sv;
7514                             ret->flags = 0;
7515                         } else {
7516                             ret->flags = 1;
7517                         }
7518                     }
7519                 }
7520                 if (!internal_argval)
7521                     RExC_seen |= REG_SEEN_VERBARG;
7522             } else if ( start_arg ) {
7523                 vFAIL3("Verb pattern '%.*s' may not have an argument",
7524                         verb_len, start_verb);
7525             } else {
7526                 ret = reg_node(pRExC_state, op);
7527             }
7528             nextchar(pRExC_state);
7529             return ret;
7530         } else
7531         if (*RExC_parse == '?') { /* (?...) */
7532             bool is_logical = 0;
7533             const char * const seqstart = RExC_parse;
7534             bool has_use_defaults = FALSE;
7535
7536             RExC_parse++;
7537             paren = *RExC_parse++;
7538             ret = NULL;                 /* For look-ahead/behind. */
7539             switch (paren) {
7540
7541             case 'P':   /* (?P...) variants for those used to PCRE/Python */
7542                 paren = *RExC_parse++;
7543                 if ( paren == '<')         /* (?P<...>) named capture */
7544                     goto named_capture;
7545                 else if (paren == '>') {   /* (?P>name) named recursion */
7546                     goto named_recursion;
7547                 }
7548                 else if (paren == '=') {   /* (?P=...)  named backref */
7549                     /* this pretty much dupes the code for \k<NAME> in regatom(), if
7550                        you change this make sure you change that */
7551                     char* name_start = RExC_parse;
7552                     U32 num = 0;
7553                     SV *sv_dat = reg_scan_name(pRExC_state,
7554                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7555                     if (RExC_parse == name_start || *RExC_parse != ')')
7556                         vFAIL2("Sequence %.3s... not terminated",parse_start);
7557
7558                     if (!SIZE_ONLY) {
7559                         num = add_data( pRExC_state, 1, "S" );
7560                         RExC_rxi->data->data[num]=(void*)sv_dat;
7561                         SvREFCNT_inc_simple_void(sv_dat);
7562                     }
7563                     RExC_sawback = 1;
7564                     ret = reganode(pRExC_state,
7565                                    ((! FOLD)
7566                                      ? NREF
7567                                      : (MORE_ASCII_RESTRICTED)
7568                                        ? NREFFA
7569                                        : (AT_LEAST_UNI_SEMANTICS)
7570                                          ? NREFFU
7571                                          : (LOC)
7572                                            ? NREFFL
7573                                            : NREFF),
7574                                     num);
7575                     *flagp |= HASWIDTH;
7576
7577                     Set_Node_Offset(ret, parse_start+1);
7578                     Set_Node_Cur_Length(ret); /* MJD */
7579
7580                     nextchar(pRExC_state);
7581                     return ret;
7582                 }
7583                 RExC_parse++;
7584                 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
7585                 /*NOTREACHED*/
7586             case '<':           /* (?<...) */
7587                 if (*RExC_parse == '!')
7588                     paren = ',';
7589                 else if (*RExC_parse != '=')
7590               named_capture:
7591                 {               /* (?<...>) */
7592                     char *name_start;
7593                     SV *svname;
7594                     paren= '>';
7595             case '\'':          /* (?'...') */
7596                     name_start= RExC_parse;
7597                     svname = reg_scan_name(pRExC_state,
7598                         SIZE_ONLY ?  /* reverse test from the others */
7599                         REG_RSN_RETURN_NAME :
7600                         REG_RSN_RETURN_NULL);
7601                     if (RExC_parse == name_start) {
7602                         RExC_parse++;
7603                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
7604                         /*NOTREACHED*/
7605                     }
7606                     if (*RExC_parse != paren)
7607                         vFAIL2("Sequence (?%c... not terminated",
7608                             paren=='>' ? '<' : paren);
7609                     if (SIZE_ONLY) {
7610                         HE *he_str;
7611                         SV *sv_dat = NULL;
7612                         if (!svname) /* shouldn't happen */
7613                             Perl_croak(aTHX_
7614                                 "panic: reg_scan_name returned NULL");
7615                         if (!RExC_paren_names) {
7616                             RExC_paren_names= newHV();
7617                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
7618 #ifdef DEBUGGING
7619                             RExC_paren_name_list= newAV();
7620                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
7621 #endif
7622                         }
7623                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
7624                         if ( he_str )
7625                             sv_dat = HeVAL(he_str);
7626                         if ( ! sv_dat ) {
7627                             /* croak baby croak */
7628                             Perl_croak(aTHX_
7629                                 "panic: paren_name hash element allocation failed");
7630                         } else if ( SvPOK(sv_dat) ) {
7631                             /* (?|...) can mean we have dupes so scan to check
7632                                its already been stored. Maybe a flag indicating
7633                                we are inside such a construct would be useful,
7634                                but the arrays are likely to be quite small, so
7635                                for now we punt -- dmq */
7636                             IV count = SvIV(sv_dat);
7637                             I32 *pv = (I32*)SvPVX(sv_dat);
7638                             IV i;
7639                             for ( i = 0 ; i < count ; i++ ) {
7640                                 if ( pv[i] == RExC_npar ) {
7641                                     count = 0;
7642                                     break;
7643                                 }
7644                             }
7645                             if ( count ) {
7646                                 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
7647                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
7648                                 pv[count] = RExC_npar;
7649                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
7650                             }
7651                         } else {
7652                             (void)SvUPGRADE(sv_dat,SVt_PVNV);
7653                             sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
7654                             SvIOK_on(sv_dat);
7655                             SvIV_set(sv_dat, 1);
7656                         }
7657 #ifdef DEBUGGING
7658                         /* Yes this does cause a memory leak in debugging Perls */
7659                         if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
7660                             SvREFCNT_dec(svname);
7661 #endif
7662
7663                         /*sv_dump(sv_dat);*/
7664                     }
7665                     nextchar(pRExC_state);
7666                     paren = 1;
7667                     goto capturing_parens;
7668                 }
7669                 RExC_seen |= REG_SEEN_LOOKBEHIND;
7670                 RExC_in_lookbehind++;
7671                 RExC_parse++;
7672             case '=':           /* (?=...) */
7673                 RExC_seen_zerolen++;
7674                 break;
7675             case '!':           /* (?!...) */
7676                 RExC_seen_zerolen++;
7677                 if (*RExC_parse == ')') {
7678                     ret=reg_node(pRExC_state, OPFAIL);
7679                     nextchar(pRExC_state);
7680                     return ret;
7681                 }
7682                 break;
7683             case '|':           /* (?|...) */
7684                 /* branch reset, behave like a (?:...) except that
7685                    buffers in alternations share the same numbers */
7686                 paren = ':';
7687                 after_freeze = freeze_paren = RExC_npar;
7688                 break;
7689             case ':':           /* (?:...) */
7690             case '>':           /* (?>...) */
7691                 break;
7692             case '$':           /* (?$...) */
7693             case '@':           /* (?@...) */
7694                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
7695                 break;
7696             case '#':           /* (?#...) */
7697                 while (*RExC_parse && *RExC_parse != ')')
7698                     RExC_parse++;
7699                 if (*RExC_parse != ')')
7700                     FAIL("Sequence (?#... not terminated");
7701                 nextchar(pRExC_state);
7702                 *flagp = TRYAGAIN;
7703                 return NULL;
7704             case '0' :           /* (?0) */
7705             case 'R' :           /* (?R) */
7706                 if (*RExC_parse != ')')
7707                     FAIL("Sequence (?R) not terminated");
7708                 ret = reg_node(pRExC_state, GOSTART);
7709                 *flagp |= POSTPONED;
7710                 nextchar(pRExC_state);
7711                 return ret;
7712                 /*notreached*/
7713             { /* named and numeric backreferences */
7714                 I32 num;
7715             case '&':            /* (?&NAME) */
7716                 parse_start = RExC_parse - 1;
7717               named_recursion:
7718                 {
7719                     SV *sv_dat = reg_scan_name(pRExC_state,
7720                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7721                      num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
7722                 }
7723                 goto gen_recurse_regop;
7724                 /* NOT REACHED */
7725             case '+':
7726                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
7727                     RExC_parse++;
7728                     vFAIL("Illegal pattern");
7729                 }
7730                 goto parse_recursion;
7731                 /* NOT REACHED*/
7732             case '-': /* (?-1) */
7733                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
7734                     RExC_parse--; /* rewind to let it be handled later */
7735                     goto parse_flags;
7736                 }
7737                 /*FALLTHROUGH */
7738             case '1': case '2': case '3': case '4': /* (?1) */
7739             case '5': case '6': case '7': case '8': case '9':
7740                 RExC_parse--;
7741               parse_recursion:
7742                 num = atoi(RExC_parse);
7743                 parse_start = RExC_parse - 1; /* MJD */
7744                 if (*RExC_parse == '-')
7745                     RExC_parse++;
7746                 while (isDIGIT(*RExC_parse))
7747                         RExC_parse++;
7748                 if (*RExC_parse!=')')
7749                     vFAIL("Expecting close bracket");
7750
7751               gen_recurse_regop:
7752                 if ( paren == '-' ) {
7753                     /*
7754                     Diagram of capture buffer numbering.
7755                     Top line is the normal capture buffer numbers
7756                     Bottom line is the negative indexing as from
7757                     the X (the (?-2))
7758
7759                     +   1 2    3 4 5 X          6 7
7760                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
7761                     -   5 4    3 2 1 X          x x
7762
7763                     */
7764                     num = RExC_npar + num;
7765                     if (num < 1)  {
7766                         RExC_parse++;
7767                         vFAIL("Reference to nonexistent group");
7768                     }
7769                 } else if ( paren == '+' ) {
7770                     num = RExC_npar + num - 1;
7771                 }
7772
7773                 ret = reganode(pRExC_state, GOSUB, num);
7774                 if (!SIZE_ONLY) {
7775                     if (num > (I32)RExC_rx->nparens) {
7776                         RExC_parse++;
7777                         vFAIL("Reference to nonexistent group");
7778                     }
7779                     ARG2L_SET( ret, RExC_recurse_count++);
7780                     RExC_emit++;
7781                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
7782                         "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
7783                 } else {
7784                     RExC_size++;
7785                 }
7786                 RExC_seen |= REG_SEEN_RECURSE;
7787                 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
7788                 Set_Node_Offset(ret, parse_start); /* MJD */
7789
7790                 *flagp |= POSTPONED;
7791                 nextchar(pRExC_state);
7792                 return ret;
7793             } /* named and numeric backreferences */
7794             /* NOT REACHED */
7795
7796             case '?':           /* (??...) */
7797                 is_logical = 1;
7798                 if (*RExC_parse != '{') {
7799                     RExC_parse++;
7800                     vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
7801                     /*NOTREACHED*/
7802                 }
7803                 *flagp |= POSTPONED;
7804                 paren = *RExC_parse++;
7805                 /* FALL THROUGH */
7806             case '{':           /* (?{...}) */
7807             {
7808                 I32 count = 1;
7809                 U32 n = 0;
7810                 char c;
7811                 char *s = RExC_parse;
7812
7813                 RExC_seen_zerolen++;
7814                 RExC_seen |= REG_SEEN_EVAL;
7815                 while (count && (c = *RExC_parse)) {
7816                     if (c == '\\') {
7817                         if (RExC_parse[1])
7818                             RExC_parse++;
7819                     }
7820                     else if (c == '{')
7821                         count++;
7822                     else if (c == '}')
7823                         count--;
7824                     RExC_parse++;
7825                 }
7826                 if (*RExC_parse != ')') {
7827                     RExC_parse = s;
7828                     vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
7829                 }
7830                 if (!SIZE_ONLY) {
7831                     PAD *pad;
7832                     OP_4tree *sop, *rop;
7833                     SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
7834
7835                     ENTER;
7836                     Perl_save_re_context(aTHX);
7837                     rop = Perl_sv_compile_2op_is_broken(aTHX_ sv, &sop, "re", &pad);
7838                     sop->op_private |= OPpREFCOUNTED;
7839                     /* re_dup will OpREFCNT_inc */
7840                     OpREFCNT_set(sop, 1);
7841                     LEAVE;
7842
7843                     n = add_data(pRExC_state, 3, "nop");
7844                     RExC_rxi->data->data[n] = (void*)rop;
7845                     RExC_rxi->data->data[n+1] = (void*)sop;
7846                     RExC_rxi->data->data[n+2] = (void*)pad;
7847                     SvREFCNT_dec(sv);
7848                 }
7849                 else {                                          /* First pass */
7850                     if (PL_reginterp_cnt < ++RExC_seen_evals
7851                         && IN_PERL_RUNTIME)
7852                         /* No compiled RE interpolated, has runtime
7853                            components ===> unsafe.  */
7854                         FAIL("Eval-group not allowed at runtime, use re 'eval'");
7855                     if (PL_tainting && PL_tainted)
7856                         FAIL("Eval-group in insecure regular expression");
7857 #if PERL_VERSION > 8
7858                     if (IN_PERL_COMPILETIME)
7859                         PL_cv_has_eval = 1;
7860 #endif
7861                 }
7862
7863                 nextchar(pRExC_state);
7864                 if (is_logical) {
7865                     ret = reg_node(pRExC_state, LOGICAL);
7866                     if (!SIZE_ONLY)
7867                         ret->flags = 2;
7868                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
7869                     /* deal with the length of this later - MJD */
7870                     return ret;
7871                 }
7872                 ret = reganode(pRExC_state, EVAL, n);
7873                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
7874                 Set_Node_Offset(ret, parse_start);
7875                 return ret;
7876             }
7877             case '(':           /* (?(?{...})...) and (?(?=...)...) */
7878             {
7879                 int is_define= 0;
7880                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
7881                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
7882                         || RExC_parse[1] == '<'
7883                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
7884                         I32 flag;
7885
7886                         ret = reg_node(pRExC_state, LOGICAL);
7887                         if (!SIZE_ONLY)
7888                             ret->flags = 1;
7889                         REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
7890                         goto insert_if;
7891                     }
7892                 }
7893                 else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
7894                          || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
7895                 {
7896                     char ch = RExC_parse[0] == '<' ? '>' : '\'';
7897                     char *name_start= RExC_parse++;
7898                     U32 num = 0;
7899                     SV *sv_dat=reg_scan_name(pRExC_state,
7900                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7901                     if (RExC_parse == name_start || *RExC_parse != ch)
7902                         vFAIL2("Sequence (?(%c... not terminated",
7903                             (ch == '>' ? '<' : ch));
7904                     RExC_parse++;
7905                     if (!SIZE_ONLY) {
7906                         num = add_data( pRExC_state, 1, "S" );
7907                         RExC_rxi->data->data[num]=(void*)sv_dat;
7908                         SvREFCNT_inc_simple_void(sv_dat);
7909                     }
7910                     ret = reganode(pRExC_state,NGROUPP,num);
7911                     goto insert_if_check_paren;
7912                 }
7913                 else if (RExC_parse[0] == 'D' &&
7914                          RExC_parse[1] == 'E' &&
7915                          RExC_parse[2] == 'F' &&
7916                          RExC_parse[3] == 'I' &&
7917                          RExC_parse[4] == 'N' &&
7918                          RExC_parse[5] == 'E')
7919                 {
7920                     ret = reganode(pRExC_state,DEFINEP,0);
7921                     RExC_parse +=6 ;
7922                     is_define = 1;
7923                     goto insert_if_check_paren;
7924                 }
7925                 else if (RExC_parse[0] == 'R') {
7926                     RExC_parse++;
7927                     parno = 0;
7928                     if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
7929                         parno = atoi(RExC_parse++);
7930                         while (isDIGIT(*RExC_parse))
7931                             RExC_parse++;
7932                     } else if (RExC_parse[0] == '&') {
7933                         SV *sv_dat;
7934                         RExC_parse++;
7935                         sv_dat = reg_scan_name(pRExC_state,
7936                             SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7937                         parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
7938                     }
7939                     ret = reganode(pRExC_state,INSUBP,parno);
7940                     goto insert_if_check_paren;
7941                 }
7942                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
7943                     /* (?(1)...) */
7944                     char c;
7945                     parno = atoi(RExC_parse++);
7946
7947                     while (isDIGIT(*RExC_parse))
7948                         RExC_parse++;
7949                     ret = reganode(pRExC_state, GROUPP, parno);
7950
7951                  insert_if_check_paren:
7952                     if ((c = *nextchar(pRExC_state)) != ')')
7953                         vFAIL("Switch condition not recognized");
7954                   insert_if:
7955                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
7956                     br = regbranch(pRExC_state, &flags, 1,depth+1);
7957                     if (br == NULL)
7958                         br = reganode(pRExC_state, LONGJMP, 0);
7959                     else
7960                         REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
7961                     c = *nextchar(pRExC_state);
7962                     if (flags&HASWIDTH)
7963                         *flagp |= HASWIDTH;
7964                     if (c == '|') {
7965                         if (is_define)
7966                             vFAIL("(?(DEFINE)....) does not allow branches");
7967                         lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
7968                         regbranch(pRExC_state, &flags, 1,depth+1);
7969                         REGTAIL(pRExC_state, ret, lastbr);
7970                         if (flags&HASWIDTH)
7971                             *flagp |= HASWIDTH;
7972                         c = *nextchar(pRExC_state);
7973                     }
7974                     else
7975                         lastbr = NULL;
7976                     if (c != ')')
7977                         vFAIL("Switch (?(condition)... contains too many branches");
7978                     ender = reg_node(pRExC_state, TAIL);
7979                     REGTAIL(pRExC_state, br, ender);
7980                     if (lastbr) {
7981                         REGTAIL(pRExC_state, lastbr, ender);
7982                         REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
7983                     }
7984                     else
7985                         REGTAIL(pRExC_state, ret, ender);
7986                     RExC_size++; /* XXX WHY do we need this?!!
7987                                     For large programs it seems to be required
7988                                     but I can't figure out why. -- dmq*/
7989                     return ret;
7990                 }
7991                 else {
7992                     vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
7993                 }
7994             }
7995             case 0:
7996                 RExC_parse--; /* for vFAIL to print correctly */
7997                 vFAIL("Sequence (? incomplete");
7998                 break;
7999             case DEFAULT_PAT_MOD:   /* Use default flags with the exceptions
8000                                        that follow */
8001                 has_use_defaults = TRUE;
8002                 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
8003                 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
8004                                                 ? REGEX_UNICODE_CHARSET
8005                                                 : REGEX_DEPENDS_CHARSET);
8006                 goto parse_flags;
8007             default:
8008                 --RExC_parse;
8009                 parse_flags:      /* (?i) */
8010             {
8011                 U32 posflags = 0, negflags = 0;
8012                 U32 *flagsp = &posflags;
8013                 char has_charset_modifier = '\0';
8014                 regex_charset cs = (RExC_utf8 || RExC_uni_semantics)
8015                                     ? REGEX_UNICODE_CHARSET
8016                                     : REGEX_DEPENDS_CHARSET;
8017
8018                 while (*RExC_parse) {
8019                     /* && strchr("iogcmsx", *RExC_parse) */
8020                     /* (?g), (?gc) and (?o) are useless here
8021                        and must be globally applied -- japhy */
8022                     switch (*RExC_parse) {
8023                     CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
8024                     case LOCALE_PAT_MOD:
8025                         if (has_charset_modifier) {
8026                             goto excess_modifier;
8027                         }
8028                         else if (flagsp == &negflags) {
8029                             goto neg_modifier;
8030                         }
8031                         cs = REGEX_LOCALE_CHARSET;
8032                         has_charset_modifier = LOCALE_PAT_MOD;
8033                         RExC_contains_locale = 1;
8034                         break;
8035                     case UNICODE_PAT_MOD:
8036                         if (has_charset_modifier) {
8037                             goto excess_modifier;
8038                         }
8039                         else if (flagsp == &negflags) {
8040                             goto neg_modifier;
8041                         }
8042                         cs = REGEX_UNICODE_CHARSET;
8043                         has_charset_modifier = UNICODE_PAT_MOD;
8044                         break;
8045                     case ASCII_RESTRICT_PAT_MOD:
8046                         if (flagsp == &negflags) {
8047                             goto neg_modifier;
8048                         }
8049                         if (has_charset_modifier) {
8050                             if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
8051                                 goto excess_modifier;
8052                             }
8053                             /* Doubled modifier implies more restricted */
8054                             cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
8055                         }
8056                         else {
8057                             cs = REGEX_ASCII_RESTRICTED_CHARSET;
8058                         }
8059                         has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
8060                         break;
8061                     case DEPENDS_PAT_MOD:
8062                         if (has_use_defaults) {
8063                             goto fail_modifiers;
8064                         }
8065                         else if (flagsp == &negflags) {
8066                             goto neg_modifier;
8067                         }
8068                         else if (has_charset_modifier) {
8069                             goto excess_modifier;
8070                         }
8071
8072                         /* The dual charset means unicode semantics if the
8073                          * pattern (or target, not known until runtime) are
8074                          * utf8, or something in the pattern indicates unicode
8075                          * semantics */
8076                         cs = (RExC_utf8 || RExC_uni_semantics)
8077                              ? REGEX_UNICODE_CHARSET
8078                              : REGEX_DEPENDS_CHARSET;
8079                         has_charset_modifier = DEPENDS_PAT_MOD;
8080                         break;
8081                     excess_modifier:
8082                         RExC_parse++;
8083                         if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
8084                             vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
8085                         }
8086                         else if (has_charset_modifier == *(RExC_parse - 1)) {
8087                             vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
8088                         }
8089                         else {
8090                             vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
8091                         }
8092                         /*NOTREACHED*/
8093                     neg_modifier:
8094                         RExC_parse++;
8095                         vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
8096                         /*NOTREACHED*/
8097                     case ONCE_PAT_MOD: /* 'o' */
8098                     case GLOBAL_PAT_MOD: /* 'g' */
8099                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
8100                             const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
8101                             if (! (wastedflags & wflagbit) ) {
8102                                 wastedflags |= wflagbit;
8103                                 vWARN5(
8104                                     RExC_parse + 1,
8105                                     "Useless (%s%c) - %suse /%c modifier",
8106                                     flagsp == &negflags ? "?-" : "?",
8107                                     *RExC_parse,
8108                                     flagsp == &negflags ? "don't " : "",
8109                                     *RExC_parse
8110                                 );
8111                             }
8112                         }
8113                         break;
8114
8115                     case CONTINUE_PAT_MOD: /* 'c' */
8116                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
8117                             if (! (wastedflags & WASTED_C) ) {
8118                                 wastedflags |= WASTED_GC;
8119                                 vWARN3(
8120                                     RExC_parse + 1,
8121                                     "Useless (%sc) - %suse /gc modifier",
8122                                     flagsp == &negflags ? "?-" : "?",
8123                                     flagsp == &negflags ? "don't " : ""
8124                                 );
8125                             }
8126                         }
8127                         break;
8128                     case KEEPCOPY_PAT_MOD: /* 'p' */
8129                         if (flagsp == &negflags) {
8130                             if (SIZE_ONLY)
8131                                 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
8132                         } else {
8133                             *flagsp |= RXf_PMf_KEEPCOPY;
8134                         }
8135                         break;
8136                     case '-':
8137                         /* A flag is a default iff it is following a minus, so
8138                          * if there is a minus, it means will be trying to
8139                          * re-specify a default which is an error */
8140                         if (has_use_defaults || flagsp == &negflags) {
8141             fail_modifiers:
8142                             RExC_parse++;
8143                             vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8144                             /*NOTREACHED*/
8145                         }
8146                         flagsp = &negflags;
8147                         wastedflags = 0;  /* reset so (?g-c) warns twice */
8148                         break;
8149                     case ':':
8150                         paren = ':';
8151                         /*FALLTHROUGH*/
8152                     case ')':
8153                         RExC_flags |= posflags;
8154                         RExC_flags &= ~negflags;
8155                         set_regex_charset(&RExC_flags, cs);
8156                         if (paren != ':') {
8157                             oregflags |= posflags;
8158                             oregflags &= ~negflags;
8159                             set_regex_charset(&oregflags, cs);
8160                         }
8161                         nextchar(pRExC_state);
8162                         if (paren != ':') {
8163                             *flagp = TRYAGAIN;
8164                             return NULL;
8165                         } else {
8166                             ret = NULL;
8167                             goto parse_rest;
8168                         }
8169                         /*NOTREACHED*/
8170                     default:
8171                         RExC_parse++;
8172                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8173                         /*NOTREACHED*/
8174                     }
8175                     ++RExC_parse;
8176                 }
8177             }} /* one for the default block, one for the switch */
8178         }
8179         else {                  /* (...) */
8180           capturing_parens:
8181             parno = RExC_npar;
8182             RExC_npar++;
8183
8184             ret = reganode(pRExC_state, OPEN, parno);
8185             if (!SIZE_ONLY ){
8186                 if (!RExC_nestroot)
8187                     RExC_nestroot = parno;
8188                 if (RExC_seen & REG_SEEN_RECURSE
8189                     && !RExC_open_parens[parno-1])
8190                 {
8191                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
8192                         "Setting open paren #%"IVdf" to %d\n",
8193                         (IV)parno, REG_NODE_NUM(ret)));
8194                     RExC_open_parens[parno-1]= ret;
8195                 }
8196             }
8197             Set_Node_Length(ret, 1); /* MJD */
8198             Set_Node_Offset(ret, RExC_parse); /* MJD */
8199             is_open = 1;
8200         }
8201     }
8202     else                        /* ! paren */
8203         ret = NULL;
8204
8205    parse_rest:
8206     /* Pick up the branches, linking them together. */
8207     parse_start = RExC_parse;   /* MJD */
8208     br = regbranch(pRExC_state, &flags, 1,depth+1);
8209
8210     /*     branch_len = (paren != 0); */
8211
8212     if (br == NULL)
8213         return(NULL);
8214     if (*RExC_parse == '|') {
8215         if (!SIZE_ONLY && RExC_extralen) {
8216             reginsert(pRExC_state, BRANCHJ, br, depth+1);
8217         }
8218         else {                  /* MJD */
8219             reginsert(pRExC_state, BRANCH, br, depth+1);
8220             Set_Node_Length(br, paren != 0);
8221             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
8222         }
8223         have_branch = 1;
8224         if (SIZE_ONLY)
8225             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
8226     }
8227     else if (paren == ':') {
8228         *flagp |= flags&SIMPLE;
8229     }
8230     if (is_open) {                              /* Starts with OPEN. */
8231         REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
8232     }
8233     else if (paren != '?')              /* Not Conditional */
8234         ret = br;
8235     *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
8236     lastbr = br;
8237     while (*RExC_parse == '|') {
8238         if (!SIZE_ONLY && RExC_extralen) {
8239             ender = reganode(pRExC_state, LONGJMP,0);
8240             REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
8241         }
8242         if (SIZE_ONLY)
8243             RExC_extralen += 2;         /* Account for LONGJMP. */
8244         nextchar(pRExC_state);
8245         if (freeze_paren) {
8246             if (RExC_npar > after_freeze)
8247                 after_freeze = RExC_npar;
8248             RExC_npar = freeze_paren;
8249         }
8250         br = regbranch(pRExC_state, &flags, 0, depth+1);
8251
8252         if (br == NULL)
8253             return(NULL);
8254         REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
8255         lastbr = br;
8256         *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
8257     }
8258
8259     if (have_branch || paren != ':') {
8260         /* Make a closing node, and hook it on the end. */
8261         switch (paren) {
8262         case ':':
8263             ender = reg_node(pRExC_state, TAIL);
8264             break;
8265         case 1:
8266             ender = reganode(pRExC_state, CLOSE, parno);
8267             if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
8268                 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
8269                         "Setting close paren #%"IVdf" to %d\n",
8270                         (IV)parno, REG_NODE_NUM(ender)));
8271                 RExC_close_parens[parno-1]= ender;
8272                 if (RExC_nestroot == parno)
8273                     RExC_nestroot = 0;
8274             }
8275             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
8276             Set_Node_Length(ender,1); /* MJD */
8277             break;
8278         case '<':
8279         case ',':
8280         case '=':
8281         case '!':
8282             *flagp &= ~HASWIDTH;
8283             /* FALL THROUGH */
8284         case '>':
8285             ender = reg_node(pRExC_state, SUCCEED);
8286             break;
8287         case 0:
8288             ender = reg_node(pRExC_state, END);
8289             if (!SIZE_ONLY) {
8290                 assert(!RExC_opend); /* there can only be one! */
8291                 RExC_opend = ender;
8292             }
8293             break;
8294         }
8295         REGTAIL(pRExC_state, lastbr, ender);
8296
8297         if (have_branch && !SIZE_ONLY) {
8298             if (depth==1)
8299                 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
8300
8301             /* Hook the tails of the branches to the closing node. */
8302             for (br = ret; br; br = regnext(br)) {
8303                 const U8 op = PL_regkind[OP(br)];
8304                 if (op == BRANCH) {
8305                     REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
8306                 }
8307                 else if (op == BRANCHJ) {
8308                     REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
8309                 }
8310             }
8311         }
8312     }
8313
8314     {
8315         const char *p;
8316         static const char parens[] = "=!<,>";
8317
8318         if (paren && (p = strchr(parens, paren))) {
8319             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
8320             int flag = (p - parens) > 1;
8321
8322             if (paren == '>')
8323                 node = SUSPEND, flag = 0;
8324             reginsert(pRExC_state, node,ret, depth+1);
8325             Set_Node_Cur_Length(ret);
8326             Set_Node_Offset(ret, parse_start + 1);
8327             ret->flags = flag;
8328             REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
8329         }
8330     }
8331
8332     /* Check for proper termination. */
8333     if (paren) {
8334         RExC_flags = oregflags;
8335         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
8336             RExC_parse = oregcomp_parse;
8337             vFAIL("Unmatched (");
8338         }
8339     }
8340     else if (!paren && RExC_parse < RExC_end) {
8341         if (*RExC_parse == ')') {
8342             RExC_parse++;
8343             vFAIL("Unmatched )");
8344         }
8345         else
8346             FAIL("Junk on end of regexp");      /* "Can't happen". */
8347         /* NOTREACHED */
8348     }
8349
8350     if (RExC_in_lookbehind) {
8351         RExC_in_lookbehind--;
8352     }
8353     if (after_freeze > RExC_npar)
8354         RExC_npar = after_freeze;
8355     return(ret);
8356 }
8357
8358 /*
8359  - regbranch - one alternative of an | operator
8360  *
8361  * Implements the concatenation operator.
8362  */
8363 STATIC regnode *
8364 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
8365 {
8366     dVAR;
8367     register regnode *ret;
8368     register regnode *chain = NULL;
8369     register regnode *latest;
8370     I32 flags = 0, c = 0;
8371     GET_RE_DEBUG_FLAGS_DECL;
8372
8373     PERL_ARGS_ASSERT_REGBRANCH;
8374
8375     DEBUG_PARSE("brnc");
8376
8377     if (first)
8378         ret = NULL;
8379     else {
8380         if (!SIZE_ONLY && RExC_extralen)
8381             ret = reganode(pRExC_state, BRANCHJ,0);
8382         else {
8383             ret = reg_node(pRExC_state, BRANCH);
8384             Set_Node_Length(ret, 1);
8385         }
8386     }
8387
8388     if (!first && SIZE_ONLY)
8389         RExC_extralen += 1;                     /* BRANCHJ */
8390
8391     *flagp = WORST;                     /* Tentatively. */
8392
8393     RExC_parse--;
8394     nextchar(pRExC_state);
8395     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
8396         flags &= ~TRYAGAIN;
8397         latest = regpiece(pRExC_state, &flags,depth+1);
8398         if (latest == NULL) {
8399             if (flags & TRYAGAIN)
8400                 continue;
8401             return(NULL);
8402         }
8403         else if (ret == NULL)
8404             ret = latest;
8405         *flagp |= flags&(HASWIDTH|POSTPONED);
8406         if (chain == NULL)      /* First piece. */
8407             *flagp |= flags&SPSTART;
8408         else {
8409             RExC_naughty++;
8410             REGTAIL(pRExC_state, chain, latest);
8411         }
8412         chain = latest;
8413         c++;
8414     }
8415     if (chain == NULL) {        /* Loop ran zero times. */
8416         chain = reg_node(pRExC_state, NOTHING);
8417         if (ret == NULL)
8418             ret = chain;
8419     }
8420     if (c == 1) {
8421         *flagp |= flags&SIMPLE;
8422     }
8423
8424     return ret;
8425 }
8426
8427 /*
8428  - regpiece - something followed by possible [*+?]
8429  *
8430  * Note that the branching code sequences used for ? and the general cases
8431  * of * and + are somewhat optimized:  they use the same NOTHING node as
8432  * both the endmarker for their branch list and the body of the last branch.
8433  * It might seem that this node could be dispensed with entirely, but the
8434  * endmarker role is not redundant.
8435  */
8436 STATIC regnode *
8437 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
8438 {
8439     dVAR;
8440     register regnode *ret;
8441     register char op;
8442     register char *next;
8443     I32 flags;
8444     const char * const origparse = RExC_parse;
8445     I32 min;
8446     I32 max = REG_INFTY;
8447 #ifdef RE_TRACK_PATTERN_OFFSETS
8448     char *parse_start;
8449 #endif
8450     const char *maxpos = NULL;
8451     GET_RE_DEBUG_FLAGS_DECL;
8452
8453     PERL_ARGS_ASSERT_REGPIECE;
8454
8455     DEBUG_PARSE("piec");
8456
8457     ret = regatom(pRExC_state, &flags,depth+1);
8458     if (ret == NULL) {
8459         if (flags & TRYAGAIN)
8460             *flagp |= TRYAGAIN;
8461         return(NULL);
8462     }
8463
8464     op = *RExC_parse;
8465
8466     if (op == '{' && regcurly(RExC_parse)) {
8467         maxpos = NULL;
8468 #ifdef RE_TRACK_PATTERN_OFFSETS
8469         parse_start = RExC_parse; /* MJD */
8470 #endif
8471         next = RExC_parse + 1;
8472         while (isDIGIT(*next) || *next == ',') {
8473             if (*next == ',') {
8474                 if (maxpos)
8475                     break;
8476                 else
8477                     maxpos = next;
8478             }
8479             next++;
8480         }
8481         if (*next == '}') {             /* got one */
8482             if (!maxpos)
8483                 maxpos = next;
8484             RExC_parse++;
8485             min = atoi(RExC_parse);
8486             if (*maxpos == ',')
8487                 maxpos++;
8488             else
8489                 maxpos = RExC_parse;
8490             max = atoi(maxpos);
8491             if (!max && *maxpos != '0')
8492                 max = REG_INFTY;                /* meaning "infinity" */
8493             else if (max >= REG_INFTY)
8494                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
8495             RExC_parse = next;
8496             nextchar(pRExC_state);
8497
8498         do_curly:
8499             if ((flags&SIMPLE)) {
8500                 RExC_naughty += 2 + RExC_naughty / 2;
8501                 reginsert(pRExC_state, CURLY, ret, depth+1);
8502                 Set_Node_Offset(ret, parse_start+1); /* MJD */
8503                 Set_Node_Cur_Length(ret);
8504             }
8505             else {
8506                 regnode * const w = reg_node(pRExC_state, WHILEM);
8507
8508                 w->flags = 0;
8509                 REGTAIL(pRExC_state, ret, w);
8510                 if (!SIZE_ONLY && RExC_extralen) {
8511                     reginsert(pRExC_state, LONGJMP,ret, depth+1);
8512                     reginsert(pRExC_state, NOTHING,ret, depth+1);
8513                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
8514                 }
8515                 reginsert(pRExC_state, CURLYX,ret, depth+1);
8516                                 /* MJD hk */
8517                 Set_Node_Offset(ret, parse_start+1);
8518                 Set_Node_Length(ret,
8519                                 op == '{' ? (RExC_parse - parse_start) : 1);
8520
8521                 if (!SIZE_ONLY && RExC_extralen)
8522                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
8523                 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
8524                 if (SIZE_ONLY)
8525                     RExC_whilem_seen++, RExC_extralen += 3;
8526                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
8527             }
8528             ret->flags = 0;
8529
8530             if (min > 0)
8531                 *flagp = WORST;
8532             if (max > 0)
8533                 *flagp |= HASWIDTH;
8534             if (max < min)
8535                 vFAIL("Can't do {n,m} with n > m");
8536             if (!SIZE_ONLY) {
8537                 ARG1_SET(ret, (U16)min);
8538                 ARG2_SET(ret, (U16)max);
8539             }
8540
8541             goto nest_check;
8542         }
8543     }
8544
8545     if (!ISMULT1(op)) {
8546         *flagp = flags;
8547         return(ret);
8548     }
8549
8550 #if 0                           /* Now runtime fix should be reliable. */
8551
8552     /* if this is reinstated, don't forget to put this back into perldiag:
8553
8554             =item Regexp *+ operand could be empty at {#} in regex m/%s/
8555
8556            (F) The part of the regexp subject to either the * or + quantifier
8557            could match an empty string. The {#} shows in the regular
8558            expression about where the problem was discovered.
8559
8560     */
8561
8562     if (!(flags&HASWIDTH) && op != '?')
8563       vFAIL("Regexp *+ operand could be empty");
8564 #endif
8565
8566 #ifdef RE_TRACK_PATTERN_OFFSETS
8567     parse_start = RExC_parse;
8568 #endif
8569     nextchar(pRExC_state);
8570
8571     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
8572
8573     if (op == '*' && (flags&SIMPLE)) {
8574         reginsert(pRExC_state, STAR, ret, depth+1);
8575         ret->flags = 0;
8576         RExC_naughty += 4;
8577     }
8578     else if (op == '*') {
8579         min = 0;
8580         goto do_curly;
8581     }
8582     else if (op == '+' && (flags&SIMPLE)) {
8583         reginsert(pRExC_state, PLUS, ret, depth+1);
8584         ret->flags = 0;
8585         RExC_naughty += 3;
8586     }
8587     else if (op == '+') {
8588         min = 1;
8589         goto do_curly;
8590     }
8591     else if (op == '?') {
8592         min = 0; max = 1;
8593         goto do_curly;
8594     }
8595   nest_check:
8596     if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
8597         ckWARN3reg(RExC_parse,
8598                    "%.*s matches null string many times",
8599                    (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
8600                    origparse);
8601     }
8602
8603     if (RExC_parse < RExC_end && *RExC_parse == '?') {
8604         nextchar(pRExC_state);
8605         reginsert(pRExC_state, MINMOD, ret, depth+1);
8606         REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
8607     }
8608 #ifndef REG_ALLOW_MINMOD_SUSPEND
8609     else
8610 #endif
8611     if (RExC_parse < RExC_end && *RExC_parse == '+') {
8612         regnode *ender;
8613         nextchar(pRExC_state);
8614         ender = reg_node(pRExC_state, SUCCEED);
8615         REGTAIL(pRExC_state, ret, ender);
8616         reginsert(pRExC_state, SUSPEND, ret, depth+1);
8617         ret->flags = 0;
8618         ender = reg_node(pRExC_state, TAIL);
8619         REGTAIL(pRExC_state, ret, ender);
8620         /*ret= ender;*/
8621     }
8622
8623     if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
8624         RExC_parse++;
8625         vFAIL("Nested quantifiers");
8626     }
8627
8628     return(ret);
8629 }
8630
8631
8632 /* reg_namedseq(pRExC_state,UVp, UV depth)
8633
8634    This is expected to be called by a parser routine that has
8635    recognized '\N' and needs to handle the rest. RExC_parse is
8636    expected to point at the first char following the N at the time
8637    of the call.
8638
8639    The \N may be inside (indicated by valuep not being NULL) or outside a
8640    character class.
8641
8642    \N may begin either a named sequence, or if outside a character class, mean
8643    to match a non-newline.  For non single-quoted regexes, the tokenizer has
8644    attempted to decide which, and in the case of a named sequence converted it
8645    into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
8646    where c1... are the characters in the sequence.  For single-quoted regexes,
8647    the tokenizer passes the \N sequence through unchanged; this code will not
8648    attempt to determine this nor expand those.  The net effect is that if the
8649    beginning of the passed-in pattern isn't '{U+' or there is no '}', it
8650    signals that this \N occurrence means to match a non-newline.
8651
8652    Only the \N{U+...} form should occur in a character class, for the same
8653    reason that '.' inside a character class means to just match a period: it
8654    just doesn't make sense.
8655
8656    If valuep is non-null then it is assumed that we are parsing inside
8657    of a charclass definition and the first codepoint in the resolved
8658    string is returned via *valuep and the routine will return NULL.
8659    In this mode if a multichar string is returned from the charnames
8660    handler, a warning will be issued, and only the first char in the
8661    sequence will be examined. If the string returned is zero length
8662    then the value of *valuep is undefined and NON-NULL will
8663    be returned to indicate failure. (This will NOT be a valid pointer
8664    to a regnode.)
8665
8666    If valuep is null then it is assumed that we are parsing normal text and a
8667    new EXACT node is inserted into the program containing the resolved string,
8668    and a pointer to the new node is returned.  But if the string is zero length
8669    a NOTHING node is emitted instead.
8670
8671    On success RExC_parse is set to the char following the endbrace.
8672    Parsing failures will generate a fatal error via vFAIL(...)
8673  */
8674 STATIC regnode *
8675 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp, U32 depth)
8676 {
8677     char * endbrace;    /* '}' following the name */
8678     regnode *ret = NULL;
8679     char* p;
8680
8681     GET_RE_DEBUG_FLAGS_DECL;
8682
8683     PERL_ARGS_ASSERT_REG_NAMEDSEQ;
8684
8685     GET_RE_DEBUG_FLAGS;
8686
8687     /* The [^\n] meaning of \N ignores spaces and comments under the /x
8688      * modifier.  The other meaning does not */
8689     p = (RExC_flags & RXf_PMf_EXTENDED)
8690         ? regwhite( pRExC_state, RExC_parse )
8691         : RExC_parse;
8692
8693     /* Disambiguate between \N meaning a named character versus \N meaning
8694      * [^\n].  The former is assumed when it can't be the latter. */
8695     if (*p != '{' || regcurly(p)) {
8696         RExC_parse = p;
8697         if (valuep) {
8698             /* no bare \N in a charclass */
8699             vFAIL("\\N in a character class must be a named character: \\N{...}");
8700         }
8701         nextchar(pRExC_state);
8702         ret = reg_node(pRExC_state, REG_ANY);
8703         *flagp |= HASWIDTH|SIMPLE;
8704         RExC_naughty++;
8705         RExC_parse--;
8706         Set_Node_Length(ret, 1); /* MJD */
8707         return ret;
8708     }
8709
8710     /* Here, we have decided it should be a named sequence */
8711
8712     /* The test above made sure that the next real character is a '{', but
8713      * under the /x modifier, it could be separated by space (or a comment and
8714      * \n) and this is not allowed (for consistency with \x{...} and the
8715      * tokenizer handling of \N{NAME}). */
8716     if (*RExC_parse != '{') {
8717         vFAIL("Missing braces on \\N{}");
8718     }
8719
8720     RExC_parse++;       /* Skip past the '{' */
8721
8722     if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
8723         || ! (endbrace == RExC_parse            /* nothing between the {} */
8724               || (endbrace - RExC_parse >= 2    /* U+ (bad hex is checked below */
8725                   && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
8726     {
8727         if (endbrace) RExC_parse = endbrace;    /* position msg's '<--HERE' */
8728         vFAIL("\\N{NAME} must be resolved by the lexer");
8729     }
8730
8731     if (endbrace == RExC_parse) {   /* empty: \N{} */
8732         if (! valuep) {
8733             RExC_parse = endbrace + 1;
8734             return reg_node(pRExC_state,NOTHING);
8735         }
8736
8737         if (SIZE_ONLY) {
8738             ckWARNreg(RExC_parse,
8739                     "Ignoring zero length \\N{} in character class"
8740             );
8741             RExC_parse = endbrace + 1;
8742         }
8743         *valuep = 0;
8744         return (regnode *) &RExC_parse; /* Invalid regnode pointer */
8745     }
8746
8747     REQUIRE_UTF8;       /* named sequences imply Unicode semantics */
8748     RExC_parse += 2;    /* Skip past the 'U+' */
8749
8750     if (valuep) {   /* In a bracketed char class */
8751         /* We only pay attention to the first char of
8752         multichar strings being returned. I kinda wonder
8753         if this makes sense as it does change the behaviour
8754         from earlier versions, OTOH that behaviour was broken
8755         as well. XXX Solution is to recharacterize as
8756         [rest-of-class]|multi1|multi2... */
8757
8758         STRLEN length_of_hex;
8759         I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
8760             | PERL_SCAN_DISALLOW_PREFIX
8761             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
8762
8763         char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
8764         if (endchar < endbrace) {
8765             ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
8766         }
8767
8768         length_of_hex = (STRLEN)(endchar - RExC_parse);
8769         *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
8770
8771         /* The tokenizer should have guaranteed validity, but it's possible to
8772          * bypass it by using single quoting, so check */
8773         if (length_of_hex == 0
8774             || length_of_hex != (STRLEN)(endchar - RExC_parse) )
8775         {
8776             RExC_parse += length_of_hex;        /* Includes all the valid */
8777             RExC_parse += (RExC_orig_utf8)      /* point to after 1st invalid */
8778                             ? UTF8SKIP(RExC_parse)
8779                             : 1;
8780             /* Guard against malformed utf8 */
8781             if (RExC_parse >= endchar) RExC_parse = endchar;
8782             vFAIL("Invalid hexadecimal number in \\N{U+...}");
8783         }
8784
8785         RExC_parse = endbrace + 1;
8786         if (endchar == endbrace) return NULL;
8787
8788         ret = (regnode *) &RExC_parse;  /* Invalid regnode pointer */
8789     }
8790     else {      /* Not a char class */
8791
8792         /* What is done here is to convert this to a sub-pattern of the form
8793          * (?:\x{char1}\x{char2}...)
8794          * and then call reg recursively.  That way, it retains its atomicness,
8795          * while not having to worry about special handling that some code
8796          * points may have.  toke.c has converted the original Unicode values
8797          * to native, so that we can just pass on the hex values unchanged.  We
8798          * do have to set a flag to keep recoding from happening in the
8799          * recursion */
8800
8801         SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
8802         STRLEN len;
8803         char *endchar;      /* Points to '.' or '}' ending cur char in the input
8804                                stream */
8805         char *orig_end = RExC_end;
8806
8807         while (RExC_parse < endbrace) {
8808
8809             /* Code points are separated by dots.  If none, there is only one
8810              * code point, and is terminated by the brace */
8811             endchar = RExC_parse + strcspn(RExC_parse, ".}");
8812
8813             /* Convert to notation the rest of the code understands */
8814             sv_catpv(substitute_parse, "\\x{");
8815             sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
8816             sv_catpv(substitute_parse, "}");
8817
8818             /* Point to the beginning of the next character in the sequence. */
8819             RExC_parse = endchar + 1;
8820         }
8821         sv_catpv(substitute_parse, ")");
8822
8823         RExC_parse = SvPV(substitute_parse, len);
8824
8825         /* Don't allow empty number */
8826         if (len < 8) {
8827             vFAIL("Invalid hexadecimal number in \\N{U+...}");
8828         }
8829         RExC_end = RExC_parse + len;
8830
8831         /* The values are Unicode, and therefore not subject to recoding */
8832         RExC_override_recoding = 1;
8833
8834         ret = reg(pRExC_state, 1, flagp, depth+1);
8835
8836         RExC_parse = endbrace;
8837         RExC_end = orig_end;
8838         RExC_override_recoding = 0;
8839
8840         nextchar(pRExC_state);
8841     }
8842
8843     return ret;
8844 }
8845
8846
8847 /*
8848  * reg_recode
8849  *
8850  * It returns the code point in utf8 for the value in *encp.
8851  *    value: a code value in the source encoding
8852  *    encp:  a pointer to an Encode object
8853  *
8854  * If the result from Encode is not a single character,
8855  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
8856  */
8857 STATIC UV
8858 S_reg_recode(pTHX_ const char value, SV **encp)
8859 {
8860     STRLEN numlen = 1;
8861     SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
8862     const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
8863     const STRLEN newlen = SvCUR(sv);
8864     UV uv = UNICODE_REPLACEMENT;
8865
8866     PERL_ARGS_ASSERT_REG_RECODE;
8867
8868     if (newlen)
8869         uv = SvUTF8(sv)
8870              ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
8871              : *(U8*)s;
8872
8873     if (!newlen || numlen != newlen) {
8874         uv = UNICODE_REPLACEMENT;
8875         *encp = NULL;
8876     }
8877     return uv;
8878 }
8879
8880
8881 /*
8882  - regatom - the lowest level
8883
8884    Try to identify anything special at the start of the pattern. If there
8885    is, then handle it as required. This may involve generating a single regop,
8886    such as for an assertion; or it may involve recursing, such as to
8887    handle a () structure.
8888
8889    If the string doesn't start with something special then we gobble up
8890    as much literal text as we can.
8891
8892    Once we have been able to handle whatever type of thing started the
8893    sequence, we return.
8894
8895    Note: we have to be careful with escapes, as they can be both literal
8896    and special, and in the case of \10 and friends can either, depending
8897    on context. Specifically there are two separate switches for handling
8898    escape sequences, with the one for handling literal escapes requiring
8899    a dummy entry for all of the special escapes that are actually handled
8900    by the other.
8901 */
8902
8903 STATIC regnode *
8904 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
8905 {
8906     dVAR;
8907     register regnode *ret = NULL;
8908     I32 flags;
8909     char *parse_start = RExC_parse;
8910     U8 op;
8911     GET_RE_DEBUG_FLAGS_DECL;
8912     DEBUG_PARSE("atom");
8913     *flagp = WORST;             /* Tentatively. */
8914
8915     PERL_ARGS_ASSERT_REGATOM;
8916
8917 tryagain:
8918     switch ((U8)*RExC_parse) {
8919     case '^':
8920         RExC_seen_zerolen++;
8921         nextchar(pRExC_state);
8922         if (RExC_flags & RXf_PMf_MULTILINE)
8923             ret = reg_node(pRExC_state, MBOL);
8924         else if (RExC_flags & RXf_PMf_SINGLELINE)
8925             ret = reg_node(pRExC_state, SBOL);
8926         else
8927             ret = reg_node(pRExC_state, BOL);
8928         Set_Node_Length(ret, 1); /* MJD */
8929         break;
8930     case '$':
8931         nextchar(pRExC_state);
8932         if (*RExC_parse)
8933             RExC_seen_zerolen++;
8934         if (RExC_flags & RXf_PMf_MULTILINE)
8935             ret = reg_node(pRExC_state, MEOL);
8936         else if (RExC_flags & RXf_PMf_SINGLELINE)
8937             ret = reg_node(pRExC_state, SEOL);
8938         else
8939             ret = reg_node(pRExC_state, EOL);
8940         Set_Node_Length(ret, 1); /* MJD */
8941         break;
8942     case '.':
8943         nextchar(pRExC_state);
8944         if (RExC_flags & RXf_PMf_SINGLELINE)
8945             ret = reg_node(pRExC_state, SANY);
8946         else
8947             ret = reg_node(pRExC_state, REG_ANY);
8948         *flagp |= HASWIDTH|SIMPLE;
8949         RExC_naughty++;
8950         Set_Node_Length(ret, 1); /* MJD */
8951         break;
8952     case '[':
8953     {
8954         char * const oregcomp_parse = ++RExC_parse;
8955         ret = regclass(pRExC_state,depth+1);
8956         if (*RExC_parse != ']') {
8957             RExC_parse = oregcomp_parse;
8958             vFAIL("Unmatched [");
8959         }
8960         nextchar(pRExC_state);
8961         *flagp |= HASWIDTH|SIMPLE;
8962         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
8963         break;
8964     }
8965     case '(':
8966         nextchar(pRExC_state);
8967         ret = reg(pRExC_state, 1, &flags,depth+1);
8968         if (ret == NULL) {
8969                 if (flags & TRYAGAIN) {
8970                     if (RExC_parse == RExC_end) {
8971                          /* Make parent create an empty node if needed. */
8972                         *flagp |= TRYAGAIN;
8973                         return(NULL);
8974                     }
8975                     goto tryagain;
8976                 }
8977                 return(NULL);
8978         }
8979         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
8980         break;
8981     case '|':
8982     case ')':
8983         if (flags & TRYAGAIN) {
8984             *flagp |= TRYAGAIN;
8985             return NULL;
8986         }
8987         vFAIL("Internal urp");
8988                                 /* Supposed to be caught earlier. */
8989         break;
8990     case '{':
8991         if (!regcurly(RExC_parse)) {
8992             RExC_parse++;
8993             goto defchar;
8994         }
8995         /* FALL THROUGH */
8996     case '?':
8997     case '+':
8998     case '*':
8999         RExC_parse++;
9000         vFAIL("Quantifier follows nothing");
9001         break;
9002     case '\\':
9003         /* Special Escapes
9004
9005            This switch handles escape sequences that resolve to some kind
9006            of special regop and not to literal text. Escape sequnces that
9007            resolve to literal text are handled below in the switch marked
9008            "Literal Escapes".
9009
9010            Every entry in this switch *must* have a corresponding entry
9011            in the literal escape switch. However, the opposite is not
9012            required, as the default for this switch is to jump to the
9013            literal text handling code.
9014         */
9015         switch ((U8)*++RExC_parse) {
9016         /* Special Escapes */
9017         case 'A':
9018             RExC_seen_zerolen++;
9019             ret = reg_node(pRExC_state, SBOL);
9020             *flagp |= SIMPLE;
9021             goto finish_meta_pat;
9022         case 'G':
9023             ret = reg_node(pRExC_state, GPOS);
9024             RExC_seen |= REG_SEEN_GPOS;
9025             *flagp |= SIMPLE;
9026             goto finish_meta_pat;
9027         case 'K':
9028             RExC_seen_zerolen++;
9029             ret = reg_node(pRExC_state, KEEPS);
9030             *flagp |= SIMPLE;
9031             /* XXX:dmq : disabling in-place substitution seems to
9032              * be necessary here to avoid cases of memory corruption, as
9033              * with: C<$_="x" x 80; s/x\K/y/> -- rgs
9034              */
9035             RExC_seen |= REG_SEEN_LOOKBEHIND;
9036             goto finish_meta_pat;
9037         case 'Z':
9038             ret = reg_node(pRExC_state, SEOL);
9039             *flagp |= SIMPLE;
9040             RExC_seen_zerolen++;                /* Do not optimize RE away */
9041             goto finish_meta_pat;
9042         case 'z':
9043             ret = reg_node(pRExC_state, EOS);
9044             *flagp |= SIMPLE;
9045             RExC_seen_zerolen++;                /* Do not optimize RE away */
9046             goto finish_meta_pat;
9047         case 'C':
9048             ret = reg_node(pRExC_state, CANY);
9049             RExC_seen |= REG_SEEN_CANY;
9050             *flagp |= HASWIDTH|SIMPLE;
9051             goto finish_meta_pat;
9052         case 'X':
9053             ret = reg_node(pRExC_state, CLUMP);
9054             *flagp |= HASWIDTH;
9055             goto finish_meta_pat;
9056         case 'w':
9057             switch (get_regex_charset(RExC_flags)) {
9058                 case REGEX_LOCALE_CHARSET:
9059                     op = ALNUML;
9060                     break;
9061                 case REGEX_UNICODE_CHARSET:
9062                     op = ALNUMU;
9063                     break;
9064                 case REGEX_ASCII_RESTRICTED_CHARSET:
9065                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9066                     op = ALNUMA;
9067                     break;
9068                 case REGEX_DEPENDS_CHARSET:
9069                     op = ALNUM;
9070                     break;
9071                 default:
9072                     goto bad_charset;
9073             }
9074             ret = reg_node(pRExC_state, op);
9075             *flagp |= HASWIDTH|SIMPLE;
9076             goto finish_meta_pat;
9077         case 'W':
9078             switch (get_regex_charset(RExC_flags)) {
9079                 case REGEX_LOCALE_CHARSET:
9080                     op = NALNUML;
9081                     break;
9082                 case REGEX_UNICODE_CHARSET:
9083                     op = NALNUMU;
9084                     break;
9085                 case REGEX_ASCII_RESTRICTED_CHARSET:
9086                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9087                     op = NALNUMA;
9088                     break;
9089                 case REGEX_DEPENDS_CHARSET:
9090                     op = NALNUM;
9091                     break;
9092                 default:
9093                     goto bad_charset;
9094             }
9095             ret = reg_node(pRExC_state, op);
9096             *flagp |= HASWIDTH|SIMPLE;
9097             goto finish_meta_pat;
9098         case 'b':
9099             RExC_seen_zerolen++;
9100             RExC_seen |= REG_SEEN_LOOKBEHIND;
9101             switch (get_regex_charset(RExC_flags)) {
9102                 case REGEX_LOCALE_CHARSET:
9103                     op = BOUNDL;
9104                     break;
9105                 case REGEX_UNICODE_CHARSET:
9106                     op = BOUNDU;
9107                     break;
9108                 case REGEX_ASCII_RESTRICTED_CHARSET:
9109                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9110                     op = BOUNDA;
9111                     break;
9112                 case REGEX_DEPENDS_CHARSET:
9113                     op = BOUND;
9114                     break;
9115                 default:
9116                     goto bad_charset;
9117             }
9118             ret = reg_node(pRExC_state, op);
9119             FLAGS(ret) = get_regex_charset(RExC_flags);
9120             *flagp |= SIMPLE;
9121             if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
9122                 ckWARNregdep(RExC_parse, "\"\\b{\" is deprecated; use \"\\b\\{\" instead");
9123             }
9124             goto finish_meta_pat;
9125         case 'B':
9126             RExC_seen_zerolen++;
9127             RExC_seen |= REG_SEEN_LOOKBEHIND;
9128             switch (get_regex_charset(RExC_flags)) {
9129                 case REGEX_LOCALE_CHARSET:
9130                     op = NBOUNDL;
9131                     break;
9132                 case REGEX_UNICODE_CHARSET:
9133                     op = NBOUNDU;
9134                     break;
9135                 case REGEX_ASCII_RESTRICTED_CHARSET:
9136                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9137                     op = NBOUNDA;
9138                     break;
9139                 case REGEX_DEPENDS_CHARSET:
9140                     op = NBOUND;
9141                     break;
9142                 default:
9143                     goto bad_charset;
9144             }
9145             ret = reg_node(pRExC_state, op);
9146             FLAGS(ret) = get_regex_charset(RExC_flags);
9147             *flagp |= SIMPLE;
9148             if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
9149                 ckWARNregdep(RExC_parse, "\"\\B{\" is deprecated; use \"\\B\\{\" instead");
9150             }
9151             goto finish_meta_pat;
9152         case 's':
9153             switch (get_regex_charset(RExC_flags)) {
9154                 case REGEX_LOCALE_CHARSET:
9155                     op = SPACEL;
9156                     break;
9157                 case REGEX_UNICODE_CHARSET:
9158                     op = SPACEU;
9159                     break;
9160                 case REGEX_ASCII_RESTRICTED_CHARSET:
9161                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9162                     op = SPACEA;
9163                     break;
9164                 case REGEX_DEPENDS_CHARSET:
9165                     op = SPACE;
9166                     break;
9167                 default:
9168                     goto bad_charset;
9169             }
9170             ret = reg_node(pRExC_state, op);
9171             *flagp |= HASWIDTH|SIMPLE;
9172             goto finish_meta_pat;
9173         case 'S':
9174             switch (get_regex_charset(RExC_flags)) {
9175                 case REGEX_LOCALE_CHARSET:
9176                     op = NSPACEL;
9177                     break;
9178                 case REGEX_UNICODE_CHARSET:
9179                     op = NSPACEU;
9180                     break;
9181                 case REGEX_ASCII_RESTRICTED_CHARSET:
9182                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9183                     op = NSPACEA;
9184                     break;
9185                 case REGEX_DEPENDS_CHARSET:
9186                     op = NSPACE;
9187                     break;
9188                 default:
9189                     goto bad_charset;
9190             }
9191             ret = reg_node(pRExC_state, op);
9192             *flagp |= HASWIDTH|SIMPLE;
9193             goto finish_meta_pat;
9194         case 'd':
9195             switch (get_regex_charset(RExC_flags)) {
9196                 case REGEX_LOCALE_CHARSET:
9197                     op = DIGITL;
9198                     break;
9199                 case REGEX_ASCII_RESTRICTED_CHARSET:
9200                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9201                     op = DIGITA;
9202                     break;
9203                 case REGEX_DEPENDS_CHARSET: /* No difference between these */
9204                 case REGEX_UNICODE_CHARSET:
9205                     op = DIGIT;
9206                     break;
9207                 default:
9208                     goto bad_charset;
9209             }
9210             ret = reg_node(pRExC_state, op);
9211             *flagp |= HASWIDTH|SIMPLE;
9212             goto finish_meta_pat;
9213         case 'D':
9214             switch (get_regex_charset(RExC_flags)) {
9215                 case REGEX_LOCALE_CHARSET:
9216                     op = NDIGITL;
9217                     break;
9218                 case REGEX_ASCII_RESTRICTED_CHARSET:
9219                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9220                     op = NDIGITA;
9221                     break;
9222                 case REGEX_DEPENDS_CHARSET: /* No difference between these */
9223                 case REGEX_UNICODE_CHARSET:
9224                     op = NDIGIT;
9225                     break;
9226                 default:
9227                     goto bad_charset;
9228             }
9229             ret = reg_node(pRExC_state, op);
9230             *flagp |= HASWIDTH|SIMPLE;
9231             goto finish_meta_pat;
9232         case 'R':
9233             ret = reg_node(pRExC_state, LNBREAK);
9234             *flagp |= HASWIDTH|SIMPLE;
9235             goto finish_meta_pat;
9236         case 'h':
9237             ret = reg_node(pRExC_state, HORIZWS);
9238             *flagp |= HASWIDTH|SIMPLE;
9239             goto finish_meta_pat;
9240         case 'H':
9241             ret = reg_node(pRExC_state, NHORIZWS);
9242             *flagp |= HASWIDTH|SIMPLE;
9243             goto finish_meta_pat;
9244         case 'v':
9245             ret = reg_node(pRExC_state, VERTWS);
9246             *flagp |= HASWIDTH|SIMPLE;
9247             goto finish_meta_pat;
9248         case 'V':
9249             ret = reg_node(pRExC_state, NVERTWS);
9250             *flagp |= HASWIDTH|SIMPLE;
9251          finish_meta_pat:
9252             nextchar(pRExC_state);
9253             Set_Node_Length(ret, 2); /* MJD */
9254             break;
9255         case 'p':
9256         case 'P':
9257             {
9258                 char* const oldregxend = RExC_end;
9259 #ifdef DEBUGGING
9260                 char* parse_start = RExC_parse - 2;
9261 #endif
9262
9263                 if (RExC_parse[1] == '{') {
9264                   /* a lovely hack--pretend we saw [\pX] instead */
9265                     RExC_end = strchr(RExC_parse, '}');
9266                     if (!RExC_end) {
9267                         const U8 c = (U8)*RExC_parse;
9268                         RExC_parse += 2;
9269                         RExC_end = oldregxend;
9270                         vFAIL2("Missing right brace on \\%c{}", c);
9271                     }
9272                     RExC_end++;
9273                 }
9274                 else {
9275                     RExC_end = RExC_parse + 2;
9276                     if (RExC_end > oldregxend)
9277                         RExC_end = oldregxend;
9278                 }
9279                 RExC_parse--;
9280
9281                 ret = regclass(pRExC_state,depth+1);
9282
9283                 RExC_end = oldregxend;
9284                 RExC_parse--;
9285
9286                 Set_Node_Offset(ret, parse_start + 2);
9287                 Set_Node_Cur_Length(ret);
9288                 nextchar(pRExC_state);
9289                 *flagp |= HASWIDTH|SIMPLE;
9290             }
9291             break;
9292         case 'N':
9293             /* Handle \N and \N{NAME} here and not below because it can be
9294             multicharacter. join_exact() will join them up later on.
9295             Also this makes sure that things like /\N{BLAH}+/ and
9296             \N{BLAH} being multi char Just Happen. dmq*/
9297             ++RExC_parse;
9298             ret= reg_namedseq(pRExC_state, NULL, flagp, depth);
9299             break;
9300         case 'k':    /* Handle \k<NAME> and \k'NAME' */
9301         parse_named_seq:
9302         {
9303             char ch= RExC_parse[1];
9304             if (ch != '<' && ch != '\'' && ch != '{') {
9305                 RExC_parse++;
9306                 vFAIL2("Sequence %.2s... not terminated",parse_start);
9307             } else {
9308                 /* this pretty much dupes the code for (?P=...) in reg(), if
9309                    you change this make sure you change that */
9310                 char* name_start = (RExC_parse += 2);
9311                 U32 num = 0;
9312                 SV *sv_dat = reg_scan_name(pRExC_state,
9313                     SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9314                 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
9315                 if (RExC_parse == name_start || *RExC_parse != ch)
9316                     vFAIL2("Sequence %.3s... not terminated",parse_start);
9317
9318                 if (!SIZE_ONLY) {
9319                     num = add_data( pRExC_state, 1, "S" );
9320                     RExC_rxi->data->data[num]=(void*)sv_dat;
9321                     SvREFCNT_inc_simple_void(sv_dat);
9322                 }
9323
9324                 RExC_sawback = 1;
9325                 ret = reganode(pRExC_state,
9326                                ((! FOLD)
9327                                  ? NREF
9328                                  : (MORE_ASCII_RESTRICTED)
9329                                    ? NREFFA
9330                                    : (AT_LEAST_UNI_SEMANTICS)
9331                                      ? NREFFU
9332                                      : (LOC)
9333                                        ? NREFFL
9334                                        : NREFF),
9335                                 num);
9336                 *flagp |= HASWIDTH;
9337
9338                 /* override incorrect value set in reganode MJD */
9339                 Set_Node_Offset(ret, parse_start+1);
9340                 Set_Node_Cur_Length(ret); /* MJD */
9341                 nextchar(pRExC_state);
9342
9343             }
9344             break;
9345         }
9346         case 'g':
9347         case '1': case '2': case '3': case '4':
9348         case '5': case '6': case '7': case '8': case '9':
9349             {
9350                 I32 num;
9351                 bool isg = *RExC_parse == 'g';
9352                 bool isrel = 0;
9353                 bool hasbrace = 0;
9354                 if (isg) {
9355                     RExC_parse++;
9356                     if (*RExC_parse == '{') {
9357                         RExC_parse++;
9358                         hasbrace = 1;
9359                     }
9360                     if (*RExC_parse == '-') {
9361                         RExC_parse++;
9362                         isrel = 1;
9363                     }
9364                     if (hasbrace && !isDIGIT(*RExC_parse)) {
9365                         if (isrel) RExC_parse--;
9366                         RExC_parse -= 2;
9367                         goto parse_named_seq;
9368                 }   }
9369                 num = atoi(RExC_parse);
9370                 if (isg && num == 0)
9371                     vFAIL("Reference to invalid group 0");
9372                 if (isrel) {
9373                     num = RExC_npar - num;
9374                     if (num < 1)
9375                         vFAIL("Reference to nonexistent or unclosed group");
9376                 }
9377                 if (!isg && num > 9 && num >= RExC_npar)
9378                     goto defchar;
9379                 else {
9380                     char * const parse_start = RExC_parse - 1; /* MJD */
9381                     while (isDIGIT(*RExC_parse))
9382                         RExC_parse++;
9383                     if (parse_start == RExC_parse - 1)
9384                         vFAIL("Unterminated \\g... pattern");
9385                     if (hasbrace) {
9386                         if (*RExC_parse != '}')
9387                             vFAIL("Unterminated \\g{...} pattern");
9388                         RExC_parse++;
9389                     }
9390                     if (!SIZE_ONLY) {
9391                         if (num > (I32)RExC_rx->nparens)
9392                             vFAIL("Reference to nonexistent group");
9393                     }
9394                     RExC_sawback = 1;
9395                     ret = reganode(pRExC_state,
9396                                    ((! FOLD)
9397                                      ? REF
9398                                      : (MORE_ASCII_RESTRICTED)
9399                                        ? REFFA
9400                                        : (AT_LEAST_UNI_SEMANTICS)
9401                                          ? REFFU
9402                                          : (LOC)
9403                                            ? REFFL
9404                                            : REFF),
9405                                     num);
9406                     *flagp |= HASWIDTH;
9407
9408                     /* override incorrect value set in reganode MJD */
9409                     Set_Node_Offset(ret, parse_start+1);
9410                     Set_Node_Cur_Length(ret); /* MJD */
9411                     RExC_parse--;
9412                     nextchar(pRExC_state);
9413                 }
9414             }
9415             break;
9416         case '\0':
9417             if (RExC_parse >= RExC_end)
9418                 FAIL("Trailing \\");
9419             /* FALL THROUGH */
9420         default:
9421             /* Do not generate "unrecognized" warnings here, we fall
9422                back into the quick-grab loop below */
9423             parse_start--;
9424             goto defchar;
9425         }
9426         break;
9427
9428     case '#':
9429         if (RExC_flags & RXf_PMf_EXTENDED) {
9430             if ( reg_skipcomment( pRExC_state ) )
9431                 goto tryagain;
9432         }
9433         /* FALL THROUGH */
9434
9435     default:
9436
9437             parse_start = RExC_parse - 1;
9438
9439             RExC_parse++;
9440
9441         defchar: {
9442             register STRLEN len;
9443             register UV ender;
9444             register char *p;
9445             char *s;
9446             STRLEN foldlen;
9447             U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
9448             U8 node_type;
9449
9450             /* Is this a LATIN LOWER CASE SHARP S in an EXACTFU node?  If so,
9451              * it is folded to 'ss' even if not utf8 */
9452             bool is_exactfu_sharp_s;
9453
9454             ender = 0;
9455             node_type = ((! FOLD) ? EXACT
9456                         : (LOC)
9457                           ? EXACTFL
9458                           : (MORE_ASCII_RESTRICTED)
9459                             ? EXACTFA
9460                             : (AT_LEAST_UNI_SEMANTICS)
9461                               ? EXACTFU
9462                               : EXACTF);
9463             ret = reg_node(pRExC_state, node_type);
9464             s = STRING(ret);
9465
9466             /* XXX The node can hold up to 255 bytes, yet this only goes to
9467              * 127.  I (khw) do not know why.  Keeping it somewhat less than
9468              * 255 allows us to not have to worry about overflow due to
9469              * converting to utf8 and fold expansion, but that value is
9470              * 255-UTF8_MAXBYTES_CASE.  join_exact() may join adjacent nodes
9471              * split up by this limit into a single one using the real max of
9472              * 255.  Even at 127, this breaks under rare circumstances.  If
9473              * folding, we do not want to split a node at a character that is a
9474              * non-final in a multi-char fold, as an input string could just
9475              * happen to want to match across the node boundary.  The join
9476              * would solve that problem if the join actually happens.  But a
9477              * series of more than two nodes in a row each of 127 would cause
9478              * the first join to succeed to get to 254, but then there wouldn't
9479              * be room for the next one, which could at be one of those split
9480              * multi-char folds.  I don't know of any fool-proof solution.  One
9481              * could back off to end with only a code point that isn't such a
9482              * non-final, but it is possible for there not to be any in the
9483              * entire node. */
9484             for (len = 0, p = RExC_parse - 1;
9485                  len < 127 && p < RExC_end;
9486                  len++)
9487             {
9488                 char * const oldp = p;
9489
9490                 if (RExC_flags & RXf_PMf_EXTENDED)
9491                     p = regwhite( pRExC_state, p );
9492                 switch ((U8)*p) {
9493                 case '^':
9494                 case '$':
9495                 case '.':
9496                 case '[':
9497                 case '(':
9498                 case ')':
9499                 case '|':
9500                     goto loopdone;
9501                 case '\\':
9502                     /* Literal Escapes Switch
9503
9504                        This switch is meant to handle escape sequences that
9505                        resolve to a literal character.
9506
9507                        Every escape sequence that represents something
9508                        else, like an assertion or a char class, is handled
9509                        in the switch marked 'Special Escapes' above in this
9510                        routine, but also has an entry here as anything that
9511                        isn't explicitly mentioned here will be treated as
9512                        an unescaped equivalent literal.
9513                     */
9514
9515                     switch ((U8)*++p) {
9516                     /* These are all the special escapes. */
9517                     case 'A':             /* Start assertion */
9518                     case 'b': case 'B':   /* Word-boundary assertion*/
9519                     case 'C':             /* Single char !DANGEROUS! */
9520                     case 'd': case 'D':   /* digit class */
9521                     case 'g': case 'G':   /* generic-backref, pos assertion */
9522                     case 'h': case 'H':   /* HORIZWS */
9523                     case 'k': case 'K':   /* named backref, keep marker */
9524                     case 'N':             /* named char sequence */
9525                     case 'p': case 'P':   /* Unicode property */
9526                               case 'R':   /* LNBREAK */
9527                     case 's': case 'S':   /* space class */
9528                     case 'v': case 'V':   /* VERTWS */
9529                     case 'w': case 'W':   /* word class */
9530                     case 'X':             /* eXtended Unicode "combining character sequence" */
9531                     case 'z': case 'Z':   /* End of line/string assertion */
9532                         --p;
9533                         goto loopdone;
9534
9535                     /* Anything after here is an escape that resolves to a
9536                        literal. (Except digits, which may or may not)
9537                      */
9538                     case 'n':
9539                         ender = '\n';
9540                         p++;
9541                         break;
9542                     case 'r':
9543                         ender = '\r';
9544                         p++;
9545                         break;
9546                     case 't':
9547                         ender = '\t';
9548                         p++;
9549                         break;
9550                     case 'f':
9551                         ender = '\f';
9552                         p++;
9553                         break;
9554                     case 'e':
9555                           ender = ASCII_TO_NATIVE('\033');
9556                         p++;
9557                         break;
9558                     case 'a':
9559                           ender = ASCII_TO_NATIVE('\007');
9560                         p++;
9561                         break;
9562                     case 'o':
9563                         {
9564                             STRLEN brace_len = len;
9565                             UV result;
9566                             const char* error_msg;
9567
9568                             bool valid = grok_bslash_o(p,
9569                                                        &result,
9570                                                        &brace_len,
9571                                                        &error_msg,
9572                                                        1);
9573                             p += brace_len;
9574                             if (! valid) {
9575                                 RExC_parse = p; /* going to die anyway; point
9576                                                    to exact spot of failure */
9577                                 vFAIL(error_msg);
9578                             }
9579                             else
9580                             {
9581                                 ender = result;
9582                             }
9583                             if (PL_encoding && ender < 0x100) {
9584                                 goto recode_encoding;
9585                             }
9586                             if (ender > 0xff) {
9587                                 REQUIRE_UTF8;
9588                             }
9589                             break;
9590                         }
9591                     case 'x':
9592                         if (*++p == '{') {
9593                             char* const e = strchr(p, '}');
9594
9595                             if (!e) {
9596                                 RExC_parse = p + 1;
9597                                 vFAIL("Missing right brace on \\x{}");
9598                             }
9599                             else {
9600                                 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
9601                                     | PERL_SCAN_DISALLOW_PREFIX;
9602                                 STRLEN numlen = e - p - 1;
9603                                 ender = grok_hex(p + 1, &numlen, &flags, NULL);
9604                                 if (ender > 0xff)
9605                                     REQUIRE_UTF8;
9606                                 p = e + 1;
9607                             }
9608                         }
9609                         else {
9610                             I32 flags = PERL_SCAN_DISALLOW_PREFIX;
9611                             STRLEN numlen = 2;
9612                             ender = grok_hex(p, &numlen, &flags, NULL);
9613                             p += numlen;
9614                         }
9615                         if (PL_encoding && ender < 0x100)
9616                             goto recode_encoding;
9617                         break;
9618                     case 'c':
9619                         p++;
9620                         ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
9621                         break;
9622                     case '0': case '1': case '2': case '3':case '4':
9623                     case '5': case '6': case '7': case '8':case '9':
9624                         if (*p == '0' ||
9625                             (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
9626                         {
9627                             I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
9628                             STRLEN numlen = 3;
9629                             ender = grok_oct(p, &numlen, &flags, NULL);
9630                             if (ender > 0xff) {
9631                                 REQUIRE_UTF8;
9632                             }
9633                             p += numlen;
9634                         }
9635                         else {
9636                             --p;
9637                             goto loopdone;
9638                         }
9639                         if (PL_encoding && ender < 0x100)
9640                             goto recode_encoding;
9641                         break;
9642                     recode_encoding:
9643                         if (! RExC_override_recoding) {
9644                             SV* enc = PL_encoding;
9645                             ender = reg_recode((const char)(U8)ender, &enc);
9646                             if (!enc && SIZE_ONLY)
9647                                 ckWARNreg(p, "Invalid escape in the specified encoding");
9648                             REQUIRE_UTF8;
9649                         }
9650                         break;
9651                     case '\0':
9652                         if (p >= RExC_end)
9653                             FAIL("Trailing \\");
9654                         /* FALL THROUGH */
9655                     default:
9656                         if (!SIZE_ONLY&& isALPHA(*p)) {
9657                             /* Include any { following the alpha to emphasize
9658                              * that it could be part of an escape at some point
9659                              * in the future */
9660                             int len = (*(p + 1) == '{') ? 2 : 1;
9661                             ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
9662                         }
9663                         goto normal_default;
9664                     }
9665                     break;
9666                 default:
9667                   normal_default:
9668                     if (UTF8_IS_START(*p) && UTF) {
9669                         STRLEN numlen;
9670                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
9671                                                &numlen, UTF8_ALLOW_DEFAULT);
9672                         p += numlen;
9673                     }
9674                     else
9675                         ender = (U8) *p++;
9676                     break;
9677                 } /* End of switch on the literal */
9678
9679                 is_exactfu_sharp_s = (node_type == EXACTFU
9680                                       && ender == LATIN_SMALL_LETTER_SHARP_S);
9681                 if ( RExC_flags & RXf_PMf_EXTENDED)
9682                     p = regwhite( pRExC_state, p );
9683                 if ((UTF && FOLD) || is_exactfu_sharp_s) {
9684                     /* Prime the casefolded buffer.  Locale rules, which apply
9685                      * only to code points < 256, aren't known until execution,
9686                      * so for them, just output the original character using
9687                      * utf8.  If we start to fold non-UTF patterns, be sure to
9688                      * update join_exact() */
9689                     if (LOC && ender < 256) {
9690                         if (UNI_IS_INVARIANT(ender)) {
9691                             *tmpbuf = (U8) ender;
9692                             foldlen = 1;
9693                         } else {
9694                             *tmpbuf = UTF8_TWO_BYTE_HI(ender);
9695                             *(tmpbuf + 1) = UTF8_TWO_BYTE_LO(ender);
9696                             foldlen = 2;
9697                         }
9698                     }
9699                     else if (isASCII(ender)) {  /* Note: Here can't also be LOC
9700                                                  */
9701                         ender = toLOWER(ender);
9702                         *tmpbuf = (U8) ender;
9703                         foldlen = 1;
9704                     }
9705                     else if (! MORE_ASCII_RESTRICTED && ! LOC) {
9706
9707                         /* Locale and /aa require more selectivity about the
9708                          * fold, so are handled below.  Otherwise, here, just
9709                          * use the fold */
9710                         ender = toFOLD_uni(ender, tmpbuf, &foldlen);
9711                     }
9712                     else {
9713                         /* Under locale rules or /aa we are not to mix,
9714                          * respectively, ords < 256 or ASCII with non-.  So
9715                          * reject folds that mix them, using only the
9716                          * non-folded code point.  So do the fold to a
9717                          * temporary, and inspect each character in it. */
9718                         U8 trialbuf[UTF8_MAXBYTES_CASE+1];
9719                         U8* s = trialbuf;
9720                         UV tmpender = toFOLD_uni(ender, trialbuf, &foldlen);
9721                         U8* e = s + foldlen;
9722                         bool fold_ok = TRUE;
9723
9724                         while (s < e) {
9725                             if (isASCII(*s)
9726                                 || (LOC && (UTF8_IS_INVARIANT(*s)
9727                                            || UTF8_IS_DOWNGRADEABLE_START(*s))))
9728                             {
9729                                 fold_ok = FALSE;
9730                                 break;
9731                             }
9732                             s += UTF8SKIP(s);
9733                         }
9734                         if (fold_ok) {
9735                             Copy(trialbuf, tmpbuf, foldlen, U8);
9736                             ender = tmpender;
9737                         }
9738                         else {
9739                             uvuni_to_utf8(tmpbuf, ender);
9740                             foldlen = UNISKIP(ender);
9741                         }
9742                     }
9743                 }
9744                 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
9745                     if (len)
9746                         p = oldp;
9747                     else if (UTF || is_exactfu_sharp_s) {
9748                          if (FOLD) {
9749                               /* Emit all the Unicode characters. */
9750                               STRLEN numlen;
9751                               for (foldbuf = tmpbuf;
9752                                    foldlen;
9753                                    foldlen -= numlen) {
9754                                    ender = utf8_to_uvchr(foldbuf, &numlen);
9755                                    if (numlen > 0) {
9756                                         const STRLEN unilen = reguni(pRExC_state, ender, s);
9757                                         s       += unilen;
9758                                         len     += unilen;
9759                                         /* In EBCDIC the numlen
9760                                          * and unilen can differ. */
9761                                         foldbuf += numlen;
9762                                         if (numlen >= foldlen)
9763                                              break;
9764                                    }
9765                                    else
9766                                         break; /* "Can't happen." */
9767                               }
9768                          }
9769                          else {
9770                               const STRLEN unilen = reguni(pRExC_state, ender, s);
9771                               if (unilen > 0) {
9772                                    s   += unilen;
9773                                    len += unilen;
9774                               }
9775                          }
9776                     }
9777                     else {
9778                         len++;
9779                         REGC((char)ender, s++);
9780                     }
9781                     break;
9782                 }
9783                 if (UTF || is_exactfu_sharp_s) {
9784                      if (FOLD) {
9785                           /* Emit all the Unicode characters. */
9786                           STRLEN numlen;
9787                           for (foldbuf = tmpbuf;
9788                                foldlen;
9789                                foldlen -= numlen) {
9790                                ender = utf8_to_uvchr(foldbuf, &numlen);
9791                                if (numlen > 0) {
9792                                     const STRLEN unilen = reguni(pRExC_state, ender, s);
9793                                     len     += unilen;
9794                                     s       += unilen;
9795                                     /* In EBCDIC the numlen
9796                                      * and unilen can differ. */
9797                                     foldbuf += numlen;
9798                                     if (numlen >= foldlen)
9799                                          break;
9800                                }
9801                                else
9802                                     break;
9803                           }
9804                      }
9805                      else {
9806                           const STRLEN unilen = reguni(pRExC_state, ender, s);
9807                           if (unilen > 0) {
9808                                s   += unilen;
9809                                len += unilen;
9810                           }
9811                      }
9812                      len--;
9813                 }
9814                 else {
9815                     REGC((char)ender, s++);
9816                 }
9817             }
9818         loopdone:   /* Jumped to when encounters something that shouldn't be in
9819                        the node */
9820             RExC_parse = p - 1;
9821             Set_Node_Cur_Length(ret); /* MJD */
9822             nextchar(pRExC_state);
9823             {
9824                 /* len is STRLEN which is unsigned, need to copy to signed */
9825                 IV iv = len;
9826                 if (iv < 0)
9827                     vFAIL("Internal disaster");
9828             }
9829             if (len > 0)
9830                 *flagp |= HASWIDTH;
9831             if (len == 1 && UNI_IS_INVARIANT(ender))
9832                 *flagp |= SIMPLE;
9833
9834             if (SIZE_ONLY)
9835                 RExC_size += STR_SZ(len);
9836             else {
9837                 STR_LEN(ret) = len;
9838                 RExC_emit += STR_SZ(len);
9839             }
9840         }
9841         break;
9842     }
9843
9844     return(ret);
9845
9846 /* Jumped to when an unrecognized character set is encountered */
9847 bad_charset:
9848     Perl_croak(aTHX_ "panic: Unknown regex character set encoding: %u", get_regex_charset(RExC_flags));
9849     return(NULL);
9850 }
9851
9852 STATIC char *
9853 S_regwhite( RExC_state_t *pRExC_state, char *p )
9854 {
9855     const char *e = RExC_end;
9856
9857     PERL_ARGS_ASSERT_REGWHITE;
9858
9859     while (p < e) {
9860         if (isSPACE(*p))
9861             ++p;
9862         else if (*p == '#') {
9863             bool ended = 0;
9864             do {
9865                 if (*p++ == '\n') {
9866                     ended = 1;
9867                     break;
9868                 }
9869             } while (p < e);
9870             if (!ended)
9871                 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
9872         }
9873         else
9874             break;
9875     }
9876     return p;
9877 }
9878
9879 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
9880    Character classes ([:foo:]) can also be negated ([:^foo:]).
9881    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
9882    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
9883    but trigger failures because they are currently unimplemented. */
9884
9885 #define POSIXCC_DONE(c)   ((c) == ':')
9886 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
9887 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
9888
9889 STATIC I32
9890 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
9891 {
9892     dVAR;
9893     I32 namedclass = OOB_NAMEDCLASS;
9894
9895     PERL_ARGS_ASSERT_REGPPOSIXCC;
9896
9897     if (value == '[' && RExC_parse + 1 < RExC_end &&
9898         /* I smell either [: or [= or [. -- POSIX has been here, right? */
9899         POSIXCC(UCHARAT(RExC_parse))) {
9900         const char c = UCHARAT(RExC_parse);
9901         char* const s = RExC_parse++;
9902
9903         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
9904             RExC_parse++;
9905         if (RExC_parse == RExC_end)
9906             /* Grandfather lone [:, [=, [. */
9907             RExC_parse = s;
9908         else {
9909             const char* const t = RExC_parse++; /* skip over the c */
9910             assert(*t == c);
9911
9912             if (UCHARAT(RExC_parse) == ']') {
9913                 const char *posixcc = s + 1;
9914                 RExC_parse++; /* skip over the ending ] */
9915
9916                 if (*s == ':') {
9917                     const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
9918                     const I32 skip = t - posixcc;
9919
9920                     /* Initially switch on the length of the name.  */
9921                     switch (skip) {
9922                     case 4:
9923                         if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
9924                             namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
9925                         break;
9926                     case 5:
9927                         /* Names all of length 5.  */
9928                         /* alnum alpha ascii blank cntrl digit graph lower
9929                            print punct space upper  */
9930                         /* Offset 4 gives the best switch position.  */
9931                         switch (posixcc[4]) {
9932                         case 'a':
9933                             if (memEQ(posixcc, "alph", 4)) /* alpha */
9934                                 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
9935                             break;
9936                         case 'e':
9937                             if (memEQ(posixcc, "spac", 4)) /* space */
9938                                 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
9939                             break;
9940                         case 'h':
9941                             if (memEQ(posixcc, "grap", 4)) /* graph */
9942                                 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
9943                             break;
9944                         case 'i':
9945                             if (memEQ(posixcc, "asci", 4)) /* ascii */
9946                                 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
9947                             break;
9948                         case 'k':
9949                             if (memEQ(posixcc, "blan", 4)) /* blank */
9950                                 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
9951                             break;
9952                         case 'l':
9953                             if (memEQ(posixcc, "cntr", 4)) /* cntrl */
9954                                 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
9955                             break;
9956                         case 'm':
9957                             if (memEQ(posixcc, "alnu", 4)) /* alnum */
9958                                 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
9959                             break;
9960                         case 'r':
9961                             if (memEQ(posixcc, "lowe", 4)) /* lower */
9962                                 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
9963                             else if (memEQ(posixcc, "uppe", 4)) /* upper */
9964                                 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
9965                             break;
9966                         case 't':
9967                             if (memEQ(posixcc, "digi", 4)) /* digit */
9968                                 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
9969                             else if (memEQ(posixcc, "prin", 4)) /* print */
9970                                 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
9971                             else if (memEQ(posixcc, "punc", 4)) /* punct */
9972                                 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
9973                             break;
9974                         }
9975                         break;
9976                     case 6:
9977                         if (memEQ(posixcc, "xdigit", 6))
9978                             namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
9979                         break;
9980                     }
9981
9982                     if (namedclass == OOB_NAMEDCLASS)
9983                         Simple_vFAIL3("POSIX class [:%.*s:] unknown",
9984                                       t - s - 1, s + 1);
9985                     assert (posixcc[skip] == ':');
9986                     assert (posixcc[skip+1] == ']');
9987                 } else if (!SIZE_ONLY) {
9988                     /* [[=foo=]] and [[.foo.]] are still future. */
9989
9990                     /* adjust RExC_parse so the warning shows after
9991                        the class closes */
9992                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
9993                         RExC_parse++;
9994                     Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
9995                 }
9996             } else {
9997                 /* Maternal grandfather:
9998                  * "[:" ending in ":" but not in ":]" */
9999                 RExC_parse = s;
10000             }
10001         }
10002     }
10003
10004     return namedclass;
10005 }
10006
10007 STATIC void
10008 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
10009 {
10010     dVAR;
10011
10012     PERL_ARGS_ASSERT_CHECKPOSIXCC;
10013
10014     if (POSIXCC(UCHARAT(RExC_parse))) {
10015         const char *s = RExC_parse;
10016         const char  c = *s++;
10017
10018         while (isALNUM(*s))
10019             s++;
10020         if (*s && c == *s && s[1] == ']') {
10021             ckWARN3reg(s+2,
10022                        "POSIX syntax [%c %c] belongs inside character classes",
10023                        c, c);
10024
10025             /* [[=foo=]] and [[.foo.]] are still future. */
10026             if (POSIXCC_NOTYET(c)) {
10027                 /* adjust RExC_parse so the error shows after
10028                    the class closes */
10029                 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
10030                     NOOP;
10031                 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
10032             }
10033         }
10034     }
10035 }
10036
10037 /* Generate the code to add a full posix character <class> to the bracketed
10038  * character class given by <node>.  (<node> is needed only under locale rules)
10039  * destlist     is the inversion list for non-locale rules that this class is
10040  *              to be added to
10041  * sourcelist   is the ASCII-range inversion list to add under /a rules
10042  * Xsourcelist  is the full Unicode range list to use otherwise. */
10043 #define DO_POSIX(node, class, destlist, sourcelist, Xsourcelist)           \
10044     if (LOC) {                                                             \
10045         SV* scratch_list = NULL;                                           \
10046                                                                            \
10047         /* Set this class in the node for runtime matching */              \
10048         ANYOF_CLASS_SET(node, class);                                      \
10049                                                                            \
10050         /* For above Latin1 code points, we use the full Unicode range */  \
10051         _invlist_intersection(PL_AboveLatin1,                              \
10052                               Xsourcelist,                                 \
10053                               &scratch_list);                              \
10054         /* And set the output to it, adding instead if there already is an \
10055          * output.  Checking if <destlist> is NULL first saves an extra    \
10056          * clone.  Its reference count will be decremented at the next     \
10057          * union, etc, or if this is the only instance, at the end of the  \
10058          * routine */                                                      \
10059         if (! destlist) {                                                  \
10060             destlist = scratch_list;                                       \
10061         }                                                                  \
10062         else {                                                             \
10063             _invlist_union(destlist, scratch_list, &destlist);             \
10064             SvREFCNT_dec(scratch_list);                                    \
10065         }                                                                  \
10066     }                                                                      \
10067     else {                                                                 \
10068         /* For non-locale, just add it to any existing list */             \
10069         _invlist_union(destlist,                                           \
10070                        (AT_LEAST_ASCII_RESTRICTED)                         \
10071                            ? sourcelist                                    \
10072                            : Xsourcelist,                                  \
10073                        &destlist);                                         \
10074     }
10075
10076 /* Like DO_POSIX, but matches the complement of <sourcelist> and <Xsourcelist>.
10077  */
10078 #define DO_N_POSIX(node, class, destlist, sourcelist, Xsourcelist)         \
10079     if (LOC) {                                                             \
10080         SV* scratch_list = NULL;                                           \
10081         ANYOF_CLASS_SET(node, class);                                      \
10082         _invlist_subtract(PL_AboveLatin1, Xsourcelist, &scratch_list);     \
10083         if (! destlist) {                                                  \
10084             destlist = scratch_list;                                       \
10085         }                                                                  \
10086         else {                                                             \
10087             _invlist_union(destlist, scratch_list, &destlist);             \
10088             SvREFCNT_dec(scratch_list);                                    \
10089         }                                                                  \
10090     }                                                                      \
10091     else {                                                                 \
10092         _invlist_union_complement_2nd(destlist,                            \
10093                                     (AT_LEAST_ASCII_RESTRICTED)            \
10094                                         ? sourcelist                       \
10095                                         : Xsourcelist,                     \
10096                                     &destlist);                            \
10097         /* Under /d, everything in the upper half of the Latin1 range      \
10098          * matches this complement */                                      \
10099         if (DEPENDS_SEMANTICS) {                                           \
10100             ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL;                \
10101         }                                                                  \
10102     }
10103
10104 /* Generate the code to add a posix character <class> to the bracketed
10105  * character class given by <node>.  (<node> is needed only under locale rules)
10106  * destlist       is the inversion list for non-locale rules that this class is
10107  *                to be added to
10108  * sourcelist     is the ASCII-range inversion list to add under /a rules
10109  * l1_sourcelist  is the Latin1 range list to use otherwise.
10110  * Xpropertyname  is the name to add to <run_time_list> of the property to
10111  *                specify the code points above Latin1 that will have to be
10112  *                determined at run-time
10113  * run_time_list  is a SV* that contains text names of properties that are to
10114  *                be computed at run time.  This concatenates <Xpropertyname>
10115  *                to it, apppropriately
10116  * This is essentially DO_POSIX, but we know only the Latin1 values at compile
10117  * time */
10118 #define DO_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist,      \
10119                               l1_sourcelist, Xpropertyname, run_time_list) \
10120     /* If not /a matching, there are going to be code points we will have  \
10121      * to defer to runtime to look-up */                                   \
10122     if (! AT_LEAST_ASCII_RESTRICTED) {                                     \
10123         Perl_sv_catpvf(aTHX_ run_time_list, "+utf8::%s\n", Xpropertyname); \
10124     }                                                                      \
10125     if (LOC) {                                                             \
10126         ANYOF_CLASS_SET(node, class);                                      \
10127     }                                                                      \
10128     else {                                                                 \
10129         _invlist_union(destlist,                                           \
10130                        (AT_LEAST_ASCII_RESTRICTED)                         \
10131                            ? sourcelist                                    \
10132                            : l1_sourcelist,                                \
10133                        &destlist);                                         \
10134     }
10135
10136 /* Like DO_POSIX_LATIN1_ONLY_KNOWN, but for the complement.  A combination of
10137  * this and DO_N_POSIX */
10138 #define DO_N_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist,    \
10139                               l1_sourcelist, Xpropertyname, run_time_list) \
10140     if (AT_LEAST_ASCII_RESTRICTED) {                                       \
10141         _invlist_union_complement_2nd(destlist, sourcelist, &destlist);    \
10142     }                                                                      \
10143     else {                                                                 \
10144         Perl_sv_catpvf(aTHX_ run_time_list, "!utf8::%s\n", Xpropertyname); \
10145         if (LOC) {                                                         \
10146             ANYOF_CLASS_SET(node, namedclass);                             \
10147         }                                                                  \
10148         else {                                                             \
10149             SV* scratch_list = NULL;                                       \
10150             _invlist_subtract(PL_Latin1, l1_sourcelist, &scratch_list);    \
10151             if (! destlist) {                                              \
10152                 destlist = scratch_list;                                   \
10153             }                                                              \
10154             else {                                                         \
10155                 _invlist_union(destlist, scratch_list, &destlist);         \
10156                 SvREFCNT_dec(scratch_list);                                \
10157             }                                                              \
10158             if (DEPENDS_SEMANTICS) {                                       \
10159                 ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL;            \
10160             }                                                              \
10161         }                                                                  \
10162     }
10163
10164 STATIC U8
10165 S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, SV** invlist_ptr, AV** alternate_ptr)
10166 {
10167
10168     /* Handle the setting of folds in the bitmap for non-locale ANYOF nodes.
10169      * Locale folding is done at run-time, so this function should not be
10170      * called for nodes that are for locales.
10171      *
10172      * This function sets the bit corresponding to the fold of the input
10173      * 'value', if not already set.  The fold of 'f' is 'F', and the fold of
10174      * 'F' is 'f'.
10175      *
10176      * It also knows about the characters that are in the bitmap that have
10177      * folds that are matchable only outside it, and sets the appropriate lists
10178      * and flags.
10179      *
10180      * It returns the number of bits that actually changed from 0 to 1 */
10181
10182     U8 stored = 0;
10183     U8 fold;
10184
10185     PERL_ARGS_ASSERT_SET_REGCLASS_BIT_FOLD;
10186
10187     fold = (AT_LEAST_UNI_SEMANTICS) ? PL_fold_latin1[value]
10188                                     : PL_fold[value];
10189
10190     /* It assumes the bit for 'value' has already been set */
10191     if (fold != value && ! ANYOF_BITMAP_TEST(node, fold)) {
10192         ANYOF_BITMAP_SET(node, fold);
10193         stored++;
10194     }
10195     if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value) && (! isASCII(value) || ! MORE_ASCII_RESTRICTED)) {
10196         /* Certain Latin1 characters have matches outside the bitmap.  To get
10197          * here, 'value' is one of those characters.   None of these matches is
10198          * valid for ASCII characters under /aa, which have been excluded by
10199          * the 'if' above.  The matches fall into three categories:
10200          * 1) They are singly folded-to or -from an above 255 character, as
10201          *    LATIN SMALL LETTER Y WITH DIAERESIS and LATIN CAPITAL LETTER Y
10202          *    WITH DIAERESIS;
10203          * 2) They are part of a multi-char fold with another character in the
10204          *    bitmap, only LATIN SMALL LETTER SHARP S => "ss" fits that bill;
10205          * 3) They are part of a multi-char fold with a character not in the
10206          *    bitmap, such as various ligatures.
10207          * We aren't dealing fully with multi-char folds, except we do deal
10208          * with the pattern containing a character that has a multi-char fold
10209          * (not so much the inverse).
10210          * For types 1) and 3), the matches only happen when the target string
10211          * is utf8; that's not true for 2), and we set a flag for it.
10212          *
10213          * The code below adds to the passed in inversion list the single fold
10214          * closures for 'value'.  The values are hard-coded here so that an
10215          * innocent-looking character class, like /[ks]/i won't have to go out
10216          * to disk to find the possible matches.  XXX It would be better to
10217          * generate these via regen, in case a new version of the Unicode
10218          * standard adds new mappings, though that is not really likely. */
10219         switch (value) {
10220             case 'k':
10221             case 'K':
10222                 /* KELVIN SIGN */
10223                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212A);
10224                 break;
10225             case 's':
10226             case 'S':
10227                 /* LATIN SMALL LETTER LONG S */
10228                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x017F);
10229                 break;
10230             case MICRO_SIGN:
10231                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10232                                                  GREEK_SMALL_LETTER_MU);
10233                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10234                                                  GREEK_CAPITAL_LETTER_MU);
10235                 break;
10236             case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
10237             case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
10238                 /* ANGSTROM SIGN */
10239                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212B);
10240                 if (DEPENDS_SEMANTICS) {    /* See DEPENDS comment below */
10241                     *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10242                                                      PL_fold_latin1[value]);
10243                 }
10244                 break;
10245             case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
10246                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10247                                         LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
10248                 break;
10249             case LATIN_SMALL_LETTER_SHARP_S:
10250                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10251                                         LATIN_CAPITAL_LETTER_SHARP_S);
10252
10253                 /* Under /a, /d, and /u, this can match the two chars "ss" */
10254                 if (! MORE_ASCII_RESTRICTED) {
10255                     add_alternate(alternate_ptr, (U8 *) "ss", 2);
10256
10257                     /* And under /u or /a, it can match even if the target is
10258                      * not utf8 */
10259                     if (AT_LEAST_UNI_SEMANTICS) {
10260                         ANYOF_FLAGS(node) |= ANYOF_NONBITMAP_NON_UTF8;
10261                     }
10262                 }
10263                 break;
10264             case 'F': case 'f':
10265             case 'I': case 'i':
10266             case 'L': case 'l':
10267             case 'T': case 't':
10268             case 'A': case 'a':
10269             case 'H': case 'h':
10270             case 'J': case 'j':
10271             case 'N': case 'n':
10272             case 'W': case 'w':
10273             case 'Y': case 'y':
10274                 /* These all are targets of multi-character folds from code
10275                  * points that require UTF8 to express, so they can't match
10276                  * unless the target string is in UTF-8, so no action here is
10277                  * necessary, as regexec.c properly handles the general case
10278                  * for UTF-8 matching */
10279                 break;
10280             default:
10281                 /* Use deprecated warning to increase the chances of this
10282                  * being output */
10283                 ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%x; please use the perlbug utility to report;", value);
10284                 break;
10285         }
10286     }
10287     else if (DEPENDS_SEMANTICS
10288             && ! isASCII(value)
10289             && PL_fold_latin1[value] != value)
10290     {
10291            /* Under DEPENDS rules, non-ASCII Latin1 characters match their
10292             * folds only when the target string is in UTF-8.  We add the fold
10293             * here to the list of things to match outside the bitmap, which
10294             * won't be looked at unless it is UTF8 (or else if something else
10295             * says to look even if not utf8, but those things better not happen
10296             * under DEPENDS semantics. */
10297         *invlist_ptr = add_cp_to_invlist(*invlist_ptr, PL_fold_latin1[value]);
10298     }
10299
10300     return stored;
10301 }
10302
10303
10304 PERL_STATIC_INLINE U8
10305 S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, SV** invlist_ptr, AV** alternate_ptr)
10306 {
10307     /* This inline function sets a bit in the bitmap if not already set, and if
10308      * appropriate, its fold, returning the number of bits that actually
10309      * changed from 0 to 1 */
10310
10311     U8 stored;
10312
10313     PERL_ARGS_ASSERT_SET_REGCLASS_BIT;
10314
10315     if (ANYOF_BITMAP_TEST(node, value)) {   /* Already set */
10316         return 0;
10317     }
10318
10319     ANYOF_BITMAP_SET(node, value);
10320     stored = 1;
10321
10322     if (FOLD && ! LOC) {        /* Locale folds aren't known until runtime */
10323         stored += set_regclass_bit_fold(pRExC_state, node, value, invlist_ptr, alternate_ptr);
10324     }
10325
10326     return stored;
10327 }
10328
10329 STATIC void
10330 S_add_alternate(pTHX_ AV** alternate_ptr, U8* string, STRLEN len)
10331 {
10332     /* Adds input 'string' with length 'len' to the ANYOF node's unicode
10333      * alternate list, pointed to by 'alternate_ptr'.  This is an array of
10334      * the multi-character folds of characters in the node */
10335     SV *sv;
10336
10337     PERL_ARGS_ASSERT_ADD_ALTERNATE;
10338
10339     if (! *alternate_ptr) {
10340         *alternate_ptr = newAV();
10341     }
10342     sv = newSVpvn_utf8((char*)string, len, TRUE);
10343     av_push(*alternate_ptr, sv);
10344     return;
10345 }
10346
10347 /*
10348    parse a class specification and produce either an ANYOF node that
10349    matches the pattern or perhaps will be optimized into an EXACTish node
10350    instead. The node contains a bit map for the first 256 characters, with the
10351    corresponding bit set if that character is in the list.  For characters
10352    above 255, a range list is used */
10353
10354 STATIC regnode *
10355 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
10356 {
10357     dVAR;
10358     register UV nextvalue;
10359     register IV prevvalue = OOB_UNICODE;
10360     register IV range = 0;
10361     UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
10362     register regnode *ret;
10363     STRLEN numlen;
10364     IV namedclass;
10365     char *rangebegin = NULL;
10366     bool need_class = 0;
10367     bool allow_full_fold = TRUE;   /* Assume wants multi-char folding */
10368     SV *listsv = NULL;
10369     STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
10370                                       than just initialized.  */
10371     SV* properties = NULL;    /* Code points that match \p{} \P{} */
10372     UV element_count = 0;   /* Number of distinct elements in the class.
10373                                Optimizations may be possible if this is tiny */
10374     UV n;
10375
10376     /* Unicode properties are stored in a swash; this holds the current one
10377      * being parsed.  If this swash is the only above-latin1 component of the
10378      * character class, an optimization is to pass it directly on to the
10379      * execution engine.  Otherwise, it is set to NULL to indicate that there
10380      * are other things in the class that have to be dealt with at execution
10381      * time */
10382     SV* swash = NULL;           /* Code points that match \p{} \P{} */
10383
10384     /* Set if a component of this character class is user-defined; just passed
10385      * on to the engine */
10386     UV has_user_defined_property = 0;
10387
10388     /* code points this node matches that can't be stored in the bitmap */
10389     SV* nonbitmap = NULL;
10390
10391     /* The items that are to match that aren't stored in the bitmap, but are a
10392      * result of things that are stored there.  This is the fold closure of
10393      * such a character, either because it has DEPENDS semantics and shouldn't
10394      * be matched unless the target string is utf8, or is a code point that is
10395      * too large for the bit map, as for example, the fold of the MICRO SIGN is
10396      * above 255.  This all is solely for performance reasons.  By having this
10397      * code know the outside-the-bitmap folds that the bitmapped characters are
10398      * involved with, we don't have to go out to disk to find the list of
10399      * matches, unless the character class includes code points that aren't
10400      * storable in the bit map.  That means that a character class with an 's'
10401      * in it, for example, doesn't need to go out to disk to find everything
10402      * that matches.  A 2nd list is used so that the 'nonbitmap' list is kept
10403      * empty unless there is something whose fold we don't know about, and will
10404      * have to go out to the disk to find. */
10405     SV* l1_fold_invlist = NULL;
10406
10407     /* List of multi-character folds that are matched by this node */
10408     AV* unicode_alternate  = NULL;
10409 #ifdef EBCDIC
10410     UV literal_endpoint = 0;
10411 #endif
10412     UV stored = 0;  /* how many chars stored in the bitmap */
10413
10414     regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
10415         case we need to change the emitted regop to an EXACT. */
10416     const char * orig_parse = RExC_parse;
10417     GET_RE_DEBUG_FLAGS_DECL;
10418
10419     PERL_ARGS_ASSERT_REGCLASS;
10420 #ifndef DEBUGGING
10421     PERL_UNUSED_ARG(depth);
10422 #endif
10423
10424     DEBUG_PARSE("clas");
10425
10426     /* Assume we are going to generate an ANYOF node. */
10427     ret = reganode(pRExC_state, ANYOF, 0);
10428
10429
10430     if (!SIZE_ONLY) {
10431         ANYOF_FLAGS(ret) = 0;
10432     }
10433
10434     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
10435         RExC_naughty++;
10436         RExC_parse++;
10437         if (!SIZE_ONLY)
10438             ANYOF_FLAGS(ret) |= ANYOF_INVERT;
10439
10440         /* We have decided to not allow multi-char folds in inverted character
10441          * classes, due to the confusion that can happen, especially with
10442          * classes that are designed for a non-Unicode world:  You have the
10443          * peculiar case that:
10444             "s s" =~ /^[^\xDF]+$/i => Y
10445             "ss"  =~ /^[^\xDF]+$/i => N
10446          *
10447          * See [perl #89750] */
10448         allow_full_fold = FALSE;
10449     }
10450
10451     if (SIZE_ONLY) {
10452         RExC_size += ANYOF_SKIP;
10453         listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
10454     }
10455     else {
10456         RExC_emit += ANYOF_SKIP;
10457         if (LOC) {
10458             ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
10459         }
10460         ANYOF_BITMAP_ZERO(ret);
10461         listsv = newSVpvs("# comment\n");
10462         initial_listsv_len = SvCUR(listsv);
10463     }
10464
10465     nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
10466
10467     if (!SIZE_ONLY && POSIXCC(nextvalue))
10468         checkposixcc(pRExC_state);
10469
10470     /* allow 1st char to be ] (allowing it to be - is dealt with later) */
10471     if (UCHARAT(RExC_parse) == ']')
10472         goto charclassloop;
10473
10474 parseit:
10475     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
10476
10477     charclassloop:
10478
10479         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
10480
10481         if (!range) {
10482             rangebegin = RExC_parse;
10483             element_count++;
10484         }
10485         if (UTF) {
10486             value = utf8n_to_uvchr((U8*)RExC_parse,
10487                                    RExC_end - RExC_parse,
10488                                    &numlen, UTF8_ALLOW_DEFAULT);
10489             RExC_parse += numlen;
10490         }
10491         else
10492             value = UCHARAT(RExC_parse++);
10493
10494         nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
10495         if (value == '[' && POSIXCC(nextvalue))
10496             namedclass = regpposixcc(pRExC_state, value);
10497         else if (value == '\\') {
10498             if (UTF) {
10499                 value = utf8n_to_uvchr((U8*)RExC_parse,
10500                                    RExC_end - RExC_parse,
10501                                    &numlen, UTF8_ALLOW_DEFAULT);
10502                 RExC_parse += numlen;
10503             }
10504             else
10505                 value = UCHARAT(RExC_parse++);
10506             /* Some compilers cannot handle switching on 64-bit integer
10507              * values, therefore value cannot be an UV.  Yes, this will
10508              * be a problem later if we want switch on Unicode.
10509              * A similar issue a little bit later when switching on
10510              * namedclass. --jhi */
10511             switch ((I32)value) {
10512             case 'w':   namedclass = ANYOF_ALNUM;       break;
10513             case 'W':   namedclass = ANYOF_NALNUM;      break;
10514             case 's':   namedclass = ANYOF_SPACE;       break;
10515             case 'S':   namedclass = ANYOF_NSPACE;      break;
10516             case 'd':   namedclass = ANYOF_DIGIT;       break;
10517             case 'D':   namedclass = ANYOF_NDIGIT;      break;
10518             case 'v':   namedclass = ANYOF_VERTWS;      break;
10519             case 'V':   namedclass = ANYOF_NVERTWS;     break;
10520             case 'h':   namedclass = ANYOF_HORIZWS;     break;
10521             case 'H':   namedclass = ANYOF_NHORIZWS;    break;
10522             case 'N':  /* Handle \N{NAME} in class */
10523                 {
10524                     /* We only pay attention to the first char of
10525                     multichar strings being returned. I kinda wonder
10526                     if this makes sense as it does change the behaviour
10527                     from earlier versions, OTOH that behaviour was broken
10528                     as well. */
10529                     UV v; /* value is register so we cant & it /grrr */
10530                     if (reg_namedseq(pRExC_state, &v, NULL, depth)) {
10531                         goto parseit;
10532                     }
10533                     value= v;
10534                 }
10535                 break;
10536             case 'p':
10537             case 'P':
10538                 {
10539                 char *e;
10540                 if (RExC_parse >= RExC_end)
10541                     vFAIL2("Empty \\%c{}", (U8)value);
10542                 if (*RExC_parse == '{') {
10543                     const U8 c = (U8)value;
10544                     e = strchr(RExC_parse++, '}');
10545                     if (!e)
10546                         vFAIL2("Missing right brace on \\%c{}", c);
10547                     while (isSPACE(UCHARAT(RExC_parse)))
10548                         RExC_parse++;
10549                     if (e == RExC_parse)
10550                         vFAIL2("Empty \\%c{}", c);
10551                     n = e - RExC_parse;
10552                     while (isSPACE(UCHARAT(RExC_parse + n - 1)))
10553                         n--;
10554                 }
10555                 else {
10556                     e = RExC_parse;
10557                     n = 1;
10558                 }
10559                 if (!SIZE_ONLY) {
10560                     SV** invlistsvp;
10561                     SV* invlist;
10562                     char* name;
10563                     if (UCHARAT(RExC_parse) == '^') {
10564                          RExC_parse++;
10565                          n--;
10566                          value = value == 'p' ? 'P' : 'p'; /* toggle */
10567                          while (isSPACE(UCHARAT(RExC_parse))) {
10568                               RExC_parse++;
10569                               n--;
10570                          }
10571                     }
10572                     /* Try to get the definition of the property into
10573                      * <invlist>.  If /i is in effect, the effective property
10574                      * will have its name be <__NAME_i>.  The design is
10575                      * discussed in commit
10576                      * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
10577                     Newx(name, n + sizeof("_i__\n"), char);
10578
10579                     sprintf(name, "%s%.*s%s\n",
10580                                     (FOLD) ? "__" : "",
10581                                     (int)n,
10582                                     RExC_parse,
10583                                     (FOLD) ? "_i" : ""
10584                     );
10585
10586                     /* Look up the property name, and get its swash and
10587                      * inversion list, if the property is found  */
10588                     if (swash) {
10589                         SvREFCNT_dec(swash);
10590                     }
10591                     swash = _core_swash_init("utf8", name, &PL_sv_undef,
10592                                              1, /* binary */
10593                                              0, /* not tr/// */
10594                                              TRUE, /* this routine will handle
10595                                                       undefined properties */
10596                                              NULL, FALSE /* No inversion list */
10597                                             );
10598                     if (   ! swash
10599                         || ! SvROK(swash)
10600                         || ! SvTYPE(SvRV(swash)) == SVt_PVHV
10601                         || ! (invlistsvp =
10602                                 hv_fetchs(MUTABLE_HV(SvRV(swash)),
10603                                 "INVLIST", FALSE))
10604                         || ! (invlist = *invlistsvp))
10605                     {
10606                         if (swash) {
10607                             SvREFCNT_dec(swash);
10608                             swash = NULL;
10609                         }
10610
10611                         /* Here didn't find it.  It could be a user-defined
10612                          * property that will be available at run-time.  Add it
10613                          * to the list to look up then */
10614                         Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n",
10615                                         (value == 'p' ? '+' : '!'),
10616                                         name);
10617                         has_user_defined_property = 1;
10618
10619                         /* We don't know yet, so have to assume that the
10620                          * property could match something in the Latin1 range,
10621                          * hence something that isn't utf8 */
10622                         ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
10623                     }
10624                     else {
10625
10626                         /* Here, did get the swash and its inversion list.  If
10627                          * the swash is from a user-defined property, then this
10628                          * whole character class should be regarded as such */
10629                         SV** user_defined_svp =
10630                                             hv_fetchs(MUTABLE_HV(SvRV(swash)),
10631                                                         "USER_DEFINED", FALSE);
10632                         if (user_defined_svp) {
10633                             has_user_defined_property
10634                                                     |= SvUV(*user_defined_svp);
10635                         }
10636
10637                         /* Invert if asking for the complement */
10638                         if (value == 'P') {
10639                             _invlist_union_complement_2nd(properties, invlist, &properties);
10640
10641                             /* The swash can't be used as-is, because we've
10642                              * inverted things; delay removing it to here after
10643                              * have copied its invlist above */
10644                             SvREFCNT_dec(swash);
10645                             swash = NULL;
10646                         }
10647                         else {
10648                             _invlist_union(properties, invlist, &properties);
10649                         }
10650                     }
10651                     Safefree(name);
10652                 }
10653                 RExC_parse = e + 1;
10654                 namedclass = ANYOF_MAX;  /* no official name, but it's named */
10655
10656                 /* \p means they want Unicode semantics */
10657                 RExC_uni_semantics = 1;
10658                 }
10659                 break;
10660             case 'n':   value = '\n';                   break;
10661             case 'r':   value = '\r';                   break;
10662             case 't':   value = '\t';                   break;
10663             case 'f':   value = '\f';                   break;
10664             case 'b':   value = '\b';                   break;
10665             case 'e':   value = ASCII_TO_NATIVE('\033');break;
10666             case 'a':   value = ASCII_TO_NATIVE('\007');break;
10667             case 'o':
10668                 RExC_parse--;   /* function expects to be pointed at the 'o' */
10669                 {
10670                     const char* error_msg;
10671                     bool valid = grok_bslash_o(RExC_parse,
10672                                                &value,
10673                                                &numlen,
10674                                                &error_msg,
10675                                                SIZE_ONLY);
10676                     RExC_parse += numlen;
10677                     if (! valid) {
10678                         vFAIL(error_msg);
10679                     }
10680                 }
10681                 if (PL_encoding && value < 0x100) {
10682                     goto recode_encoding;
10683                 }
10684                 break;
10685             case 'x':
10686                 if (*RExC_parse == '{') {
10687                     I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
10688                         | PERL_SCAN_DISALLOW_PREFIX;
10689                     char * const e = strchr(RExC_parse++, '}');
10690                     if (!e)
10691                         vFAIL("Missing right brace on \\x{}");
10692
10693                     numlen = e - RExC_parse;
10694                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
10695                     RExC_parse = e + 1;
10696                 }
10697                 else {
10698                     I32 flags = PERL_SCAN_DISALLOW_PREFIX;
10699                     numlen = 2;
10700                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
10701                     RExC_parse += numlen;
10702                 }
10703                 if (PL_encoding && value < 0x100)
10704                     goto recode_encoding;
10705                 break;
10706             case 'c':
10707                 value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
10708                 break;
10709             case '0': case '1': case '2': case '3': case '4':
10710             case '5': case '6': case '7':
10711                 {
10712                     /* Take 1-3 octal digits */
10713                     I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
10714                     numlen = 3;
10715                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
10716                     RExC_parse += numlen;
10717                     if (PL_encoding && value < 0x100)
10718                         goto recode_encoding;
10719                     break;
10720                 }
10721             recode_encoding:
10722                 if (! RExC_override_recoding) {
10723                     SV* enc = PL_encoding;
10724                     value = reg_recode((const char)(U8)value, &enc);
10725                     if (!enc && SIZE_ONLY)
10726                         ckWARNreg(RExC_parse,
10727                                   "Invalid escape in the specified encoding");
10728                     break;
10729                 }
10730             default:
10731                 /* Allow \_ to not give an error */
10732                 if (!SIZE_ONLY && isALNUM(value) && value != '_') {
10733                     ckWARN2reg(RExC_parse,
10734                                "Unrecognized escape \\%c in character class passed through",
10735                                (int)value);
10736                 }
10737                 break;
10738             }
10739         } /* end of \blah */
10740 #ifdef EBCDIC
10741         else
10742             literal_endpoint++;
10743 #endif
10744
10745         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
10746
10747             /* What matches in a locale is not known until runtime, so need to
10748              * (one time per class) allocate extra space to pass to regexec.
10749              * The space will contain a bit for each named class that is to be
10750              * matched against.  This isn't needed for \p{} and pseudo-classes,
10751              * as they are not affected by locale, and hence are dealt with
10752              * separately */
10753             if (LOC && namedclass < ANYOF_MAX && ! need_class) {
10754                 need_class = 1;
10755                 if (SIZE_ONLY) {
10756                     RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
10757                 }
10758                 else {
10759                     RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
10760                     ANYOF_CLASS_ZERO(ret);
10761                 }
10762                 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
10763             }
10764
10765             /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
10766              * literal, as is the character that began the false range, i.e.
10767              * the 'a' in the examples */
10768             if (range) {
10769                 if (!SIZE_ONLY) {
10770                     const int w =
10771                         RExC_parse >= rangebegin ?
10772                         RExC_parse - rangebegin : 0;
10773                     ckWARN4reg(RExC_parse,
10774                                "False [] range \"%*.*s\"",
10775                                w, w, rangebegin);
10776
10777                     stored +=
10778                          set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
10779                     if (prevvalue < 256) {
10780                         stored +=
10781                          set_regclass_bit(pRExC_state, ret, (U8) prevvalue, &l1_fold_invlist, &unicode_alternate);
10782                     }
10783                     else {
10784                         nonbitmap = add_cp_to_invlist(nonbitmap, prevvalue);
10785                     }
10786                 }
10787
10788                 range = 0; /* this was not a true range */
10789             }
10790
10791             if (!SIZE_ONLY) {
10792
10793                 /* Possible truncation here but in some 64-bit environments
10794                  * the compiler gets heartburn about switch on 64-bit values.
10795                  * A similar issue a little earlier when switching on value.
10796                  * --jhi */
10797                 switch ((I32)namedclass) {
10798
10799                 case ANYOF_ALNUMC: /* C's alnum, in contrast to \w */
10800                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10801                         PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv);
10802                     break;
10803                 case ANYOF_NALNUMC:
10804                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10805                         PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv);
10806                     break;
10807                 case ANYOF_ALPHA:
10808                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10809                         PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv);
10810                     break;
10811                 case ANYOF_NALPHA:
10812                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10813                         PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv);
10814                     break;
10815                 case ANYOF_ASCII:
10816                     if (LOC) {
10817                         ANYOF_CLASS_SET(ret, namedclass);
10818                     }
10819                     else {
10820                         _invlist_union(properties, PL_ASCII, &properties);
10821                     }
10822                     break;
10823                 case ANYOF_NASCII:
10824                     if (LOC) {
10825                         ANYOF_CLASS_SET(ret, namedclass);
10826                     }
10827                     else {
10828                         _invlist_union_complement_2nd(properties,
10829                                                     PL_ASCII, &properties);
10830                         if (DEPENDS_SEMANTICS) {
10831                             ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
10832                         }
10833                     }
10834                     break;
10835                 case ANYOF_BLANK:
10836                     DO_POSIX(ret, namedclass, properties,
10837                                             PL_PosixBlank, PL_XPosixBlank);
10838                     break;
10839                 case ANYOF_NBLANK:
10840                     DO_N_POSIX(ret, namedclass, properties,
10841                                             PL_PosixBlank, PL_XPosixBlank);
10842                     break;
10843                 case ANYOF_CNTRL:
10844                     DO_POSIX(ret, namedclass, properties,
10845                                             PL_PosixCntrl, PL_XPosixCntrl);
10846                     break;
10847                 case ANYOF_NCNTRL:
10848                     DO_N_POSIX(ret, namedclass, properties,
10849                                             PL_PosixCntrl, PL_XPosixCntrl);
10850                     break;
10851                 case ANYOF_DIGIT:
10852                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10853                         PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv);
10854                     break;
10855                 case ANYOF_NDIGIT:
10856                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10857                         PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv);
10858                     break;
10859                 case ANYOF_GRAPH:
10860                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10861                         PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv);
10862                     break;
10863                 case ANYOF_NGRAPH:
10864                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10865                         PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv);
10866                     break;
10867                 case ANYOF_HORIZWS:
10868                     /* For these, we use the nonbitmap, as /d doesn't make a
10869                      * difference in what these match.  There would be problems
10870                      * if these characters had folds other than themselves, as
10871                      * nonbitmap is subject to folding.  It turns out that \h
10872                      * is just a synonym for XPosixBlank */
10873                     _invlist_union(nonbitmap, PL_XPosixBlank, &nonbitmap);
10874                     break;
10875                 case ANYOF_NHORIZWS:
10876                     _invlist_union_complement_2nd(nonbitmap,
10877                                                  PL_XPosixBlank, &nonbitmap);
10878                     break;
10879                 case ANYOF_LOWER:
10880                 case ANYOF_NLOWER:
10881                 {   /* These require special handling, as they differ under
10882                        folding, matching Cased there (which in the ASCII range
10883                        is the same as Alpha */
10884
10885                     SV* ascii_source;
10886                     SV* l1_source;
10887                     const char *Xname;
10888
10889                     if (FOLD && ! LOC) {
10890                         ascii_source = PL_PosixAlpha;
10891                         l1_source = PL_L1Cased;
10892                         Xname = "Cased";
10893                     }
10894                     else {
10895                         ascii_source = PL_PosixLower;
10896                         l1_source = PL_L1PosixLower;
10897                         Xname = "XPosixLower";
10898                     }
10899                     if (namedclass == ANYOF_LOWER) {
10900                         DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10901                                     ascii_source, l1_source, Xname, listsv);
10902                     }
10903                     else {
10904                         DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
10905                             properties, ascii_source, l1_source, Xname, listsv);
10906                     }
10907                     break;
10908                 }
10909                 case ANYOF_PRINT:
10910                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10911                         PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv);
10912                     break;
10913                 case ANYOF_NPRINT:
10914                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10915                         PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv);
10916                     break;
10917                 case ANYOF_PUNCT:
10918                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10919                         PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv);
10920                     break;
10921                 case ANYOF_NPUNCT:
10922                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10923                         PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv);
10924                     break;
10925                 case ANYOF_PSXSPC:
10926                     DO_POSIX(ret, namedclass, properties,
10927                                             PL_PosixSpace, PL_XPosixSpace);
10928                     break;
10929                 case ANYOF_NPSXSPC:
10930                     DO_N_POSIX(ret, namedclass, properties,
10931                                             PL_PosixSpace, PL_XPosixSpace);
10932                     break;
10933                 case ANYOF_SPACE:
10934                     DO_POSIX(ret, namedclass, properties,
10935                                             PL_PerlSpace, PL_XPerlSpace);
10936                     break;
10937                 case ANYOF_NSPACE:
10938                     DO_N_POSIX(ret, namedclass, properties,
10939                                             PL_PerlSpace, PL_XPerlSpace);
10940                     break;
10941                 case ANYOF_UPPER:   /* Same as LOWER, above */
10942                 case ANYOF_NUPPER:
10943                 {
10944                     SV* ascii_source;
10945                     SV* l1_source;
10946                     const char *Xname;
10947
10948                     if (FOLD && ! LOC) {
10949                         ascii_source = PL_PosixAlpha;
10950                         l1_source = PL_L1Cased;
10951                         Xname = "Cased";
10952                     }
10953                     else {
10954                         ascii_source = PL_PosixUpper;
10955                         l1_source = PL_L1PosixUpper;
10956                         Xname = "XPosixUpper";
10957                     }
10958                     if (namedclass == ANYOF_UPPER) {
10959                         DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10960                                     ascii_source, l1_source, Xname, listsv);
10961                     }
10962                     else {
10963                         DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
10964                         properties, ascii_source, l1_source, Xname, listsv);
10965                     }
10966                     break;
10967                 }
10968                 case ANYOF_ALNUM:   /* Really is 'Word' */
10969                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10970                             PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
10971                     break;
10972                 case ANYOF_NALNUM:
10973                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10974                             PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
10975                     break;
10976                 case ANYOF_VERTWS:
10977                     /* For these, we use the nonbitmap, as /d doesn't make a
10978                      * difference in what these match.  There would be problems
10979                      * if these characters had folds other than themselves, as
10980                      * nonbitmap is subject to folding */
10981                     _invlist_union(nonbitmap, PL_VertSpace, &nonbitmap);
10982                     break;
10983                 case ANYOF_NVERTWS:
10984                     _invlist_union_complement_2nd(nonbitmap,
10985                                                     PL_VertSpace, &nonbitmap);
10986                     break;
10987                 case ANYOF_XDIGIT:
10988                     DO_POSIX(ret, namedclass, properties,
10989                                             PL_PosixXDigit, PL_XPosixXDigit);
10990                     break;
10991                 case ANYOF_NXDIGIT:
10992                     DO_N_POSIX(ret, namedclass, properties,
10993                                             PL_PosixXDigit, PL_XPosixXDigit);
10994                     break;
10995                 case ANYOF_MAX:
10996                     /* this is to handle \p and \P */
10997                     break;
10998                 default:
10999                     vFAIL("Invalid [::] class");
11000                     break;
11001                 }
11002
11003                 continue;
11004             }
11005         } /* end of namedclass \blah */
11006
11007         if (range) {
11008             if (prevvalue > (IV)value) /* b-a */ {
11009                 const int w = RExC_parse - rangebegin;
11010                 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
11011                 range = 0; /* not a valid range */
11012             }
11013         }
11014         else {
11015             prevvalue = value; /* save the beginning of the range */
11016             if (RExC_parse+1 < RExC_end
11017                 && *RExC_parse == '-'
11018                 && RExC_parse[1] != ']')
11019             {
11020                 RExC_parse++;
11021
11022                 /* a bad range like \w-, [:word:]- ? */
11023                 if (namedclass > OOB_NAMEDCLASS) {
11024                     if (ckWARN(WARN_REGEXP)) {
11025                         const int w =
11026                             RExC_parse >= rangebegin ?
11027                             RExC_parse - rangebegin : 0;
11028                         vWARN4(RExC_parse,
11029                                "False [] range \"%*.*s\"",
11030                                w, w, rangebegin);
11031                     }
11032                     if (!SIZE_ONLY)
11033                         stored +=
11034                             set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
11035                 } else
11036                     range = 1;  /* yeah, it's a range! */
11037                 continue;       /* but do it the next time */
11038             }
11039         }
11040
11041         /* non-Latin1 code point implies unicode semantics.  Must be set in
11042          * pass1 so is there for the whole of pass 2 */
11043         if (value > 255) {
11044             RExC_uni_semantics = 1;
11045         }
11046
11047         /* now is the next time */
11048         if (!SIZE_ONLY) {
11049             if (prevvalue < 256) {
11050                 const IV ceilvalue = value < 256 ? value : 255;
11051                 IV i;
11052 #ifdef EBCDIC
11053                 /* In EBCDIC [\x89-\x91] should include
11054                  * the \x8e but [i-j] should not. */
11055                 if (literal_endpoint == 2 &&
11056                     ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
11057                      (isUPPER(prevvalue) && isUPPER(ceilvalue))))
11058                 {
11059                     if (isLOWER(prevvalue)) {
11060                         for (i = prevvalue; i <= ceilvalue; i++)
11061                             if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
11062                                 stored +=
11063                                   set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
11064                             }
11065                     } else {
11066                         for (i = prevvalue; i <= ceilvalue; i++)
11067                             if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
11068                                 stored +=
11069                                   set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
11070                             }
11071                     }
11072                 }
11073                 else
11074 #endif
11075                       for (i = prevvalue; i <= ceilvalue; i++) {
11076                         stored += set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
11077                       }
11078           }
11079           if (value > 255) {
11080             const UV prevnatvalue  = NATIVE_TO_UNI(prevvalue);
11081             const UV natvalue      = NATIVE_TO_UNI(value);
11082             nonbitmap = add_range_to_invlist(nonbitmap, prevnatvalue, natvalue);
11083         }
11084 #ifdef EBCDIC
11085             literal_endpoint = 0;
11086 #endif
11087         }
11088
11089         range = 0; /* this range (if it was one) is done now */
11090     }
11091
11092
11093
11094     if (SIZE_ONLY)
11095         return ret;
11096     /****** !SIZE_ONLY AFTER HERE *********/
11097
11098     /* If folding and there are code points above 255, we calculate all
11099      * characters that could fold to or from the ones already on the list */
11100     if (FOLD && nonbitmap) {
11101         UV start, end;  /* End points of code point ranges */
11102
11103         SV* fold_intersection = NULL;
11104
11105         /* This is a list of all the characters that participate in folds
11106             * (except marks, etc in multi-char folds */
11107         if (! PL_utf8_foldable) {
11108             SV* swash = swash_init("utf8", "Cased", &PL_sv_undef, 1, 0);
11109             PL_utf8_foldable = _swash_to_invlist(swash);
11110             SvREFCNT_dec(swash);
11111         }
11112
11113         /* This is a hash that for a particular fold gives all characters
11114             * that are involved in it */
11115         if (! PL_utf8_foldclosures) {
11116
11117             /* If we were unable to find any folds, then we likely won't be
11118              * able to find the closures.  So just create an empty list.
11119              * Folding will effectively be restricted to the non-Unicode rules
11120              * hard-coded into Perl.  (This case happens legitimately during
11121              * compilation of Perl itself before the Unicode tables are
11122              * generated) */
11123             if (invlist_len(PL_utf8_foldable) == 0) {
11124                 PL_utf8_foldclosures = newHV();
11125             } else {
11126                 /* If the folds haven't been read in, call a fold function
11127                     * to force that */
11128                 if (! PL_utf8_tofold) {
11129                     U8 dummy[UTF8_MAXBYTES+1];
11130                     STRLEN dummy_len;
11131
11132                     /* This particular string is above \xff in both UTF-8 and
11133                      * UTFEBCDIC */
11134                     to_utf8_fold((U8*) "\xC8\x80", dummy, &dummy_len);
11135                     assert(PL_utf8_tofold); /* Verify that worked */
11136                 }
11137                 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
11138             }
11139         }
11140
11141         /* Only the characters in this class that participate in folds need be
11142          * checked.  Get the intersection of this class and all the possible
11143          * characters that are foldable.  This can quickly narrow down a large
11144          * class */
11145         _invlist_intersection(PL_utf8_foldable, nonbitmap, &fold_intersection);
11146
11147         /* Now look at the foldable characters in this class individually */
11148         invlist_iterinit(fold_intersection);
11149         while (invlist_iternext(fold_intersection, &start, &end)) {
11150             UV j;
11151
11152             /* Look at every character in the range */
11153             for (j = start; j <= end; j++) {
11154
11155                 /* Get its fold */
11156                 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
11157                 STRLEN foldlen;
11158                 const UV f =
11159                     _to_uni_fold_flags(j, foldbuf, &foldlen, allow_full_fold);
11160
11161                 if (foldlen > (STRLEN)UNISKIP(f)) {
11162
11163                     /* Any multicharacter foldings (disallowed in lookbehind
11164                      * patterns) require the following transform: [ABCDEF] ->
11165                      * (?:[ABCabcDEFd]|pq|rst) where E folds into "pq" and F
11166                      * folds into "rst", all other characters fold to single
11167                      * characters.  We save away these multicharacter foldings,
11168                      * to be later saved as part of the additional "s" data. */
11169                     if (! RExC_in_lookbehind) {
11170                         U8* loc = foldbuf;
11171                         U8* e = foldbuf + foldlen;
11172
11173                         /* If any of the folded characters of this are in the
11174                          * Latin1 range, tell the regex engine that this can
11175                          * match a non-utf8 target string.  The only multi-byte
11176                          * fold whose source is in the Latin1 range (U+00DF)
11177                          * applies only when the target string is utf8, or
11178                          * under unicode rules */
11179                         if (j > 255 || AT_LEAST_UNI_SEMANTICS) {
11180                             while (loc < e) {
11181
11182                                 /* Can't mix ascii with non- under /aa */
11183                                 if (MORE_ASCII_RESTRICTED
11184                                     && (isASCII(*loc) != isASCII(j)))
11185                                 {
11186                                     goto end_multi_fold;
11187                                 }
11188                                 if (UTF8_IS_INVARIANT(*loc)
11189                                     || UTF8_IS_DOWNGRADEABLE_START(*loc))
11190                                 {
11191                                     /* Can't mix above and below 256 under LOC
11192                                      */
11193                                     if (LOC) {
11194                                         goto end_multi_fold;
11195                                     }
11196                                     ANYOF_FLAGS(ret)
11197                                             |= ANYOF_NONBITMAP_NON_UTF8;
11198                                     break;
11199                                 }
11200                                 loc += UTF8SKIP(loc);
11201                             }
11202                         }
11203
11204                         add_alternate(&unicode_alternate, foldbuf, foldlen);
11205                     end_multi_fold: ;
11206                     }
11207
11208                     /* This is special-cased, as it is the only letter which
11209                      * has both a multi-fold and single-fold in Latin1.  All
11210                      * the other chars that have single and multi-folds are
11211                      * always in utf8, and the utf8 folding algorithm catches
11212                      * them */
11213                     if (! LOC && j == LATIN_CAPITAL_LETTER_SHARP_S) {
11214                         stored += set_regclass_bit(pRExC_state,
11215                                         ret,
11216                                         LATIN_SMALL_LETTER_SHARP_S,
11217                                         &l1_fold_invlist, &unicode_alternate);
11218                     }
11219                 }
11220                 else {
11221                     /* Single character fold.  Add everything in its fold
11222                      * closure to the list that this node should match */
11223                     SV** listp;
11224
11225                     /* The fold closures data structure is a hash with the keys
11226                      * being every character that is folded to, like 'k', and
11227                      * the values each an array of everything that folds to its
11228                      * key.  e.g. [ 'k', 'K', KELVIN_SIGN ] */
11229                     if ((listp = hv_fetch(PL_utf8_foldclosures,
11230                                     (char *) foldbuf, foldlen, FALSE)))
11231                     {
11232                         AV* list = (AV*) *listp;
11233                         IV k;
11234                         for (k = 0; k <= av_len(list); k++) {
11235                             SV** c_p = av_fetch(list, k, FALSE);
11236                             UV c;
11237                             if (c_p == NULL) {
11238                                 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
11239                             }
11240                             c = SvUV(*c_p);
11241
11242                             /* /aa doesn't allow folds between ASCII and non-;
11243                              * /l doesn't allow them between above and below
11244                              * 256 */
11245                             if ((MORE_ASCII_RESTRICTED
11246                                  && (isASCII(c) != isASCII(j)))
11247                                     || (LOC && ((c < 256) != (j < 256))))
11248                             {
11249                                 continue;
11250                             }
11251
11252                             if (c < 256 && AT_LEAST_UNI_SEMANTICS) {
11253                                 stored += set_regclass_bit(pRExC_state,
11254                                         ret,
11255                                         (U8) c,
11256                                         &l1_fold_invlist, &unicode_alternate);
11257                             }
11258                                 /* It may be that the code point is already in
11259                                  * this range or already in the bitmap, in
11260                                  * which case we need do nothing */
11261                             else if ((c < start || c > end)
11262                                         && (c > 255
11263                                             || ! ANYOF_BITMAP_TEST(ret, c)))
11264                             {
11265                                 nonbitmap = add_cp_to_invlist(nonbitmap, c);
11266                             }
11267                         }
11268                     }
11269                 }
11270             }
11271         }
11272         SvREFCNT_dec(fold_intersection);
11273     }
11274
11275     /* Combine the two lists into one. */
11276     if (l1_fold_invlist) {
11277         if (nonbitmap) {
11278             _invlist_union(nonbitmap, l1_fold_invlist, &nonbitmap);
11279             SvREFCNT_dec(l1_fold_invlist);
11280         }
11281         else {
11282             nonbitmap = l1_fold_invlist;
11283         }
11284     }
11285
11286     /* And combine the result (if any) with any inversion list from properties.
11287      * The lists are kept separate up to now because we don't want to fold the
11288      * properties */
11289     if (properties) {
11290         if (nonbitmap) {
11291             _invlist_union(nonbitmap, properties, &nonbitmap);
11292             SvREFCNT_dec(properties);
11293         }
11294         else {
11295             nonbitmap = properties;
11296         }
11297     }
11298
11299     /* Here, <nonbitmap> contains all the code points we can determine at
11300      * compile time that we haven't put into the bitmap.  Go through it, and
11301      * for things that belong in the bitmap, put them there, and delete from
11302      * <nonbitmap> */
11303     if (nonbitmap) {
11304
11305         /* Above-ASCII code points in /d have to stay in <nonbitmap>, as they
11306          * possibly only should match when the target string is UTF-8 */
11307         UV max_cp_to_set = (DEPENDS_SEMANTICS) ? 127 : 255;
11308
11309         /* This gets set if we actually need to modify things */
11310         bool change_invlist = FALSE;
11311
11312         UV start, end;
11313
11314         /* Start looking through <nonbitmap> */
11315         invlist_iterinit(nonbitmap);
11316         while (invlist_iternext(nonbitmap, &start, &end)) {
11317             UV high;
11318             int i;
11319
11320             /* Quit if are above what we should change */
11321             if (start > max_cp_to_set) {
11322                 break;
11323             }
11324
11325             change_invlist = TRUE;
11326
11327             /* Set all the bits in the range, up to the max that we are doing */
11328             high = (end < max_cp_to_set) ? end : max_cp_to_set;
11329             for (i = start; i <= (int) high; i++) {
11330                 if (! ANYOF_BITMAP_TEST(ret, i)) {
11331                     ANYOF_BITMAP_SET(ret, i);
11332                     stored++;
11333                     prevvalue = value;
11334                     value = i;
11335                 }
11336             }
11337         }
11338
11339         /* Done with loop; remove any code points that are in the bitmap from
11340          * <nonbitmap> */
11341         if (change_invlist) {
11342             _invlist_subtract(nonbitmap,
11343                               (DEPENDS_SEMANTICS)
11344                                 ? PL_ASCII
11345                                 : PL_Latin1,
11346                               &nonbitmap);
11347         }
11348
11349         /* If have completely emptied it, remove it completely */
11350         if (invlist_len(nonbitmap) == 0) {
11351             SvREFCNT_dec(nonbitmap);
11352             nonbitmap = NULL;
11353         }
11354     }
11355
11356     /* Here, we have calculated what code points should be in the character
11357      * class.  <nonbitmap> does not overlap the bitmap except possibly in the
11358      * case of DEPENDS rules.
11359      *
11360      * Now we can see about various optimizations.  Fold calculation (which we
11361      * did above) needs to take place before inversion.  Otherwise /[^k]/i
11362      * would invert to include K, which under /i would match k, which it
11363      * shouldn't. */
11364
11365     /* Optimize inverted simple patterns (e.g. [^a-z]).  Note that we haven't
11366      * set the FOLD flag yet, so this does optimize those.  It doesn't
11367      * optimize locale.  Doing so perhaps could be done as long as there is
11368      * nothing like \w in it; some thought also would have to be given to the
11369      * interaction with above 0x100 chars */
11370     if ((ANYOF_FLAGS(ret) & ANYOF_INVERT)
11371         && ! LOC
11372         && ! unicode_alternate
11373         /* In case of /d, there are some things that should match only when in
11374          * not in the bitmap, i.e., they require UTF8 to match.  These are
11375          * listed in nonbitmap, but if ANYOF_NONBITMAP_NON_UTF8 is set in this
11376          * case, they don't require UTF8, so can invert here */
11377         && (! nonbitmap
11378             || ! DEPENDS_SEMANTICS
11379             || (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP_NON_UTF8))
11380         && SvCUR(listsv) == initial_listsv_len)
11381     {
11382         int i;
11383         if (! nonbitmap) {
11384             for (i = 0; i < 256; ++i) {
11385                 if (ANYOF_BITMAP_TEST(ret, i)) {
11386                     ANYOF_BITMAP_CLEAR(ret, i);
11387                 }
11388                 else {
11389                     ANYOF_BITMAP_SET(ret, i);
11390                     prevvalue = value;
11391                     value = i;
11392                 }
11393             }
11394             /* The inversion means that everything above 255 is matched */
11395             ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
11396         }
11397         else {
11398             /* Here, also has things outside the bitmap that may overlap with
11399              * the bitmap.  We have to sync them up, so that they get inverted
11400              * in both places.  Earlier, we removed all overlaps except in the
11401              * case of /d rules, so no syncing is needed except for this case
11402              */
11403             SV *remove_list = NULL;
11404
11405             if (DEPENDS_SEMANTICS) {
11406                 UV start, end;
11407
11408                 /* Set the bits that correspond to the ones that aren't in the
11409                  * bitmap.  Otherwise, when we invert, we'll miss these.
11410                  * Earlier, we removed from the nonbitmap all code points
11411                  * < 128, so there is no extra work here */
11412                 invlist_iterinit(nonbitmap);
11413                 while (invlist_iternext(nonbitmap, &start, &end)) {
11414                     if (start > 255) {  /* The bit map goes to 255 */
11415                         break;
11416                     }
11417                     if (end > 255) {
11418                         end = 255;
11419                     }
11420                     for (i = start; i <= (int) end; ++i) {
11421                         ANYOF_BITMAP_SET(ret, i);
11422                         prevvalue = value;
11423                         value = i;
11424                     }
11425                 }
11426             }
11427
11428             /* Now invert both the bitmap and the nonbitmap.  Anything in the
11429              * bitmap has to also be removed from the non-bitmap, but again,
11430              * there should not be overlap unless is /d rules. */
11431             _invlist_invert(nonbitmap);
11432
11433             /* Any swash can't be used as-is, because we've inverted things */
11434             if (swash) {
11435                 SvREFCNT_dec(swash);
11436                 swash = NULL;
11437             }
11438
11439             for (i = 0; i < 256; ++i) {
11440                 if (ANYOF_BITMAP_TEST(ret, i)) {
11441                     ANYOF_BITMAP_CLEAR(ret, i);
11442                     if (DEPENDS_SEMANTICS) {
11443                         if (! remove_list) {
11444                             remove_list = _new_invlist(2);
11445                         }
11446                         remove_list = add_cp_to_invlist(remove_list, i);
11447                     }
11448                 }
11449                 else {
11450                     ANYOF_BITMAP_SET(ret, i);
11451                     prevvalue = value;
11452                     value = i;
11453                 }
11454             }
11455
11456             /* And do the removal */
11457             if (DEPENDS_SEMANTICS) {
11458                 if (remove_list) {
11459                     _invlist_subtract(nonbitmap, remove_list, &nonbitmap);
11460                     SvREFCNT_dec(remove_list);
11461                 }
11462             }
11463             else {
11464                 /* There is no overlap for non-/d, so just delete anything
11465                  * below 256 */
11466                 _invlist_intersection(nonbitmap, PL_AboveLatin1, &nonbitmap);
11467             }
11468         }
11469
11470         stored = 256 - stored;
11471
11472         /* Clear the invert flag since have just done it here */
11473         ANYOF_FLAGS(ret) &= ~ANYOF_INVERT;
11474     }
11475
11476     /* Folding in the bitmap is taken care of above, but not for locale (for
11477      * which we have to wait to see what folding is in effect at runtime), and
11478      * for some things not in the bitmap (only the upper latin folds in this
11479      * case, as all other single-char folding has been set above).  Set
11480      * run-time fold flag for these */
11481     if (FOLD && (LOC
11482                 || (DEPENDS_SEMANTICS
11483                     && nonbitmap
11484                     && ! (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP_NON_UTF8))
11485                 || unicode_alternate))
11486     {
11487         ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD;
11488     }
11489
11490     /* A single character class can be "optimized" into an EXACTish node.
11491      * Note that since we don't currently count how many characters there are
11492      * outside the bitmap, we are XXX missing optimization possibilities for
11493      * them.  This optimization can't happen unless this is a truly single
11494      * character class, which means that it can't be an inversion into a
11495      * many-character class, and there must be no possibility of there being
11496      * things outside the bitmap.  'stored' (only) for locales doesn't include
11497      * \w, etc, so have to make a special test that they aren't present
11498      *
11499      * Similarly A 2-character class of the very special form like [bB] can be
11500      * optimized into an EXACTFish node, but only for non-locales, and for
11501      * characters which only have the two folds; so things like 'fF' and 'Ii'
11502      * wouldn't work because they are part of the fold of 'LATIN SMALL LIGATURE
11503      * FI'. */
11504     if (! nonbitmap
11505         && ! unicode_alternate
11506         && SvCUR(listsv) == initial_listsv_len
11507         && ! (ANYOF_FLAGS(ret) & (ANYOF_INVERT|ANYOF_UNICODE_ALL))
11508         && (((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
11509                               || (! ANYOF_CLASS_TEST_ANY_SET(ret)))))
11510             || (stored == 2 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
11511                                  && (! _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value))
11512                                  /* If the latest code point has a fold whose
11513                                   * bit is set, it must be the only other one */
11514                                 && ((prevvalue = PL_fold_latin1[value]) != (IV)value)
11515                                  && ANYOF_BITMAP_TEST(ret, prevvalue)))))
11516     {
11517         /* Note that the information needed to decide to do this optimization
11518          * is not currently available until the 2nd pass, and that the actually
11519          * used EXACTish node takes less space than the calculated ANYOF node,
11520          * and hence the amount of space calculated in the first pass is larger
11521          * than actually used, so this optimization doesn't gain us any space.
11522          * But an EXACT node is faster than an ANYOF node, and can be combined
11523          * with any adjacent EXACT nodes later by the optimizer for further
11524          * gains.  The speed of executing an EXACTF is similar to an ANYOF
11525          * node, so the optimization advantage comes from the ability to join
11526          * it to adjacent EXACT nodes */
11527
11528         const char * cur_parse= RExC_parse;
11529         U8 op;
11530         RExC_emit = (regnode *)orig_emit;
11531         RExC_parse = (char *)orig_parse;
11532
11533         if (stored == 1) {
11534
11535             /* A locale node with one point can be folded; all the other cases
11536              * with folding will have two points, since we calculate them above
11537              */
11538             if (ANYOF_FLAGS(ret) & ANYOF_LOC_NONBITMAP_FOLD) {
11539                  op = EXACTFL;
11540             }
11541             else {
11542                 op = EXACT;
11543             }
11544         }
11545         else {   /* else 2 chars in the bit map: the folds of each other */
11546
11547             /* Use the folded value, which for the cases where we get here,
11548              * is just the lower case of the current one (which may resolve to
11549              * itself, or to the other one */
11550             value = toLOWER_LATIN1(value);
11551
11552             /* To join adjacent nodes, they must be the exact EXACTish type.
11553              * Try to use the most likely type, by using EXACTFA if possible,
11554              * then EXACTFU if the regex calls for it, or is required because
11555              * the character is non-ASCII.  (If <value> is ASCII, its fold is
11556              * also ASCII for the cases where we get here.) */
11557             if (MORE_ASCII_RESTRICTED && isASCII(value)) {
11558                 op = EXACTFA;
11559             }
11560             else if (AT_LEAST_UNI_SEMANTICS || !isASCII(value)) {
11561                 op = EXACTFU;
11562             }
11563             else {    /* Otherwise, more likely to be EXACTF type */
11564                 op = EXACTF;
11565             }
11566         }
11567
11568         ret = reg_node(pRExC_state, op);
11569         RExC_parse = (char *)cur_parse;
11570         if (UTF && ! NATIVE_IS_INVARIANT(value)) {
11571             *STRING(ret)= UTF8_EIGHT_BIT_HI((U8) value);
11572             *(STRING(ret) + 1)= UTF8_EIGHT_BIT_LO((U8) value);
11573             STR_LEN(ret)= 2;
11574             RExC_emit += STR_SZ(2);
11575         }
11576         else {
11577             *STRING(ret)= (char)value;
11578             STR_LEN(ret)= 1;
11579             RExC_emit += STR_SZ(1);
11580         }
11581         SvREFCNT_dec(listsv);
11582         return ret;
11583     }
11584
11585     /* If there is a swash and more than one element, we can't use the swash in
11586      * the optimization below. */
11587     if (swash && element_count > 1) {
11588         SvREFCNT_dec(swash);
11589         swash = NULL;
11590     }
11591     if (! nonbitmap
11592         && SvCUR(listsv) == initial_listsv_len
11593         && ! unicode_alternate)
11594     {
11595         ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
11596         SvREFCNT_dec(listsv);
11597         SvREFCNT_dec(unicode_alternate);
11598     }
11599     else {
11600         /* av[0] stores the character class description in its textual form:
11601          *       used later (regexec.c:Perl_regclass_swash()) to initialize the
11602          *       appropriate swash, and is also useful for dumping the regnode.
11603          * av[1] if NULL, is a placeholder to later contain the swash computed
11604          *       from av[0].  But if no further computation need be done, the
11605          *       swash is stored there now.
11606          * av[2] stores the multicharacter foldings, used later in
11607          *       regexec.c:S_reginclass().
11608          * av[3] stores the nonbitmap inversion list for use in addition or
11609          *       instead of av[0]; not used if av[1] isn't NULL
11610          * av[4] is set if any component of the class is from a user-defined
11611          *       property; not used if av[1] isn't NULL */
11612         AV * const av = newAV();
11613         SV *rv;
11614
11615         av_store(av, 0, (SvCUR(listsv) == initial_listsv_len)
11616                         ? &PL_sv_undef
11617                         : listsv);
11618         if (swash) {
11619             av_store(av, 1, swash);
11620             SvREFCNT_dec(nonbitmap);
11621         }
11622         else {
11623             av_store(av, 1, NULL);
11624             if (nonbitmap) {
11625                 av_store(av, 3, nonbitmap);
11626                 av_store(av, 4, newSVuv(has_user_defined_property));
11627             }
11628         }
11629
11630         /* Store any computed multi-char folds only if we are allowing
11631          * them */
11632         if (allow_full_fold) {
11633             av_store(av, 2, MUTABLE_SV(unicode_alternate));
11634             if (unicode_alternate) { /* This node is variable length */
11635                 OP(ret) = ANYOFV;
11636             }
11637         }
11638         else {
11639             av_store(av, 2, NULL);
11640         }
11641         rv = newRV_noinc(MUTABLE_SV(av));
11642         n = add_data(pRExC_state, 1, "s");
11643         RExC_rxi->data->data[n] = (void*)rv;
11644         ARG_SET(ret, n);
11645     }
11646     return ret;
11647 }
11648
11649
11650 /* reg_skipcomment()
11651
11652    Absorbs an /x style # comments from the input stream.
11653    Returns true if there is more text remaining in the stream.
11654    Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
11655    terminates the pattern without including a newline.
11656
11657    Note its the callers responsibility to ensure that we are
11658    actually in /x mode
11659
11660 */
11661
11662 STATIC bool
11663 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
11664 {
11665     bool ended = 0;
11666
11667     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
11668
11669     while (RExC_parse < RExC_end)
11670         if (*RExC_parse++ == '\n') {
11671             ended = 1;
11672             break;
11673         }
11674     if (!ended) {
11675         /* we ran off the end of the pattern without ending
11676            the comment, so we have to add an \n when wrapping */
11677         RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11678         return 0;
11679     } else
11680         return 1;
11681 }
11682
11683 /* nextchar()
11684
11685    Advances the parse position, and optionally absorbs
11686    "whitespace" from the inputstream.
11687
11688    Without /x "whitespace" means (?#...) style comments only,
11689    with /x this means (?#...) and # comments and whitespace proper.
11690
11691    Returns the RExC_parse point from BEFORE the scan occurs.
11692
11693    This is the /x friendly way of saying RExC_parse++.
11694 */
11695
11696 STATIC char*
11697 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
11698 {
11699     char* const retval = RExC_parse++;
11700
11701     PERL_ARGS_ASSERT_NEXTCHAR;
11702
11703     for (;;) {
11704         if (RExC_end - RExC_parse >= 3
11705             && *RExC_parse == '('
11706             && RExC_parse[1] == '?'
11707             && RExC_parse[2] == '#')
11708         {
11709             while (*RExC_parse != ')') {
11710                 if (RExC_parse == RExC_end)
11711                     FAIL("Sequence (?#... not terminated");
11712                 RExC_parse++;
11713             }
11714             RExC_parse++;
11715             continue;
11716         }
11717         if (RExC_flags & RXf_PMf_EXTENDED) {
11718             if (isSPACE(*RExC_parse)) {
11719                 RExC_parse++;
11720                 continue;
11721             }
11722             else if (*RExC_parse == '#') {
11723                 if ( reg_skipcomment( pRExC_state ) )
11724                     continue;
11725             }
11726         }
11727         return retval;
11728     }
11729 }
11730
11731 /*
11732 - reg_node - emit a node
11733 */
11734 STATIC regnode *                        /* Location. */
11735 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
11736 {
11737     dVAR;
11738     register regnode *ptr;
11739     regnode * const ret = RExC_emit;
11740     GET_RE_DEBUG_FLAGS_DECL;
11741
11742     PERL_ARGS_ASSERT_REG_NODE;
11743
11744     if (SIZE_ONLY) {
11745         SIZE_ALIGN(RExC_size);
11746         RExC_size += 1;
11747         return(ret);
11748     }
11749     if (RExC_emit >= RExC_emit_bound)
11750         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
11751                    op, RExC_emit, RExC_emit_bound);
11752
11753     NODE_ALIGN_FILL(ret);
11754     ptr = ret;
11755     FILL_ADVANCE_NODE(ptr, op);
11756     REH_CALL_REGCOMP_HOOK(pRExC_state->rx, (ptr) - 1);
11757 #ifdef RE_TRACK_PATTERN_OFFSETS
11758     if (RExC_offsets) {         /* MJD */
11759         MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
11760               "reg_node", __LINE__,
11761               PL_reg_name[op],
11762               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
11763                 ? "Overwriting end of array!\n" : "OK",
11764               (UV)(RExC_emit - RExC_emit_start),
11765               (UV)(RExC_parse - RExC_start),
11766               (UV)RExC_offsets[0]));
11767         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
11768     }
11769 #endif
11770     RExC_emit = ptr;
11771     return(ret);
11772 }
11773
11774 /*
11775 - reganode - emit a node with an argument
11776 */
11777 STATIC regnode *                        /* Location. */
11778 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
11779 {
11780     dVAR;
11781     register regnode *ptr;
11782     regnode * const ret = RExC_emit;
11783     GET_RE_DEBUG_FLAGS_DECL;
11784
11785     PERL_ARGS_ASSERT_REGANODE;
11786
11787     if (SIZE_ONLY) {
11788         SIZE_ALIGN(RExC_size);
11789         RExC_size += 2;
11790         /*
11791            We can't do this:
11792
11793            assert(2==regarglen[op]+1);
11794
11795            Anything larger than this has to allocate the extra amount.
11796            If we changed this to be:
11797
11798            RExC_size += (1 + regarglen[op]);
11799
11800            then it wouldn't matter. Its not clear what side effect
11801            might come from that so its not done so far.
11802            -- dmq
11803         */
11804         return(ret);
11805     }
11806     if (RExC_emit >= RExC_emit_bound)
11807         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
11808                    op, RExC_emit, RExC_emit_bound);
11809
11810     NODE_ALIGN_FILL(ret);
11811     ptr = ret;
11812     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
11813     REH_CALL_REGCOMP_HOOK(pRExC_state->rx, (ptr) - 2);
11814 #ifdef RE_TRACK_PATTERN_OFFSETS
11815     if (RExC_offsets) {         /* MJD */
11816         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
11817               "reganode",
11818               __LINE__,
11819               PL_reg_name[op],
11820               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
11821               "Overwriting end of array!\n" : "OK",
11822               (UV)(RExC_emit - RExC_emit_start),
11823               (UV)(RExC_parse - RExC_start),
11824               (UV)RExC_offsets[0]));
11825         Set_Cur_Node_Offset;
11826     }
11827 #endif
11828     RExC_emit = ptr;
11829     return(ret);
11830 }
11831
11832 /*
11833 - reguni - emit (if appropriate) a Unicode character
11834 */
11835 STATIC STRLEN
11836 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
11837 {
11838     dVAR;
11839
11840     PERL_ARGS_ASSERT_REGUNI;
11841
11842     return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
11843 }
11844
11845 /*
11846 - reginsert - insert an operator in front of already-emitted operand
11847 *
11848 * Means relocating the operand.
11849 */
11850 STATIC void
11851 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
11852 {
11853     dVAR;
11854     register regnode *src;
11855     register regnode *dst;
11856     register regnode *place;
11857     const int offset = regarglen[(U8)op];
11858     const int size = NODE_STEP_REGNODE + offset;
11859     GET_RE_DEBUG_FLAGS_DECL;
11860
11861     PERL_ARGS_ASSERT_REGINSERT;
11862     PERL_UNUSED_ARG(depth);
11863 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
11864     DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
11865     if (SIZE_ONLY) {
11866         RExC_size += size;
11867         return;
11868     }
11869
11870     src = RExC_emit;
11871     RExC_emit += size;
11872     dst = RExC_emit;
11873     if (RExC_open_parens) {
11874         int paren;
11875         /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
11876         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
11877             if ( RExC_open_parens[paren] >= opnd ) {
11878                 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
11879                 RExC_open_parens[paren] += size;
11880             } else {
11881                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
11882             }
11883             if ( RExC_close_parens[paren] >= opnd ) {
11884                 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
11885                 RExC_close_parens[paren] += size;
11886             } else {
11887                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
11888             }
11889         }
11890     }
11891
11892     while (src > opnd) {
11893         StructCopy(--src, --dst, regnode);
11894 #ifdef RE_TRACK_PATTERN_OFFSETS
11895         if (RExC_offsets) {     /* MJD 20010112 */
11896             MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
11897                   "reg_insert",
11898                   __LINE__,
11899                   PL_reg_name[op],
11900                   (UV)(dst - RExC_emit_start) > RExC_offsets[0]
11901                     ? "Overwriting end of array!\n" : "OK",
11902                   (UV)(src - RExC_emit_start),
11903                   (UV)(dst - RExC_emit_start),
11904                   (UV)RExC_offsets[0]));
11905             Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
11906             Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
11907         }
11908 #endif
11909     }
11910
11911
11912     place = opnd;               /* Op node, where operand used to be. */
11913 #ifdef RE_TRACK_PATTERN_OFFSETS
11914     if (RExC_offsets) {         /* MJD */
11915         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
11916               "reginsert",
11917               __LINE__,
11918               PL_reg_name[op],
11919               (UV)(place - RExC_emit_start) > RExC_offsets[0]
11920               ? "Overwriting end of array!\n" : "OK",
11921               (UV)(place - RExC_emit_start),
11922               (UV)(RExC_parse - RExC_start),
11923               (UV)RExC_offsets[0]));
11924         Set_Node_Offset(place, RExC_parse);
11925         Set_Node_Length(place, 1);
11926     }
11927 #endif
11928     src = NEXTOPER(place);
11929     FILL_ADVANCE_NODE(place, op);
11930     REH_CALL_REGCOMP_HOOK(pRExC_state->rx, (place) - 1);
11931     Zero(src, offset, regnode);
11932 }
11933
11934 /*
11935 - regtail - set the next-pointer at the end of a node chain of p to val.
11936 - SEE ALSO: regtail_study
11937 */
11938 /* TODO: All three parms should be const */
11939 STATIC void
11940 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
11941 {
11942     dVAR;
11943     register regnode *scan;
11944     GET_RE_DEBUG_FLAGS_DECL;
11945
11946     PERL_ARGS_ASSERT_REGTAIL;
11947 #ifndef DEBUGGING
11948     PERL_UNUSED_ARG(depth);
11949 #endif
11950
11951     if (SIZE_ONLY)
11952         return;
11953
11954     /* Find last node. */
11955     scan = p;
11956     for (;;) {
11957         regnode * const temp = regnext(scan);
11958         DEBUG_PARSE_r({
11959             SV * const mysv=sv_newmortal();
11960             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
11961             regprop(RExC_rx, mysv, scan);
11962             PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
11963                 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
11964                     (temp == NULL ? "->" : ""),
11965                     (temp == NULL ? PL_reg_name[OP(val)] : "")
11966             );
11967         });
11968         if (temp == NULL)
11969             break;
11970         scan = temp;
11971     }
11972
11973     if (reg_off_by_arg[OP(scan)]) {
11974         ARG_SET(scan, val - scan);
11975     }
11976     else {
11977         NEXT_OFF(scan) = val - scan;
11978     }
11979 }
11980
11981 #ifdef DEBUGGING
11982 /*
11983 - regtail_study - set the next-pointer at the end of a node chain of p to val.
11984 - Look for optimizable sequences at the same time.
11985 - currently only looks for EXACT chains.
11986
11987 This is experimental code. The idea is to use this routine to perform
11988 in place optimizations on branches and groups as they are constructed,
11989 with the long term intention of removing optimization from study_chunk so
11990 that it is purely analytical.
11991
11992 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
11993 to control which is which.
11994
11995 */
11996 /* TODO: All four parms should be const */
11997
11998 STATIC U8
11999 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
12000 {
12001     dVAR;
12002     register regnode *scan;
12003     U8 exact = PSEUDO;
12004 #ifdef EXPERIMENTAL_INPLACESCAN
12005     I32 min = 0;
12006 #endif
12007     GET_RE_DEBUG_FLAGS_DECL;
12008
12009     PERL_ARGS_ASSERT_REGTAIL_STUDY;
12010
12011
12012     if (SIZE_ONLY)
12013         return exact;
12014
12015     /* Find last node. */
12016
12017     scan = p;
12018     for (;;) {
12019         regnode * const temp = regnext(scan);
12020 #ifdef EXPERIMENTAL_INPLACESCAN
12021         if (PL_regkind[OP(scan)] == EXACT) {
12022             bool has_exactf_sharp_s;    /* Unexamined in this routine */
12023             if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1))
12024                 return EXACT;
12025         }
12026 #endif
12027         if ( exact ) {
12028             switch (OP(scan)) {
12029                 case EXACT:
12030                 case EXACTF:
12031                 case EXACTFA:
12032                 case EXACTFU:
12033                 case EXACTFU_SS:
12034                 case EXACTFU_NO_TRIE:
12035                 case EXACTFL:
12036                         if( exact == PSEUDO )
12037                             exact= OP(scan);
12038                         else if ( exact != OP(scan) )
12039                             exact= 0;
12040                 case NOTHING:
12041                     break;
12042                 default:
12043                     exact= 0;
12044             }
12045         }
12046         DEBUG_PARSE_r({
12047             SV * const mysv=sv_newmortal();
12048             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
12049             regprop(RExC_rx, mysv, scan);
12050             PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
12051                 SvPV_nolen_const(mysv),
12052                 REG_NODE_NUM(scan),
12053                 PL_reg_name[exact]);
12054         });
12055         if (temp == NULL)
12056             break;
12057         scan = temp;
12058     }
12059     DEBUG_PARSE_r({
12060         SV * const mysv_val=sv_newmortal();
12061         DEBUG_PARSE_MSG("");
12062         regprop(RExC_rx, mysv_val, val);
12063         PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
12064                       SvPV_nolen_const(mysv_val),
12065                       (IV)REG_NODE_NUM(val),
12066                       (IV)(val - scan)
12067         );
12068     });
12069     if (reg_off_by_arg[OP(scan)]) {
12070         ARG_SET(scan, val - scan);
12071     }
12072     else {
12073         NEXT_OFF(scan) = val - scan;
12074     }
12075
12076     return exact;
12077 }
12078 #endif
12079
12080 /*
12081  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
12082  */
12083 #ifdef DEBUGGING
12084 static void
12085 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
12086 {
12087     int bit;
12088     int set=0;
12089     regex_charset cs;
12090
12091     for (bit=0; bit<32; bit++) {
12092         if (flags & (1<<bit)) {
12093             if ((1<<bit) & RXf_PMf_CHARSET) {   /* Output separately, below */
12094                 continue;
12095             }
12096             if (!set++ && lead)
12097                 PerlIO_printf(Perl_debug_log, "%s",lead);
12098             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
12099         }
12100     }
12101     if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
12102             if (!set++ && lead) {
12103                 PerlIO_printf(Perl_debug_log, "%s",lead);
12104             }
12105             switch (cs) {
12106                 case REGEX_UNICODE_CHARSET:
12107                     PerlIO_printf(Perl_debug_log, "UNICODE");
12108                     break;
12109                 case REGEX_LOCALE_CHARSET:
12110                     PerlIO_printf(Perl_debug_log, "LOCALE");
12111                     break;
12112                 case REGEX_ASCII_RESTRICTED_CHARSET:
12113                     PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
12114                     break;
12115                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
12116                     PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
12117                     break;
12118                 default:
12119                     PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
12120                     break;
12121             }
12122     }
12123     if (lead)  {
12124         if (set)
12125             PerlIO_printf(Perl_debug_log, "\n");
12126         else
12127             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
12128     }
12129 }
12130 #endif
12131
12132 void
12133 Perl_regdump(pTHX_ const regexp *r)
12134 {
12135 #ifdef DEBUGGING
12136     dVAR;
12137     SV * const sv = sv_newmortal();
12138     SV *dsv= sv_newmortal();
12139     RXi_GET_DECL(r,ri);
12140     GET_RE_DEBUG_FLAGS_DECL;
12141
12142     PERL_ARGS_ASSERT_REGDUMP;
12143
12144     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
12145
12146     /* Header fields of interest. */
12147     if (r->anchored_substr) {
12148         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
12149             RE_SV_DUMPLEN(r->anchored_substr), 30);
12150         PerlIO_printf(Perl_debug_log,
12151                       "anchored %s%s at %"IVdf" ",
12152                       s, RE_SV_TAIL(r->anchored_substr),
12153                       (IV)r->anchored_offset);
12154     } else if (r->anchored_utf8) {
12155         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
12156             RE_SV_DUMPLEN(r->anchored_utf8), 30);
12157         PerlIO_printf(Perl_debug_log,
12158                       "anchored utf8 %s%s at %"IVdf" ",
12159                       s, RE_SV_TAIL(r->anchored_utf8),
12160                       (IV)r->anchored_offset);
12161     }
12162     if (r->float_substr) {
12163         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
12164             RE_SV_DUMPLEN(r->float_substr), 30);
12165         PerlIO_printf(Perl_debug_log,
12166                       "floating %s%s at %"IVdf"..%"UVuf" ",
12167                       s, RE_SV_TAIL(r->float_substr),
12168                       (IV)r->float_min_offset, (UV)r->float_max_offset);
12169     } else if (r->float_utf8) {
12170         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
12171             RE_SV_DUMPLEN(r->float_utf8), 30);
12172         PerlIO_printf(Perl_debug_log,
12173                       "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
12174                       s, RE_SV_TAIL(r->float_utf8),
12175                       (IV)r->float_min_offset, (UV)r->float_max_offset);
12176     }
12177     if (r->check_substr || r->check_utf8)
12178         PerlIO_printf(Perl_debug_log,
12179                       (const char *)
12180                       (r->check_substr == r->float_substr
12181                        && r->check_utf8 == r->float_utf8
12182                        ? "(checking floating" : "(checking anchored"));
12183     if (r->extflags & RXf_NOSCAN)
12184         PerlIO_printf(Perl_debug_log, " noscan");
12185     if (r->extflags & RXf_CHECK_ALL)
12186         PerlIO_printf(Perl_debug_log, " isall");
12187     if (r->check_substr || r->check_utf8)
12188         PerlIO_printf(Perl_debug_log, ") ");
12189
12190     if (ri->regstclass) {
12191         regprop(r, sv, ri->regstclass);
12192         PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
12193     }
12194     if (r->extflags & RXf_ANCH) {
12195         PerlIO_printf(Perl_debug_log, "anchored");
12196         if (r->extflags & RXf_ANCH_BOL)
12197             PerlIO_printf(Perl_debug_log, "(BOL)");
12198         if (r->extflags & RXf_ANCH_MBOL)
12199             PerlIO_printf(Perl_debug_log, "(MBOL)");
12200         if (r->extflags & RXf_ANCH_SBOL)
12201             PerlIO_printf(Perl_debug_log, "(SBOL)");
12202         if (r->extflags & RXf_ANCH_GPOS)
12203             PerlIO_printf(Perl_debug_log, "(GPOS)");
12204         PerlIO_putc(Perl_debug_log, ' ');
12205     }
12206     if (r->extflags & RXf_GPOS_SEEN)
12207         PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
12208     if (r->intflags & PREGf_SKIP)
12209         PerlIO_printf(Perl_debug_log, "plus ");
12210     if (r->intflags & PREGf_IMPLICIT)
12211         PerlIO_printf(Perl_debug_log, "implicit ");
12212     PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
12213     if (r->extflags & RXf_EVAL_SEEN)
12214         PerlIO_printf(Perl_debug_log, "with eval ");
12215     PerlIO_printf(Perl_debug_log, "\n");
12216     DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
12217 #else
12218     PERL_ARGS_ASSERT_REGDUMP;
12219     PERL_UNUSED_CONTEXT;
12220     PERL_UNUSED_ARG(r);
12221 #endif  /* DEBUGGING */
12222 }
12223
12224 /*
12225 - regprop - printable representation of opcode
12226 */
12227 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
12228 STMT_START { \
12229         if (do_sep) {                           \
12230             Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
12231             if (flags & ANYOF_INVERT)           \
12232                 /*make sure the invert info is in each */ \
12233                 sv_catpvs(sv, "^");             \
12234             do_sep = 0;                         \
12235         }                                       \
12236 } STMT_END
12237
12238 void
12239 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
12240 {
12241 #ifdef DEBUGGING
12242     dVAR;
12243     register int k;
12244     RXi_GET_DECL(prog,progi);
12245     GET_RE_DEBUG_FLAGS_DECL;
12246
12247     PERL_ARGS_ASSERT_REGPROP;
12248
12249     sv_setpvs(sv, "");
12250
12251     if (OP(o) > REGNODE_MAX)            /* regnode.type is unsigned */
12252         /* It would be nice to FAIL() here, but this may be called from
12253            regexec.c, and it would be hard to supply pRExC_state. */
12254         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
12255     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
12256
12257     k = PL_regkind[OP(o)];
12258
12259     if (k == EXACT) {
12260         sv_catpvs(sv, " ");
12261         /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
12262          * is a crude hack but it may be the best for now since
12263          * we have no flag "this EXACTish node was UTF-8"
12264          * --jhi */
12265         pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
12266                   PERL_PV_ESCAPE_UNI_DETECT |
12267                   PERL_PV_ESCAPE_NONASCII   |
12268                   PERL_PV_PRETTY_ELLIPSES   |
12269                   PERL_PV_PRETTY_LTGT       |
12270                   PERL_PV_PRETTY_NOCLEAR
12271                   );
12272     } else if (k == TRIE) {
12273         /* print the details of the trie in dumpuntil instead, as
12274          * progi->data isn't available here */
12275         const char op = OP(o);
12276         const U32 n = ARG(o);
12277         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
12278                (reg_ac_data *)progi->data->data[n] :
12279                NULL;
12280         const reg_trie_data * const trie
12281             = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
12282
12283         Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
12284         DEBUG_TRIE_COMPILE_r(
12285             Perl_sv_catpvf(aTHX_ sv,
12286                 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
12287                 (UV)trie->startstate,
12288                 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
12289                 (UV)trie->wordcount,
12290                 (UV)trie->minlen,
12291                 (UV)trie->maxlen,
12292                 (UV)TRIE_CHARCOUNT(trie),
12293                 (UV)trie->uniquecharcount
12294             )
12295         );
12296         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
12297             int i;
12298             int rangestart = -1;
12299             U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
12300             sv_catpvs(sv, "[");
12301             for (i = 0; i <= 256; i++) {
12302                 if (i < 256 && BITMAP_TEST(bitmap,i)) {
12303                     if (rangestart == -1)
12304                         rangestart = i;
12305                 } else if (rangestart != -1) {
12306                     if (i <= rangestart + 3)
12307                         for (; rangestart < i; rangestart++)
12308                             put_byte(sv, rangestart);
12309                     else {
12310                         put_byte(sv, rangestart);
12311                         sv_catpvs(sv, "-");
12312                         put_byte(sv, i - 1);
12313                     }
12314                     rangestart = -1;
12315                 }
12316             }
12317             sv_catpvs(sv, "]");
12318         }
12319
12320     } else if (k == CURLY) {
12321         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
12322             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
12323         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
12324     }
12325     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
12326         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
12327     else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
12328         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
12329         if ( RXp_PAREN_NAMES(prog) ) {
12330             if ( k != REF || (OP(o) < NREF)) {
12331                 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
12332                 SV **name= av_fetch(list, ARG(o), 0 );
12333                 if (name)
12334                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
12335             }
12336             else {
12337                 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
12338                 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
12339                 I32 *nums=(I32*)SvPVX(sv_dat);
12340                 SV **name= av_fetch(list, nums[0], 0 );
12341                 I32 n;
12342                 if (name) {
12343                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
12344                         Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
12345                                     (n ? "," : ""), (IV)nums[n]);
12346                     }
12347                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
12348                 }
12349             }
12350         }
12351     } else if (k == GOSUB)
12352         Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
12353     else if (k == VERB) {
12354         if (!o->flags)
12355             Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
12356                            SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
12357     } else if (k == LOGICAL)
12358         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);     /* 2: embedded, otherwise 1 */
12359     else if (k == ANYOF) {
12360         int i, rangestart = -1;
12361         const U8 flags = ANYOF_FLAGS(o);
12362         int do_sep = 0;
12363
12364         /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
12365         static const char * const anyofs[] = {
12366             "\\w",
12367             "\\W",
12368             "\\s",
12369             "\\S",
12370             "\\d",
12371             "\\D",
12372             "[:alnum:]",
12373             "[:^alnum:]",
12374             "[:alpha:]",
12375             "[:^alpha:]",
12376             "[:ascii:]",
12377             "[:^ascii:]",
12378             "[:cntrl:]",
12379             "[:^cntrl:]",
12380             "[:graph:]",
12381             "[:^graph:]",
12382             "[:lower:]",
12383             "[:^lower:]",
12384             "[:print:]",
12385             "[:^print:]",
12386             "[:punct:]",
12387             "[:^punct:]",
12388             "[:upper:]",
12389             "[:^upper:]",
12390             "[:xdigit:]",
12391             "[:^xdigit:]",
12392             "[:space:]",
12393             "[:^space:]",
12394             "[:blank:]",
12395             "[:^blank:]"
12396         };
12397
12398         if (flags & ANYOF_LOCALE)
12399             sv_catpvs(sv, "{loc}");
12400         if (flags & ANYOF_LOC_NONBITMAP_FOLD)
12401             sv_catpvs(sv, "{i}");
12402         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
12403         if (flags & ANYOF_INVERT)
12404             sv_catpvs(sv, "^");
12405
12406         /* output what the standard cp 0-255 bitmap matches */
12407         for (i = 0; i <= 256; i++) {
12408             if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
12409                 if (rangestart == -1)
12410                     rangestart = i;
12411             } else if (rangestart != -1) {
12412                 if (i <= rangestart + 3)
12413                     for (; rangestart < i; rangestart++)
12414                         put_byte(sv, rangestart);
12415                 else {
12416                     put_byte(sv, rangestart);
12417                     sv_catpvs(sv, "-");
12418                     put_byte(sv, i - 1);
12419                 }
12420                 do_sep = 1;
12421                 rangestart = -1;
12422             }
12423         }
12424
12425         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
12426         /* output any special charclass tests (used entirely under use locale) */
12427         if (ANYOF_CLASS_TEST_ANY_SET(o))
12428             for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
12429                 if (ANYOF_CLASS_TEST(o,i)) {
12430                     sv_catpv(sv, anyofs[i]);
12431                     do_sep = 1;
12432                 }
12433
12434         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
12435
12436         if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
12437             sv_catpvs(sv, "{non-utf8-latin1-all}");
12438         }
12439
12440         /* output information about the unicode matching */
12441         if (flags & ANYOF_UNICODE_ALL)
12442             sv_catpvs(sv, "{unicode_all}");
12443         else if (ANYOF_NONBITMAP(o))
12444             sv_catpvs(sv, "{unicode}");
12445         if (flags & ANYOF_NONBITMAP_NON_UTF8)
12446             sv_catpvs(sv, "{outside bitmap}");
12447
12448         if (ANYOF_NONBITMAP(o)) {
12449             SV *lv; /* Set if there is something outside the bit map */
12450             SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
12451             bool byte_output = FALSE;   /* If something in the bitmap has been
12452                                            output */
12453
12454             if (lv && lv != &PL_sv_undef) {
12455                 if (sw) {
12456                     U8 s[UTF8_MAXBYTES_CASE+1];
12457
12458                     for (i = 0; i <= 256; i++) { /* Look at chars in bitmap */
12459                         uvchr_to_utf8(s, i);
12460
12461                         if (i < 256
12462                             && ! ANYOF_BITMAP_TEST(o, i)    /* Don't duplicate
12463                                                                things already
12464                                                                output as part
12465                                                                of the bitmap */
12466                             && swash_fetch(sw, s, TRUE))
12467                         {
12468                             if (rangestart == -1)
12469                                 rangestart = i;
12470                         } else if (rangestart != -1) {
12471                             byte_output = TRUE;
12472                             if (i <= rangestart + 3)
12473                                 for (; rangestart < i; rangestart++) {
12474                                     put_byte(sv, rangestart);
12475                                 }
12476                             else {
12477                                 put_byte(sv, rangestart);
12478                                 sv_catpvs(sv, "-");
12479                                 put_byte(sv, i-1);
12480                             }
12481                             rangestart = -1;
12482                         }
12483                     }
12484                 }
12485
12486                 {
12487                     char *s = savesvpv(lv);
12488                     char * const origs = s;
12489
12490                     while (*s && *s != '\n')
12491                         s++;
12492
12493                     if (*s == '\n') {
12494                         const char * const t = ++s;
12495
12496                         if (byte_output) {
12497                             sv_catpvs(sv, " ");
12498                         }
12499
12500                         while (*s) {
12501                             if (*s == '\n') {
12502
12503                                 /* Truncate very long output */
12504                                 if (s - origs > 256) {
12505                                     Perl_sv_catpvf(aTHX_ sv,
12506                                                    "%.*s...",
12507                                                    (int) (s - origs - 1),
12508                                                    t);
12509                                     goto out_dump;
12510                                 }
12511                                 *s = ' ';
12512                             }
12513                             else if (*s == '\t') {
12514                                 *s = '-';
12515                             }
12516                             s++;
12517                         }
12518                         if (s[-1] == ' ')
12519                             s[-1] = 0;
12520
12521                         sv_catpv(sv, t);
12522                     }
12523
12524                 out_dump:
12525
12526                     Safefree(origs);
12527                 }
12528                 SvREFCNT_dec(lv);
12529             }
12530         }
12531
12532         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
12533     }
12534     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
12535         Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
12536 #else
12537     PERL_UNUSED_CONTEXT;
12538     PERL_UNUSED_ARG(sv);
12539     PERL_UNUSED_ARG(o);
12540     PERL_UNUSED_ARG(prog);
12541 #endif  /* DEBUGGING */
12542 }
12543
12544 SV *
12545 Perl_re_intuit_string(pTHX_ REGEXP * const r)
12546 {                               /* Assume that RE_INTUIT is set */
12547     dVAR;
12548     struct regexp *const prog = (struct regexp *)SvANY(r);
12549     GET_RE_DEBUG_FLAGS_DECL;
12550
12551     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
12552     PERL_UNUSED_CONTEXT;
12553
12554     DEBUG_COMPILE_r(
12555         {
12556             const char * const s = SvPV_nolen_const(prog->check_substr
12557                       ? prog->check_substr : prog->check_utf8);
12558
12559             if (!PL_colorset) reginitcolors();
12560             PerlIO_printf(Perl_debug_log,
12561                       "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
12562                       PL_colors[4],
12563                       prog->check_substr ? "" : "utf8 ",
12564                       PL_colors[5],PL_colors[0],
12565                       s,
12566                       PL_colors[1],
12567                       (strlen(s) > 60 ? "..." : ""));
12568         } );
12569
12570     return prog->check_substr ? prog->check_substr : prog->check_utf8;
12571 }
12572
12573 /*
12574    pregfree()
12575
12576    handles refcounting and freeing the perl core regexp structure. When
12577    it is necessary to actually free the structure the first thing it
12578    does is call the 'free' method of the regexp_engine associated to
12579    the regexp, allowing the handling of the void *pprivate; member
12580    first. (This routine is not overridable by extensions, which is why
12581    the extensions free is called first.)
12582
12583    See regdupe and regdupe_internal if you change anything here.
12584 */
12585 #ifndef PERL_IN_XSUB_RE
12586 void
12587 Perl_pregfree(pTHX_ REGEXP *r)
12588 {
12589     SvREFCNT_dec(r);
12590 }
12591
12592 void
12593 Perl_pregfree2(pTHX_ REGEXP *rx)
12594 {
12595     dVAR;
12596     struct regexp *const r = (struct regexp *)SvANY(rx);
12597     GET_RE_DEBUG_FLAGS_DECL;
12598
12599     PERL_ARGS_ASSERT_PREGFREE2;
12600
12601     if (r->mother_re) {
12602         ReREFCNT_dec(r->mother_re);
12603     } else {
12604         CALLREGFREE_PVT(rx); /* free the private data */
12605         SvREFCNT_dec(RXp_PAREN_NAMES(r));
12606     }
12607     if (r->substrs) {
12608         SvREFCNT_dec(r->anchored_substr);
12609         SvREFCNT_dec(r->anchored_utf8);
12610         SvREFCNT_dec(r->float_substr);
12611         SvREFCNT_dec(r->float_utf8);
12612         Safefree(r->substrs);
12613     }
12614     RX_MATCH_COPY_FREE(rx);
12615 #ifdef PERL_OLD_COPY_ON_WRITE
12616     SvREFCNT_dec(r->saved_copy);
12617 #endif
12618     Safefree(r->offs);
12619 }
12620
12621 /*  reg_temp_copy()
12622
12623     This is a hacky workaround to the structural issue of match results
12624     being stored in the regexp structure which is in turn stored in
12625     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
12626     could be PL_curpm in multiple contexts, and could require multiple
12627     result sets being associated with the pattern simultaneously, such
12628     as when doing a recursive match with (??{$qr})
12629
12630     The solution is to make a lightweight copy of the regexp structure
12631     when a qr// is returned from the code executed by (??{$qr}) this
12632     lightweight copy doesn't actually own any of its data except for
12633     the starp/end and the actual regexp structure itself.
12634
12635 */
12636
12637
12638 REGEXP *
12639 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
12640 {
12641     struct regexp *ret;
12642     struct regexp *const r = (struct regexp *)SvANY(rx);
12643     register const I32 npar = r->nparens+1;
12644
12645     PERL_ARGS_ASSERT_REG_TEMP_COPY;
12646
12647     if (!ret_x)
12648         ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
12649     ret = (struct regexp *)SvANY(ret_x);
12650
12651     (void)ReREFCNT_inc(rx);
12652     /* We can take advantage of the existing "copied buffer" mechanism in SVs
12653        by pointing directly at the buffer, but flagging that the allocated
12654        space in the copy is zero. As we've just done a struct copy, it's now
12655        a case of zero-ing that, rather than copying the current length.  */
12656     SvPV_set(ret_x, RX_WRAPPED(rx));
12657     SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
12658     memcpy(&(ret->xpv_cur), &(r->xpv_cur),
12659            sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
12660     SvLEN_set(ret_x, 0);
12661     SvSTASH_set(ret_x, NULL);
12662     SvMAGIC_set(ret_x, NULL);
12663     Newx(ret->offs, npar, regexp_paren_pair);
12664     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
12665     if (r->substrs) {
12666         Newx(ret->substrs, 1, struct reg_substr_data);
12667         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
12668
12669         SvREFCNT_inc_void(ret->anchored_substr);
12670         SvREFCNT_inc_void(ret->anchored_utf8);
12671         SvREFCNT_inc_void(ret->float_substr);
12672         SvREFCNT_inc_void(ret->float_utf8);
12673
12674         /* check_substr and check_utf8, if non-NULL, point to either their
12675            anchored or float namesakes, and don't hold a second reference.  */
12676     }
12677     RX_MATCH_COPIED_off(ret_x);
12678 #ifdef PERL_OLD_COPY_ON_WRITE
12679     ret->saved_copy = NULL;
12680 #endif
12681     ret->mother_re = rx;
12682
12683     return ret_x;
12684 }
12685 #endif
12686
12687 /* regfree_internal()
12688
12689    Free the private data in a regexp. This is overloadable by
12690    extensions. Perl takes care of the regexp structure in pregfree(),
12691    this covers the *pprivate pointer which technically perl doesn't
12692    know about, however of course we have to handle the
12693    regexp_internal structure when no extension is in use.
12694
12695    Note this is called before freeing anything in the regexp
12696    structure.
12697  */
12698
12699 void
12700 Perl_regfree_internal(pTHX_ REGEXP * const rx)
12701 {
12702     dVAR;
12703     struct regexp *const r = (struct regexp *)SvANY(rx);
12704     RXi_GET_DECL(r,ri);
12705     GET_RE_DEBUG_FLAGS_DECL;
12706
12707     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
12708
12709     DEBUG_COMPILE_r({
12710         if (!PL_colorset)
12711             reginitcolors();
12712         {
12713             SV *dsv= sv_newmortal();
12714             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
12715                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
12716             PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
12717                 PL_colors[4],PL_colors[5],s);
12718         }
12719     });
12720 #ifdef RE_TRACK_PATTERN_OFFSETS
12721     if (ri->u.offsets)
12722         Safefree(ri->u.offsets);             /* 20010421 MJD */
12723 #endif
12724     if (ri->data) {
12725         int n = ri->data->count;
12726         PAD* new_comppad = NULL;
12727         PAD* old_comppad;
12728         PADOFFSET refcnt;
12729
12730         while (--n >= 0) {
12731           /* If you add a ->what type here, update the comment in regcomp.h */
12732             switch (ri->data->what[n]) {
12733             case 'a':
12734             case 's':
12735             case 'S':
12736             case 'u':
12737                 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
12738                 break;
12739             case 'f':
12740                 Safefree(ri->data->data[n]);
12741                 break;
12742             case 'p':
12743                 new_comppad = MUTABLE_AV(ri->data->data[n]);
12744                 break;
12745             case 'o':
12746                 if (new_comppad == NULL)
12747                     Perl_croak(aTHX_ "panic: pregfree comppad");
12748                 PAD_SAVE_LOCAL(old_comppad,
12749                     /* Watch out for global destruction's random ordering. */
12750                     (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
12751                 );
12752                 OP_REFCNT_LOCK;
12753                 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
12754                 OP_REFCNT_UNLOCK;
12755                 if (!refcnt)
12756                     op_free((OP_4tree*)ri->data->data[n]);
12757
12758                 PAD_RESTORE_LOCAL(old_comppad);
12759                 SvREFCNT_dec(MUTABLE_SV(new_comppad));
12760                 new_comppad = NULL;
12761                 break;
12762             case 'n':
12763                 break;
12764             case 'T':
12765                 { /* Aho Corasick add-on structure for a trie node.
12766                      Used in stclass optimization only */
12767                     U32 refcount;
12768                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
12769                     OP_REFCNT_LOCK;
12770                     refcount = --aho->refcount;
12771                     OP_REFCNT_UNLOCK;
12772                     if ( !refcount ) {
12773                         PerlMemShared_free(aho->states);
12774                         PerlMemShared_free(aho->fail);
12775                          /* do this last!!!! */
12776                         PerlMemShared_free(ri->data->data[n]);
12777                         PerlMemShared_free(ri->regstclass);
12778                     }
12779                 }
12780                 break;
12781             case 't':
12782                 {
12783                     /* trie structure. */
12784                     U32 refcount;
12785                     reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
12786                     OP_REFCNT_LOCK;
12787                     refcount = --trie->refcount;
12788                     OP_REFCNT_UNLOCK;
12789                     if ( !refcount ) {
12790                         PerlMemShared_free(trie->charmap);
12791                         PerlMemShared_free(trie->states);
12792                         PerlMemShared_free(trie->trans);
12793                         if (trie->bitmap)
12794                             PerlMemShared_free(trie->bitmap);
12795                         if (trie->jump)
12796                             PerlMemShared_free(trie->jump);
12797                         PerlMemShared_free(trie->wordinfo);
12798                         /* do this last!!!! */
12799                         PerlMemShared_free(ri->data->data[n]);
12800                     }
12801                 }
12802                 break;
12803             default:
12804                 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
12805             }
12806         }
12807         Safefree(ri->data->what);
12808         Safefree(ri->data);
12809     }
12810
12811     Safefree(ri);
12812 }
12813
12814 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
12815 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
12816 #define SAVEPVN(p,n)    ((p) ? savepvn(p,n) : NULL)
12817
12818 /*
12819    re_dup - duplicate a regexp.
12820
12821    This routine is expected to clone a given regexp structure. It is only
12822    compiled under USE_ITHREADS.
12823
12824    After all of the core data stored in struct regexp is duplicated
12825    the regexp_engine.dupe method is used to copy any private data
12826    stored in the *pprivate pointer. This allows extensions to handle
12827    any duplication it needs to do.
12828
12829    See pregfree() and regfree_internal() if you change anything here.
12830 */
12831 #if defined(USE_ITHREADS)
12832 #ifndef PERL_IN_XSUB_RE
12833 void
12834 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
12835 {
12836     dVAR;
12837     I32 npar;
12838     const struct regexp *r = (const struct regexp *)SvANY(sstr);
12839     struct regexp *ret = (struct regexp *)SvANY(dstr);
12840
12841     PERL_ARGS_ASSERT_RE_DUP_GUTS;
12842
12843     npar = r->nparens+1;
12844     Newx(ret->offs, npar, regexp_paren_pair);
12845     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
12846     if(ret->swap) {
12847         /* no need to copy these */
12848         Newx(ret->swap, npar, regexp_paren_pair);
12849     }
12850
12851     if (ret->substrs) {
12852         /* Do it this way to avoid reading from *r after the StructCopy().
12853            That way, if any of the sv_dup_inc()s dislodge *r from the L1
12854            cache, it doesn't matter.  */
12855         const bool anchored = r->check_substr
12856             ? r->check_substr == r->anchored_substr
12857             : r->check_utf8 == r->anchored_utf8;
12858         Newx(ret->substrs, 1, struct reg_substr_data);
12859         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
12860
12861         ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
12862         ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
12863         ret->float_substr = sv_dup_inc(ret->float_substr, param);
12864         ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
12865
12866         /* check_substr and check_utf8, if non-NULL, point to either their
12867            anchored or float namesakes, and don't hold a second reference.  */
12868
12869         if (ret->check_substr) {
12870             if (anchored) {
12871                 assert(r->check_utf8 == r->anchored_utf8);
12872                 ret->check_substr = ret->anchored_substr;
12873                 ret->check_utf8 = ret->anchored_utf8;
12874             } else {
12875                 assert(r->check_substr == r->float_substr);
12876                 assert(r->check_utf8 == r->float_utf8);
12877                 ret->check_substr = ret->float_substr;
12878                 ret->check_utf8 = ret->float_utf8;
12879             }
12880         } else if (ret->check_utf8) {
12881             if (anchored) {
12882                 ret->check_utf8 = ret->anchored_utf8;
12883             } else {
12884                 ret->check_utf8 = ret->float_utf8;
12885             }
12886         }
12887     }
12888
12889     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
12890
12891     if (ret->pprivate)
12892         RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
12893
12894     if (RX_MATCH_COPIED(dstr))
12895         ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
12896     else
12897         ret->subbeg = NULL;
12898 #ifdef PERL_OLD_COPY_ON_WRITE
12899     ret->saved_copy = NULL;
12900 #endif
12901
12902     if (ret->mother_re) {
12903         if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
12904             /* Our storage points directly to our mother regexp, but that's
12905                1: a buffer in a different thread
12906                2: something we no longer hold a reference on
12907                so we need to copy it locally.  */
12908             /* Note we need to use SvCUR(), rather than
12909                SvLEN(), on our mother_re, because it, in
12910                turn, may well be pointing to its own mother_re.  */
12911             SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
12912                                    SvCUR(ret->mother_re)+1));
12913             SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
12914         }
12915         ret->mother_re      = NULL;
12916     }
12917     ret->gofs = 0;
12918 }
12919 #endif /* PERL_IN_XSUB_RE */
12920
12921 /*
12922    regdupe_internal()
12923
12924    This is the internal complement to regdupe() which is used to copy
12925    the structure pointed to by the *pprivate pointer in the regexp.
12926    This is the core version of the extension overridable cloning hook.
12927    The regexp structure being duplicated will be copied by perl prior
12928    to this and will be provided as the regexp *r argument, however
12929    with the /old/ structures pprivate pointer value. Thus this routine
12930    may override any copying normally done by perl.
12931
12932    It returns a pointer to the new regexp_internal structure.
12933 */
12934
12935 void *
12936 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
12937 {
12938     dVAR;
12939     struct regexp *const r = (struct regexp *)SvANY(rx);
12940     regexp_internal *reti;
12941     int len;
12942     RXi_GET_DECL(r,ri);
12943
12944     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
12945
12946     len = ProgLen(ri);
12947
12948     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
12949     Copy(ri->program, reti->program, len+1, regnode);
12950
12951
12952     reti->regstclass = NULL;
12953
12954     if (ri->data) {
12955         struct reg_data *d;
12956         const int count = ri->data->count;
12957         int i;
12958
12959         Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
12960                 char, struct reg_data);
12961         Newx(d->what, count, U8);
12962
12963         d->count = count;
12964         for (i = 0; i < count; i++) {
12965             d->what[i] = ri->data->what[i];
12966             switch (d->what[i]) {
12967                 /* legal options are one of: sSfpontTua
12968                    see also regcomp.h and pregfree() */
12969             case 'a': /* actually an AV, but the dup function is identical.  */
12970             case 's':
12971             case 'S':
12972             case 'p': /* actually an AV, but the dup function is identical.  */
12973             case 'u': /* actually an HV, but the dup function is identical.  */
12974                 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
12975                 break;
12976             case 'f':
12977                 /* This is cheating. */
12978                 Newx(d->data[i], 1, struct regnode_charclass_class);
12979                 StructCopy(ri->data->data[i], d->data[i],
12980                             struct regnode_charclass_class);
12981                 reti->regstclass = (regnode*)d->data[i];
12982                 break;
12983             case 'o':
12984                 /* Compiled op trees are readonly and in shared memory,
12985                    and can thus be shared without duplication. */
12986                 OP_REFCNT_LOCK;
12987                 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
12988                 OP_REFCNT_UNLOCK;
12989                 break;
12990             case 'T':
12991                 /* Trie stclasses are readonly and can thus be shared
12992                  * without duplication. We free the stclass in pregfree
12993                  * when the corresponding reg_ac_data struct is freed.
12994                  */
12995                 reti->regstclass= ri->regstclass;
12996                 /* Fall through */
12997             case 't':
12998                 OP_REFCNT_LOCK;
12999                 ((reg_trie_data*)ri->data->data[i])->refcount++;
13000                 OP_REFCNT_UNLOCK;
13001                 /* Fall through */
13002             case 'n':
13003                 d->data[i] = ri->data->data[i];
13004                 break;
13005             default:
13006                 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
13007             }
13008         }
13009
13010         reti->data = d;
13011     }
13012     else
13013         reti->data = NULL;
13014
13015     reti->name_list_idx = ri->name_list_idx;
13016
13017 #ifdef RE_TRACK_PATTERN_OFFSETS
13018     if (ri->u.offsets) {
13019         Newx(reti->u.offsets, 2*len+1, U32);
13020         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
13021     }
13022 #else
13023     SetProgLen(reti,len);
13024 #endif
13025
13026     return (void*)reti;
13027 }
13028
13029 #endif    /* USE_ITHREADS */
13030
13031 #ifndef PERL_IN_XSUB_RE
13032
13033 /*
13034  - regnext - dig the "next" pointer out of a node
13035  */
13036 regnode *
13037 Perl_regnext(pTHX_ register regnode *p)
13038 {
13039     dVAR;
13040     register I32 offset;
13041
13042     if (!p)
13043         return(NULL);
13044
13045     if (OP(p) > REGNODE_MAX) {          /* regnode.type is unsigned */
13046         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
13047     }
13048
13049     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
13050     if (offset == 0)
13051         return(NULL);
13052
13053     return(p+offset);
13054 }
13055 #endif
13056
13057 STATIC void
13058 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
13059 {
13060     va_list args;
13061     STRLEN l1 = strlen(pat1);
13062     STRLEN l2 = strlen(pat2);
13063     char buf[512];
13064     SV *msv;
13065     const char *message;
13066
13067     PERL_ARGS_ASSERT_RE_CROAK2;
13068
13069     if (l1 > 510)
13070         l1 = 510;
13071     if (l1 + l2 > 510)
13072         l2 = 510 - l1;
13073     Copy(pat1, buf, l1 , char);
13074     Copy(pat2, buf + l1, l2 , char);
13075     buf[l1 + l2] = '\n';
13076     buf[l1 + l2 + 1] = '\0';
13077 #ifdef I_STDARG
13078     /* ANSI variant takes additional second argument */
13079     va_start(args, pat2);
13080 #else
13081     va_start(args);
13082 #endif
13083     msv = vmess(buf, &args);
13084     va_end(args);
13085     message = SvPV_const(msv,l1);
13086     if (l1 > 512)
13087         l1 = 512;
13088     Copy(message, buf, l1 , char);
13089     buf[l1-1] = '\0';                   /* Overwrite \n */
13090     Perl_croak(aTHX_ "%s", buf);
13091 }
13092
13093 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
13094
13095 #ifndef PERL_IN_XSUB_RE
13096 void
13097 Perl_save_re_context(pTHX)
13098 {
13099     dVAR;
13100
13101     struct re_save_state *state;
13102
13103     SAVEVPTR(PL_curcop);
13104     SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
13105
13106     state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
13107     PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
13108     SSPUSHUV(SAVEt_RE_STATE);
13109
13110     Copy(&PL_reg_state, state, 1, struct re_save_state);
13111
13112     PL_reg_start_tmp = 0;
13113     PL_reg_start_tmpl = 0;
13114     PL_reg_oldsaved = NULL;
13115     PL_reg_oldsavedlen = 0;
13116     PL_reg_maxiter = 0;
13117     PL_reg_leftiter = 0;
13118     PL_reg_poscache = NULL;
13119     PL_reg_poscache_size = 0;
13120 #ifdef PERL_OLD_COPY_ON_WRITE
13121     PL_nrs = NULL;
13122 #endif
13123
13124     /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
13125     if (PL_curpm) {
13126         const REGEXP * const rx = PM_GETRE(PL_curpm);
13127         if (rx) {
13128             U32 i;
13129             for (i = 1; i <= RX_NPARENS(rx); i++) {
13130                 char digits[TYPE_CHARS(long)];
13131                 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
13132                 GV *const *const gvp
13133                     = (GV**)hv_fetch(PL_defstash, digits, len, 0);
13134
13135                 if (gvp) {
13136                     GV * const gv = *gvp;
13137                     if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
13138                         save_scalar(gv);
13139                 }
13140             }
13141         }
13142     }
13143 }
13144 #endif
13145
13146 static void
13147 clear_re(pTHX_ void *r)
13148 {
13149     dVAR;
13150     ReREFCNT_dec((REGEXP *)r);
13151 }
13152
13153 #ifdef DEBUGGING
13154
13155 STATIC void
13156 S_put_byte(pTHX_ SV *sv, int c)
13157 {
13158     PERL_ARGS_ASSERT_PUT_BYTE;
13159
13160     /* Our definition of isPRINT() ignores locales, so only bytes that are
13161        not part of UTF-8 are considered printable. I assume that the same
13162        holds for UTF-EBCDIC.
13163        Also, code point 255 is not printable in either (it's E0 in EBCDIC,
13164        which Wikipedia says:
13165
13166        EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
13167        ones (binary 1111 1111, hexadecimal FF). It is similar, but not
13168        identical, to the ASCII delete (DEL) or rubout control character.
13169        ) So the old condition can be simplified to !isPRINT(c)  */
13170     if (!isPRINT(c)) {
13171         if (c < 256) {
13172             Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
13173         }
13174         else {
13175             Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
13176         }
13177     }
13178     else {
13179         const char string = c;
13180         if (c == '-' || c == ']' || c == '\\' || c == '^')
13181             sv_catpvs(sv, "\\");
13182         sv_catpvn(sv, &string, 1);
13183     }
13184 }
13185
13186
13187 #define CLEAR_OPTSTART \
13188     if (optstart) STMT_START { \
13189             DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
13190             optstart=NULL; \
13191     } STMT_END
13192
13193 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
13194
13195 STATIC const regnode *
13196 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
13197             const regnode *last, const regnode *plast,
13198             SV* sv, I32 indent, U32 depth)
13199 {
13200     dVAR;
13201     register U8 op = PSEUDO;    /* Arbitrary non-END op. */
13202     register const regnode *next;
13203     const regnode *optstart= NULL;
13204
13205     RXi_GET_DECL(r,ri);
13206     GET_RE_DEBUG_FLAGS_DECL;
13207
13208     PERL_ARGS_ASSERT_DUMPUNTIL;
13209
13210 #ifdef DEBUG_DUMPUNTIL
13211     PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
13212         last ? last-start : 0,plast ? plast-start : 0);
13213 #endif
13214
13215     if (plast && plast < last)
13216         last= plast;
13217
13218     while (PL_regkind[op] != END && (!last || node < last)) {
13219         /* While that wasn't END last time... */
13220         NODE_ALIGN(node);
13221         op = OP(node);
13222         if (op == CLOSE || op == WHILEM)
13223             indent--;
13224         next = regnext((regnode *)node);
13225
13226         /* Where, what. */
13227         if (OP(node) == OPTIMIZED) {
13228             if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
13229                 optstart = node;
13230             else
13231                 goto after_print;
13232         } else
13233             CLEAR_OPTSTART;
13234
13235         regprop(r, sv, node);
13236         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
13237                       (int)(2*indent + 1), "", SvPVX_const(sv));
13238
13239         if (OP(node) != OPTIMIZED) {
13240             if (next == NULL)           /* Next ptr. */
13241                 PerlIO_printf(Perl_debug_log, " (0)");
13242             else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
13243                 PerlIO_printf(Perl_debug_log, " (FAIL)");
13244             else
13245                 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
13246             (void)PerlIO_putc(Perl_debug_log, '\n');
13247         }
13248
13249       after_print:
13250         if (PL_regkind[(U8)op] == BRANCHJ) {
13251             assert(next);
13252             {
13253                 register const regnode *nnode = (OP(next) == LONGJMP
13254                                              ? regnext((regnode *)next)
13255                                              : next);
13256                 if (last && nnode > last)
13257                     nnode = last;
13258                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
13259             }
13260         }
13261         else if (PL_regkind[(U8)op] == BRANCH) {
13262             assert(next);
13263             DUMPUNTIL(NEXTOPER(node), next);
13264         }
13265         else if ( PL_regkind[(U8)op]  == TRIE ) {
13266             const regnode *this_trie = node;
13267             const char op = OP(node);
13268             const U32 n = ARG(node);
13269             const reg_ac_data * const ac = op>=AHOCORASICK ?
13270                (reg_ac_data *)ri->data->data[n] :
13271                NULL;
13272             const reg_trie_data * const trie =
13273                 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
13274 #ifdef DEBUGGING
13275             AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
13276 #endif
13277             const regnode *nextbranch= NULL;
13278             I32 word_idx;
13279             sv_setpvs(sv, "");
13280             for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
13281                 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
13282
13283                 PerlIO_printf(Perl_debug_log, "%*s%s ",
13284                    (int)(2*(indent+3)), "",
13285                     elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
13286                             PL_colors[0], PL_colors[1],
13287                             (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
13288                             PERL_PV_PRETTY_ELLIPSES    |
13289                             PERL_PV_PRETTY_LTGT
13290                             )
13291                             : "???"
13292                 );
13293                 if (trie->jump) {
13294                     U16 dist= trie->jump[word_idx+1];
13295                     PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
13296                                   (UV)((dist ? this_trie + dist : next) - start));
13297                     if (dist) {
13298                         if (!nextbranch)
13299                             nextbranch= this_trie + trie->jump[0];
13300                         DUMPUNTIL(this_trie + dist, nextbranch);
13301                     }
13302                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
13303                         nextbranch= regnext((regnode *)nextbranch);
13304                 } else {
13305                     PerlIO_printf(Perl_debug_log, "\n");
13306                 }
13307             }
13308             if (last && next > last)
13309                 node= last;
13310             else
13311                 node= next;
13312         }
13313         else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
13314             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
13315                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
13316         }
13317         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
13318             assert(next);
13319             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
13320         }
13321         else if ( op == PLUS || op == STAR) {
13322             DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
13323         }
13324         else if (PL_regkind[(U8)op] == ANYOF) {
13325             /* arglen 1 + class block */
13326             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
13327                     ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
13328             node = NEXTOPER(node);
13329         }
13330         else if (PL_regkind[(U8)op] == EXACT) {
13331             /* Literal string, where present. */
13332             node += NODE_SZ_STR(node) - 1;
13333             node = NEXTOPER(node);
13334         }
13335         else {
13336             node = NEXTOPER(node);
13337             node += regarglen[(U8)op];
13338         }
13339         if (op == CURLYX || op == OPEN)
13340             indent++;
13341     }
13342     CLEAR_OPTSTART;
13343 #ifdef DEBUG_DUMPUNTIL
13344     PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
13345 #endif
13346     return node;
13347 }
13348
13349 #endif  /* DEBUGGING */
13350
13351 /*
13352  * Local variables:
13353  * c-indentation-style: bsd
13354  * c-basic-offset: 4
13355  * indent-tabs-mode: t
13356  * End:
13357  *
13358  * ex: set ts=8 sts=4 sw=4 noet:
13359  */