src/5015008/orig/regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  *      Copyright (c) 1986 by University of Toronto.
  42  *      Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  *      Permission is granted to anyone to use this software for any
  45  *      purpose on any computer system, and to redistribute it freely,
  46  *      subject to the following restrictions:
  47  *
  48  *      1. The author is not responsible for the consequences of use of
  49  *              this software, no matter how awful, even if they arise
  50  *              from defects in it.
  51  *
  52  *      2. The origin of this software must not be misrepresented, either
  53  *              by explicit claim or by omission.
  54  *
  55  *      3. Altered versions must be plainly marked as such, and must not
  56  *              be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #include "perl.h"
  76
  77 #ifndef PERL_IN_XSUB_RE
  78 #  include "INTERN.h"
  79 #endif
  80
  81 #define REG_COMP_C
  82 #ifdef PERL_IN_XSUB_RE
  83 #  include "re_comp.h"
  84 #else
  85 #  include "regcomp.h"
  86 #endif
  87
  88 #include "dquote_static.c"
  89 #ifndef PERL_IN_XSUB_RE
  90 #  include "charclass_invlists.h"
  91 #endif
  92
  93 #ifdef op
  94 #undef op
  95 #endif /* op */
  96
  97 #ifdef MSDOS
  98 #  if defined(BUGGY_MSC6)
  99  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
 100 #    pragma optimize("a",off)
 101  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
 102 #    pragma optimize("w",on )
 103 #  endif /* BUGGY_MSC6 */
 104 #endif /* MSDOS */
 105
 106 #ifndef STATIC
 107 #define STATIC  static
 108 #endif
 109
 110 typedef struct RExC_state_t {
 111     U32         flags;                  /* are we folding, multilining? */
 112     char        *precomp;               /* uncompiled string. */
 113     REGEXP      *rx_sv;                 /* The SV that is the regexp. */
 114     regexp      *rx;                    /* perl core regexp structure */
 115     regexp_internal     *rxi;           /* internal data for regexp object pprivate field */
 116     char        *start;                 /* Start of input for compile */
 117     char        *end;                   /* End of input for compile */
 118     char        *parse;                 /* Input-scan pointer. */
 119     I32         whilem_seen;            /* number of WHILEM in this expr */
 120     regnode     *emit_start;            /* Start of emitted-code area */
 121     regnode     *emit_bound;            /* First regnode outside of the allocated space */
 122     regnode     *emit;                  /* Code-emit pointer; &regdummy = don't = compiling */
 123     I32         naughty;                /* How bad is this pattern? */
 124     I32         sawback;                /* Did we see \1, ...? */
 125     U32         seen;
 126     I32         size;                   /* Code size. */
 127     I32         npar;                   /* Capture buffer count, (OPEN). */
 128     I32         cpar;                   /* Capture buffer count, (CLOSE). */
 129     I32         nestroot;               /* root parens we are in - used by accept */
 130     I32         extralen;
 131     I32         seen_zerolen;
 132     I32         seen_evals;
 133     regnode     **open_parens;          /* pointers to open parens */
 134     regnode     **close_parens;         /* pointers to close parens */
 135     regnode     *opend;                 /* END node in program */
 136     I32         utf8;           /* whether the pattern is utf8 or not */
 137     I32         orig_utf8;      /* whether the pattern was originally in utf8 */
 138                                 /* XXX use this for future optimisation of case
 139                                  * where pattern must be upgraded to utf8. */
 140     I32         uni_semantics;  /* If a d charset modifier should use unicode
 141                                    rules, even if the pattern is not in
 142                                    utf8 */
 143     HV          *paren_names;           /* Paren names */
 144
 145     regnode     **recurse;              /* Recurse regops */
 146     I32         recurse_count;          /* Number of recurse regops */
 147     I32         in_lookbehind;
 148     I32         contains_locale;
 149     I32         override_recoding;
 150 #if ADD_TO_REGEXEC
 151     char        *starttry;              /* -Dr: where regtry was called. */
 152 #define RExC_starttry   (pRExC_state->starttry)
 153 #endif
 154 #ifdef DEBUGGING
 155     const char  *lastparse;
 156     I32         lastnum;
 157     AV          *paren_name_list;       /* idx -> name */
 158 #define RExC_lastparse  (pRExC_state->lastparse)
 159 #define RExC_lastnum    (pRExC_state->lastnum)
 160 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 161 #endif
 162 } RExC_state_t;
 163
 164 #define RExC_flags      (pRExC_state->flags)
 165 #define RExC_precomp    (pRExC_state->precomp)
 166 #define RExC_rx_sv      (pRExC_state->rx_sv)
 167 #define RExC_rx         (pRExC_state->rx)
 168 #define RExC_rxi        (pRExC_state->rxi)
 169 #define RExC_start      (pRExC_state->start)
 170 #define RExC_end        (pRExC_state->end)
 171 #define RExC_parse      (pRExC_state->parse)
 172 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
 173 #ifdef RE_TRACK_PATTERN_OFFSETS
 174 #define RExC_offsets    (pRExC_state->rxi->u.offsets) /* I am not like the others */
 175 #endif
 176 #define RExC_emit       (pRExC_state->emit)
 177 #define RExC_emit_start (pRExC_state->emit_start)
 178 #define RExC_emit_bound (pRExC_state->emit_bound)
 179 #define RExC_naughty    (pRExC_state->naughty)
 180 #define RExC_sawback    (pRExC_state->sawback)
 181 #define RExC_seen       (pRExC_state->seen)
 182 #define RExC_size       (pRExC_state->size)
 183 #define RExC_npar       (pRExC_state->npar)
 184 #define RExC_nestroot   (pRExC_state->nestroot)
 185 #define RExC_extralen   (pRExC_state->extralen)
 186 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
 187 #define RExC_seen_evals (pRExC_state->seen_evals)
 188 #define RExC_utf8       (pRExC_state->utf8)
 189 #define RExC_uni_semantics      (pRExC_state->uni_semantics)
 190 #define RExC_orig_utf8  (pRExC_state->orig_utf8)
 191 #define RExC_open_parens        (pRExC_state->open_parens)
 192 #define RExC_close_parens       (pRExC_state->close_parens)
 193 #define RExC_opend      (pRExC_state->opend)
 194 #define RExC_paren_names        (pRExC_state->paren_names)
 195 #define RExC_recurse    (pRExC_state->recurse)
 196 #define RExC_recurse_count      (pRExC_state->recurse_count)
 197 #define RExC_in_lookbehind      (pRExC_state->in_lookbehind)
 198 #define RExC_contains_locale    (pRExC_state->contains_locale)
 199 #define RExC_override_recoding  (pRExC_state->override_recoding)
 200
 201
 202 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
 203 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 204         ((*s) == '{' && regcurly(s)))
 205
 206 #ifdef SPSTART
 207 #undef SPSTART          /* dratted cpp namespace... */
 208 #endif
 209 /*
 210  * Flags to be passed up and down.
 211  */
 212 #define WORST           0       /* Worst case. */
 213 #define HASWIDTH        0x01    /* Known to match non-null strings. */
 214
 215 /* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
 216  * character, and if utf8, must be invariant.  Note that this is not the same thing as REGNODE_SIMPLE */
 217 #define SIMPLE          0x02
 218 #define SPSTART         0x04    /* Starts with * or +. */
 219 #define TRYAGAIN        0x08    /* Weeded out a declaration. */
 220 #define POSTPONED       0x10    /* (?1),(?&name), (??{...}) or similar */
 221
 222 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 223
 224 /* whether trie related optimizations are enabled */
 225 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 226 #define TRIE_STUDY_OPT
 227 #define FULL_TRIE_STUDY
 228 #define TRIE_STCLASS
 229 #endif
 230
 231
 232
 233 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 234 #define PBITVAL(paren) (1 << ((paren) & 7))
 235 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 236 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 237 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 238
 239 /* If not already in utf8, do a longjmp back to the beginning */
 240 #define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
 241 #define REQUIRE_UTF8    STMT_START {                                       \
 242                                      if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
 243                         } STMT_END
 244
 245 /* About scan_data_t.
 246
 247   During optimisation we recurse through the regexp program performing
 248   various inplace (keyhole style) optimisations. In addition study_chunk
 249   and scan_commit populate this data structure with information about
 250   what strings MUST appear in the pattern. We look for the longest
 251   string that must appear at a fixed location, and we look for the
 252   longest string that may appear at a floating location. So for instance
 253   in the pattern:
 254
 255     /FOO[xX]A.*B[xX]BAR/
 256
 257   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 258   strings (because they follow a .* construct). study_chunk will identify
 259   both FOO and BAR as being the longest fixed and floating strings respectively.
 260
 261   The strings can be composites, for instance
 262
 263      /(f)(o)(o)/
 264
 265   will result in a composite fixed substring 'foo'.
 266
 267   For each string some basic information is maintained:
 268
 269   - offset or min_offset
 270     This is the position the string must appear at, or not before.
 271     It also implicitly (when combined with minlenp) tells us how many
 272     characters must match before the string we are searching for.
 273     Likewise when combined with minlenp and the length of the string it
 274     tells us how many characters must appear after the string we have
 275     found.
 276
 277   - max_offset
 278     Only used for floating strings. This is the rightmost point that
 279     the string can appear at. If set to I32 max it indicates that the
 280     string can occur infinitely far to the right.
 281
 282   - minlenp
 283     A pointer to the minimum length of the pattern that the string
 284     was found inside. This is important as in the case of positive
 285     lookahead or positive lookbehind we can have multiple patterns
 286     involved. Consider
 287
 288     /(?=FOO).*F/
 289
 290     The minimum length of the pattern overall is 3, the minimum length
 291     of the lookahead part is 3, but the minimum length of the part that
 292     will actually match is 1. So 'FOO's minimum length is 3, but the
 293     minimum length for the F is 1. This is important as the minimum length
 294     is used to determine offsets in front of and behind the string being
 295     looked for.  Since strings can be composites this is the length of the
 296     pattern at the time it was committed with a scan_commit. Note that
 297     the length is calculated by study_chunk, so that the minimum lengths
 298     are not known until the full pattern has been compiled, thus the
 299     pointer to the value.
 300
 301   - lookbehind
 302
 303     In the case of lookbehind the string being searched for can be
 304     offset past the start point of the final matching string.
 305     If this value was just blithely removed from the min_offset it would
 306     invalidate some of the calculations for how many chars must match
 307     before or after (as they are derived from min_offset and minlen and
 308     the length of the string being searched for).
 309     When the final pattern is compiled and the data is moved from the
 310     scan_data_t structure into the regexp structure the information
 311     about lookbehind is factored in, with the information that would
 312     have been lost precalculated in the end_shift field for the
 313     associated string.
 314
 315   The fields pos_min and pos_delta are used to store the minimum offset
 316   and the delta to the maximum offset at the current point in the pattern.
 317
 318 */
 319
 320 typedef struct scan_data_t {
 321     /*I32 len_min;      unused */
 322     /*I32 len_delta;    unused */
 323     I32 pos_min;
 324     I32 pos_delta;
 325     SV *last_found;
 326     I32 last_end;           /* min value, <0 unless valid. */
 327     I32 last_start_min;
 328     I32 last_start_max;
 329     SV **longest;           /* Either &l_fixed, or &l_float. */
 330     SV *longest_fixed;      /* longest fixed string found in pattern */
 331     I32 offset_fixed;       /* offset where it starts */
 332     I32 *minlen_fixed;      /* pointer to the minlen relevant to the string */
 333     I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 334     SV *longest_float;      /* longest floating string found in pattern */
 335     I32 offset_float_min;   /* earliest point in string it can appear */
 336     I32 offset_float_max;   /* latest point in string it can appear */
 337     I32 *minlen_float;      /* pointer to the minlen relevant to the string */
 338     I32 lookbehind_float;   /* is the position of the string modified by LB */
 339     I32 flags;
 340     I32 whilem_c;
 341     I32 *last_closep;
 342     struct regnode_charclass_class *start_class;
 343 } scan_data_t;
 344
 345 /*
 346  * Forward declarations for pregcomp()'s friends.
 347  */
 348
 349 static const scan_data_t zero_scan_data =
 350   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 351
 352 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 353 #define SF_BEFORE_SEOL          0x0001
 354 #define SF_BEFORE_MEOL          0x0002
 355 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 356 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 357
 358 #ifdef NO_UNARY_PLUS
 359 #  define SF_FIX_SHIFT_EOL      (0+2)
 360 #  define SF_FL_SHIFT_EOL               (0+4)
 361 #else
 362 #  define SF_FIX_SHIFT_EOL      (+2)
 363 #  define SF_FL_SHIFT_EOL               (+4)
 364 #endif
 365
 366 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 367 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 368
 369 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 370 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 371 #define SF_IS_INF               0x0040
 372 #define SF_HAS_PAR              0x0080
 373 #define SF_IN_PAR               0x0100
 374 #define SF_HAS_EVAL             0x0200
 375 #define SCF_DO_SUBSTR           0x0400
 376 #define SCF_DO_STCLASS_AND      0x0800
 377 #define SCF_DO_STCLASS_OR       0x1000
 378 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 379 #define SCF_WHILEM_VISITED_POS  0x2000
 380
 381 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 382 #define SCF_SEEN_ACCEPT         0x8000
 383
 384 #define UTF cBOOL(RExC_utf8)
 385 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 386 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
 387 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
 388 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
 389 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
 390 #define MORE_ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
 391 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
 392
 393 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 394
 395 #define OOB_UNICODE             12345678
 396 #define OOB_NAMEDCLASS          -1
 397
 398 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 399 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 400
 401
 402 /* length of regex to show in messages that don't mark a position within */
 403 #define RegexLengthToShowInErrorMessages 127
 404
 405 /*
 406  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 407  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 408  * op/pragma/warn/regcomp.
 409  */
 410 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 411 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 412
 413 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
 414
 415 /*
 416  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 417  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 418  * "...".
 419  */
 420 #define _FAIL(code) STMT_START {                                        \
 421     const char *ellipses = "";                                          \
 422     IV len = RExC_end - RExC_precomp;                                   \
 423                                                                         \
 424     if (!SIZE_ONLY)                                                     \
 425         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);                   \
 426     if (len > RegexLengthToShowInErrorMessages) {                       \
 427         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
 428         len = RegexLengthToShowInErrorMessages - 10;                    \
 429         ellipses = "...";                                               \
 430     }                                                                   \
 431     code;                                                               \
 432 } STMT_END
 433
 434 #define FAIL(msg) _FAIL(                            \
 435     Perl_croak(aTHX_ "%s in regex m/%.*s%s/",       \
 436             msg, (int)len, RExC_precomp, ellipses))
 437
 438 #define FAIL2(msg,arg) _FAIL(                       \
 439     Perl_croak(aTHX_ msg " in regex m/%.*s%s/",     \
 440             arg, (int)len, RExC_precomp, ellipses))
 441
 442 /*
 443  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 444  */
 445 #define Simple_vFAIL(m) STMT_START {                                    \
 446     const IV offset = RExC_parse - RExC_precomp;                        \
 447     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
 448             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
 449 } STMT_END
 450
 451 /*
 452  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 453  */
 454 #define vFAIL(m) STMT_START {                           \
 455     if (!SIZE_ONLY)                                     \
 456         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 457     Simple_vFAIL(m);                                    \
 458 } STMT_END
 459
 460 /*
 461  * Like Simple_vFAIL(), but accepts two arguments.
 462  */
 463 #define Simple_vFAIL2(m,a1) STMT_START {                        \
 464     const IV offset = RExC_parse - RExC_precomp;                        \
 465     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,                   \
 466             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 467 } STMT_END
 468
 469 /*
 470  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 471  */
 472 #define vFAIL2(m,a1) STMT_START {                       \
 473     if (!SIZE_ONLY)                                     \
 474         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 475     Simple_vFAIL2(m, a1);                               \
 476 } STMT_END
 477
 478
 479 /*
 480  * Like Simple_vFAIL(), but accepts three arguments.
 481  */
 482 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
 483     const IV offset = RExC_parse - RExC_precomp;                \
 484     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,               \
 485             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 486 } STMT_END
 487
 488 /*
 489  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 490  */
 491 #define vFAIL3(m,a1,a2) STMT_START {                    \
 492     if (!SIZE_ONLY)                                     \
 493         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 494     Simple_vFAIL3(m, a1, a2);                           \
 495 } STMT_END
 496
 497 /*
 498  * Like Simple_vFAIL(), but accepts four arguments.
 499  */
 500 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
 501     const IV offset = RExC_parse - RExC_precomp;                \
 502     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,           \
 503             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 504 } STMT_END
 505
 506 #define ckWARNreg(loc,m) STMT_START {                                   \
 507     const IV offset = loc - RExC_precomp;                               \
 508     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 509             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 510 } STMT_END
 511
 512 #define ckWARNregdep(loc,m) STMT_START {                                \
 513     const IV offset = loc - RExC_precomp;                               \
 514     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 515             m REPORT_LOCATION,                                          \
 516             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 517 } STMT_END
 518
 519 #define ckWARN2regdep(loc,m, a1) STMT_START {                           \
 520     const IV offset = loc - RExC_precomp;                               \
 521     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 522             m REPORT_LOCATION,                                          \
 523             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
 524 } STMT_END
 525
 526 #define ckWARN2reg(loc, m, a1) STMT_START {                             \
 527     const IV offset = loc - RExC_precomp;                               \
 528     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 529             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
 530 } STMT_END
 531
 532 #define vWARN3(loc, m, a1, a2) STMT_START {                             \
 533     const IV offset = loc - RExC_precomp;                               \
 534     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 535             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 536 } STMT_END
 537
 538 #define ckWARN3reg(loc, m, a1, a2) STMT_START {                         \
 539     const IV offset = loc - RExC_precomp;                               \
 540     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 541             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 542 } STMT_END
 543
 544 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
 545     const IV offset = loc - RExC_precomp;                               \
 546     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 547             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 548 } STMT_END
 549
 550 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START {                     \
 551     const IV offset = loc - RExC_precomp;                               \
 552     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 553             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 554 } STMT_END
 555
 556 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
 557     const IV offset = loc - RExC_precomp;                               \
 558     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 559             a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
 560 } STMT_END
 561
 562
 563 /* Allow for side effects in s */
 564 #define REGC(c,s) STMT_START {                  \
 565     if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
 566 } STMT_END
 567
 568 /* Macros for recording node offsets.   20001227 mjd@plover.com
 569  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 570  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 571  * Element 0 holds the number n.
 572  * Position is 1 indexed.
 573  */
 574 #ifndef RE_TRACK_PATTERN_OFFSETS
 575 #define Set_Node_Offset_To_R(node,byte)
 576 #define Set_Node_Offset(node,byte)
 577 #define Set_Cur_Node_Offset
 578 #define Set_Node_Length_To_R(node,len)
 579 #define Set_Node_Length(node,len)
 580 #define Set_Node_Cur_Length(node)
 581 #define Node_Offset(n)
 582 #define Node_Length(n)
 583 #define Set_Node_Offset_Length(node,offset,len)
 584 #define ProgLen(ri) ri->u.proglen
 585 #define SetProgLen(ri,x) ri->u.proglen = x
 586 #else
 587 #define ProgLen(ri) ri->u.offsets[0]
 588 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 589 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
 590     if (! SIZE_ONLY) {                                                  \
 591         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
 592                     __LINE__, (int)(node), (int)(byte)));               \
 593         if((node) < 0) {                                                \
 594             Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
 595         } else {                                                        \
 596             RExC_offsets[2*(node)-1] = (byte);                          \
 597         }                                                               \
 598     }                                                                   \
 599 } STMT_END
 600
 601 #define Set_Node_Offset(node,byte) \
 602     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 603 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 604
 605 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
 606     if (! SIZE_ONLY) {                                                  \
 607         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
 608                 __LINE__, (int)(node), (int)(len)));                    \
 609         if((node) < 0) {                                                \
 610             Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
 611         } else {                                                        \
 612             RExC_offsets[2*(node)] = (len);                             \
 613         }                                                               \
 614     }                                                                   \
 615 } STMT_END
 616
 617 #define Set_Node_Length(node,len) \
 618     Set_Node_Length_To_R((node)-RExC_emit_start, len)
 619 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
 620 #define Set_Node_Cur_Length(node) \
 621     Set_Node_Length(node, RExC_parse - parse_start)
 622
 623 /* Get offsets and lengths */
 624 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 625 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 626
 627 #define Set_Node_Offset_Length(node,offset,len) STMT_START {    \
 628     Set_Node_Offset_To_R((node)-RExC_emit_start, (offset));     \
 629     Set_Node_Length_To_R((node)-RExC_emit_start, (len));        \
 630 } STMT_END
 631 #endif
 632
 633 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 634 #define EXPERIMENTAL_INPLACESCAN
 635 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
 636
 637 #define DEBUG_STUDYDATA(str,data,depth)                              \
 638 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
 639     PerlIO_printf(Perl_debug_log,                                    \
 640         "%*s" str "Pos:%"IVdf"/%"IVdf                                \
 641         " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s",       \
 642         (int)(depth)*2, "",                                          \
 643         (IV)((data)->pos_min),                                       \
 644         (IV)((data)->pos_delta),                                     \
 645         (UV)((data)->flags),                                         \
 646         (IV)((data)->whilem_c),                                      \
 647         (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
 648         is_inf ? "INF " : ""                                         \
 649     );                                                               \
 650     if ((data)->last_found)                                          \
 651         PerlIO_printf(Perl_debug_log,                                \
 652             "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 653             " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 654             SvPVX_const((data)->last_found),                         \
 655             (IV)((data)->last_end),                                  \
 656             (IV)((data)->last_start_min),                            \
 657             (IV)((data)->last_start_max),                            \
 658             ((data)->longest &&                                      \
 659              (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 660             SvPVX_const((data)->longest_fixed),                      \
 661             (IV)((data)->offset_fixed),                              \
 662             ((data)->longest &&                                      \
 663              (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 664             SvPVX_const((data)->longest_float),                      \
 665             (IV)((data)->offset_float_min),                          \
 666             (IV)((data)->offset_float_max)                           \
 667         );                                                           \
 668     PerlIO_printf(Perl_debug_log,"\n");                              \
 669 });
 670
 671 static void clear_re(pTHX_ void *r);
 672
 673 /* Mark that we cannot extend a found fixed substring at this point.
 674    Update the longest found anchored substring and the longest found
 675    floating substrings if needed. */
 676
 677 STATIC void
 678 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
 679 {
 680     const STRLEN l = CHR_SVLEN(data->last_found);
 681     const STRLEN old_l = CHR_SVLEN(*data->longest);
 682     GET_RE_DEBUG_FLAGS_DECL;
 683
 684     PERL_ARGS_ASSERT_SCAN_COMMIT;
 685
 686     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 687         SvSetMagicSV(*data->longest, data->last_found);
 688         if (*data->longest == data->longest_fixed) {
 689             data->offset_fixed = l ? data->last_start_min : data->pos_min;
 690             if (data->flags & SF_BEFORE_EOL)
 691                 data->flags
 692                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
 693             else
 694                 data->flags &= ~SF_FIX_BEFORE_EOL;
 695             data->minlen_fixed=minlenp;
 696             data->lookbehind_fixed=0;
 697         }
 698         else { /* *data->longest == data->longest_float */
 699             data->offset_float_min = l ? data->last_start_min : data->pos_min;
 700             data->offset_float_max = (l
 701                                       ? data->last_start_max
 702                                       : data->pos_min + data->pos_delta);
 703             if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
 704                 data->offset_float_max = I32_MAX;
 705             if (data->flags & SF_BEFORE_EOL)
 706                 data->flags
 707                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
 708             else
 709                 data->flags &= ~SF_FL_BEFORE_EOL;
 710             data->minlen_float=minlenp;
 711             data->lookbehind_float=0;
 712         }
 713     }
 714     SvCUR_set(data->last_found, 0);
 715     {
 716         SV * const sv = data->last_found;
 717         if (SvUTF8(sv) && SvMAGICAL(sv)) {
 718             MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 719             if (mg)
 720                 mg->mg_len = 0;
 721         }
 722     }
 723     data->last_end = -1;
 724     data->flags &= ~SF_BEFORE_EOL;
 725     DEBUG_STUDYDATA("commit: ",data,0);
 726 }
 727
 728 /* Can match anything (initialization) */
 729 STATIC void
 730 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 731 {
 732     PERL_ARGS_ASSERT_CL_ANYTHING;
 733
 734     ANYOF_BITMAP_SETALL(cl);
 735     cl->flags = ANYOF_CLASS|ANYOF_EOS|ANYOF_UNICODE_ALL
 736                 |ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL;
 737
 738     /* If any portion of the regex is to operate under locale rules,
 739      * initialization includes it.  The reason this isn't done for all regexes
 740      * is that the optimizer was written under the assumption that locale was
 741      * all-or-nothing.  Given the complexity and lack of documentation in the
 742      * optimizer, and that there are inadequate test cases for locale, so many
 743      * parts of it may not work properly, it is safest to avoid locale unless
 744      * necessary. */
 745     if (RExC_contains_locale) {
 746         ANYOF_CLASS_SETALL(cl);     /* /l uses class */
 747         cl->flags |= ANYOF_LOCALE;
 748     }
 749     else {
 750         ANYOF_CLASS_ZERO(cl);       /* Only /l uses class now */
 751     }
 752 }
 753
 754 /* Can match anything (initialization) */
 755 STATIC int
 756 S_cl_is_anything(const struct regnode_charclass_class *cl)
 757 {
 758     int value;
 759
 760     PERL_ARGS_ASSERT_CL_IS_ANYTHING;
 761
 762     for (value = 0; value <= ANYOF_MAX; value += 2)
 763         if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
 764             return 1;
 765     if (!(cl->flags & ANYOF_UNICODE_ALL))
 766         return 0;
 767     if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
 768         return 0;
 769     return 1;
 770 }
 771
 772 /* Can match anything (initialization) */
 773 STATIC void
 774 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 775 {
 776     PERL_ARGS_ASSERT_CL_INIT;
 777
 778     Zero(cl, 1, struct regnode_charclass_class);
 779     cl->type = ANYOF;
 780     cl_anything(pRExC_state, cl);
 781     ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
 782 }
 783
 784 /* These two functions currently do the exact same thing */
 785 #define cl_init_zero            S_cl_init
 786
 787 /* 'AND' a given class with another one.  Can create false positives.  'cl'
 788  * should not be inverted.  'and_with->flags & ANYOF_CLASS' should be 0 if
 789  * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
 790 STATIC void
 791 S_cl_and(struct regnode_charclass_class *cl,
 792         const struct regnode_charclass_class *and_with)
 793 {
 794     PERL_ARGS_ASSERT_CL_AND;
 795
 796     assert(and_with->type == ANYOF);
 797
 798     /* I (khw) am not sure all these restrictions are necessary XXX */
 799     if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
 800         && !(ANYOF_CLASS_TEST_ANY_SET(cl))
 801         && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 802         && !(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
 803         && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) {
 804         int i;
 805
 806         if (and_with->flags & ANYOF_INVERT)
 807             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 808                 cl->bitmap[i] &= ~and_with->bitmap[i];
 809         else
 810             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 811                 cl->bitmap[i] &= and_with->bitmap[i];
 812     } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
 813
 814     if (and_with->flags & ANYOF_INVERT) {
 815
 816         /* Here, the and'ed node is inverted.  Get the AND of the flags that
 817          * aren't affected by the inversion.  Those that are affected are
 818          * handled individually below */
 819         U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
 820         cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
 821         cl->flags |= affected_flags;
 822
 823         /* We currently don't know how to deal with things that aren't in the
 824          * bitmap, but we know that the intersection is no greater than what
 825          * is already in cl, so let there be false positives that get sorted
 826          * out after the synthetic start class succeeds, and the node is
 827          * matched for real. */
 828
 829         /* The inversion of these two flags indicate that the resulting
 830          * intersection doesn't have them */
 831         if (and_with->flags & ANYOF_UNICODE_ALL) {
 832             cl->flags &= ~ANYOF_UNICODE_ALL;
 833         }
 834         if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
 835             cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
 836         }
 837     }
 838     else {   /* and'd node is not inverted */
 839         U8 outside_bitmap_but_not_utf8; /* Temp variable */
 840
 841         if (! ANYOF_NONBITMAP(and_with)) {
 842
 843             /* Here 'and_with' doesn't match anything outside the bitmap
 844              * (except possibly ANYOF_UNICODE_ALL), which means the
 845              * intersection can't either, except for ANYOF_UNICODE_ALL, in
 846              * which case we don't know what the intersection is, but it's no
 847              * greater than what cl already has, so can just leave it alone,
 848              * with possible false positives */
 849             if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
 850                 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
 851                 cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
 852             }
 853         }
 854         else if (! ANYOF_NONBITMAP(cl)) {
 855
 856             /* Here, 'and_with' does match something outside the bitmap, and cl
 857              * doesn't have a list of things to match outside the bitmap.  If
 858              * cl can match all code points above 255, the intersection will
 859              * be those above-255 code points that 'and_with' matches.  If cl
 860              * can't match all Unicode code points, it means that it can't
 861              * match anything outside the bitmap (since the 'if' that got us
 862              * into this block tested for that), so we leave the bitmap empty.
 863              */
 864             if (cl->flags & ANYOF_UNICODE_ALL) {
 865                 ARG_SET(cl, ARG(and_with));
 866
 867                 /* and_with's ARG may match things that don't require UTF8.
 868                  * And now cl's will too, in spite of this being an 'and'.  See
 869                  * the comments below about the kludge */
 870                 cl->flags |= and_with->flags & ANYOF_NONBITMAP_NON_UTF8;
 871             }
 872         }
 873         else {
 874             /* Here, both 'and_with' and cl match something outside the
 875              * bitmap.  Currently we do not do the intersection, so just match
 876              * whatever cl had at the beginning.  */
 877         }
 878
 879
 880         /* Take the intersection of the two sets of flags.  However, the
 881          * ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'.  This is a
 882          * kludge around the fact that this flag is not treated like the others
 883          * which are initialized in cl_anything().  The way the optimizer works
 884          * is that the synthetic start class (SSC) is initialized to match
 885          * anything, and then the first time a real node is encountered, its
 886          * values are AND'd with the SSC's with the result being the values of
 887          * the real node.  However, there are paths through the optimizer where
 888          * the AND never gets called, so those initialized bits are set
 889          * inappropriately, which is not usually a big deal, as they just cause
 890          * false positives in the SSC, which will just mean a probably
 891          * imperceptible slow down in execution.  However this bit has a
 892          * higher false positive consequence in that it can cause utf8.pm,
 893          * utf8_heavy.pl ... to be loaded when not necessary, which is a much
 894          * bigger slowdown and also causes significant extra memory to be used.
 895          * In order to prevent this, the code now takes a different tack.  The
 896          * bit isn't set unless some part of the regular expression needs it,
 897          * but once set it won't get cleared.  This means that these extra
 898          * modules won't get loaded unless there was some path through the
 899          * pattern that would have required them anyway, and  so any false
 900          * positives that occur by not ANDing them out when they could be
 901          * aren't as severe as they would be if we treated this bit like all
 902          * the others */
 903         outside_bitmap_but_not_utf8 = (cl->flags | and_with->flags)
 904                                       & ANYOF_NONBITMAP_NON_UTF8;
 905         cl->flags &= and_with->flags;
 906         cl->flags |= outside_bitmap_but_not_utf8;
 907     }
 908 }
 909
 910 /* 'OR' a given class with another one.  Can create false positives.  'cl'
 911  * should not be inverted.  'or_with->flags & ANYOF_CLASS' should be 0 if
 912  * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
 913 STATIC void
 914 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
 915 {
 916     PERL_ARGS_ASSERT_CL_OR;
 917
 918     if (or_with->flags & ANYOF_INVERT) {
 919
 920         /* Here, the or'd node is to be inverted.  This means we take the
 921          * complement of everything not in the bitmap, but currently we don't
 922          * know what that is, so give up and match anything */
 923         if (ANYOF_NONBITMAP(or_with)) {
 924             cl_anything(pRExC_state, cl);
 925         }
 926         /* We do not use
 927          * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
 928          *   <= (B1 | !B2) | (CL1 | !CL2)
 929          * which is wasteful if CL2 is small, but we ignore CL2:
 930          *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
 931          * XXXX Can we handle case-fold?  Unclear:
 932          *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
 933          *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
 934          */
 935         else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 936              && !(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
 937              && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD) ) {
 938             int i;
 939
 940             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 941                 cl->bitmap[i] |= ~or_with->bitmap[i];
 942         } /* XXXX: logic is complicated otherwise */
 943         else {
 944             cl_anything(pRExC_state, cl);
 945         }
 946
 947         /* And, we can just take the union of the flags that aren't affected
 948          * by the inversion */
 949         cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
 950
 951         /* For the remaining flags:
 952             ANYOF_UNICODE_ALL and inverted means to not match anything above
 953                     255, which means that the union with cl should just be
 954                     what cl has in it, so can ignore this flag
 955             ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
 956                     is 127-255 to match them, but then invert that, so the
 957                     union with cl should just be what cl has in it, so can
 958                     ignore this flag
 959          */
 960     } else {    /* 'or_with' is not inverted */
 961         /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
 962         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 963              && (!(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
 964                  || (cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) ) {
 965             int i;
 966
 967             /* OR char bitmap and class bitmap separately */
 968             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 969                 cl->bitmap[i] |= or_with->bitmap[i];
 970             if (ANYOF_CLASS_TEST_ANY_SET(or_with)) {
 971                 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
 972                     cl->classflags[i] |= or_with->classflags[i];
 973                 cl->flags |= ANYOF_CLASS;
 974             }
 975         }
 976         else { /* XXXX: logic is complicated, leave it along for a moment. */
 977             cl_anything(pRExC_state, cl);
 978         }
 979
 980         if (ANYOF_NONBITMAP(or_with)) {
 981
 982             /* Use the added node's outside-the-bit-map match if there isn't a
 983              * conflict.  If there is a conflict (both nodes match something
 984              * outside the bitmap, but what they match outside is not the same
 985              * pointer, and hence not easily compared until XXX we extend
 986              * inversion lists this far), give up and allow the start class to
 987              * match everything outside the bitmap.  If that stuff is all above
 988              * 255, can just set UNICODE_ALL, otherwise caould be anything. */
 989             if (! ANYOF_NONBITMAP(cl)) {
 990                 ARG_SET(cl, ARG(or_with));
 991             }
 992             else if (ARG(cl) != ARG(or_with)) {
 993
 994                 if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
 995                     cl_anything(pRExC_state, cl);
 996                 }
 997                 else {
 998                     cl->flags |= ANYOF_UNICODE_ALL;
 999                 }
1000             }
1001         }
1002
1003         /* Take the union */
1004         cl->flags |= or_with->flags;
1005     }
1006 }
1007
1008 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1009 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
1010 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1011 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
1012
1013
1014 #ifdef DEBUGGING
1015 /*
1016    dump_trie(trie,widecharmap,revcharmap)
1017    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1018    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1019
1020    These routines dump out a trie in a somewhat readable format.
1021    The _interim_ variants are used for debugging the interim
1022    tables that are used to generate the final compressed
1023    representation which is what dump_trie expects.
1024
1025    Part of the reason for their existence is to provide a form
1026    of documentation as to how the different representations function.
1027
1028 */
1029
1030 /*
1031   Dumps the final compressed table form of the trie to Perl_debug_log.
1032   Used for debugging make_trie().
1033 */
1034
1035 STATIC void
1036 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1037             AV *revcharmap, U32 depth)
1038 {
1039     U32 state;
1040     SV *sv=sv_newmortal();
1041     int colwidth= widecharmap ? 6 : 4;
1042     U16 word;
1043     GET_RE_DEBUG_FLAGS_DECL;
1044
1045     PERL_ARGS_ASSERT_DUMP_TRIE;
1046
1047     PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1048         (int)depth * 2 + 2,"",
1049         "Match","Base","Ofs" );
1050
1051     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1052         SV ** const tmp = av_fetch( revcharmap, state, 0);
1053         if ( tmp ) {
1054             PerlIO_printf( Perl_debug_log, "%*s",
1055                 colwidth,
1056                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1057                             PL_colors[0], PL_colors[1],
1058                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1059                             PERL_PV_ESCAPE_FIRSTCHAR
1060                 )
1061             );
1062         }
1063     }
1064     PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1065         (int)depth * 2 + 2,"");
1066
1067     for( state = 0 ; state < trie->uniquecharcount ; state++ )
1068         PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1069     PerlIO_printf( Perl_debug_log, "\n");
1070
1071     for( state = 1 ; state < trie->statecount ; state++ ) {
1072         const U32 base = trie->states[ state ].trans.base;
1073
1074         PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1075
1076         if ( trie->states[ state ].wordnum ) {
1077             PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1078         } else {
1079             PerlIO_printf( Perl_debug_log, "%6s", "" );
1080         }
1081
1082         PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1083
1084         if ( base ) {
1085             U32 ofs = 0;
1086
1087             while( ( base + ofs  < trie->uniquecharcount ) ||
1088                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
1089                      && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1090                     ofs++;
1091
1092             PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1093
1094             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1095                 if ( ( base + ofs >= trie->uniquecharcount ) &&
1096                      ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1097                      trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1098                 {
1099                    PerlIO_printf( Perl_debug_log, "%*"UVXf,
1100                     colwidth,
1101                     (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1102                 } else {
1103                     PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
1104                 }
1105             }
1106
1107             PerlIO_printf( Perl_debug_log, "]");
1108
1109         }
1110         PerlIO_printf( Perl_debug_log, "\n" );
1111     }
1112     PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1113     for (word=1; word <= trie->wordcount; word++) {
1114         PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1115             (int)word, (int)(trie->wordinfo[word].prev),
1116             (int)(trie->wordinfo[word].len));
1117     }
1118     PerlIO_printf(Perl_debug_log, "\n" );
1119 }
1120 /*
1121   Dumps a fully constructed but uncompressed trie in list form.
1122   List tries normally only are used for construction when the number of
1123   possible chars (trie->uniquecharcount) is very high.
1124   Used for debugging make_trie().
1125 */
1126 STATIC void
1127 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1128                          HV *widecharmap, AV *revcharmap, U32 next_alloc,
1129                          U32 depth)
1130 {
1131     U32 state;
1132     SV *sv=sv_newmortal();
1133     int colwidth= widecharmap ? 6 : 4;
1134     GET_RE_DEBUG_FLAGS_DECL;
1135
1136     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1137
1138     /* print out the table precompression.  */
1139     PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1140         (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1141         "------:-----+-----------------\n" );
1142
1143     for( state=1 ; state < next_alloc ; state ++ ) {
1144         U16 charid;
1145
1146         PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1147             (int)depth * 2 + 2,"", (UV)state  );
1148         if ( ! trie->states[ state ].wordnum ) {
1149             PerlIO_printf( Perl_debug_log, "%5s| ","");
1150         } else {
1151             PerlIO_printf( Perl_debug_log, "W%4x| ",
1152                 trie->states[ state ].wordnum
1153             );
1154         }
1155         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1156             SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1157             if ( tmp ) {
1158                 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1159                     colwidth,
1160                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1161                             PL_colors[0], PL_colors[1],
1162                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1163                             PERL_PV_ESCAPE_FIRSTCHAR
1164                     ) ,
1165                     TRIE_LIST_ITEM(state,charid).forid,
1166                     (UV)TRIE_LIST_ITEM(state,charid).newstate
1167                 );
1168                 if (!(charid % 10))
1169                     PerlIO_printf(Perl_debug_log, "\n%*s| ",
1170                         (int)((depth * 2) + 14), "");
1171             }
1172         }
1173         PerlIO_printf( Perl_debug_log, "\n");
1174     }
1175 }
1176
1177 /*
1178   Dumps a fully constructed but uncompressed trie in table form.
1179   This is the normal DFA style state transition table, with a few
1180   twists to facilitate compression later.
1181   Used for debugging make_trie().
1182 */
1183 STATIC void
1184 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1185                           HV *widecharmap, AV *revcharmap, U32 next_alloc,
1186                           U32 depth)
1187 {
1188     U32 state;
1189     U16 charid;
1190     SV *sv=sv_newmortal();
1191     int colwidth= widecharmap ? 6 : 4;
1192     GET_RE_DEBUG_FLAGS_DECL;
1193
1194     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1195
1196     /*
1197        print out the table precompression so that we can do a visual check
1198        that they are identical.
1199      */
1200
1201     PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1202
1203     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1204         SV ** const tmp = av_fetch( revcharmap, charid, 0);
1205         if ( tmp ) {
1206             PerlIO_printf( Perl_debug_log, "%*s",
1207                 colwidth,
1208                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1209                             PL_colors[0], PL_colors[1],
1210                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1211                             PERL_PV_ESCAPE_FIRSTCHAR
1212                 )
1213             );
1214         }
1215     }
1216
1217     PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1218
1219     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1220         PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1221     }
1222
1223     PerlIO_printf( Perl_debug_log, "\n" );
1224
1225     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1226
1227         PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1228             (int)depth * 2 + 2,"",
1229             (UV)TRIE_NODENUM( state ) );
1230
1231         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1232             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1233             if (v)
1234                 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1235             else
1236                 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1237         }
1238         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1239             PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1240         } else {
1241             PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1242             trie->states[ TRIE_NODENUM( state ) ].wordnum );
1243         }
1244     }
1245 }
1246
1247 #endif
1248
1249
1250 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1251   startbranch: the first branch in the whole branch sequence
1252   first      : start branch of sequence of branch-exact nodes.
1253                May be the same as startbranch
1254   last       : Thing following the last branch.
1255                May be the same as tail.
1256   tail       : item following the branch sequence
1257   count      : words in the sequence
1258   flags      : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1259   depth      : indent depth
1260
1261 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1262
1263 A trie is an N'ary tree where the branches are determined by digital
1264 decomposition of the key. IE, at the root node you look up the 1st character and
1265 follow that branch repeat until you find the end of the branches. Nodes can be
1266 marked as "accepting" meaning they represent a complete word. Eg:
1267
1268   /he|she|his|hers/
1269
1270 would convert into the following structure. Numbers represent states, letters
1271 following numbers represent valid transitions on the letter from that state, if
1272 the number is in square brackets it represents an accepting state, otherwise it
1273 will be in parenthesis.
1274
1275       +-h->+-e->[3]-+-r->(8)-+-s->[9]
1276       |    |
1277       |   (2)
1278       |    |
1279      (1)   +-i->(6)-+-s->[7]
1280       |
1281       +-s->(3)-+-h->(4)-+-e->[5]
1282
1283       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1284
1285 This shows that when matching against the string 'hers' we will begin at state 1
1286 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1287 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1288 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1289 single traverse. We store a mapping from accepting to state to which word was
1290 matched, and then when we have multiple possibilities we try to complete the
1291 rest of the regex in the order in which they occured in the alternation.
1292
1293 The only prior NFA like behaviour that would be changed by the TRIE support is
1294 the silent ignoring of duplicate alternations which are of the form:
1295
1296  / (DUPE|DUPE) X? (?{ ... }) Y /x
1297
1298 Thus EVAL blocks following a trie may be called a different number of times with
1299 and without the optimisation. With the optimisations dupes will be silently
1300 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1301 the following demonstrates:
1302
1303  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1304
1305 which prints out 'word' three times, but
1306
1307  'words'=~/(word|word|word)(?{ print $1 })S/
1308
1309 which doesnt print it out at all. This is due to other optimisations kicking in.
1310
1311 Example of what happens on a structural level:
1312
1313 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1314
1315    1: CURLYM[1] {1,32767}(18)
1316    5:   BRANCH(8)
1317    6:     EXACT <ac>(16)
1318    8:   BRANCH(11)
1319    9:     EXACT <ad>(16)
1320   11:   BRANCH(14)
1321   12:     EXACT <ab>(16)
1322   16:   SUCCEED(0)
1323   17:   NOTHING(18)
1324   18: END(0)
1325
1326 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1327 and should turn into:
1328
1329    1: CURLYM[1] {1,32767}(18)
1330    5:   TRIE(16)
1331         [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1332           <ac>
1333           <ad>
1334           <ab>
1335   16:   SUCCEED(0)
1336   17:   NOTHING(18)
1337   18: END(0)
1338
1339 Cases where tail != last would be like /(?foo|bar)baz/:
1340
1341    1: BRANCH(4)
1342    2:   EXACT <foo>(8)
1343    4: BRANCH(7)
1344    5:   EXACT <bar>(8)
1345    7: TAIL(8)
1346    8: EXACT <baz>(10)
1347   10: END(0)
1348
1349 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1350 and would end up looking like:
1351
1352     1: TRIE(8)
1353       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1354         <foo>
1355         <bar>
1356    7: TAIL(8)
1357    8: EXACT <baz>(10)
1358   10: END(0)
1359
1360     d = uvuni_to_utf8_flags(d, uv, 0);
1361
1362 is the recommended Unicode-aware way of saying
1363
1364     *(d++) = uv;
1365 */
1366
1367 #define TRIE_STORE_REVCHAR                                                 \
1368     STMT_START {                                                           \
1369         if (UTF) {                                                         \
1370             SV *zlopp = newSV(2);                                          \
1371             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
1372             unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
1373             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
1374             SvPOK_on(zlopp);                                               \
1375             SvUTF8_on(zlopp);                                              \
1376             av_push(revcharmap, zlopp);                                    \
1377         } else {                                                           \
1378             char ooooff = (char)uvc;                                               \
1379             av_push(revcharmap, newSVpvn(&ooooff, 1));                     \
1380         }                                                                  \
1381         } STMT_END
1382
1383 #define TRIE_READ_CHAR STMT_START {                                           \
1384     wordlen++;                                                                \
1385     if ( UTF ) {                                                              \
1386         if ( folder ) {                                                       \
1387             if ( foldlen > 0 ) {                                              \
1388                uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags );     \
1389                foldlen -= len;                                                \
1390                scan += len;                                                   \
1391                len = 0;                                                       \
1392             } else {                                                          \
1393                 len = UTF8SKIP(uc);\
1394                 uvc = to_utf8_fold( uc, foldbuf, &foldlen);                   \
1395                 foldlen -= UNISKIP( uvc );                                    \
1396                 scan = foldbuf + UNISKIP( uvc );                              \
1397             }                                                                 \
1398         } else {                                                              \
1399             uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1400         }                                                                     \
1401     } else {                                                                  \
1402         uvc = (U32)*uc;                                                       \
1403         len = 1;                                                              \
1404     }                                                                         \
1405 } STMT_END
1406
1407
1408
1409 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
1410     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
1411         U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
1412         Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1413     }                                                           \
1414     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
1415     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
1416     TRIE_LIST_CUR( state )++;                                   \
1417 } STMT_END
1418
1419 #define TRIE_LIST_NEW(state) STMT_START {                       \
1420     Newxz( trie->states[ state ].trans.list,               \
1421         4, reg_trie_trans_le );                                 \
1422      TRIE_LIST_CUR( state ) = 1;                                \
1423      TRIE_LIST_LEN( state ) = 4;                                \
1424 } STMT_END
1425
1426 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
1427     U16 dupe= trie->states[ state ].wordnum;                    \
1428     regnode * const noper_next = regnext( noper );              \
1429                                                                 \
1430     DEBUG_r({                                                   \
1431         /* store the word for dumping */                        \
1432         SV* tmp;                                                \
1433         if (OP(noper) != NOTHING)                               \
1434             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);    \
1435         else                                                    \
1436             tmp = newSVpvn_utf8( "", 0, UTF );                  \
1437         av_push( trie_words, tmp );                             \
1438     });                                                         \
1439                                                                 \
1440     curword++;                                                  \
1441     trie->wordinfo[curword].prev   = 0;                         \
1442     trie->wordinfo[curword].len    = wordlen;                   \
1443     trie->wordinfo[curword].accept = state;                     \
1444                                                                 \
1445     if ( noper_next < tail ) {                                  \
1446         if (!trie->jump)                                        \
1447             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1448         trie->jump[curword] = (U16)(noper_next - convert);      \
1449         if (!jumper)                                            \
1450             jumper = noper_next;                                \
1451         if (!nextbranch)                                        \
1452             nextbranch= regnext(cur);                           \
1453     }                                                           \
1454                                                                 \
1455     if ( dupe ) {                                               \
1456         /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
1457         /* chain, so that when the bits of chain are later    */\
1458         /* linked together, the dups appear in the chain      */\
1459         trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1460         trie->wordinfo[dupe].prev = curword;                    \
1461     } else {                                                    \
1462         /* we haven't inserted this word yet.                */ \
1463         trie->states[ state ].wordnum = curword;                \
1464     }                                                           \
1465 } STMT_END
1466
1467
1468 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)          \
1469      ( ( base + charid >=  ucharcount                                   \
1470          && base + charid < ubound                                      \
1471          && state == trie->trans[ base - ucharcount + charid ].check    \
1472          && trie->trans[ base - ucharcount + charid ].next )            \
1473            ? trie->trans[ base - ucharcount + charid ].next             \
1474            : ( state==1 ? special : 0 )                                 \
1475       )
1476
1477 #define MADE_TRIE       1
1478 #define MADE_JUMP_TRIE  2
1479 #define MADE_EXACT_TRIE 4
1480
1481 STATIC I32
1482 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1483 {
1484     dVAR;
1485     /* first pass, loop through and scan words */
1486     reg_trie_data *trie;
1487     HV *widecharmap = NULL;
1488     AV *revcharmap = newAV();
1489     regnode *cur;
1490     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1491     STRLEN len = 0;
1492     UV uvc = 0;
1493     U16 curword = 0;
1494     U32 next_alloc = 0;
1495     regnode *jumper = NULL;
1496     regnode *nextbranch = NULL;
1497     regnode *convert = NULL;
1498     U32 *prev_states; /* temp array mapping each state to previous one */
1499     /* we just use folder as a flag in utf8 */
1500     const U8 * folder = NULL;
1501
1502 #ifdef DEBUGGING
1503     const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1504     AV *trie_words = NULL;
1505     /* along with revcharmap, this only used during construction but both are
1506      * useful during debugging so we store them in the struct when debugging.
1507      */
1508 #else
1509     const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1510     STRLEN trie_charcount=0;
1511 #endif
1512     SV *re_trie_maxbuff;
1513     GET_RE_DEBUG_FLAGS_DECL;
1514
1515     PERL_ARGS_ASSERT_MAKE_TRIE;
1516 #ifndef DEBUGGING
1517     PERL_UNUSED_ARG(depth);
1518 #endif
1519
1520     switch (flags) {
1521         case EXACT: break;
1522         case EXACTFA:
1523         case EXACTFU: folder = PL_fold_latin1; break;
1524         case EXACTF:  folder = PL_fold; break;
1525         case EXACTFL: folder = PL_fold_locale; break;
1526         default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u", (unsigned) flags );
1527     }
1528
1529     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1530     trie->refcount = 1;
1531     trie->startstate = 1;
1532     trie->wordcount = word_count;
1533     RExC_rxi->data->data[ data_slot ] = (void*)trie;
1534     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1535     if (!(UTF && folder))
1536         trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1537     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1538                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
1539
1540     DEBUG_r({
1541         trie_words = newAV();
1542     });
1543
1544     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1545     if (!SvIOK(re_trie_maxbuff)) {
1546         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1547     }
1548     DEBUG_OPTIMISE_r({
1549                 PerlIO_printf( Perl_debug_log,
1550                   "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1551                   (int)depth * 2 + 2, "",
1552                   REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1553                   REG_NODE_NUM(last), REG_NODE_NUM(tail),
1554                   (int)depth);
1555     });
1556
1557    /* Find the node we are going to overwrite */
1558     if ( first == startbranch && OP( last ) != BRANCH ) {
1559         /* whole branch chain */
1560         convert = first;
1561     } else {
1562         /* branch sub-chain */
1563         convert = NEXTOPER( first );
1564     }
1565
1566     /*  -- First loop and Setup --
1567
1568        We first traverse the branches and scan each word to determine if it
1569        contains widechars, and how many unique chars there are, this is
1570        important as we have to build a table with at least as many columns as we
1571        have unique chars.
1572
1573        We use an array of integers to represent the character codes 0..255
1574        (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1575        native representation of the character value as the key and IV's for the
1576        coded index.
1577
1578        *TODO* If we keep track of how many times each character is used we can
1579        remap the columns so that the table compression later on is more
1580        efficient in terms of memory by ensuring the most common value is in the
1581        middle and the least common are on the outside.  IMO this would be better
1582        than a most to least common mapping as theres a decent chance the most
1583        common letter will share a node with the least common, meaning the node
1584        will not be compressible. With a middle is most common approach the worst
1585        case is when we have the least common nodes twice.
1586
1587      */
1588
1589     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1590         regnode * const noper = NEXTOPER( cur );
1591         const U8 *uc = (U8*)STRING( noper );
1592         const U8 * const e  = uc + STR_LEN( noper );
1593         STRLEN foldlen = 0;
1594         U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1595         const U8 *scan = (U8*)NULL;
1596         U32 wordlen      = 0;         /* required init */
1597         STRLEN chars = 0;
1598         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1599
1600         if (OP(noper) == NOTHING) {
1601             trie->minlen= 0;
1602             continue;
1603         }
1604         if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1605             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1606                                           regardless of encoding */
1607
1608         for ( ; uc < e ; uc += len ) {
1609             TRIE_CHARCOUNT(trie)++;
1610             TRIE_READ_CHAR;
1611             chars++;
1612             if ( uvc < 256 ) {
1613                 if ( !trie->charmap[ uvc ] ) {
1614                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1615                     if ( folder )
1616                         trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1617                     TRIE_STORE_REVCHAR;
1618                 }
1619                 if ( set_bit ) {
1620                     /* store the codepoint in the bitmap, and its folded
1621                      * equivalent. */
1622                     TRIE_BITMAP_SET(trie,uvc);
1623
1624                     /* store the folded codepoint */
1625                     if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1626
1627                     if ( !UTF ) {
1628                         /* store first byte of utf8 representation of
1629                            variant codepoints */
1630                         if (! UNI_IS_INVARIANT(uvc)) {
1631                             TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
1632                         }
1633                     }
1634                     set_bit = 0; /* We've done our bit :-) */
1635                 }
1636             } else {
1637                 SV** svpp;
1638                 if ( !widecharmap )
1639                     widecharmap = newHV();
1640
1641                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1642
1643                 if ( !svpp )
1644                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1645
1646                 if ( !SvTRUE( *svpp ) ) {
1647                     sv_setiv( *svpp, ++trie->uniquecharcount );
1648                     TRIE_STORE_REVCHAR;
1649                 }
1650             }
1651         }
1652         if( cur == first ) {
1653             trie->minlen=chars;
1654             trie->maxlen=chars;
1655         } else if (chars < trie->minlen) {
1656             trie->minlen=chars;
1657         } else if (chars > trie->maxlen) {
1658             trie->maxlen=chars;
1659         }
1660
1661     } /* end first pass */
1662     DEBUG_TRIE_COMPILE_r(
1663         PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1664                 (int)depth * 2 + 2,"",
1665                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1666                 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1667                 (int)trie->minlen, (int)trie->maxlen )
1668     );
1669
1670     /*
1671         We now know what we are dealing with in terms of unique chars and
1672         string sizes so we can calculate how much memory a naive
1673         representation using a flat table  will take. If it's over a reasonable
1674         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1675         conservative but potentially much slower representation using an array
1676         of lists.
1677
1678         At the end we convert both representations into the same compressed
1679         form that will be used in regexec.c for matching with. The latter
1680         is a form that cannot be used to construct with but has memory
1681         properties similar to the list form and access properties similar
1682         to the table form making it both suitable for fast searches and
1683         small enough that its feasable to store for the duration of a program.
1684
1685         See the comment in the code where the compressed table is produced
1686         inplace from the flat tabe representation for an explanation of how
1687         the compression works.
1688
1689     */
1690
1691
1692     Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1693     prev_states[1] = 0;
1694
1695     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1696         /*
1697             Second Pass -- Array Of Lists Representation
1698
1699             Each state will be represented by a list of charid:state records
1700             (reg_trie_trans_le) the first such element holds the CUR and LEN
1701             points of the allocated array. (See defines above).
1702
1703             We build the initial structure using the lists, and then convert
1704             it into the compressed table form which allows faster lookups
1705             (but cant be modified once converted).
1706         */
1707
1708         STRLEN transcount = 1;
1709
1710         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1711             "%*sCompiling trie using list compiler\n",
1712             (int)depth * 2 + 2, ""));
1713
1714         trie->states = (reg_trie_state *)
1715             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1716                                   sizeof(reg_trie_state) );
1717         TRIE_LIST_NEW(1);
1718         next_alloc = 2;
1719
1720         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1721
1722             regnode * const noper = NEXTOPER( cur );
1723             U8 *uc           = (U8*)STRING( noper );
1724             const U8 * const e = uc + STR_LEN( noper );
1725             U32 state        = 1;         /* required init */
1726             U16 charid       = 0;         /* sanity init */
1727             U8 *scan         = (U8*)NULL; /* sanity init */
1728             STRLEN foldlen   = 0;         /* required init */
1729             U32 wordlen      = 0;         /* required init */
1730             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1731
1732             if (OP(noper) != NOTHING) {
1733                 for ( ; uc < e ; uc += len ) {
1734
1735                     TRIE_READ_CHAR;
1736
1737                     if ( uvc < 256 ) {
1738                         charid = trie->charmap[ uvc ];
1739                     } else {
1740                         SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1741                         if ( !svpp ) {
1742                             charid = 0;
1743                         } else {
1744                             charid=(U16)SvIV( *svpp );
1745                         }
1746                     }
1747                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
1748                     if ( charid ) {
1749
1750                         U16 check;
1751                         U32 newstate = 0;
1752
1753                         charid--;
1754                         if ( !trie->states[ state ].trans.list ) {
1755                             TRIE_LIST_NEW( state );
1756                         }
1757                         for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1758                             if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1759                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1760                                 break;
1761                             }
1762                         }
1763                         if ( ! newstate ) {
1764                             newstate = next_alloc++;
1765                             prev_states[newstate] = state;
1766                             TRIE_LIST_PUSH( state, charid, newstate );
1767                             transcount++;
1768                         }
1769                         state = newstate;
1770                     } else {
1771                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1772                     }
1773                 }
1774             }
1775             TRIE_HANDLE_WORD(state);
1776
1777         } /* end second pass */
1778
1779         /* next alloc is the NEXT state to be allocated */
1780         trie->statecount = next_alloc;
1781         trie->states = (reg_trie_state *)
1782             PerlMemShared_realloc( trie->states,
1783                                    next_alloc
1784                                    * sizeof(reg_trie_state) );
1785
1786         /* and now dump it out before we compress it */
1787         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1788                                                          revcharmap, next_alloc,
1789                                                          depth+1)
1790         );
1791
1792         trie->trans = (reg_trie_trans *)
1793             PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1794         {
1795             U32 state;
1796             U32 tp = 0;
1797             U32 zp = 0;
1798
1799
1800             for( state=1 ; state < next_alloc ; state ++ ) {
1801                 U32 base=0;
1802
1803                 /*
1804                 DEBUG_TRIE_COMPILE_MORE_r(
1805                     PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1806                 );
1807                 */
1808
1809                 if (trie->states[state].trans.list) {
1810                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1811                     U16 maxid=minid;
1812                     U16 idx;
1813
1814                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1815                         const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1816                         if ( forid < minid ) {
1817                             minid=forid;
1818                         } else if ( forid > maxid ) {
1819                             maxid=forid;
1820                         }
1821                     }
1822                     if ( transcount < tp + maxid - minid + 1) {
1823                         transcount *= 2;
1824                         trie->trans = (reg_trie_trans *)
1825                             PerlMemShared_realloc( trie->trans,
1826                                                      transcount
1827                                                      * sizeof(reg_trie_trans) );
1828                         Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1829                     }
1830                     base = trie->uniquecharcount + tp - minid;
1831                     if ( maxid == minid ) {
1832                         U32 set = 0;
1833                         for ( ; zp < tp ; zp++ ) {
1834                             if ( ! trie->trans[ zp ].next ) {
1835                                 base = trie->uniquecharcount + zp - minid;
1836                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1837                                 trie->trans[ zp ].check = state;
1838                                 set = 1;
1839                                 break;
1840                             }
1841                         }
1842                         if ( !set ) {
1843                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1844                             trie->trans[ tp ].check = state;
1845                             tp++;
1846                             zp = tp;
1847                         }
1848                     } else {
1849                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1850                             const U32 tid = base -  trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1851                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1852                             trie->trans[ tid ].check = state;
1853                         }
1854                         tp += ( maxid - minid + 1 );
1855                     }
1856                     Safefree(trie->states[ state ].trans.list);
1857                 }
1858                 /*
1859                 DEBUG_TRIE_COMPILE_MORE_r(
1860                     PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1861                 );
1862                 */
1863                 trie->states[ state ].trans.base=base;
1864             }
1865             trie->lasttrans = tp + 1;
1866         }
1867     } else {
1868         /*
1869            Second Pass -- Flat Table Representation.
1870
1871            we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1872            We know that we will need Charcount+1 trans at most to store the data
1873            (one row per char at worst case) So we preallocate both structures
1874            assuming worst case.
1875
1876            We then construct the trie using only the .next slots of the entry
1877            structs.
1878
1879            We use the .check field of the first entry of the node temporarily to
1880            make compression both faster and easier by keeping track of how many non
1881            zero fields are in the node.
1882
1883            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1884            transition.
1885
1886            There are two terms at use here: state as a TRIE_NODEIDX() which is a
1887            number representing the first entry of the node, and state as a
1888            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1889            TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1890            are 2 entrys per node. eg:
1891
1892              A B       A B
1893           1. 2 4    1. 3 7
1894           2. 0 3    3. 0 5
1895           3. 0 0    5. 0 0
1896           4. 0 0    7. 0 0
1897
1898            The table is internally in the right hand, idx form. However as we also
1899            have to deal with the states array which is indexed by nodenum we have to
1900            use TRIE_NODENUM() to convert.
1901
1902         */
1903         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1904             "%*sCompiling trie using table compiler\n",
1905             (int)depth * 2 + 2, ""));
1906
1907         trie->trans = (reg_trie_trans *)
1908             PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1909                                   * trie->uniquecharcount + 1,
1910                                   sizeof(reg_trie_trans) );
1911         trie->states = (reg_trie_state *)
1912             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1913                                   sizeof(reg_trie_state) );
1914         next_alloc = trie->uniquecharcount + 1;
1915
1916
1917         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1918
1919             regnode * const noper   = NEXTOPER( cur );
1920             const U8 *uc     = (U8*)STRING( noper );
1921             const U8 * const e = uc + STR_LEN( noper );
1922
1923             U32 state        = 1;         /* required init */
1924
1925             U16 charid       = 0;         /* sanity init */
1926             U32 accept_state = 0;         /* sanity init */
1927             U8 *scan         = (U8*)NULL; /* sanity init */
1928
1929             STRLEN foldlen   = 0;         /* required init */
1930             U32 wordlen      = 0;         /* required init */
1931             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1932
1933             if ( OP(noper) != NOTHING ) {
1934                 for ( ; uc < e ; uc += len ) {
1935
1936                     TRIE_READ_CHAR;
1937
1938                     if ( uvc < 256 ) {
1939                         charid = trie->charmap[ uvc ];
1940                     } else {
1941                         SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1942                         charid = svpp ? (U16)SvIV(*svpp) : 0;
1943                     }
1944                     if ( charid ) {
1945                         charid--;
1946                         if ( !trie->trans[ state + charid ].next ) {
1947                             trie->trans[ state + charid ].next = next_alloc;
1948                             trie->trans[ state ].check++;
1949                             prev_states[TRIE_NODENUM(next_alloc)]
1950                                     = TRIE_NODENUM(state);
1951                             next_alloc += trie->uniquecharcount;
1952                         }
1953                         state = trie->trans[ state + charid ].next;
1954                     } else {
1955                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1956                     }
1957                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
1958                 }
1959             }
1960             accept_state = TRIE_NODENUM( state );
1961             TRIE_HANDLE_WORD(accept_state);
1962
1963         } /* end second pass */
1964
1965         /* and now dump it out before we compress it */
1966         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1967                                                           revcharmap,
1968                                                           next_alloc, depth+1));
1969
1970         {
1971         /*
1972            * Inplace compress the table.*
1973
1974            For sparse data sets the table constructed by the trie algorithm will
1975            be mostly 0/FAIL transitions or to put it another way mostly empty.
1976            (Note that leaf nodes will not contain any transitions.)
1977
1978            This algorithm compresses the tables by eliminating most such
1979            transitions, at the cost of a modest bit of extra work during lookup:
1980
1981            - Each states[] entry contains a .base field which indicates the
1982            index in the state[] array wheres its transition data is stored.
1983
1984            - If .base is 0 there are no valid transitions from that node.
1985
1986            - If .base is nonzero then charid is added to it to find an entry in
1987            the trans array.
1988
1989            -If trans[states[state].base+charid].check!=state then the
1990            transition is taken to be a 0/Fail transition. Thus if there are fail
1991            transitions at the front of the node then the .base offset will point
1992            somewhere inside the previous nodes data (or maybe even into a node
1993            even earlier), but the .check field determines if the transition is
1994            valid.
1995
1996            XXX - wrong maybe?
1997            The following process inplace converts the table to the compressed
1998            table: We first do not compress the root node 1,and mark all its
1999            .check pointers as 1 and set its .base pointer as 1 as well. This
2000            allows us to do a DFA construction from the compressed table later,
2001            and ensures that any .base pointers we calculate later are greater
2002            than 0.
2003
2004            - We set 'pos' to indicate the first entry of the second node.
2005
2006            - We then iterate over the columns of the node, finding the first and
2007            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2008            and set the .check pointers accordingly, and advance pos
2009            appropriately and repreat for the next node. Note that when we copy
2010            the next pointers we have to convert them from the original
2011            NODEIDX form to NODENUM form as the former is not valid post
2012            compression.
2013
2014            - If a node has no transitions used we mark its base as 0 and do not
2015            advance the pos pointer.
2016
2017            - If a node only has one transition we use a second pointer into the
2018            structure to fill in allocated fail transitions from other states.
2019            This pointer is independent of the main pointer and scans forward
2020            looking for null transitions that are allocated to a state. When it
2021            finds one it writes the single transition into the "hole".  If the
2022            pointer doesnt find one the single transition is appended as normal.
2023
2024            - Once compressed we can Renew/realloc the structures to release the
2025            excess space.
2026
2027            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2028            specifically Fig 3.47 and the associated pseudocode.
2029
2030            demq
2031         */
2032         const U32 laststate = TRIE_NODENUM( next_alloc );
2033         U32 state, charid;
2034         U32 pos = 0, zp=0;
2035         trie->statecount = laststate;
2036
2037         for ( state = 1 ; state < laststate ; state++ ) {
2038             U8 flag = 0;
2039             const U32 stateidx = TRIE_NODEIDX( state );
2040             const U32 o_used = trie->trans[ stateidx ].check;
2041             U32 used = trie->trans[ stateidx ].check;
2042             trie->trans[ stateidx ].check = 0;
2043
2044             for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2045                 if ( flag || trie->trans[ stateidx + charid ].next ) {
2046                     if ( trie->trans[ stateidx + charid ].next ) {
2047                         if (o_used == 1) {
2048                             for ( ; zp < pos ; zp++ ) {
2049                                 if ( ! trie->trans[ zp ].next ) {
2050                                     break;
2051                                 }
2052                             }
2053                             trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2054                             trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2055                             trie->trans[ zp ].check = state;
2056                             if ( ++zp > pos ) pos = zp;
2057                             break;
2058                         }
2059                         used--;
2060                     }
2061                     if ( !flag ) {
2062                         flag = 1;
2063                         trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2064                     }
2065                     trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2066                     trie->trans[ pos ].check = state;
2067                     pos++;
2068                 }
2069             }
2070         }
2071         trie->lasttrans = pos + 1;
2072         trie->states = (reg_trie_state *)
2073             PerlMemShared_realloc( trie->states, laststate
2074                                    * sizeof(reg_trie_state) );
2075         DEBUG_TRIE_COMPILE_MORE_r(
2076                 PerlIO_printf( Perl_debug_log,
2077                     "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2078                     (int)depth * 2 + 2,"",
2079                     (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
2080                     (IV)next_alloc,
2081                     (IV)pos,
2082                     ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2083             );
2084
2085         } /* end table compress */
2086     }
2087     DEBUG_TRIE_COMPILE_MORE_r(
2088             PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2089                 (int)depth * 2 + 2, "",
2090                 (UV)trie->statecount,
2091                 (UV)trie->lasttrans)
2092     );
2093     /* resize the trans array to remove unused space */
2094     trie->trans = (reg_trie_trans *)
2095         PerlMemShared_realloc( trie->trans, trie->lasttrans
2096                                * sizeof(reg_trie_trans) );
2097
2098     {   /* Modify the program and insert the new TRIE node */
2099         U8 nodetype =(U8)(flags & 0xFF);
2100         char *str=NULL;
2101
2102 #ifdef DEBUGGING
2103         regnode *optimize = NULL;
2104 #ifdef RE_TRACK_PATTERN_OFFSETS
2105
2106         U32 mjd_offset = 0;
2107         U32 mjd_nodelen = 0;
2108 #endif /* RE_TRACK_PATTERN_OFFSETS */
2109 #endif /* DEBUGGING */
2110         /*
2111            This means we convert either the first branch or the first Exact,
2112            depending on whether the thing following (in 'last') is a branch
2113            or not and whther first is the startbranch (ie is it a sub part of
2114            the alternation or is it the whole thing.)
2115            Assuming its a sub part we convert the EXACT otherwise we convert
2116            the whole branch sequence, including the first.
2117          */
2118         /* Find the node we are going to overwrite */
2119         if ( first != startbranch || OP( last ) == BRANCH ) {
2120             /* branch sub-chain */
2121             NEXT_OFF( first ) = (U16)(last - first);
2122 #ifdef RE_TRACK_PATTERN_OFFSETS
2123             DEBUG_r({
2124                 mjd_offset= Node_Offset((convert));
2125                 mjd_nodelen= Node_Length((convert));
2126             });
2127 #endif
2128             /* whole branch chain */
2129         }
2130 #ifdef RE_TRACK_PATTERN_OFFSETS
2131         else {
2132             DEBUG_r({
2133                 const  regnode *nop = NEXTOPER( convert );
2134                 mjd_offset= Node_Offset((nop));
2135                 mjd_nodelen= Node_Length((nop));
2136             });
2137         }
2138         DEBUG_OPTIMISE_r(
2139             PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2140                 (int)depth * 2 + 2, "",
2141                 (UV)mjd_offset, (UV)mjd_nodelen)
2142         );
2143 #endif
2144         /* But first we check to see if there is a common prefix we can
2145            split out as an EXACT and put in front of the TRIE node.  */
2146         trie->startstate= 1;
2147         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
2148             U32 state;
2149             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2150                 U32 ofs = 0;
2151                 I32 idx = -1;
2152                 U32 count = 0;
2153                 const U32 base = trie->states[ state ].trans.base;
2154
2155                 if ( trie->states[state].wordnum )
2156                         count = 1;
2157
2158                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2159                     if ( ( base + ofs >= trie->uniquecharcount ) &&
2160                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2161                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2162                     {
2163                         if ( ++count > 1 ) {
2164                             SV **tmp = av_fetch( revcharmap, ofs, 0);
2165                             const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2166                             if ( state == 1 ) break;
2167                             if ( count == 2 ) {
2168                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2169                                 DEBUG_OPTIMISE_r(
2170                                     PerlIO_printf(Perl_debug_log,
2171                                         "%*sNew Start State=%"UVuf" Class: [",
2172                                         (int)depth * 2 + 2, "",
2173                                         (UV)state));
2174                                 if (idx >= 0) {
2175                                     SV ** const tmp = av_fetch( revcharmap, idx, 0);
2176                                     const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2177
2178                                     TRIE_BITMAP_SET(trie,*ch);
2179                                     if ( folder )
2180                                         TRIE_BITMAP_SET(trie, folder[ *ch ]);
2181                                     DEBUG_OPTIMISE_r(
2182                                         PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2183                                     );
2184                                 }
2185                             }
2186                             TRIE_BITMAP_SET(trie,*ch);
2187                             if ( folder )
2188                                 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2189                             DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2190                         }
2191                         idx = ofs;
2192                     }
2193                 }
2194                 if ( count == 1 ) {
2195                     SV **tmp = av_fetch( revcharmap, idx, 0);
2196                     STRLEN len;
2197                     char *ch = SvPV( *tmp, len );
2198                     DEBUG_OPTIMISE_r({
2199                         SV *sv=sv_newmortal();
2200                         PerlIO_printf( Perl_debug_log,
2201                             "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2202                             (int)depth * 2 + 2, "",
2203                             (UV)state, (UV)idx,
2204                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2205                                 PL_colors[0], PL_colors[1],
2206                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2207                                 PERL_PV_ESCAPE_FIRSTCHAR
2208                             )
2209                         );
2210                     });
2211                     if ( state==1 ) {
2212                         OP( convert ) = nodetype;
2213                         str=STRING(convert);
2214                         STR_LEN(convert)=0;
2215                     }
2216                     STR_LEN(convert) += len;
2217                     while (len--)
2218                         *str++ = *ch++;
2219                 } else {
2220 #ifdef DEBUGGING
2221                     if (state>1)
2222                         DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2223 #endif
2224                     break;
2225                 }
2226             }
2227             trie->prefixlen = (state-1);
2228             if (str) {
2229                 regnode *n = convert+NODE_SZ_STR(convert);
2230                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2231                 trie->startstate = state;
2232                 trie->minlen -= (state - 1);
2233                 trie->maxlen -= (state - 1);
2234 #ifdef DEBUGGING
2235                /* At least the UNICOS C compiler choked on this
2236                 * being argument to DEBUG_r(), so let's just have
2237                 * it right here. */
2238                if (
2239 #ifdef PERL_EXT_RE_BUILD
2240                    1
2241 #else
2242                    DEBUG_r_TEST
2243 #endif
2244                    ) {
2245                    regnode *fix = convert;
2246                    U32 word = trie->wordcount;
2247                    mjd_nodelen++;
2248                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2249                    while( ++fix < n ) {
2250                        Set_Node_Offset_Length(fix, 0, 0);
2251                    }
2252                    while (word--) {
2253                        SV ** const tmp = av_fetch( trie_words, word, 0 );
2254                        if (tmp) {
2255                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
2256                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2257                            else
2258                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2259                        }
2260                    }
2261                }
2262 #endif
2263                 if (trie->maxlen) {
2264                     convert = n;
2265                 } else {
2266                     NEXT_OFF(convert) = (U16)(tail - convert);
2267                     DEBUG_r(optimize= n);
2268                 }
2269             }
2270         }
2271         if (!jumper)
2272             jumper = last;
2273         if ( trie->maxlen ) {
2274             NEXT_OFF( convert ) = (U16)(tail - convert);
2275             ARG_SET( convert, data_slot );
2276             /* Store the offset to the first unabsorbed branch in
2277                jump[0], which is otherwise unused by the jump logic.
2278                We use this when dumping a trie and during optimisation. */
2279             if (trie->jump)
2280                 trie->jump[0] = (U16)(nextbranch - convert);
2281
2282             /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2283              *   and there is a bitmap
2284              *   and the first "jump target" node we found leaves enough room
2285              * then convert the TRIE node into a TRIEC node, with the bitmap
2286              * embedded inline in the opcode - this is hypothetically faster.
2287              */
2288             if ( !trie->states[trie->startstate].wordnum
2289                  && trie->bitmap
2290                  && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2291             {
2292                 OP( convert ) = TRIEC;
2293                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2294                 PerlMemShared_free(trie->bitmap);
2295                 trie->bitmap= NULL;
2296             } else
2297                 OP( convert ) = TRIE;
2298
2299             /* store the type in the flags */
2300             convert->flags = nodetype;
2301             DEBUG_r({
2302             optimize = convert
2303                       + NODE_STEP_REGNODE
2304                       + regarglen[ OP( convert ) ];
2305             });
2306             /* XXX We really should free up the resource in trie now,
2307                    as we won't use them - (which resources?) dmq */
2308         }
2309         /* needed for dumping*/
2310         DEBUG_r(if (optimize) {
2311             regnode *opt = convert;
2312
2313             while ( ++opt < optimize) {
2314                 Set_Node_Offset_Length(opt,0,0);
2315             }
2316             /*
2317                 Try to clean up some of the debris left after the
2318                 optimisation.
2319              */
2320             while( optimize < jumper ) {
2321                 mjd_nodelen += Node_Length((optimize));
2322                 OP( optimize ) = OPTIMIZED;
2323                 Set_Node_Offset_Length(optimize,0,0);
2324                 optimize++;
2325             }
2326             Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2327         });
2328     } /* end node insert */
2329
2330     /*  Finish populating the prev field of the wordinfo array.  Walk back
2331      *  from each accept state until we find another accept state, and if
2332      *  so, point the first word's .prev field at the second word. If the
2333      *  second already has a .prev field set, stop now. This will be the
2334      *  case either if we've already processed that word's accept state,
2335      *  or that state had multiple words, and the overspill words were
2336      *  already linked up earlier.
2337      */
2338     {
2339         U16 word;
2340         U32 state;
2341         U16 prev;
2342
2343         for (word=1; word <= trie->wordcount; word++) {
2344             prev = 0;
2345             if (trie->wordinfo[word].prev)
2346                 continue;
2347             state = trie->wordinfo[word].accept;
2348             while (state) {
2349                 state = prev_states[state];
2350                 if (!state)
2351                     break;
2352                 prev = trie->states[state].wordnum;
2353                 if (prev)
2354                     break;
2355             }
2356             trie->wordinfo[word].prev = prev;
2357         }
2358         Safefree(prev_states);
2359     }
2360
2361
2362     /* and now dump out the compressed format */
2363     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2364
2365     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2366 #ifdef DEBUGGING
2367     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2368     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2369 #else
2370     SvREFCNT_dec(revcharmap);
2371 #endif
2372     return trie->jump
2373            ? MADE_JUMP_TRIE
2374            : trie->startstate>1
2375              ? MADE_EXACT_TRIE
2376              : MADE_TRIE;
2377 }
2378
2379 STATIC void
2380 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode *stclass, U32 depth)
2381 {
2382 /* The Trie is constructed and compressed now so we can build a fail array if it's needed
2383
2384    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2385    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2386    ISBN 0-201-10088-6
2387
2388    We find the fail state for each state in the trie, this state is the longest proper
2389    suffix of the current state's 'word' that is also a proper prefix of another word in our
2390    trie. State 1 represents the word '' and is thus the default fail state. This allows
2391    the DFA not to have to restart after its tried and failed a word at a given point, it
2392    simply continues as though it had been matching the other word in the first place.
2393    Consider
2394       'abcdgu'=~/abcdefg|cdgu/
2395    When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2396    fail, which would bring us to the state representing 'd' in the second word where we would
2397    try 'g' and succeed, proceeding to match 'cdgu'.
2398  */
2399  /* add a fail transition */
2400     const U32 trie_offset = ARG(source);
2401     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2402     U32 *q;
2403     const U32 ucharcount = trie->uniquecharcount;
2404     const U32 numstates = trie->statecount;
2405     const U32 ubound = trie->lasttrans + ucharcount;
2406     U32 q_read = 0;
2407     U32 q_write = 0;
2408     U32 charid;
2409     U32 base = trie->states[ 1 ].trans.base;
2410     U32 *fail;
2411     reg_ac_data *aho;
2412     const U32 data_slot = add_data( pRExC_state, 1, "T" );
2413     GET_RE_DEBUG_FLAGS_DECL;
2414
2415     PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2416 #ifndef DEBUGGING
2417     PERL_UNUSED_ARG(depth);
2418 #endif
2419
2420
2421     ARG_SET( stclass, data_slot );
2422     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2423     RExC_rxi->data->data[ data_slot ] = (void*)aho;
2424     aho->trie=trie_offset;
2425     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2426     Copy( trie->states, aho->states, numstates, reg_trie_state );
2427     Newxz( q, numstates, U32);
2428     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2429     aho->refcount = 1;
2430     fail = aho->fail;
2431     /* initialize fail[0..1] to be 1 so that we always have
2432        a valid final fail state */
2433     fail[ 0 ] = fail[ 1 ] = 1;
2434
2435     for ( charid = 0; charid < ucharcount ; charid++ ) {
2436         const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2437         if ( newstate ) {
2438             q[ q_write ] = newstate;
2439             /* set to point at the root */
2440             fail[ q[ q_write++ ] ]=1;
2441         }
2442     }
2443     while ( q_read < q_write) {
2444         const U32 cur = q[ q_read++ % numstates ];
2445         base = trie->states[ cur ].trans.base;
2446
2447         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2448             const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2449             if (ch_state) {
2450                 U32 fail_state = cur;
2451                 U32 fail_base;
2452                 do {
2453                     fail_state = fail[ fail_state ];
2454                     fail_base = aho->states[ fail_state ].trans.base;
2455                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2456
2457                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2458                 fail[ ch_state ] = fail_state;
2459                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2460                 {
2461                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
2462                 }
2463                 q[ q_write++ % numstates] = ch_state;
2464             }
2465         }
2466     }
2467     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2468        when we fail in state 1, this allows us to use the
2469        charclass scan to find a valid start char. This is based on the principle
2470        that theres a good chance the string being searched contains lots of stuff
2471        that cant be a start char.
2472      */
2473     fail[ 0 ] = fail[ 1 ] = 0;
2474     DEBUG_TRIE_COMPILE_r({
2475         PerlIO_printf(Perl_debug_log,
2476                       "%*sStclass Failtable (%"UVuf" states): 0",
2477                       (int)(depth * 2), "", (UV)numstates
2478         );
2479         for( q_read=1; q_read<numstates; q_read++ ) {
2480             PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2481         }
2482         PerlIO_printf(Perl_debug_log, "\n");
2483     });
2484     Safefree(q);
2485     /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2486 }
2487
2488
2489 /*
2490  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2491  * These need to be revisited when a newer toolchain becomes available.
2492  */
2493 #if defined(__sparc64__) && defined(__GNUC__)
2494 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2495 #       undef  SPARC64_GCC_WORKAROUND
2496 #       define SPARC64_GCC_WORKAROUND 1
2497 #   endif
2498 #endif
2499
2500 #define DEBUG_PEEP(str,scan,depth) \
2501     DEBUG_OPTIMISE_r({if (scan){ \
2502        SV * const mysv=sv_newmortal(); \
2503        regnode *Next = regnext(scan); \
2504        regprop(RExC_rx, mysv, scan); \
2505        PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2506        (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2507        Next ? (REG_NODE_NUM(Next)) : 0 ); \
2508    }});
2509
2510
2511 /* The below joins as many adjacent EXACTish nodes as possible into a single
2512  * one, and looks for problematic sequences of characters whose folds vs.
2513  * non-folds have sufficiently different lengths, that the optimizer would be
2514  * fooled into rejecting legitimate matches of them, and the trie construction
2515  * code can't cope with them.  The joining is only done if:
2516  * 1) there is room in the current conglomerated node to entirely contain the
2517  *    next one.
2518  * 2) they are the exact same node type
2519  *
2520  * The adjacent nodes actually may be separated by NOTHING kind nodes, and
2521  * these get optimized out
2522  *
2523  * If there are problematic code sequences, *min_subtract is set to the delta
2524  * that the minimum size of the node can be less than its actual size.  And,
2525  * the node type of the result is changed to reflect that it contains these
2526  * sequences.
2527  *
2528  * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
2529  * and contains LATIN SMALL LETTER SHARP S
2530  *
2531  * This is as good a place as any to discuss the design of handling these
2532  * problematic sequences.  It's been wrong in Perl for a very long time.  There
2533  * are three code points in Unicode whose folded lengths differ so much from
2534  * the un-folded lengths that it causes problems for the optimizer and trie
2535  * construction.  Why only these are problematic, and not others where lengths
2536  * also differ is something I (khw) do not understand.  New versions of Unicode
2537  * might add more such code points.  Hopefully the logic in fold_grind.t that
2538  * figures out what to test (in part by verifying that each size-combination
2539  * gets tested) will catch any that do come along, so they can be added to the
2540  * special handling below.  The chances of new ones are actually rather small,
2541  * as most, if not all, of the world's scripts that have casefolding have
2542  * already been encoded by Unicode.  Also, a number of Unicode's decisions were
2543  * made to allow compatibility with pre-existing standards, and almost all of
2544  * those have already been dealt with.  These would otherwise be the most
2545  * likely candidates for generating further tricky sequences.  In other words,
2546  * Unicode by itself is unlikely to add new ones unless it is for compatibility
2547  * with pre-existing standards, and there aren't many of those left.
2548  *
2549  * The previous designs for dealing with these involved assigning a special
2550  * node for them.  This approach doesn't work, as evidenced by this example:
2551  *      "\xDFs" =~ /s\xDF/ui    # Used to fail before these patches
2552  * Both these fold to "sss", but if the pattern is parsed to create a node of
2553  * that would match just the \xDF, it won't be able to handle the case where a
2554  * successful match would have to cross the node's boundary.  The new approach
2555  * that hopefully generally solves the problem generates an EXACTFU_SS node
2556  * that is "sss".
2557  *
2558  * There are a number of components to the approach (a lot of work for just
2559  * three code points!):
2560  * 1)   This routine examines each EXACTFish node that could contain the
2561  *      problematic sequences.  It returns in *min_subtract how much to
2562  *      subtract from the the actual length of the string to get a real minimum
2563  *      for one that could match it.  This number is usually 0 except for the
2564  *      problematic sequences.  This delta is used by the caller to adjust the
2565  *      min length of the match, and the delta between min and max, so that the
2566  *      optimizer doesn't reject these possibilities based on size constraints.
2567  * 2)   These sequences are not currently correctly handled by the trie code
2568  *      either, so it changes the joined node type to ops that are not handled
2569  *      by trie's, those new ops being EXACTFU_SS and EXACTFU_NO_TRIE.
2570  * 3)   This is sufficient for the two Greek sequences (described below), but
2571  *      the one involving the Sharp s (\xDF) needs more.  The node type
2572  *      EXACTFU_SS is used for an EXACTFU node that contains at least one "ss"
2573  *      sequence in it.  For non-UTF-8 patterns and strings, this is the only
2574  *      case where there is a possible fold length change.  That means that a
2575  *      regular EXACTFU node without UTF-8 involvement doesn't have to concern
2576  *      itself with length changes, and so can be processed faster.  regexec.c
2577  *      takes advantage of this.  Generally, an EXACTFish node that is in UTF-8
2578  *      is pre-folded by regcomp.c.  This saves effort in regex matching.
2579  *      However, probably mostly for historical reasons, the pre-folding isn't
2580  *      done for non-UTF8 patterns (and it can't be for EXACTF and EXACTFL
2581  *      nodes, as what they fold to isn't known until runtime.)  The fold
2582  *      possibilities for the non-UTF8 patterns are quite simple, except for
2583  *      the sharp s.  All the ones that don't involve a UTF-8 target string
2584  *      are members of a fold-pair, and arrays are set up for all of them
2585  *      that quickly find the other member of the pair.  It might actually
2586  *      be faster to pre-fold these, but it isn't currently done, except for
2587  *      the sharp s.  Code elsewhere in this file makes sure that it gets
2588  *      folded to 'ss', even if the pattern isn't UTF-8.  This avoids the
2589  *      issues described in the next item.
2590  * 4)   A problem remains for the sharp s in EXACTF nodes.  Whether it matches
2591  *      'ss' or not is not knowable at compile time.  It will match iff the
2592  *      target string is in UTF-8, unlike the EXACTFU nodes, where it always
2593  *      matches; and the EXACTFL and EXACTFA nodes where it never does.  Thus
2594  *      it can't be folded to "ss" at compile time, unlike EXACTFU does as
2595  *      described in item 3).  An assumption that the optimizer part of
2596  *      regexec.c (probably unwittingly) makes is that a character in the
2597  *      pattern corresponds to at most a single character in the target string.
2598  *      (And I do mean character, and not byte here, unlike other parts of the
2599  *      documentation that have never been updated to account for multibyte
2600  *      Unicode.)  This assumption is wrong only in this case, as all other
2601  *      cases are either 1-1 folds when no UTF-8 is involved; or is true by
2602  *      virtue of having this file pre-fold UTF-8 patterns.   I'm
2603  *      reluctant to try to change this assumption, so instead the code punts.
2604  *      This routine examines EXACTF nodes for the sharp s, and returns a
2605  *      boolean indicating whether or not the node is an EXACTF node that
2606  *      contains a sharp s.  When it is true, the caller sets a flag that later
2607  *      causes the optimizer in this file to not set values for the floating
2608  *      and fixed string lengths, and thus avoids the optimizer code in
2609  *      regexec.c that makes the invalid assumption.  Thus, there is no
2610  *      optimization based on string lengths for EXACTF nodes that contain the
2611  *      sharp s.  This only happens for /id rules (which means the pattern
2612  *      isn't in UTF-8).
2613  */
2614
2615 #define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
2616     if (PL_regkind[OP(scan)] == EXACT) \
2617         join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
2618
2619 STATIC U32
2620 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
2621     /* Merge several consecutive EXACTish nodes into one. */
2622     regnode *n = regnext(scan);
2623     U32 stringok = 1;
2624     regnode *next = scan + NODE_SZ_STR(scan);
2625     U32 merged = 0;
2626     U32 stopnow = 0;
2627 #ifdef DEBUGGING
2628     regnode *stop = scan;
2629     GET_RE_DEBUG_FLAGS_DECL;
2630 #else
2631     PERL_UNUSED_ARG(depth);
2632 #endif
2633
2634     PERL_ARGS_ASSERT_JOIN_EXACT;
2635 #ifndef EXPERIMENTAL_INPLACESCAN
2636     PERL_UNUSED_ARG(flags);
2637     PERL_UNUSED_ARG(val);
2638 #endif
2639     DEBUG_PEEP("join",scan,depth);
2640
2641     /* Look through the subsequent nodes in the chain.  Skip NOTHING, merge
2642      * EXACT ones that are mergeable to the current one. */
2643     while (n
2644            && (PL_regkind[OP(n)] == NOTHING
2645                || (stringok && OP(n) == OP(scan)))
2646            && NEXT_OFF(n)
2647            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
2648     {
2649
2650         if (OP(n) == TAIL || n > next)
2651             stringok = 0;
2652         if (PL_regkind[OP(n)] == NOTHING) {
2653             DEBUG_PEEP("skip:",n,depth);
2654             NEXT_OFF(scan) += NEXT_OFF(n);
2655             next = n + NODE_STEP_REGNODE;
2656 #ifdef DEBUGGING
2657             if (stringok)
2658                 stop = n;
2659 #endif
2660             n = regnext(n);
2661         }
2662         else if (stringok) {
2663             const unsigned int oldl = STR_LEN(scan);
2664             regnode * const nnext = regnext(n);
2665
2666             if (oldl + STR_LEN(n) > U8_MAX)
2667                 break;
2668
2669             DEBUG_PEEP("merg",n,depth);
2670             merged++;
2671
2672             NEXT_OFF(scan) += NEXT_OFF(n);
2673             STR_LEN(scan) += STR_LEN(n);
2674             next = n + NODE_SZ_STR(n);
2675             /* Now we can overwrite *n : */
2676             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2677 #ifdef DEBUGGING
2678             stop = next - 1;
2679 #endif
2680             n = nnext;
2681             if (stopnow) break;
2682         }
2683
2684 #ifdef EXPERIMENTAL_INPLACESCAN
2685         if (flags && !NEXT_OFF(n)) {
2686             DEBUG_PEEP("atch", val, depth);
2687             if (reg_off_by_arg[OP(n)]) {
2688                 ARG_SET(n, val - n);
2689             }
2690             else {
2691                 NEXT_OFF(n) = val - n;
2692             }
2693             stopnow = 1;
2694         }
2695 #endif
2696     }
2697
2698     *min_subtract = 0;
2699     *has_exactf_sharp_s = FALSE;
2700
2701     /* Here, all the adjacent mergeable EXACTish nodes have been merged.  We
2702      * can now analyze for sequences of problematic code points.  (Prior to
2703      * this final joining, sequences could have been split over boundaries, and
2704      * hence missed).  The sequences only happen in folding, hence for any
2705      * non-EXACT EXACTish node */
2706     if (OP(scan) != EXACT) {
2707         U8 *s;
2708         U8 * s0 = (U8*) STRING(scan);
2709         U8 * const s_end = s0 + STR_LEN(scan);
2710
2711         /* The below is perhaps overboard, but this allows us to save a test
2712          * each time through the loop at the expense of a mask.  This is
2713          * because on both EBCDIC and ASCII machines, 'S' and 's' differ by a
2714          * single bit.  On ASCII they are 32 apart; on EBCDIC, they are 64.
2715          * This uses an exclusive 'or' to find that bit and then inverts it to
2716          * form a mask, with just a single 0, in the bit position where 'S' and
2717          * 's' differ. */
2718         const U8 S_or_s_mask = ~ ('S' ^ 's');
2719         const U8 s_masked = 's' & S_or_s_mask;
2720
2721         /* One pass is made over the node's string looking for all the
2722          * possibilities.  to avoid some tests in the loop, there are two main
2723          * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
2724          * non-UTF-8 */
2725         if (UTF) {
2726
2727             /* There are two problematic Greek code points in Unicode
2728              * casefolding
2729              *
2730              * U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2731              * U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2732              *
2733              * which casefold to
2734              *
2735              * Unicode                      UTF-8
2736              *
2737              * U+03B9 U+0308 U+0301         0xCE 0xB9 0xCC 0x88 0xCC 0x81
2738              * U+03C5 U+0308 U+0301         0xCF 0x85 0xCC 0x88 0xCC 0x81
2739              *
2740              * This means that in case-insensitive matching (or "loose
2741              * matching", as Unicode calls it), an EXACTF of length six (the
2742              * UTF-8 encoded byte length of the above casefolded versions) can
2743              * match a target string of length two (the byte length of UTF-8
2744              * encoded U+0390 or U+03B0).  This would rather mess up the
2745              * minimum length computation.  (there are other code points that
2746              * also fold to these two sequences, but the delta is smaller)
2747              *
2748              * If these sequences are found, the minimum length is decreased by
2749              * four (six minus two).
2750              *
2751              * Similarly, 'ss' may match the single char and byte LATIN SMALL
2752              * LETTER SHARP S.  We decrease the min length by 1 for each
2753              * occurrence of 'ss' found */
2754
2755 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2756 #           define U390_first_byte 0xb4
2757             const U8 U390_tail[] = "\x68\xaf\x49\xaf\x42";
2758 #           define U3B0_first_byte 0xb5
2759             const U8 U3B0_tail[] = "\x46\xaf\x49\xaf\x42";
2760 #else
2761 #           define U390_first_byte 0xce
2762             const U8 U390_tail[] = "\xb9\xcc\x88\xcc\x81";
2763 #           define U3B0_first_byte 0xcf
2764             const U8 U3B0_tail[] = "\x85\xcc\x88\xcc\x81";
2765 #endif
2766             const U8 len = sizeof(U390_tail); /* (-1 for NUL; +1 for 1st byte;
2767                                                  yields a net of 0 */
2768             /* Examine the string for one of the problematic sequences */
2769             for (s = s0;
2770                  s < s_end - 1; /* Can stop 1 before the end, as minimum length
2771                                  * sequence we are looking for is 2 */
2772                  s += UTF8SKIP(s))
2773             {
2774
2775                 /* Look for the first byte in each problematic sequence */
2776                 switch (*s) {
2777                     /* We don't have to worry about other things that fold to
2778                      * 's' (such as the long s, U+017F), as all above-latin1
2779                      * code points have been pre-folded */
2780                     case 's':
2781                     case 'S':
2782
2783                         /* Current character is an 's' or 'S'.  If next one is
2784                          * as well, we have the dreaded sequence */
2785                         if (((*(s+1) & S_or_s_mask) == s_masked)
2786                             /* These two node types don't have special handling
2787                              * for 'ss' */
2788                             && OP(scan) != EXACTFL && OP(scan) != EXACTFA)
2789                         {
2790                             *min_subtract += 1;
2791                             OP(scan) = EXACTFU_SS;
2792                             s++;    /* No need to look at this character again */
2793                         }
2794                         break;
2795
2796                     case U390_first_byte:
2797                         if (s_end - s >= len
2798
2799                             /* The 1's are because are skipping comparing the
2800                              * first byte */
2801                             && memEQ(s + 1, U390_tail, len - 1))
2802                         {
2803                             goto greek_sequence;
2804                         }
2805                         break;
2806
2807                     case U3B0_first_byte:
2808                         if (! (s_end - s >= len
2809                                && memEQ(s + 1, U3B0_tail, len - 1)))
2810                         {
2811                             break;
2812                         }
2813                       greek_sequence:
2814                         *min_subtract += 4;
2815
2816                         /* This can't currently be handled by trie's, so change
2817                          * the node type to indicate this.  If EXACTFA and
2818                          * EXACTFL were ever to be handled by trie's, this
2819                          * would have to be changed.  If this node has already
2820                          * been changed to EXACTFU_SS in this loop, leave it as
2821                          * is.  (I (khw) think it doesn't matter in regexec.c
2822                          * for UTF patterns, but no need to change it */
2823                         if (OP(scan) == EXACTFU) {
2824                             OP(scan) = EXACTFU_NO_TRIE;
2825                         }
2826                         s += 6; /* We already know what this sequence is.  Skip
2827                                    the rest of it */
2828                         break;
2829                 }
2830             }
2831         }
2832         else if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
2833
2834             /* Here, the pattern is not UTF-8.  We need to look only for the
2835              * 'ss' sequence, and in the EXACTF case, the sharp s, which can be
2836              * in the final position.  Otherwise we can stop looking 1 byte
2837              * earlier because have to find both the first and second 's' */
2838             const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
2839
2840             for (s = s0; s < upper; s++) {
2841                 switch (*s) {
2842                     case 'S':
2843                     case 's':
2844                         if (s_end - s > 1
2845                             && ((*(s+1) & S_or_s_mask) == s_masked))
2846                         {
2847                             *min_subtract += 1;
2848
2849                             /* EXACTF nodes need to know that the minimum
2850                              * length changed so that a sharp s in the string
2851                              * can match this ss in the pattern, but they
2852                              * remain EXACTF nodes, as they are not trie'able,
2853                              * so don't have to invent a new node type to
2854                              * exclude them from the trie code */
2855                             if (OP(scan) != EXACTF) {
2856                                 OP(scan) = EXACTFU_SS;
2857                             }
2858                             s++;
2859                         }
2860                         break;
2861                     case LATIN_SMALL_LETTER_SHARP_S:
2862                         if (OP(scan) == EXACTF) {
2863                             *has_exactf_sharp_s = TRUE;
2864                         }
2865                         break;
2866                 }
2867             }
2868         }
2869     }
2870
2871 #ifdef DEBUGGING
2872     /* Allow dumping but overwriting the collection of skipped
2873      * ops and/or strings with fake optimized ops */
2874     n = scan + NODE_SZ_STR(scan);
2875     while (n <= stop) {
2876         OP(n) = OPTIMIZED;
2877         FLAGS(n) = 0;
2878         NEXT_OFF(n) = 0;
2879         n++;
2880     }
2881 #endif
2882     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2883     return stopnow;
2884 }
2885
2886 /* REx optimizer.  Converts nodes into quicker variants "in place".
2887    Finds fixed substrings.  */
2888
2889 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2890    to the position after last scanned or to NULL. */
2891
2892 #define INIT_AND_WITHP \
2893     assert(!and_withp); \
2894     Newx(and_withp,1,struct regnode_charclass_class); \
2895     SAVEFREEPV(and_withp)
2896
2897 /* this is a chain of data about sub patterns we are processing that
2898    need to be handled separately/specially in study_chunk. Its so
2899    we can simulate recursion without losing state.  */
2900 struct scan_frame;
2901 typedef struct scan_frame {
2902     regnode *last;  /* last node to process in this frame */
2903     regnode *next;  /* next node to process when last is reached */
2904     struct scan_frame *prev; /*previous frame*/
2905     I32 stop; /* what stopparen do we use */
2906 } scan_frame;
2907
2908
2909 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2910
2911 #define CASE_SYNST_FNC(nAmE)                                       \
2912 case nAmE:                                                         \
2913     if (flags & SCF_DO_STCLASS_AND) {                              \
2914             for (value = 0; value < 256; value++)                  \
2915                 if (!is_ ## nAmE ## _cp(value))                       \
2916                     ANYOF_BITMAP_CLEAR(data->start_class, value);  \
2917     }                                                              \
2918     else {                                                         \
2919             for (value = 0; value < 256; value++)                  \
2920                 if (is_ ## nAmE ## _cp(value))                        \
2921                     ANYOF_BITMAP_SET(data->start_class, value);    \
2922     }                                                              \
2923     break;                                                         \
2924 case N ## nAmE:                                                    \
2925     if (flags & SCF_DO_STCLASS_AND) {                              \
2926             for (value = 0; value < 256; value++)                   \
2927                 if (is_ ## nAmE ## _cp(value))                         \
2928                     ANYOF_BITMAP_CLEAR(data->start_class, value);   \
2929     }                                                               \
2930     else {                                                          \
2931             for (value = 0; value < 256; value++)                   \
2932                 if (!is_ ## nAmE ## _cp(value))                        \
2933                     ANYOF_BITMAP_SET(data->start_class, value);     \
2934     }                                                               \
2935     break
2936
2937
2938
2939 STATIC I32
2940 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2941                         I32 *minlenp, I32 *deltap,
2942                         regnode *last,
2943                         scan_data_t *data,
2944                         I32 stopparen,
2945                         U8* recursed,
2946                         struct regnode_charclass_class *and_withp,
2947                         U32 flags, U32 depth)
2948                         /* scanp: Start here (read-write). */
2949                         /* deltap: Write maxlen-minlen here. */
2950                         /* last: Stop before this one. */
2951                         /* data: string data about the pattern */
2952                         /* stopparen: treat close N as END */
2953                         /* recursed: which subroutines have we recursed into */
2954                         /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2955 {
2956     dVAR;
2957     I32 min = 0, pars = 0, code;
2958     regnode *scan = *scanp, *next;
2959     I32 delta = 0;
2960     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2961     int is_inf_internal = 0;            /* The studied chunk is infinite */
2962     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2963     scan_data_t data_fake;
2964     SV *re_trie_maxbuff = NULL;
2965     regnode *first_non_open = scan;
2966     I32 stopmin = I32_MAX;
2967     scan_frame *frame = NULL;
2968     GET_RE_DEBUG_FLAGS_DECL;
2969
2970     PERL_ARGS_ASSERT_STUDY_CHUNK;
2971
2972 #ifdef DEBUGGING
2973     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2974 #endif
2975
2976     if ( depth == 0 ) {
2977         while (first_non_open && OP(first_non_open) == OPEN)
2978             first_non_open=regnext(first_non_open);
2979     }
2980
2981
2982   fake_study_recurse:
2983     while ( scan && OP(scan) != END && scan < last ){
2984         UV min_subtract = 0;    /* How much to subtract from the minimum node
2985                                    length to get a real minimum (because the
2986                                    folded version may be shorter) */
2987         bool has_exactf_sharp_s = FALSE;
2988         /* Peephole optimizer: */
2989         DEBUG_STUDYDATA("Peep:", data,depth);
2990         DEBUG_PEEP("Peep",scan,depth);
2991
2992         /* Its not clear to khw or hv why this is done here, and not in the
2993          * clauses that deal with EXACT nodes.  khw's guess is that it's
2994          * because of a previous design */
2995         JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
2996
2997         /* Follow the next-chain of the current node and optimize
2998            away all the NOTHINGs from it.  */
2999         if (OP(scan) != CURLYX) {
3000             const int max = (reg_off_by_arg[OP(scan)]
3001                        ? I32_MAX
3002                        /* I32 may be smaller than U16 on CRAYs! */
3003                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3004             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3005             int noff;
3006             regnode *n = scan;
3007
3008             /* Skip NOTHING and LONGJMP. */
3009             while ((n = regnext(n))
3010                    && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3011                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3012                    && off + noff < max)
3013                 off += noff;
3014             if (reg_off_by_arg[OP(scan)])
3015                 ARG(scan) = off;
3016             else
3017                 NEXT_OFF(scan) = off;
3018         }
3019
3020
3021
3022         /* The principal pseudo-switch.  Cannot be a switch, since we
3023            look into several different things.  */
3024         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3025                    || OP(scan) == IFTHEN) {
3026             next = regnext(scan);
3027             code = OP(scan);
3028             /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
3029
3030             if (OP(next) == code || code == IFTHEN) {
3031                 /* NOTE - There is similar code to this block below for handling
3032                    TRIE nodes on a re-study.  If you change stuff here check there
3033                    too. */
3034                 I32 max1 = 0, min1 = I32_MAX, num = 0;
3035                 struct regnode_charclass_class accum;
3036                 regnode * const startbranch=scan;
3037
3038                 if (flags & SCF_DO_SUBSTR)
3039                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
3040                 if (flags & SCF_DO_STCLASS)
3041                     cl_init_zero(pRExC_state, &accum);
3042
3043                 while (OP(scan) == code) {
3044                     I32 deltanext, minnext, f = 0, fake;
3045                     struct regnode_charclass_class this_class;
3046
3047                     num++;
3048                     data_fake.flags = 0;
3049                     if (data) {
3050                         data_fake.whilem_c = data->whilem_c;
3051                         data_fake.last_closep = data->last_closep;
3052                     }
3053                     else
3054                         data_fake.last_closep = &fake;
3055
3056                     data_fake.pos_delta = delta;
3057                     next = regnext(scan);
3058                     scan = NEXTOPER(scan);
3059                     if (code != BRANCH)
3060                         scan = NEXTOPER(scan);
3061                     if (flags & SCF_DO_STCLASS) {
3062                         cl_init(pRExC_state, &this_class);
3063                         data_fake.start_class = &this_class;
3064                         f = SCF_DO_STCLASS_AND;
3065                     }
3066                     if (flags & SCF_WHILEM_VISITED_POS)
3067                         f |= SCF_WHILEM_VISITED_POS;
3068
3069                     /* we suppose the run is continuous, last=next...*/
3070                     minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3071                                           next, &data_fake,
3072                                           stopparen, recursed, NULL, f,depth+1);
3073                     if (min1 > minnext)
3074                         min1 = minnext;
3075                     if (max1 < minnext + deltanext)
3076                         max1 = minnext + deltanext;
3077                     if (deltanext == I32_MAX)
3078                         is_inf = is_inf_internal = 1;
3079                     scan = next;
3080                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3081                         pars++;
3082                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
3083                         if ( stopmin > minnext)
3084                             stopmin = min + min1;
3085                         flags &= ~SCF_DO_SUBSTR;
3086                         if (data)
3087                             data->flags |= SCF_SEEN_ACCEPT;
3088                     }
3089                     if (data) {
3090                         if (data_fake.flags & SF_HAS_EVAL)
3091                             data->flags |= SF_HAS_EVAL;
3092                         data->whilem_c = data_fake.whilem_c;
3093                     }
3094                     if (flags & SCF_DO_STCLASS)
3095                         cl_or(pRExC_state, &accum, &this_class);
3096                 }
3097                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3098                     min1 = 0;
3099                 if (flags & SCF_DO_SUBSTR) {
3100                     data->pos_min += min1;
3101                     data->pos_delta += max1 - min1;
3102                     if (max1 != min1 || is_inf)
3103                         data->longest = &(data->longest_float);
3104                 }
3105                 min += min1;
3106                 delta += max1 - min1;
3107                 if (flags & SCF_DO_STCLASS_OR) {
3108                     cl_or(pRExC_state, data->start_class, &accum);
3109                     if (min1) {
3110                         cl_and(data->start_class, and_withp);
3111                         flags &= ~SCF_DO_STCLASS;
3112                     }
3113                 }
3114                 else if (flags & SCF_DO_STCLASS_AND) {
3115                     if (min1) {
3116                         cl_and(data->start_class, &accum);
3117                         flags &= ~SCF_DO_STCLASS;
3118                     }
3119                     else {
3120                         /* Switch to OR mode: cache the old value of
3121                          * data->start_class */
3122                         INIT_AND_WITHP;
3123                         StructCopy(data->start_class, and_withp,
3124                                    struct regnode_charclass_class);
3125                         flags &= ~SCF_DO_STCLASS_AND;
3126                         StructCopy(&accum, data->start_class,
3127                                    struct regnode_charclass_class);
3128                         flags |= SCF_DO_STCLASS_OR;
3129                         data->start_class->flags |= ANYOF_EOS;
3130                     }
3131                 }
3132
3133                 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
3134                 /* demq.
3135
3136                    Assuming this was/is a branch we are dealing with: 'scan' now
3137                    points at the item that follows the branch sequence, whatever
3138                    it is. We now start at the beginning of the sequence and look
3139                    for subsequences of
3140
3141                    BRANCH->EXACT=>x1
3142                    BRANCH->EXACT=>x2
3143                    tail
3144
3145                    which would be constructed from a pattern like /A|LIST|OF|WORDS/
3146
3147                    If we can find such a subsequence we need to turn the first
3148                    element into a trie and then add the subsequent branch exact
3149                    strings to the trie.
3150
3151                    We have two cases
3152
3153                      1. patterns where the whole set of branches can be converted.
3154
3155                      2. patterns where only a subset can be converted.
3156
3157                    In case 1 we can replace the whole set with a single regop
3158                    for the trie. In case 2 we need to keep the start and end
3159                    branches so
3160
3161                      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3162                      becomes BRANCH TRIE; BRANCH X;
3163
3164                   There is an additional case, that being where there is a
3165                   common prefix, which gets split out into an EXACT like node
3166                   preceding the TRIE node.
3167
3168                   If x(1..n)==tail then we can do a simple trie, if not we make
3169                   a "jump" trie, such that when we match the appropriate word
3170                   we "jump" to the appropriate tail node. Essentially we turn
3171                   a nested if into a case structure of sorts.
3172
3173                 */
3174
3175                     int made=0;
3176                     if (!re_trie_maxbuff) {
3177                         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3178                         if (!SvIOK(re_trie_maxbuff))
3179                             sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3180                     }
3181                     if ( SvIV(re_trie_maxbuff)>=0  ) {
3182                         regnode *cur;
3183                         regnode *first = (regnode *)NULL;
3184                         regnode *last = (regnode *)NULL;
3185                         regnode *tail = scan;
3186                         U8 optype = 0;
3187                         U32 count=0;
3188
3189 #ifdef DEBUGGING
3190                         SV * const mysv = sv_newmortal();       /* for dumping */
3191 #endif
3192                         /* var tail is used because there may be a TAIL
3193                            regop in the way. Ie, the exacts will point to the
3194                            thing following the TAIL, but the last branch will
3195                            point at the TAIL. So we advance tail. If we
3196                            have nested (?:) we may have to move through several
3197                            tails.
3198                          */
3199
3200                         while ( OP( tail ) == TAIL ) {
3201                             /* this is the TAIL generated by (?:) */
3202                             tail = regnext( tail );
3203                         }
3204
3205
3206                         DEBUG_OPTIMISE_r({
3207                             regprop(RExC_rx, mysv, tail );
3208                             PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3209                                 (int)depth * 2 + 2, "",
3210                                 "Looking for TRIE'able sequences. Tail node is: ",
3211                                 SvPV_nolen_const( mysv )
3212                             );
3213                         });
3214
3215                         /*
3216
3217                            step through the branches, cur represents each
3218                            branch, noper is the first thing to be matched
3219                            as part of that branch and noper_next is the
3220                            regnext() of that node. if noper is an EXACT
3221                            and noper_next is the same as scan (our current
3222                            position in the regex) then the EXACT branch is
3223                            a possible optimization target. Once we have
3224                            two or more consecutive such branches we can
3225                            create a trie of the EXACT's contents and stich
3226                            it in place. If the sequence represents all of
3227                            the branches we eliminate the whole thing and
3228                            replace it with a single TRIE. If it is a
3229                            subsequence then we need to stitch it in. This
3230                            means the first branch has to remain, and needs
3231                            to be repointed at the item on the branch chain
3232                            following the last branch optimized. This could
3233                            be either a BRANCH, in which case the
3234                            subsequence is internal, or it could be the
3235                            item following the branch sequence in which
3236                            case the subsequence is at the end.
3237
3238                         */
3239
3240                         /* dont use tail as the end marker for this traverse */
3241                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3242                             regnode * const noper = NEXTOPER( cur );
3243 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3244                             regnode * const noper_next = regnext( noper );
3245 #endif
3246
3247                             DEBUG_OPTIMISE_r({
3248                                 regprop(RExC_rx, mysv, cur);
3249                                 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3250                                    (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3251
3252                                 regprop(RExC_rx, mysv, noper);
3253                                 PerlIO_printf( Perl_debug_log, " -> %s",
3254                                     SvPV_nolen_const(mysv));
3255
3256                                 if ( noper_next ) {
3257                                   regprop(RExC_rx, mysv, noper_next );
3258                                   PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3259                                     SvPV_nolen_const(mysv));
3260                                 }
3261                                 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
3262                                    REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
3263                             });
3264                             if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
3265                                          : PL_regkind[ OP( noper ) ] == EXACT )
3266                                   || OP(noper) == NOTHING )
3267 #ifdef NOJUMPTRIE
3268                                   && noper_next == tail
3269 #endif
3270                                   && count < U16_MAX)
3271                             {
3272                                 count++;
3273                                 if ( !first || optype == NOTHING ) {
3274                                     if (!first) first = cur;
3275                                     optype = OP( noper );
3276                                 } else {
3277                                     last = cur;
3278                                 }
3279                             } else {
3280 /*
3281     Currently the trie logic handles case insensitive matching properly only
3282     when the pattern is UTF-8 and the node is EXACTFU (thus forcing unicode
3283     semantics).
3284
3285     If/when this is fixed the following define can be swapped
3286     in below to fully enable trie logic.
3287
3288 #define TRIE_TYPE_IS_SAFE 1
3289
3290 Note that join_exact() assumes that the other types of EXACTFish nodes are not
3291 used in tries, so that would have to be updated if this changed
3292
3293 */
3294 #define TRIE_TYPE_IS_SAFE ((UTF && optype == EXACTFU) || optype==EXACT)
3295
3296                                 if ( last && TRIE_TYPE_IS_SAFE ) {
3297                                     make_trie( pRExC_state,
3298                                             startbranch, first, cur, tail, count,
3299                                             optype, depth+1 );
3300                                 }
3301                                 if ( PL_regkind[ OP( noper ) ] == EXACT
3302 #ifdef NOJUMPTRIE
3303                                      && noper_next == tail
3304 #endif
3305                                 ){
3306                                     count = 1;
3307                                     first = cur;
3308                                     optype = OP( noper );
3309                                 } else {
3310                                     count = 0;
3311                                     first = NULL;
3312                                     optype = 0;
3313                                 }
3314                                 last = NULL;
3315                             }
3316                         }
3317                         DEBUG_OPTIMISE_r({
3318                             regprop(RExC_rx, mysv, cur);
3319                             PerlIO_printf( Perl_debug_log,
3320                               "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3321                               "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3322
3323                         });
3324
3325                         if ( last && TRIE_TYPE_IS_SAFE ) {
3326                             made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3327 #ifdef TRIE_STUDY_OPT
3328                             if ( ((made == MADE_EXACT_TRIE &&
3329                                  startbranch == first)
3330                                  || ( first_non_open == first )) &&
3331                                  depth==0 ) {
3332                                 flags |= SCF_TRIE_RESTUDY;
3333                                 if ( startbranch == first
3334                                      && scan == tail )
3335                                 {
3336                                     RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3337                                 }
3338                             }
3339 #endif
3340                         }
3341                     }
3342
3343                 } /* do trie */
3344
3345             }
3346             else if ( code == BRANCHJ ) {  /* single branch is optimized. */
3347                 scan = NEXTOPER(NEXTOPER(scan));
3348             } else                      /* single branch is optimized. */
3349                 scan = NEXTOPER(scan);
3350             continue;
3351         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3352             scan_frame *newframe = NULL;
3353             I32 paren;
3354             regnode *start;
3355             regnode *end;
3356
3357             if (OP(scan) != SUSPEND) {
3358             /* set the pointer */
3359                 if (OP(scan) == GOSUB) {
3360                     paren = ARG(scan);
3361                     RExC_recurse[ARG2L(scan)] = scan;
3362                     start = RExC_open_parens[paren-1];
3363                     end   = RExC_close_parens[paren-1];
3364                 } else {
3365                     paren = 0;
3366                     start = RExC_rxi->program + 1;
3367                     end   = RExC_opend;
3368                 }
3369                 if (!recursed) {
3370                     Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3371                     SAVEFREEPV(recursed);
3372                 }
3373                 if (!PAREN_TEST(recursed,paren+1)) {
3374                     PAREN_SET(recursed,paren+1);
3375                     Newx(newframe,1,scan_frame);
3376                 } else {
3377                     if (flags & SCF_DO_SUBSTR) {
3378                         SCAN_COMMIT(pRExC_state,data,minlenp);
3379                         data->longest = &(data->longest_float);
3380                     }
3381                     is_inf = is_inf_internal = 1;
3382                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3383                         cl_anything(pRExC_state, data->start_class);
3384                     flags &= ~SCF_DO_STCLASS;
3385                 }
3386             } else {
3387                 Newx(newframe,1,scan_frame);
3388                 paren = stopparen;
3389                 start = scan+2;
3390                 end = regnext(scan);
3391             }
3392             if (newframe) {
3393                 assert(start);
3394                 assert(end);
3395                 SAVEFREEPV(newframe);
3396                 newframe->next = regnext(scan);
3397                 newframe->last = last;
3398                 newframe->stop = stopparen;
3399                 newframe->prev = frame;
3400
3401                 frame = newframe;
3402                 scan =  start;
3403                 stopparen = paren;
3404                 last = end;
3405
3406                 continue;
3407             }
3408         }
3409         else if (OP(scan) == EXACT) {
3410             I32 l = STR_LEN(scan);
3411             UV uc;
3412             if (UTF) {
3413                 const U8 * const s = (U8*)STRING(scan);
3414                 l = utf8_length(s, s + l);
3415                 uc = utf8_to_uvchr(s, NULL);
3416             } else {
3417                 uc = *((U8*)STRING(scan));
3418             }
3419             min += l;
3420             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3421                 /* The code below prefers earlier match for fixed
3422                    offset, later match for variable offset.  */
3423                 if (data->last_end == -1) { /* Update the start info. */
3424                     data->last_start_min = data->pos_min;
3425                     data->last_start_max = is_inf
3426                         ? I32_MAX : data->pos_min + data->pos_delta;
3427                 }
3428                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3429                 if (UTF)
3430                     SvUTF8_on(data->last_found);
3431                 {
3432                     SV * const sv = data->last_found;
3433                     MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3434                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
3435                     if (mg && mg->mg_len >= 0)
3436                         mg->mg_len += utf8_length((U8*)STRING(scan),
3437                                                   (U8*)STRING(scan)+STR_LEN(scan));
3438                 }
3439                 data->last_end = data->pos_min + l;
3440                 data->pos_min += l; /* As in the first entry. */
3441                 data->flags &= ~SF_BEFORE_EOL;
3442             }
3443             if (flags & SCF_DO_STCLASS_AND) {
3444                 /* Check whether it is compatible with what we know already! */
3445                 int compat = 1;
3446
3447
3448                 /* If compatible, we or it in below.  It is compatible if is
3449                  * in the bitmp and either 1) its bit or its fold is set, or 2)
3450                  * it's for a locale.  Even if there isn't unicode semantics
3451                  * here, at runtime there may be because of matching against a
3452                  * utf8 string, so accept a possible false positive for
3453                  * latin1-range folds */
3454                 if (uc >= 0x100 ||
3455                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3456                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
3457                     && (!(data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD)
3458                         || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3459                     )
3460                 {
3461                     compat = 0;
3462                 }
3463                 ANYOF_CLASS_ZERO(data->start_class);
3464                 ANYOF_BITMAP_ZERO(data->start_class);
3465                 if (compat)
3466                     ANYOF_BITMAP_SET(data->start_class, uc);
3467                 else if (uc >= 0x100) {
3468                     int i;
3469
3470                     /* Some Unicode code points fold to the Latin1 range; as
3471                      * XXX temporary code, instead of figuring out if this is
3472                      * one, just assume it is and set all the start class bits
3473                      * that could be some such above 255 code point's fold
3474                      * which will generate fals positives.  As the code
3475                      * elsewhere that does compute the fold settles down, it
3476                      * can be extracted out and re-used here */
3477                     for (i = 0; i < 256; i++){
3478                         if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3479                             ANYOF_BITMAP_SET(data->start_class, i);
3480                         }
3481                     }
3482                 }
3483                 data->start_class->flags &= ~ANYOF_EOS;
3484                 if (uc < 0x100)
3485                   data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3486             }
3487             else if (flags & SCF_DO_STCLASS_OR) {
3488                 /* false positive possible if the class is case-folded */
3489                 if (uc < 0x100)
3490                     ANYOF_BITMAP_SET(data->start_class, uc);
3491                 else
3492                     data->start_class->flags |= ANYOF_UNICODE_ALL;
3493                 data->start_class->flags &= ~ANYOF_EOS;
3494                 cl_and(data->start_class, and_withp);
3495             }
3496             flags &= ~SCF_DO_STCLASS;
3497         }
3498         else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3499             I32 l = STR_LEN(scan);
3500             UV uc = *((U8*)STRING(scan));
3501
3502             /* Search for fixed substrings supports EXACT only. */
3503             if (flags & SCF_DO_SUBSTR) {
3504                 assert(data);
3505                 SCAN_COMMIT(pRExC_state, data, minlenp);
3506             }
3507             if (UTF) {
3508                 const U8 * const s = (U8 *)STRING(scan);
3509                 l = utf8_length(s, s + l);
3510                 uc = utf8_to_uvchr(s, NULL);
3511             }
3512             else if (has_exactf_sharp_s) {
3513                 RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
3514             }
3515             min += l - min_subtract;
3516             if (min < 0) {
3517                 min = 0;
3518             }
3519             delta += min_subtract;
3520             if (flags & SCF_DO_SUBSTR) {
3521                 data->pos_min += l - min_subtract;
3522                 if (data->pos_min < 0) {
3523                     data->pos_min = 0;
3524                 }
3525                 data->pos_delta += min_subtract;
3526                 if (min_subtract) {
3527                     data->longest = &(data->longest_float);
3528                 }
3529             }
3530             if (flags & SCF_DO_STCLASS_AND) {
3531                 /* Check whether it is compatible with what we know already! */
3532                 int compat = 1;
3533                 if (uc >= 0x100 ||
3534                  (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3535                   && !ANYOF_BITMAP_TEST(data->start_class, uc)
3536                   && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3537                 {
3538                     compat = 0;
3539                 }
3540                 ANYOF_CLASS_ZERO(data->start_class);
3541                 ANYOF_BITMAP_ZERO(data->start_class);
3542                 if (compat) {
3543                     ANYOF_BITMAP_SET(data->start_class, uc);
3544                     data->start_class->flags &= ~ANYOF_EOS;
3545                     data->start_class->flags |= ANYOF_LOC_NONBITMAP_FOLD;
3546                     if (OP(scan) == EXACTFL) {
3547                         /* XXX This set is probably no longer necessary, and
3548                          * probably wrong as LOCALE now is on in the initial
3549                          * state */
3550                         data->start_class->flags |= ANYOF_LOCALE;
3551                     }
3552                     else {
3553
3554                         /* Also set the other member of the fold pair.  In case
3555                          * that unicode semantics is called for at runtime, use
3556                          * the full latin1 fold.  (Can't do this for locale,
3557                          * because not known until runtime) */
3558                         ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3559
3560                         /* All other (EXACTFL handled above) folds except under
3561                          * /iaa that include s, S, and sharp_s also may include
3562                          * the others */
3563                         if (OP(scan) != EXACTFA) {
3564                             if (uc == 's' || uc == 'S') {
3565                                 ANYOF_BITMAP_SET(data->start_class,
3566                                                  LATIN_SMALL_LETTER_SHARP_S);
3567                             }
3568                             else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3569                                 ANYOF_BITMAP_SET(data->start_class, 's');
3570                                 ANYOF_BITMAP_SET(data->start_class, 'S');
3571                             }
3572                         }
3573                     }
3574                 }
3575                 else if (uc >= 0x100) {
3576                     int i;
3577                     for (i = 0; i < 256; i++){
3578                         if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3579                             ANYOF_BITMAP_SET(data->start_class, i);
3580                         }
3581                     }
3582                 }
3583             }
3584             else if (flags & SCF_DO_STCLASS_OR) {
3585                 if (data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) {
3586                     /* false positive possible if the class is case-folded.
3587                        Assume that the locale settings are the same... */
3588                     if (uc < 0x100) {
3589                         ANYOF_BITMAP_SET(data->start_class, uc);
3590                         if (OP(scan) != EXACTFL) {
3591
3592                             /* And set the other member of the fold pair, but
3593                              * can't do that in locale because not known until
3594                              * run-time */
3595                             ANYOF_BITMAP_SET(data->start_class,
3596                                              PL_fold_latin1[uc]);
3597
3598                             /* All folds except under /iaa that include s, S,
3599                              * and sharp_s also may include the others */
3600                             if (OP(scan) != EXACTFA) {
3601                                 if (uc == 's' || uc == 'S') {
3602                                     ANYOF_BITMAP_SET(data->start_class,
3603                                                    LATIN_SMALL_LETTER_SHARP_S);
3604                                 }
3605                                 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3606                                     ANYOF_BITMAP_SET(data->start_class, 's');
3607                                     ANYOF_BITMAP_SET(data->start_class, 'S');
3608                                 }
3609                             }
3610                         }
3611                     }
3612                     data->start_class->flags &= ~ANYOF_EOS;
3613                 }
3614                 cl_and(data->start_class, and_withp);
3615             }
3616             flags &= ~SCF_DO_STCLASS;
3617         }
3618         else if (REGNODE_VARIES(OP(scan))) {
3619             I32 mincount, maxcount, minnext, deltanext, fl = 0;
3620             I32 f = flags, pos_before = 0;
3621             regnode * const oscan = scan;
3622             struct regnode_charclass_class this_class;
3623             struct regnode_charclass_class *oclass = NULL;
3624             I32 next_is_eval = 0;
3625
3626             switch (PL_regkind[OP(scan)]) {
3627             case WHILEM:                /* End of (?:...)* . */
3628                 scan = NEXTOPER(scan);
3629                 goto finish;
3630             case PLUS:
3631                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3632                     next = NEXTOPER(scan);
3633                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3634                         mincount = 1;
3635                         maxcount = REG_INFTY;
3636                         next = regnext(scan);
3637                         scan = NEXTOPER(scan);
3638                         goto do_curly;
3639                     }
3640                 }
3641                 if (flags & SCF_DO_SUBSTR)
3642                     data->pos_min++;
3643                 min++;
3644                 /* Fall through. */
3645             case STAR:
3646                 if (flags & SCF_DO_STCLASS) {
3647                     mincount = 0;
3648                     maxcount = REG_INFTY;
3649                     next = regnext(scan);
3650                     scan = NEXTOPER(scan);
3651                     goto do_curly;
3652                 }
3653                 is_inf = is_inf_internal = 1;
3654                 scan = regnext(scan);
3655                 if (flags & SCF_DO_SUBSTR) {
3656                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3657                     data->longest = &(data->longest_float);
3658                 }
3659                 goto optimize_curly_tail;
3660             case CURLY:
3661                 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3662                     && (scan->flags == stopparen))
3663                 {
3664                     mincount = 1;
3665                     maxcount = 1;
3666                 } else {
3667                     mincount = ARG1(scan);
3668                     maxcount = ARG2(scan);
3669                 }
3670                 next = regnext(scan);
3671                 if (OP(scan) == CURLYX) {
3672                     I32 lp = (data ? *(data->last_closep) : 0);
3673                     scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3674                 }
3675                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3676                 next_is_eval = (OP(scan) == EVAL);
3677               do_curly:
3678                 if (flags & SCF_DO_SUBSTR) {
3679                     if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3680                     pos_before = data->pos_min;
3681                 }
3682                 if (data) {
3683                     fl = data->flags;
3684                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3685                     if (is_inf)
3686                         data->flags |= SF_IS_INF;
3687                 }
3688                 if (flags & SCF_DO_STCLASS) {
3689                     cl_init(pRExC_state, &this_class);
3690                     oclass = data->start_class;
3691                     data->start_class = &this_class;
3692                     f |= SCF_DO_STCLASS_AND;
3693                     f &= ~SCF_DO_STCLASS_OR;
3694                 }
3695                 /* Exclude from super-linear cache processing any {n,m}
3696                    regops for which the combination of input pos and regex
3697                    pos is not enough information to determine if a match
3698                    will be possible.
3699
3700                    For example, in the regex /foo(bar\s*){4,8}baz/ with the
3701                    regex pos at the \s*, the prospects for a match depend not
3702                    only on the input position but also on how many (bar\s*)
3703                    repeats into the {4,8} we are. */
3704                if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3705                     f &= ~SCF_WHILEM_VISITED_POS;
3706
3707                 /* This will finish on WHILEM, setting scan, or on NULL: */
3708                 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3709                                       last, data, stopparen, recursed, NULL,
3710                                       (mincount == 0
3711                                         ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3712
3713                 if (flags & SCF_DO_STCLASS)
3714                     data->start_class = oclass;
3715                 if (mincount == 0 || minnext == 0) {
3716                     if (flags & SCF_DO_STCLASS_OR) {
3717                         cl_or(pRExC_state, data->start_class, &this_class);
3718                     }
3719                     else if (flags & SCF_DO_STCLASS_AND) {
3720                         /* Switch to OR mode: cache the old value of
3721                          * data->start_class */
3722                         INIT_AND_WITHP;
3723                         StructCopy(data->start_class, and_withp,
3724                                    struct regnode_charclass_class);
3725                         flags &= ~SCF_DO_STCLASS_AND;
3726                         StructCopy(&this_class, data->start_class,
3727                                    struct regnode_charclass_class);
3728                         flags |= SCF_DO_STCLASS_OR;
3729                         data->start_class->flags |= ANYOF_EOS;
3730                     }
3731                 } else {                /* Non-zero len */
3732                     if (flags & SCF_DO_STCLASS_OR) {
3733                         cl_or(pRExC_state, data->start_class, &this_class);
3734                         cl_and(data->start_class, and_withp);
3735                     }
3736                     else if (flags & SCF_DO_STCLASS_AND)
3737                         cl_and(data->start_class, &this_class);
3738                     flags &= ~SCF_DO_STCLASS;
3739                 }
3740                 if (!scan)              /* It was not CURLYX, but CURLY. */
3741                     scan = next;
3742                 if ( /* ? quantifier ok, except for (?{ ... }) */
3743                     (next_is_eval || !(mincount == 0 && maxcount == 1))
3744                     && (minnext == 0) && (deltanext == 0)
3745                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3746                     && maxcount <= REG_INFTY/3) /* Complement check for big count */
3747                 {
3748                     ckWARNreg(RExC_parse,
3749                               "Quantifier unexpected on zero-length expression");
3750                 }
3751
3752                 min += minnext * mincount;
3753                 is_inf_internal |= ((maxcount == REG_INFTY
3754                                      && (minnext + deltanext) > 0)
3755                                     || deltanext == I32_MAX);
3756                 is_inf |= is_inf_internal;
3757                 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3758
3759                 /* Try powerful optimization CURLYX => CURLYN. */
3760                 if (  OP(oscan) == CURLYX && data
3761                       && data->flags & SF_IN_PAR
3762                       && !(data->flags & SF_HAS_EVAL)
3763                       && !deltanext && minnext == 1 ) {
3764                     /* Try to optimize to CURLYN.  */
3765                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3766                     regnode * const nxt1 = nxt;
3767 #ifdef DEBUGGING
3768                     regnode *nxt2;
3769 #endif
3770
3771                     /* Skip open. */
3772                     nxt = regnext(nxt);
3773                     if (!REGNODE_SIMPLE(OP(nxt))
3774                         && !(PL_regkind[OP(nxt)] == EXACT
3775                              && STR_LEN(nxt) == 1))
3776                         goto nogo;
3777 #ifdef DEBUGGING
3778                     nxt2 = nxt;
3779 #endif
3780                     nxt = regnext(nxt);
3781                     if (OP(nxt) != CLOSE)
3782                         goto nogo;
3783                     if (RExC_open_parens) {
3784                         RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3785                         RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3786                     }
3787                     /* Now we know that nxt2 is the only contents: */
3788                     oscan->flags = (U8)ARG(nxt);
3789                     OP(oscan) = CURLYN;
3790                     OP(nxt1) = NOTHING; /* was OPEN. */
3791
3792 #ifdef DEBUGGING
3793                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3794                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3795                     NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3796                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
3797                     OP(nxt + 1) = OPTIMIZED; /* was count. */
3798                     NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3799 #endif
3800                 }
3801               nogo:
3802
3803                 /* Try optimization CURLYX => CURLYM. */
3804                 if (  OP(oscan) == CURLYX && data
3805                       && !(data->flags & SF_HAS_PAR)
3806                       && !(data->flags & SF_HAS_EVAL)
3807                       && !deltanext     /* atom is fixed width */
3808                       && minnext != 0   /* CURLYM can't handle zero width */
3809                 ) {
3810                     /* XXXX How to optimize if data == 0? */
3811                     /* Optimize to a simpler form.  */
3812                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3813                     regnode *nxt2;
3814
3815                     OP(oscan) = CURLYM;
3816                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3817                             && (OP(nxt2) != WHILEM))
3818                         nxt = nxt2;
3819                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
3820                     /* Need to optimize away parenths. */
3821                     if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3822                         /* Set the parenth number.  */
3823                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3824
3825                         oscan->flags = (U8)ARG(nxt);
3826                         if (RExC_open_parens) {
3827                             RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3828                             RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3829                         }
3830                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
3831                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
3832
3833 #ifdef DEBUGGING
3834                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3835                         OP(nxt + 1) = OPTIMIZED; /* was count. */
3836                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3837                         NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
3838 #endif
3839 #if 0
3840                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
3841                             regnode *nnxt = regnext(nxt1);
3842                             if (nnxt == nxt) {
3843                                 if (reg_off_by_arg[OP(nxt1)])
3844                                     ARG_SET(nxt1, nxt2 - nxt1);
3845                                 else if (nxt2 - nxt1 < U16_MAX)
3846                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
3847                                 else
3848                                     OP(nxt) = NOTHING;  /* Cannot beautify */
3849                             }
3850                             nxt1 = nnxt;
3851                         }
3852 #endif
3853                         /* Optimize again: */
3854                         study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3855                                     NULL, stopparen, recursed, NULL, 0,depth+1);
3856                     }
3857                     else
3858                         oscan->flags = 0;
3859                 }
3860                 else if ((OP(oscan) == CURLYX)
3861                          && (flags & SCF_WHILEM_VISITED_POS)
3862                          /* See the comment on a similar expression above.
3863                             However, this time it's not a subexpression
3864                             we care about, but the expression itself. */
3865                          && (maxcount == REG_INFTY)
3866                          && data && ++data->whilem_c < 16) {
3867                     /* This stays as CURLYX, we can put the count/of pair. */
3868                     /* Find WHILEM (as in regexec.c) */
3869                     regnode *nxt = oscan + NEXT_OFF(oscan);
3870
3871                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3872                         nxt += ARG(nxt);
3873                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
3874                         | (RExC_whilem_seen << 4)); /* On WHILEM */
3875                 }
3876                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3877                     pars++;
3878                 if (flags & SCF_DO_SUBSTR) {
3879                     SV *last_str = NULL;
3880                     int counted = mincount != 0;
3881
3882                     if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3883 #if defined(SPARC64_GCC_WORKAROUND)
3884                         I32 b = 0;
3885                         STRLEN l = 0;
3886                         const char *s = NULL;
3887                         I32 old = 0;
3888
3889                         if (pos_before >= data->last_start_min)
3890                             b = pos_before;
3891                         else
3892                             b = data->last_start_min;
3893
3894                         l = 0;
3895                         s = SvPV_const(data->last_found, l);
3896                         old = b - data->last_start_min;
3897
3898 #else
3899                         I32 b = pos_before >= data->last_start_min
3900                             ? pos_before : data->last_start_min;
3901                         STRLEN l;
3902                         const char * const s = SvPV_const(data->last_found, l);
3903                         I32 old = b - data->last_start_min;
3904 #endif
3905
3906                         if (UTF)
3907                             old = utf8_hop((U8*)s, old) - (U8*)s;
3908                         l -= old;
3909                         /* Get the added string: */
3910                         last_str = newSVpvn_utf8(s  + old, l, UTF);
3911                         if (deltanext == 0 && pos_before == b) {
3912                             /* What was added is a constant string */
3913                             if (mincount > 1) {
3914                                 SvGROW(last_str, (mincount * l) + 1);
3915                                 repeatcpy(SvPVX(last_str) + l,
3916                                           SvPVX_const(last_str), l, mincount - 1);
3917                                 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3918                                 /* Add additional parts. */
3919                                 SvCUR_set(data->last_found,
3920                                           SvCUR(data->last_found) - l);
3921                                 sv_catsv(data->last_found, last_str);
3922                                 {
3923                                     SV * sv = data->last_found;
3924                                     MAGIC *mg =
3925                                         SvUTF8(sv) && SvMAGICAL(sv) ?
3926                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
3927                                     if (mg && mg->mg_len >= 0)
3928                                         mg->mg_len += CHR_SVLEN(last_str) - l;
3929                                 }
3930                                 data->last_end += l * (mincount - 1);
3931                             }
3932                         } else {
3933                             /* start offset must point into the last copy */
3934                             data->last_start_min += minnext * (mincount - 1);
3935                             data->last_start_max += is_inf ? I32_MAX
3936                                 : (maxcount - 1) * (minnext + data->pos_delta);
3937                         }
3938                     }
3939                     /* It is counted once already... */
3940                     data->pos_min += minnext * (mincount - counted);
3941                     data->pos_delta += - counted * deltanext +
3942                         (minnext + deltanext) * maxcount - minnext * mincount;
3943                     if (mincount != maxcount) {
3944                          /* Cannot extend fixed substrings found inside
3945                             the group.  */
3946                         SCAN_COMMIT(pRExC_state,data,minlenp);
3947                         if (mincount && last_str) {
3948                             SV * const sv = data->last_found;
3949                             MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3950                                 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3951
3952                             if (mg)
3953                                 mg->mg_len = -1;
3954                             sv_setsv(sv, last_str);
3955                             data->last_end = data->pos_min;
3956                             data->last_start_min =
3957                                 data->pos_min - CHR_SVLEN(last_str);
3958                             data->last_start_max = is_inf
3959                                 ? I32_MAX
3960                                 : data->pos_min + data->pos_delta
3961                                 - CHR_SVLEN(last_str);
3962                         }
3963                         data->longest = &(data->longest_float);
3964                     }
3965                     SvREFCNT_dec(last_str);
3966                 }
3967                 if (data && (fl & SF_HAS_EVAL))
3968                     data->flags |= SF_HAS_EVAL;
3969               optimize_curly_tail:
3970                 if (OP(oscan) != CURLYX) {
3971                     while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3972                            && NEXT_OFF(next))
3973                         NEXT_OFF(oscan) += NEXT_OFF(next);
3974                 }
3975                 continue;
3976             default:                    /* REF, ANYOFV, and CLUMP only? */
3977                 if (flags & SCF_DO_SUBSTR) {
3978                     SCAN_COMMIT(pRExC_state,data,minlenp);      /* Cannot expect anything... */
3979                     data->longest = &(data->longest_float);
3980                 }
3981                 is_inf = is_inf_internal = 1;
3982                 if (flags & SCF_DO_STCLASS_OR)
3983                     cl_anything(pRExC_state, data->start_class);
3984                 flags &= ~SCF_DO_STCLASS;
3985                 break;
3986             }
3987         }
3988         else if (OP(scan) == LNBREAK) {
3989             if (flags & SCF_DO_STCLASS) {
3990                 int value = 0;
3991                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3992                 if (flags & SCF_DO_STCLASS_AND) {
3993                     for (value = 0; value < 256; value++)
3994                         if (!is_VERTWS_cp(value))
3995                             ANYOF_BITMAP_CLEAR(data->start_class, value);
3996                 }
3997                 else {
3998                     for (value = 0; value < 256; value++)
3999                         if (is_VERTWS_cp(value))
4000                             ANYOF_BITMAP_SET(data->start_class, value);
4001                 }
4002                 if (flags & SCF_DO_STCLASS_OR)
4003                     cl_and(data->start_class, and_withp);
4004                 flags &= ~SCF_DO_STCLASS;
4005             }
4006             min += 1;
4007             delta += 1;
4008             if (flags & SCF_DO_SUBSTR) {
4009                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
4010                 data->pos_min += 1;
4011                 data->pos_delta += 1;
4012                 data->longest = &(data->longest_float);
4013             }
4014         }
4015         else if (REGNODE_SIMPLE(OP(scan))) {
4016             int value = 0;
4017
4018             if (flags & SCF_DO_SUBSTR) {
4019                 SCAN_COMMIT(pRExC_state,data,minlenp);
4020                 data->pos_min++;
4021             }
4022             min++;
4023             if (flags & SCF_DO_STCLASS) {
4024                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
4025
4026                 /* Some of the logic below assumes that switching
4027                    locale on will only add false positives. */
4028                 switch (PL_regkind[OP(scan)]) {
4029                 case SANY:
4030                 default:
4031                   do_default:
4032                     /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
4033                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4034                         cl_anything(pRExC_state, data->start_class);
4035                     break;
4036                 case REG_ANY:
4037                     if (OP(scan) == SANY)
4038                         goto do_default;
4039                     if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
4040                         value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
4041                                  || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
4042                         cl_anything(pRExC_state, data->start_class);
4043                     }
4044                     if (flags & SCF_DO_STCLASS_AND || !value)
4045                         ANYOF_BITMAP_CLEAR(data->start_class,'\n');
4046                     break;
4047                 case ANYOF:
4048                     if (flags & SCF_DO_STCLASS_AND)
4049                         cl_and(data->start_class,
4050                                (struct regnode_charclass_class*)scan);
4051                     else
4052                         cl_or(pRExC_state, data->start_class,
4053                               (struct regnode_charclass_class*)scan);
4054                     break;
4055                 case ALNUM:
4056                     if (flags & SCF_DO_STCLASS_AND) {
4057                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4058                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
4059                             if (OP(scan) == ALNUMU) {
4060                                 for (value = 0; value < 256; value++) {
4061                                     if (!isWORDCHAR_L1(value)) {
4062                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4063                                     }
4064                                 }
4065                             } else {
4066                                 for (value = 0; value < 256; value++) {
4067                                     if (!isALNUM(value)) {
4068                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4069                                     }
4070                                 }
4071                             }
4072                         }
4073                     }
4074                     else {
4075                         if (data->start_class->flags & ANYOF_LOCALE)
4076                             ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
4077
4078                         /* Even if under locale, set the bits for non-locale
4079                          * in case it isn't a true locale-node.  This will
4080                          * create false positives if it truly is locale */
4081                         if (OP(scan) == ALNUMU) {
4082                             for (value = 0; value < 256; value++) {
4083                                 if (isWORDCHAR_L1(value)) {
4084                                     ANYOF_BITMAP_SET(data->start_class, value);
4085                                 }
4086                             }
4087                         } else {
4088                             for (value = 0; value < 256; value++) {
4089                                 if (isALNUM(value)) {
4090                                     ANYOF_BITMAP_SET(data->start_class, value);
4091                                 }
4092                             }
4093                         }
4094                     }
4095                     break;
4096                 case NALNUM:
4097                     if (flags & SCF_DO_STCLASS_AND) {
4098                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4099                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
4100                             if (OP(scan) == NALNUMU) {
4101                                 for (value = 0; value < 256; value++) {
4102                                     if (isWORDCHAR_L1(value)) {
4103                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4104                                     }
4105                                 }
4106                             } else {
4107                                 for (value = 0; value < 256; value++) {
4108                                     if (isALNUM(value)) {
4109                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4110                                     }
4111                                 }
4112                             }
4113                         }
4114                     }
4115                     else {
4116                         if (data->start_class->flags & ANYOF_LOCALE)
4117                             ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
4118
4119                         /* Even if under locale, set the bits for non-locale in
4120                          * case it isn't a true locale-node.  This will create
4121                          * false positives if it truly is locale */
4122                         if (OP(scan) == NALNUMU) {
4123                             for (value = 0; value < 256; value++) {
4124                                 if (! isWORDCHAR_L1(value)) {
4125                                     ANYOF_BITMAP_SET(data->start_class, value);
4126                                 }
4127                             }
4128                         } else {
4129                             for (value = 0; value < 256; value++) {
4130                                 if (! isALNUM(value)) {
4131                                     ANYOF_BITMAP_SET(data->start_class, value);
4132                                 }
4133                             }
4134                         }
4135                     }
4136                     break;
4137                 case SPACE:
4138                     if (flags & SCF_DO_STCLASS_AND) {
4139                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4140                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
4141                             if (OP(scan) == SPACEU) {
4142                                 for (value = 0; value < 256; value++) {
4143                                     if (!isSPACE_L1(value)) {
4144                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4145                                     }
4146                                 }
4147                             } else {
4148                                 for (value = 0; value < 256; value++) {
4149                                     if (!isSPACE(value)) {
4150                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4151                                     }
4152                                 }
4153                             }
4154                         }
4155                     }
4156                     else {
4157                         if (data->start_class->flags & ANYOF_LOCALE) {
4158                             ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
4159                         }
4160                         if (OP(scan) == SPACEU) {
4161                             for (value = 0; value < 256; value++) {
4162                                 if (isSPACE_L1(value)) {
4163                                     ANYOF_BITMAP_SET(data->start_class, value);
4164                                 }
4165                             }
4166                         } else {
4167                             for (value = 0; value < 256; value++) {
4168                                 if (isSPACE(value)) {
4169                                     ANYOF_BITMAP_SET(data->start_class, value);
4170                                 }
4171                             }
4172                         }
4173                     }
4174                     break;
4175                 case NSPACE:
4176                     if (flags & SCF_DO_STCLASS_AND) {
4177                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4178                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
4179                             if (OP(scan) == NSPACEU) {
4180                                 for (value = 0; value < 256; value++) {
4181                                     if (isSPACE_L1(value)) {
4182                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4183                                     }
4184                                 }
4185                             } else {
4186                                 for (value = 0; value < 256; value++) {
4187                                     if (isSPACE(value)) {
4188                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4189                                     }
4190                                 }
4191                             }
4192                         }
4193                     }
4194                     else {
4195                         if (data->start_class->flags & ANYOF_LOCALE)
4196                             ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
4197                         if (OP(scan) == NSPACEU) {
4198                             for (value = 0; value < 256; value++) {
4199                                 if (!isSPACE_L1(value)) {
4200                                     ANYOF_BITMAP_SET(data->start_class, value);
4201                                 }
4202                             }
4203                         }
4204                         else {
4205                             for (value = 0; value < 256; value++) {
4206                                 if (!isSPACE(value)) {
4207                                     ANYOF_BITMAP_SET(data->start_class, value);
4208                                 }
4209                             }
4210                         }
4211                     }
4212                     break;
4213                 case DIGIT:
4214                     if (flags & SCF_DO_STCLASS_AND) {
4215                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4216                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
4217                             for (value = 0; value < 256; value++)
4218                                 if (!isDIGIT(value))
4219                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
4220                         }
4221                     }
4222                     else {
4223                         if (data->start_class->flags & ANYOF_LOCALE)
4224                             ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
4225                         for (value = 0; value < 256; value++)
4226                             if (isDIGIT(value))
4227                                 ANYOF_BITMAP_SET(data->start_class, value);
4228                     }
4229                     break;
4230                 case NDIGIT:
4231                     if (flags & SCF_DO_STCLASS_AND) {
4232                         if (!(data->start_class->flags & ANYOF_LOCALE))
4233                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
4234                         for (value = 0; value < 256; value++)
4235                             if (isDIGIT(value))
4236                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
4237                     }
4238                     else {
4239                         if (data->start_class->flags & ANYOF_LOCALE)
4240                             ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
4241                         for (value = 0; value < 256; value++)
4242                             if (!isDIGIT(value))
4243                                 ANYOF_BITMAP_SET(data->start_class, value);
4244                     }
4245                     break;
4246                 CASE_SYNST_FNC(VERTWS);
4247                 CASE_SYNST_FNC(HORIZWS);
4248
4249                 }
4250                 if (flags & SCF_DO_STCLASS_OR)
4251                     cl_and(data->start_class, and_withp);
4252                 flags &= ~SCF_DO_STCLASS;
4253             }
4254         }
4255         else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
4256             data->flags |= (OP(scan) == MEOL
4257                             ? SF_BEFORE_MEOL
4258                             : SF_BEFORE_SEOL);
4259         }
4260         else if (  PL_regkind[OP(scan)] == BRANCHJ
4261                  /* Lookbehind, or need to calculate parens/evals/stclass: */
4262                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
4263                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4264             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4265                 || OP(scan) == UNLESSM )
4266             {
4267                 /* Negative Lookahead/lookbehind
4268                    In this case we can't do fixed string optimisation.
4269                 */
4270
4271                 I32 deltanext, minnext, fake = 0;
4272                 regnode *nscan;
4273                 struct regnode_charclass_class intrnl;
4274                 int f = 0;
4275
4276                 data_fake.flags = 0;
4277                 if (data) {
4278                     data_fake.whilem_c = data->whilem_c;
4279                     data_fake.last_closep = data->last_closep;
4280                 }
4281                 else
4282                     data_fake.last_closep = &fake;
4283                 data_fake.pos_delta = delta;
4284                 if ( flags & SCF_DO_STCLASS && !scan->flags
4285                      && OP(scan) == IFMATCH ) { /* Lookahead */
4286                     cl_init(pRExC_state, &intrnl);
4287                     data_fake.start_class = &intrnl;
4288                     f |= SCF_DO_STCLASS_AND;
4289                 }
4290                 if (flags & SCF_WHILEM_VISITED_POS)
4291                     f |= SCF_WHILEM_VISITED_POS;
4292                 next = regnext(scan);
4293                 nscan = NEXTOPER(NEXTOPER(scan));
4294                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
4295                     last, &data_fake, stopparen, recursed, NULL, f, depth+1);
4296                 if (scan->flags) {
4297                     if (deltanext) {
4298                         FAIL("Variable length lookbehind not implemented");
4299                     }
4300                     else if (minnext > (I32)U8_MAX) {
4301                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4302                     }
4303                     scan->flags = (U8)minnext;
4304                 }
4305                 if (data) {
4306                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4307                         pars++;
4308                     if (data_fake.flags & SF_HAS_EVAL)
4309                         data->flags |= SF_HAS_EVAL;
4310                     data->whilem_c = data_fake.whilem_c;
4311                 }
4312                 if (f & SCF_DO_STCLASS_AND) {
4313                     if (flags & SCF_DO_STCLASS_OR) {
4314                         /* OR before, AND after: ideally we would recurse with
4315                          * data_fake to get the AND applied by study of the
4316                          * remainder of the pattern, and then derecurse;
4317                          * *** HACK *** for now just treat as "no information".
4318                          * See [perl #56690].
4319                          */
4320                         cl_init(pRExC_state, data->start_class);
4321                     }  else {
4322                         /* AND before and after: combine and continue */
4323                         const int was = (data->start_class->flags & ANYOF_EOS);
4324
4325                         cl_and(data->start_class, &intrnl);
4326                         if (was)
4327                             data->start_class->flags |= ANYOF_EOS;
4328                     }
4329                 }
4330             }
4331 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4332             else {
4333                 /* Positive Lookahead/lookbehind
4334                    In this case we can do fixed string optimisation,
4335                    but we must be careful about it. Note in the case of
4336                    lookbehind the positions will be offset by the minimum
4337                    length of the pattern, something we won't know about
4338                    until after the recurse.
4339                 */
4340                 I32 deltanext, fake = 0;
4341                 regnode *nscan;
4342                 struct regnode_charclass_class intrnl;
4343                 int f = 0;
4344                 /* We use SAVEFREEPV so that when the full compile
4345                     is finished perl will clean up the allocated
4346                     minlens when it's all done. This way we don't
4347                     have to worry about freeing them when we know
4348                     they wont be used, which would be a pain.
4349                  */
4350                 I32 *minnextp;
4351                 Newx( minnextp, 1, I32 );
4352                 SAVEFREEPV(minnextp);
4353
4354                 if (data) {
4355                     StructCopy(data, &data_fake, scan_data_t);
4356                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
4357                         f |= SCF_DO_SUBSTR;
4358                         if (scan->flags)
4359                             SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
4360                         data_fake.last_found=newSVsv(data->last_found);
4361                     }
4362                 }
4363                 else
4364                     data_fake.last_closep = &fake;
4365                 data_fake.flags = 0;
4366                 data_fake.pos_delta = delta;
4367                 if (is_inf)
4368                     data_fake.flags |= SF_IS_INF;
4369                 if ( flags & SCF_DO_STCLASS && !scan->flags
4370                      && OP(scan) == IFMATCH ) { /* Lookahead */
4371                     cl_init(pRExC_state, &intrnl);
4372                     data_fake.start_class = &intrnl;
4373                     f |= SCF_DO_STCLASS_AND;
4374                 }
4375                 if (flags & SCF_WHILEM_VISITED_POS)
4376                     f |= SCF_WHILEM_VISITED_POS;
4377                 next = regnext(scan);
4378                 nscan = NEXTOPER(NEXTOPER(scan));
4379
4380                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
4381                     last, &data_fake, stopparen, recursed, NULL, f,depth+1);
4382                 if (scan->flags) {
4383                     if (deltanext) {
4384                         FAIL("Variable length lookbehind not implemented");
4385                     }
4386                     else if (*minnextp > (I32)U8_MAX) {
4387                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4388                     }
4389                     scan->flags = (U8)*minnextp;
4390                 }
4391
4392                 *minnextp += min;
4393
4394                 if (f & SCF_DO_STCLASS_AND) {
4395                     const int was = (data->start_class->flags & ANYOF_EOS);
4396
4397                     cl_and(data->start_class, &intrnl);
4398                     if (was)
4399                         data->start_class->flags |= ANYOF_EOS;
4400                 }
4401                 if (data) {
4402                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4403                         pars++;
4404                     if (data_fake.flags & SF_HAS_EVAL)
4405                         data->flags |= SF_HAS_EVAL;
4406                     data->whilem_c = data_fake.whilem_c;
4407                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4408                         if (RExC_rx->minlen<*minnextp)
4409                             RExC_rx->minlen=*minnextp;
4410                         SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4411                         SvREFCNT_dec(data_fake.last_found);
4412
4413                         if ( data_fake.minlen_fixed != minlenp )
4414                         {
4415                             data->offset_fixed= data_fake.offset_fixed;
4416                             data->minlen_fixed= data_fake.minlen_fixed;
4417                             data->lookbehind_fixed+= scan->flags;
4418                         }
4419                         if ( data_fake.minlen_float != minlenp )
4420                         {
4421                             data->minlen_float= data_fake.minlen_float;
4422                             data->offset_float_min=data_fake.offset_float_min;
4423                             data->offset_float_max=data_fake.offset_float_max;
4424                             data->lookbehind_float+= scan->flags;
4425                         }
4426                     }
4427                 }
4428
4429
4430             }
4431 #endif
4432         }
4433         else if (OP(scan) == OPEN) {
4434             if (stopparen != (I32)ARG(scan))
4435                 pars++;
4436         }
4437         else if (OP(scan) == CLOSE) {
4438             if (stopparen == (I32)ARG(scan)) {
4439                 break;
4440             }
4441             if ((I32)ARG(scan) == is_par) {
4442                 next = regnext(scan);
4443
4444                 if ( next && (OP(next) != WHILEM) && next < last)
4445                     is_par = 0;         /* Disable optimization */
4446             }
4447             if (data)
4448                 *(data->last_closep) = ARG(scan);
4449         }
4450         else if (OP(scan) == EVAL) {
4451                 if (data)
4452                     data->flags |= SF_HAS_EVAL;
4453         }
4454         else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4455             if (flags & SCF_DO_SUBSTR) {
4456                 SCAN_COMMIT(pRExC_state,data,minlenp);
4457                 flags &= ~SCF_DO_SUBSTR;
4458             }
4459             if (data && OP(scan)==ACCEPT) {
4460                 data->flags |= SCF_SEEN_ACCEPT;
4461                 if (stopmin > min)
4462                     stopmin = min;
4463             }
4464         }
4465         else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4466         {
4467                 if (flags & SCF_DO_SUBSTR) {
4468                     SCAN_COMMIT(pRExC_state,data,minlenp);
4469                     data->longest = &(data->longest_float);
4470                 }
4471                 is_inf = is_inf_internal = 1;
4472                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4473                     cl_anything(pRExC_state, data->start_class);
4474                 flags &= ~SCF_DO_STCLASS;
4475         }
4476         else if (OP(scan) == GPOS) {
4477             if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4478                 !(delta || is_inf || (data && data->pos_delta)))
4479             {
4480                 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4481                     RExC_rx->extflags |= RXf_ANCH_GPOS;
4482                 if (RExC_rx->gofs < (U32)min)
4483                     RExC_rx->gofs = min;
4484             } else {
4485                 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4486                 RExC_rx->gofs = 0;
4487             }
4488         }
4489 #ifdef TRIE_STUDY_OPT
4490 #ifdef FULL_TRIE_STUDY
4491         else if (PL_regkind[OP(scan)] == TRIE) {
4492             /* NOTE - There is similar code to this block above for handling
4493                BRANCH nodes on the initial study.  If you change stuff here
4494                check there too. */
4495             regnode *trie_node= scan;
4496             regnode *tail= regnext(scan);
4497             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4498             I32 max1 = 0, min1 = I32_MAX;
4499             struct regnode_charclass_class accum;
4500
4501             if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4502                 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4503             if (flags & SCF_DO_STCLASS)
4504                 cl_init_zero(pRExC_state, &accum);
4505
4506             if (!trie->jump) {
4507                 min1= trie->minlen;
4508                 max1= trie->maxlen;
4509             } else {
4510                 const regnode *nextbranch= NULL;
4511                 U32 word;
4512
4513                 for ( word=1 ; word <= trie->wordcount ; word++)
4514                 {
4515                     I32 deltanext=0, minnext=0, f = 0, fake;
4516                     struct regnode_charclass_class this_class;
4517
4518                     data_fake.flags = 0;
4519                     if (data) {
4520                         data_fake.whilem_c = data->whilem_c;
4521                         data_fake.last_closep = data->last_closep;
4522                     }
4523                     else
4524                         data_fake.last_closep = &fake;
4525                     data_fake.pos_delta = delta;
4526                     if (flags & SCF_DO_STCLASS) {
4527                         cl_init(pRExC_state, &this_class);
4528                         data_fake.start_class = &this_class;
4529                         f = SCF_DO_STCLASS_AND;
4530                     }
4531                     if (flags & SCF_WHILEM_VISITED_POS)
4532                         f |= SCF_WHILEM_VISITED_POS;
4533
4534                     if (trie->jump[word]) {
4535                         if (!nextbranch)
4536                             nextbranch = trie_node + trie->jump[0];
4537                         scan= trie_node + trie->jump[word];
4538                         /* We go from the jump point to the branch that follows
4539                            it. Note this means we need the vestigal unused branches
4540                            even though they arent otherwise used.
4541                          */
4542                         minnext = study_chunk(pRExC_state, &scan, minlenp,
4543                             &deltanext, (regnode *)nextbranch, &data_fake,
4544                             stopparen, recursed, NULL, f,depth+1);
4545                     }
4546                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4547                         nextbranch= regnext((regnode*)nextbranch);
4548
4549                     if (min1 > (I32)(minnext + trie->minlen))
4550                         min1 = minnext + trie->minlen;
4551                     if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4552                         max1 = minnext + deltanext + trie->maxlen;
4553                     if (deltanext == I32_MAX)
4554                         is_inf = is_inf_internal = 1;
4555
4556                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4557                         pars++;
4558                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
4559                         if ( stopmin > min + min1)
4560                             stopmin = min + min1;
4561                         flags &= ~SCF_DO_SUBSTR;
4562                         if (data)
4563                             data->flags |= SCF_SEEN_ACCEPT;
4564                     }
4565                     if (data) {
4566                         if (data_fake.flags & SF_HAS_EVAL)
4567                             data->flags |= SF_HAS_EVAL;
4568                         data->whilem_c = data_fake.whilem_c;
4569                     }
4570                     if (flags & SCF_DO_STCLASS)
4571                         cl_or(pRExC_state, &accum, &this_class);
4572                 }
4573             }
4574             if (flags & SCF_DO_SUBSTR) {
4575                 data->pos_min += min1;
4576                 data->pos_delta += max1 - min1;
4577                 if (max1 != min1 || is_inf)
4578                     data->longest = &(data->longest_float);
4579             }
4580             min += min1;
4581             delta += max1 - min1;
4582             if (flags & SCF_DO_STCLASS_OR) {
4583                 cl_or(pRExC_state, data->start_class, &accum);
4584                 if (min1) {
4585                     cl_and(data->start_class, and_withp);
4586                     flags &= ~SCF_DO_STCLASS;
4587                 }
4588             }
4589             else if (flags & SCF_DO_STCLASS_AND) {
4590                 if (min1) {
4591                     cl_and(data->start_class, &accum);
4592                     flags &= ~SCF_DO_STCLASS;
4593                 }
4594                 else {
4595                     /* Switch to OR mode: cache the old value of
4596                      * data->start_class */
4597                     INIT_AND_WITHP;
4598                     StructCopy(data->start_class, and_withp,
4599                                struct regnode_charclass_class);
4600                     flags &= ~SCF_DO_STCLASS_AND;
4601                     StructCopy(&accum, data->start_class,
4602                                struct regnode_charclass_class);
4603                     flags |= SCF_DO_STCLASS_OR;
4604                     data->start_class->flags |= ANYOF_EOS;
4605                 }
4606             }
4607             scan= tail;
4608             continue;
4609         }
4610 #else
4611         else if (PL_regkind[OP(scan)] == TRIE) {
4612             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4613             U8*bang=NULL;
4614
4615             min += trie->minlen;
4616             delta += (trie->maxlen - trie->minlen);
4617             flags &= ~SCF_DO_STCLASS; /* xxx */
4618             if (flags & SCF_DO_SUBSTR) {
4619                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
4620                 data->pos_min += trie->minlen;
4621                 data->pos_delta += (trie->maxlen - trie->minlen);
4622                 if (trie->maxlen != trie->minlen)
4623                     data->longest = &(data->longest_float);
4624             }
4625             if (trie->jump) /* no more substrings -- for now /grr*/
4626                 flags &= ~SCF_DO_SUBSTR;
4627         }
4628 #endif /* old or new */
4629 #endif /* TRIE_STUDY_OPT */
4630
4631         /* Else: zero-length, ignore. */
4632         scan = regnext(scan);
4633     }
4634     if (frame) {
4635         last = frame->last;
4636         scan = frame->next;
4637         stopparen = frame->stop;
4638         frame = frame->prev;
4639         goto fake_study_recurse;
4640     }
4641
4642   finish:
4643     assert(!frame);
4644     DEBUG_STUDYDATA("pre-fin:",data,depth);
4645
4646     *scanp = scan;
4647     *deltap = is_inf_internal ? I32_MAX : delta;
4648     if (flags & SCF_DO_SUBSTR && is_inf)
4649         data->pos_delta = I32_MAX - data->pos_min;
4650     if (is_par > (I32)U8_MAX)
4651         is_par = 0;
4652     if (is_par && pars==1 && data) {
4653         data->flags |= SF_IN_PAR;
4654         data->flags &= ~SF_HAS_PAR;
4655     }
4656     else if (pars && data) {
4657         data->flags |= SF_HAS_PAR;
4658         data->flags &= ~SF_IN_PAR;
4659     }
4660     if (flags & SCF_DO_STCLASS_OR)
4661         cl_and(data->start_class, and_withp);
4662     if (flags & SCF_TRIE_RESTUDY)
4663         data->flags |=  SCF_TRIE_RESTUDY;
4664
4665     DEBUG_STUDYDATA("post-fin:",data,depth);
4666
4667     return min < stopmin ? min : stopmin;
4668 }
4669
4670 STATIC U32
4671 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4672 {
4673     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4674
4675     PERL_ARGS_ASSERT_ADD_DATA;
4676
4677     Renewc(RExC_rxi->data,
4678            sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4679            char, struct reg_data);
4680     if(count)
4681         Renew(RExC_rxi->data->what, count + n, U8);
4682     else
4683         Newx(RExC_rxi->data->what, n, U8);
4684     RExC_rxi->data->count = count + n;
4685     Copy(s, RExC_rxi->data->what + count, n, U8);
4686     return count;
4687 }
4688
4689 /*XXX: todo make this not included in a non debugging perl */
4690 #ifndef PERL_IN_XSUB_RE
4691 void
4692 Perl_reginitcolors(pTHX)
4693 {
4694     dVAR;
4695     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4696     if (s) {
4697         char *t = savepv(s);
4698         int i = 0;
4699         PL_colors[0] = t;
4700         while (++i < 6) {
4701             t = strchr(t, '\t');
4702             if (t) {
4703                 *t = '\0';
4704                 PL_colors[i] = ++t;
4705             }
4706             else
4707                 PL_colors[i] = t = (char *)"";
4708         }
4709     } else {
4710         int i = 0;
4711         while (i < 6)
4712             PL_colors[i++] = (char *)"";
4713     }
4714     PL_colorset = 1;
4715 }
4716 #endif
4717
4718
4719 #ifdef TRIE_STUDY_OPT
4720 #define CHECK_RESTUDY_GOTO                                  \
4721         if (                                                \
4722               (data.flags & SCF_TRIE_RESTUDY)               \
4723               && ! restudied++                              \
4724         )     goto reStudy
4725 #else
4726 #define CHECK_RESTUDY_GOTO
4727 #endif
4728
4729 /*
4730  - pregcomp - compile a regular expression into internal code
4731  *
4732  * We can't allocate space until we know how big the compiled form will be,
4733  * but we can't compile it (and thus know how big it is) until we've got a
4734  * place to put the code.  So we cheat:  we compile it twice, once with code
4735  * generation turned off and size counting turned on, and once "for real".
4736  * This also means that we don't allocate space until we are sure that the
4737  * thing really will compile successfully, and we never have to move the
4738  * code and thus invalidate pointers into it.  (Note that it has to be in
4739  * one piece because free() must be able to free it all.) [NB: not true in perl]
4740  *
4741  * Beware that the optimization-preparation code in here knows about some
4742  * of the structure of the compiled regexp.  [I'll say.]
4743  */
4744
4745
4746
4747 #ifndef PERL_IN_XSUB_RE
4748 #define RE_ENGINE_PTR &PL_core_reg_engine
4749 #else
4750 extern const struct regexp_engine my_reg_engine;
4751 #define RE_ENGINE_PTR &my_reg_engine
4752 #endif
4753
4754 #ifndef PERL_IN_XSUB_RE
4755 REGEXP *
4756 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4757 {
4758     dVAR;
4759     HV * const table = GvHV(PL_hintgv);
4760
4761     PERL_ARGS_ASSERT_PREGCOMP;
4762
4763     /* Dispatch a request to compile a regexp to correct
4764        regexp engine. */
4765     if (table) {
4766         SV **ptr= hv_fetchs(table, "regcomp", FALSE);
4767         GET_RE_DEBUG_FLAGS_DECL;
4768         if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
4769             const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4770             DEBUG_COMPILE_r({
4771                 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4772                     SvIV(*ptr));
4773             });
4774             return CALLREGCOMP_ENG(eng, pattern, flags);
4775         }
4776     }
4777     return Perl_re_compile(aTHX_ pattern, flags);
4778 }
4779 #endif
4780
4781 REGEXP *
4782 Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
4783 {
4784     dVAR;
4785     REGEXP *rx;
4786     struct regexp *r;
4787     register regexp_internal *ri;
4788     STRLEN plen;
4789     char* VOL exp;
4790     char* xend;
4791     regnode *scan;
4792     I32 flags;
4793     I32 minlen = 0;
4794     U32 pm_flags;
4795
4796     /* these are all flags - maybe they should be turned
4797      * into a single int with different bit masks */
4798     I32 sawlookahead = 0;
4799     I32 sawplus = 0;
4800     I32 sawopen = 0;
4801     bool used_setjump = FALSE;
4802     regex_charset initial_charset = get_regex_charset(orig_pm_flags);
4803
4804     U8 jump_ret = 0;
4805     dJMPENV;
4806     scan_data_t data;
4807     RExC_state_t RExC_state;
4808     RExC_state_t * const pRExC_state = &RExC_state;
4809 #ifdef TRIE_STUDY_OPT
4810     int restudied;
4811     RExC_state_t copyRExC_state;
4812 #endif
4813     GET_RE_DEBUG_FLAGS_DECL;
4814
4815     PERL_ARGS_ASSERT_RE_COMPILE;
4816
4817     DEBUG_r(if (!PL_colorset) reginitcolors());
4818
4819 #ifndef PERL_IN_XSUB_RE
4820     /* Initialize these here instead of as-needed, as is quick and avoids
4821      * having to test them each time otherwise */
4822     if (! PL_AboveLatin1) {
4823         PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
4824         PL_ASCII = _new_invlist_C_array(ASCII_invlist);
4825         PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
4826
4827         PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist);
4828         PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist);
4829
4830         PL_L1PosixAlpha = _new_invlist_C_array(L1PosixAlpha_invlist);
4831         PL_PosixAlpha = _new_invlist_C_array(PosixAlpha_invlist);
4832
4833         PL_PosixBlank = _new_invlist_C_array(PosixBlank_invlist);
4834         PL_XPosixBlank = _new_invlist_C_array(XPosixBlank_invlist);
4835
4836         PL_L1Cased = _new_invlist_C_array(L1Cased_invlist);
4837
4838         PL_PosixCntrl = _new_invlist_C_array(PosixCntrl_invlist);
4839         PL_XPosixCntrl = _new_invlist_C_array(XPosixCntrl_invlist);
4840
4841         PL_PosixDigit = _new_invlist_C_array(PosixDigit_invlist);
4842
4843         PL_L1PosixGraph = _new_invlist_C_array(L1PosixGraph_invlist);
4844         PL_PosixGraph = _new_invlist_C_array(PosixGraph_invlist);
4845
4846         PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist);
4847         PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist);
4848
4849         PL_L1PosixLower = _new_invlist_C_array(L1PosixLower_invlist);
4850         PL_PosixLower = _new_invlist_C_array(PosixLower_invlist);
4851
4852         PL_L1PosixPrint = _new_invlist_C_array(L1PosixPrint_invlist);
4853         PL_PosixPrint = _new_invlist_C_array(PosixPrint_invlist);
4854
4855         PL_L1PosixPunct = _new_invlist_C_array(L1PosixPunct_invlist);
4856         PL_PosixPunct = _new_invlist_C_array(PosixPunct_invlist);
4857
4858         PL_PerlSpace = _new_invlist_C_array(PerlSpace_invlist);
4859         PL_XPerlSpace = _new_invlist_C_array(XPerlSpace_invlist);
4860
4861         PL_PosixSpace = _new_invlist_C_array(PosixSpace_invlist);
4862         PL_XPosixSpace = _new_invlist_C_array(XPosixSpace_invlist);
4863
4864         PL_L1PosixUpper = _new_invlist_C_array(L1PosixUpper_invlist);
4865         PL_PosixUpper = _new_invlist_C_array(PosixUpper_invlist);
4866
4867         PL_VertSpace = _new_invlist_C_array(VertSpace_invlist);
4868
4869         PL_PosixWord = _new_invlist_C_array(PosixWord_invlist);
4870         PL_L1PosixWord = _new_invlist_C_array(L1PosixWord_invlist);
4871
4872         PL_PosixXDigit = _new_invlist_C_array(PosixXDigit_invlist);
4873         PL_XPosixXDigit = _new_invlist_C_array(XPosixXDigit_invlist);
4874     }
4875 #endif
4876
4877     exp = SvPV(pattern, plen);
4878
4879     if (plen == 0) { /* ignore the utf8ness if the pattern is 0 length */
4880         RExC_utf8 = RExC_orig_utf8 = 0;
4881     }
4882     else {
4883         RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
4884     }
4885     RExC_uni_semantics = 0;
4886     RExC_contains_locale = 0;
4887
4888     /****************** LONG JUMP TARGET HERE***********************/
4889     /* Longjmp back to here if have to switch in midstream to utf8 */
4890     if (! RExC_orig_utf8) {
4891         JMPENV_PUSH(jump_ret);
4892         used_setjump = TRUE;
4893     }
4894
4895     if (jump_ret == 0) {    /* First time through */
4896         xend = exp + plen;
4897
4898         DEBUG_COMPILE_r({
4899             SV *dsv= sv_newmortal();
4900             RE_PV_QUOTED_DECL(s, RExC_utf8,
4901                 dsv, exp, plen, 60);
4902             PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4903                            PL_colors[4],PL_colors[5],s);
4904         });
4905     }
4906     else {  /* longjumped back */
4907         STRLEN len = plen;
4908
4909         /* If the cause for the longjmp was other than changing to utf8, pop
4910          * our own setjmp, and longjmp to the correct handler */
4911         if (jump_ret != UTF8_LONGJMP) {
4912             JMPENV_POP;
4913             JMPENV_JUMP(jump_ret);
4914         }
4915
4916         GET_RE_DEBUG_FLAGS;
4917
4918         /* It's possible to write a regexp in ascii that represents Unicode
4919         codepoints outside of the byte range, such as via \x{100}. If we
4920         detect such a sequence we have to convert the entire pattern to utf8
4921         and then recompile, as our sizing calculation will have been based
4922         on 1 byte == 1 character, but we will need to use utf8 to encode
4923         at least some part of the pattern, and therefore must convert the whole
4924         thing.
4925         -- dmq */
4926         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4927             "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4928         exp = (char*)Perl_bytes_to_utf8(aTHX_
4929                                         (U8*)SvPV_nomg(pattern, plen),
4930                                         &len);
4931         xend = exp + len;
4932         RExC_orig_utf8 = RExC_utf8 = 1;
4933         SAVEFREEPV(exp);
4934     }
4935
4936 #ifdef TRIE_STUDY_OPT
4937     restudied = 0;
4938 #endif
4939
4940     pm_flags = orig_pm_flags;
4941
4942     if (initial_charset == REGEX_LOCALE_CHARSET) {
4943         RExC_contains_locale = 1;
4944     }
4945     else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
4946
4947         /* Set to use unicode semantics if the pattern is in utf8 and has the
4948          * 'depends' charset specified, as it means unicode when utf8  */
4949         set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
4950     }
4951
4952     RExC_precomp = exp;
4953     RExC_flags = pm_flags;
4954     RExC_sawback = 0;
4955
4956     RExC_seen = 0;
4957     RExC_in_lookbehind = 0;
4958     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4959     RExC_seen_evals = 0;
4960     RExC_extralen = 0;
4961     RExC_override_recoding = 0;
4962
4963     /* First pass: determine size, legality. */
4964     RExC_parse = exp;
4965     RExC_start = exp;
4966     RExC_end = xend;
4967     RExC_naughty = 0;
4968     RExC_npar = 1;
4969     RExC_nestroot = 0;
4970     RExC_size = 0L;
4971     RExC_emit = &PL_regdummy;
4972     RExC_whilem_seen = 0;
4973     RExC_open_parens = NULL;
4974     RExC_close_parens = NULL;
4975     RExC_opend = NULL;
4976     RExC_paren_names = NULL;
4977 #ifdef DEBUGGING
4978     RExC_paren_name_list = NULL;
4979 #endif
4980     RExC_recurse = NULL;
4981     RExC_recurse_count = 0;
4982
4983 #if 0 /* REGC() is (currently) a NOP at the first pass.
4984        * Clever compilers notice this and complain. --jhi */
4985     REGC((U8)REG_MAGIC, (char*)RExC_emit);
4986 #endif
4987     DEBUG_PARSE_r(
4988         PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
4989         RExC_lastnum=0;
4990         RExC_lastparse=NULL;
4991     );
4992     if (reg(pRExC_state, 0, &flags,1) == NULL) {
4993         RExC_precomp = NULL;
4994         return(NULL);
4995     }
4996
4997     /* Here, finished first pass.  Get rid of any added setjmp */
4998     if (used_setjump) {
4999         JMPENV_POP;
5000     }
5001
5002     DEBUG_PARSE_r({
5003         PerlIO_printf(Perl_debug_log,
5004             "Required size %"IVdf" nodes\n"
5005             "Starting second pass (creation)\n",
5006             (IV)RExC_size);
5007         RExC_lastnum=0;
5008         RExC_lastparse=NULL;
5009     });
5010
5011     /* The first pass could have found things that force Unicode semantics */
5012     if ((RExC_utf8 || RExC_uni_semantics)
5013          && get_regex_charset(pm_flags) == REGEX_DEPENDS_CHARSET)
5014     {
5015         set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
5016     }
5017
5018     /* Small enough for pointer-storage convention?
5019        If extralen==0, this means that we will not need long jumps. */
5020     if (RExC_size >= 0x10000L && RExC_extralen)
5021         RExC_size += RExC_extralen;
5022     else
5023         RExC_extralen = 0;
5024     if (RExC_whilem_seen > 15)
5025         RExC_whilem_seen = 15;
5026
5027     /* Allocate space and zero-initialize. Note, the two step process
5028        of zeroing when in debug mode, thus anything assigned has to
5029        happen after that */
5030     rx = (REGEXP*) newSV_type(SVt_REGEXP);
5031     r = (struct regexp*)SvANY(rx);
5032     Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
5033          char, regexp_internal);
5034     if ( r == NULL || ri == NULL )
5035         FAIL("Regexp out of space");
5036 #ifdef DEBUGGING
5037     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
5038     Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
5039 #else
5040     /* bulk initialize base fields with 0. */
5041     Zero(ri, sizeof(regexp_internal), char);
5042 #endif
5043
5044     /* non-zero initialization begins here */
5045     RXi_SET( r, ri );
5046     r->engine= RE_ENGINE_PTR;
5047     r->extflags = pm_flags;
5048     {
5049         bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
5050         bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
5051
5052         /* The caret is output if there are any defaults: if not all the STD
5053          * flags are set, or if no character set specifier is needed */
5054         bool has_default =
5055                     (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
5056                     || ! has_charset);
5057         bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
5058         U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
5059                             >> RXf_PMf_STD_PMMOD_SHIFT);
5060         const char *fptr = STD_PAT_MODS;        /*"msix"*/
5061         char *p;
5062         /* Allocate for the worst case, which is all the std flags are turned
5063          * on.  If more precision is desired, we could do a population count of
5064          * the flags set.  This could be done with a small lookup table, or by
5065          * shifting, masking and adding, or even, when available, assembly
5066          * language for a machine-language population count.
5067          * We never output a minus, as all those are defaults, so are
5068          * covered by the caret */
5069         const STRLEN wraplen = plen + has_p + has_runon
5070             + has_default       /* If needs a caret */
5071
5072                 /* If needs a character set specifier */
5073             + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
5074             + (sizeof(STD_PAT_MODS) - 1)
5075             + (sizeof("(?:)") - 1);
5076
5077         p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
5078         SvPOK_on(rx);
5079         SvFLAGS(rx) |= SvUTF8(pattern);
5080         *p++='('; *p++='?';
5081
5082         /* If a default, cover it using the caret */
5083         if (has_default) {
5084             *p++= DEFAULT_PAT_MOD;
5085         }
5086         if (has_charset) {
5087             STRLEN len;
5088             const char* const name = get_regex_charset_name(r->extflags, &len);
5089             Copy(name, p, len, char);
5090             p += len;
5091         }
5092         if (has_p)
5093             *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
5094         {
5095             char ch;
5096             while((ch = *fptr++)) {
5097                 if(reganch & 1)
5098                     *p++ = ch;
5099                 reganch >>= 1;
5100             }
5101         }
5102
5103         *p++ = ':';
5104         Copy(RExC_precomp, p, plen, char);
5105         assert ((RX_WRAPPED(rx) - p) < 16);
5106         r->pre_prefix = p - RX_WRAPPED(rx);
5107         p += plen;
5108         if (has_runon)
5109             *p++ = '\n';
5110         *p++ = ')';
5111         *p = 0;
5112         SvCUR_set(rx, p - SvPVX_const(rx));
5113     }
5114
5115     r->intflags = 0;
5116     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
5117
5118     if (RExC_seen & REG_SEEN_RECURSE) {
5119         Newxz(RExC_open_parens, RExC_npar,regnode *);
5120         SAVEFREEPV(RExC_open_parens);
5121         Newxz(RExC_close_parens,RExC_npar,regnode *);
5122         SAVEFREEPV(RExC_close_parens);
5123     }
5124
5125     /* Useful during FAIL. */
5126 #ifdef RE_TRACK_PATTERN_OFFSETS
5127     Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
5128     DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
5129                           "%s %"UVuf" bytes for offset annotations.\n",
5130                           ri->u.offsets ? "Got" : "Couldn't get",
5131                           (UV)((2*RExC_size+1) * sizeof(U32))));
5132 #endif
5133     SetProgLen(ri,RExC_size);
5134     RExC_rx_sv = rx;
5135     RExC_rx = r;
5136     RExC_rxi = ri;
5137
5138     /* Second pass: emit code. */
5139     RExC_flags = pm_flags;      /* don't let top level (?i) bleed */
5140     RExC_parse = exp;
5141     RExC_end = xend;
5142     RExC_naughty = 0;
5143     RExC_npar = 1;
5144     RExC_emit_start = ri->program;
5145     RExC_emit = ri->program;
5146     RExC_emit_bound = ri->program + RExC_size + 1;
5147
5148     /* Store the count of eval-groups for security checks: */
5149     RExC_rx->seen_evals = RExC_seen_evals;
5150     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
5151     if (reg(pRExC_state, 0, &flags,1) == NULL) {
5152         ReREFCNT_dec(rx);
5153         return(NULL);
5154     }
5155     /* XXXX To minimize changes to RE engine we always allocate
5156        3-units-long substrs field. */
5157     Newx(r->substrs, 1, struct reg_substr_data);
5158     if (RExC_recurse_count) {
5159         Newxz(RExC_recurse,RExC_recurse_count,regnode *);
5160         SAVEFREEPV(RExC_recurse);
5161     }
5162
5163 reStudy:
5164     r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
5165     Zero(r->substrs, 1, struct reg_substr_data);
5166
5167 #ifdef TRIE_STUDY_OPT
5168     if (!restudied) {
5169         StructCopy(&zero_scan_data, &data, scan_data_t);
5170         copyRExC_state = RExC_state;
5171     } else {
5172         U32 seen=RExC_seen;
5173         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
5174
5175         RExC_state = copyRExC_state;
5176         if (seen & REG_TOP_LEVEL_BRANCHES)
5177             RExC_seen |= REG_TOP_LEVEL_BRANCHES;
5178         else
5179             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
5180         if (data.last_found) {
5181             SvREFCNT_dec(data.longest_fixed);
5182             SvREFCNT_dec(data.longest_float);
5183             SvREFCNT_dec(data.last_found);
5184         }
5185         StructCopy(&zero_scan_data, &data, scan_data_t);
5186     }
5187 #else
5188     StructCopy(&zero_scan_data, &data, scan_data_t);
5189 #endif
5190
5191     /* Dig out information for optimizations. */
5192     r->extflags = RExC_flags; /* was pm_op */
5193     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
5194
5195     if (UTF)
5196         SvUTF8_on(rx);  /* Unicode in it? */
5197     ri->regstclass = NULL;
5198     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
5199         r->intflags |= PREGf_NAUGHTY;
5200     scan = ri->program + 1;             /* First BRANCH. */
5201
5202     /* testing for BRANCH here tells us whether there is "must appear"
5203        data in the pattern. If there is then we can use it for optimisations */
5204     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /*  Only one top-level choice. */
5205         I32 fake;
5206         STRLEN longest_float_length, longest_fixed_length;
5207         struct regnode_charclass_class ch_class; /* pointed to by data */
5208         int stclass_flag;
5209         I32 last_close = 0; /* pointed to by data */
5210         regnode *first= scan;
5211         regnode *first_next= regnext(first);
5212         /*
5213          * Skip introductions and multiplicators >= 1
5214          * so that we can extract the 'meat' of the pattern that must
5215          * match in the large if() sequence following.
5216          * NOTE that EXACT is NOT covered here, as it is normally
5217          * picked up by the optimiser separately.
5218          *
5219          * This is unfortunate as the optimiser isnt handling lookahead
5220          * properly currently.
5221          *
5222          */
5223         while ((OP(first) == OPEN && (sawopen = 1)) ||
5224                /* An OR of *one* alternative - should not happen now. */
5225             (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
5226             /* for now we can't handle lookbehind IFMATCH*/
5227             (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
5228             (OP(first) == PLUS) ||
5229             (OP(first) == MINMOD) ||
5230                /* An {n,m} with n>0 */
5231             (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
5232             (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
5233         {
5234                 /*
5235                  * the only op that could be a regnode is PLUS, all the rest
5236                  * will be regnode_1 or regnode_2.
5237                  *
5238                  */
5239                 if (OP(first) == PLUS)
5240                     sawplus = 1;
5241                 else
5242                     first += regarglen[OP(first)];
5243
5244                 first = NEXTOPER(first);
5245                 first_next= regnext(first);
5246         }
5247
5248         /* Starting-point info. */
5249       again:
5250         DEBUG_PEEP("first:",first,0);
5251         /* Ignore EXACT as we deal with it later. */
5252         if (PL_regkind[OP(first)] == EXACT) {
5253             if (OP(first) == EXACT)
5254                 NOOP;   /* Empty, get anchored substr later. */
5255             else
5256                 ri->regstclass = first;
5257         }
5258 #ifdef TRIE_STCLASS
5259         else if (PL_regkind[OP(first)] == TRIE &&
5260                 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
5261         {
5262             regnode *trie_op;
5263             /* this can happen only on restudy */
5264             if ( OP(first) == TRIE ) {
5265                 struct regnode_1 *trieop = (struct regnode_1 *)
5266                     PerlMemShared_calloc(1, sizeof(struct regnode_1));
5267                 StructCopy(first,trieop,struct regnode_1);
5268                 trie_op=(regnode *)trieop;
5269             } else {
5270                 struct regnode_charclass *trieop = (struct regnode_charclass *)
5271                     PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
5272                 StructCopy(first,trieop,struct regnode_charclass);
5273                 trie_op=(regnode *)trieop;
5274             }
5275             OP(trie_op)+=2;
5276             make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
5277             ri->regstclass = trie_op;
5278         }
5279 #endif
5280         else if (REGNODE_SIMPLE(OP(first)))
5281             ri->regstclass = first;
5282         else if (PL_regkind[OP(first)] == BOUND ||
5283                  PL_regkind[OP(first)] == NBOUND)
5284             ri->regstclass = first;
5285         else if (PL_regkind[OP(first)] == BOL) {
5286             r->extflags |= (OP(first) == MBOL
5287                            ? RXf_ANCH_MBOL
5288                            : (OP(first) == SBOL
5289                               ? RXf_ANCH_SBOL
5290                               : RXf_ANCH_BOL));
5291             first = NEXTOPER(first);
5292             goto again;
5293         }
5294         else if (OP(first) == GPOS) {
5295             r->extflags |= RXf_ANCH_GPOS;
5296             first = NEXTOPER(first);
5297             goto again;
5298         }
5299         else if ((!sawopen || !RExC_sawback) &&
5300             (OP(first) == STAR &&
5301             PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
5302             !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
5303         {
5304             /* turn .* into ^.* with an implied $*=1 */
5305             const int type =
5306                 (OP(NEXTOPER(first)) == REG_ANY)
5307                     ? RXf_ANCH_MBOL
5308                     : RXf_ANCH_SBOL;
5309             r->extflags |= type;
5310             r->intflags |= PREGf_IMPLICIT;
5311             first = NEXTOPER(first);
5312             goto again;
5313         }
5314         if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
5315             && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
5316             /* x+ must match at the 1st pos of run of x's */
5317             r->intflags |= PREGf_SKIP;
5318
5319         /* Scan is after the zeroth branch, first is atomic matcher. */
5320 #ifdef TRIE_STUDY_OPT
5321         DEBUG_PARSE_r(
5322             if (!restudied)
5323                 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
5324                               (IV)(first - scan + 1))
5325         );
5326 #else
5327         DEBUG_PARSE_r(
5328             PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
5329                 (IV)(first - scan + 1))
5330         );
5331 #endif
5332
5333
5334         /*
5335         * If there's something expensive in the r.e., find the
5336         * longest literal string that must appear and make it the
5337         * regmust.  Resolve ties in favor of later strings, since
5338         * the regstart check works with the beginning of the r.e.
5339         * and avoiding duplication strengthens checking.  Not a
5340         * strong reason, but sufficient in the absence of others.
5341         * [Now we resolve ties in favor of the earlier string if
5342         * it happens that c_offset_min has been invalidated, since the
5343         * earlier string may buy us something the later one won't.]
5344         */
5345
5346         data.longest_fixed = newSVpvs("");
5347         data.longest_float = newSVpvs("");
5348         data.last_found = newSVpvs("");
5349         data.longest = &(data.longest_fixed);
5350         first = scan;
5351         if (!ri->regstclass) {
5352             cl_init(pRExC_state, &ch_class);
5353             data.start_class = &ch_class;
5354             stclass_flag = SCF_DO_STCLASS_AND;
5355         } else                          /* XXXX Check for BOUND? */
5356             stclass_flag = 0;
5357         data.last_closep = &last_close;
5358
5359         minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
5360             &data, -1, NULL, NULL,
5361             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
5362
5363
5364         CHECK_RESTUDY_GOTO;
5365
5366
5367         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
5368              && data.last_start_min == 0 && data.last_end > 0
5369              && !RExC_seen_zerolen
5370              && !(RExC_seen & REG_SEEN_VERBARG)
5371              && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
5372             r->extflags |= RXf_CHECK_ALL;
5373         scan_commit(pRExC_state, &data,&minlen,0);
5374         SvREFCNT_dec(data.last_found);
5375
5376         /* Note that code very similar to this but for anchored string
5377            follows immediately below, changes may need to be made to both.
5378            Be careful.
5379          */
5380         longest_float_length = CHR_SVLEN(data.longest_float);
5381         if (longest_float_length
5382             || (data.flags & SF_FL_BEFORE_EOL
5383                 && (!(data.flags & SF_FL_BEFORE_MEOL)
5384                     || (RExC_flags & RXf_PMf_MULTILINE))))
5385         {
5386             I32 t,ml;
5387
5388             /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
5389             if ((RExC_seen & REG_SEEN_EXACTF_SHARP_S)
5390                 || (SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
5391                     && data.offset_fixed == data.offset_float_min
5392                     && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
5393                     goto remove_float;          /* As in (a)+. */
5394
5395             /* copy the information about the longest float from the reg_scan_data
5396                over to the program. */
5397             if (SvUTF8(data.longest_float)) {
5398                 r->float_utf8 = data.longest_float;
5399                 r->float_substr = NULL;
5400             } else {
5401                 r->float_substr = data.longest_float;
5402                 r->float_utf8 = NULL;
5403             }
5404             /* float_end_shift is how many chars that must be matched that
5405                follow this item. We calculate it ahead of time as once the
5406                lookbehind offset is added in we lose the ability to correctly
5407                calculate it.*/
5408             ml = data.minlen_float ? *(data.minlen_float)
5409                                    : (I32)longest_float_length;
5410             r->float_end_shift = ml - data.offset_float_min
5411                 - longest_float_length + (SvTAIL(data.longest_float) != 0)
5412                 + data.lookbehind_float;
5413             r->float_min_offset = data.offset_float_min - data.lookbehind_float;
5414             r->float_max_offset = data.offset_float_max;
5415             if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
5416                 r->float_max_offset -= data.lookbehind_float;
5417
5418             t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
5419                        && (!(data.flags & SF_FL_BEFORE_MEOL)
5420                            || (RExC_flags & RXf_PMf_MULTILINE)));
5421             fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
5422         }
5423         else {
5424           remove_float:
5425             r->float_substr = r->float_utf8 = NULL;
5426             SvREFCNT_dec(data.longest_float);
5427             longest_float_length = 0;
5428         }
5429
5430         /* Note that code very similar to this but for floating string
5431            is immediately above, changes may need to be made to both.
5432            Be careful.
5433          */
5434         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
5435
5436         /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
5437         if (! (RExC_seen & REG_SEEN_EXACTF_SHARP_S)
5438             && (longest_fixed_length
5439                 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
5440                     && (!(data.flags & SF_FIX_BEFORE_MEOL)
5441                         || (RExC_flags & RXf_PMf_MULTILINE)))) )
5442         {
5443             I32 t,ml;
5444
5445             /* copy the information about the longest fixed
5446                from the reg_scan_data over to the program. */
5447             if (SvUTF8(data.longest_fixed)) {
5448                 r->anchored_utf8 = data.longest_fixed;
5449                 r->anchored_substr = NULL;
5450             } else {
5451                 r->anchored_substr = data.longest_fixed;
5452                 r->anchored_utf8 = NULL;
5453             }
5454             /* fixed_end_shift is how many chars that must be matched that
5455                follow this item. We calculate it ahead of time as once the
5456                lookbehind offset is added in we lose the ability to correctly
5457                calculate it.*/
5458             ml = data.minlen_fixed ? *(data.minlen_fixed)
5459                                    : (I32)longest_fixed_length;
5460             r->anchored_end_shift = ml - data.offset_fixed
5461                 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
5462                 + data.lookbehind_fixed;
5463             r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
5464
5465             t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
5466                  && (!(data.flags & SF_FIX_BEFORE_MEOL)
5467                      || (RExC_flags & RXf_PMf_MULTILINE)));
5468             fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
5469         }
5470         else {
5471             r->anchored_substr = r->anchored_utf8 = NULL;
5472             SvREFCNT_dec(data.longest_fixed);
5473             longest_fixed_length = 0;
5474         }
5475         if (ri->regstclass
5476             && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
5477             ri->regstclass = NULL;
5478
5479         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
5480             && stclass_flag
5481             && !(data.start_class->flags & ANYOF_EOS)
5482             && !cl_is_anything(data.start_class))
5483         {
5484             const U32 n = add_data(pRExC_state, 1, "f");
5485             data.start_class->flags |= ANYOF_IS_SYNTHETIC;
5486
5487             Newx(RExC_rxi->data->data[n], 1,
5488                 struct regnode_charclass_class);
5489             StructCopy(data.start_class,
5490                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
5491                        struct regnode_charclass_class);
5492             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
5493             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
5494             DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
5495                       regprop(r, sv, (regnode*)data.start_class);
5496                       PerlIO_printf(Perl_debug_log,
5497                                     "synthetic stclass \"%s\".\n",
5498                                     SvPVX_const(sv));});
5499         }
5500
5501         /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
5502         if (longest_fixed_length > longest_float_length) {
5503             r->check_end_shift = r->anchored_end_shift;
5504             r->check_substr = r->anchored_substr;
5505             r->check_utf8 = r->anchored_utf8;
5506             r->check_offset_min = r->check_offset_max = r->anchored_offset;
5507             if (r->extflags & RXf_ANCH_SINGLE)
5508                 r->extflags |= RXf_NOSCAN;
5509         }
5510         else {
5511             r->check_end_shift = r->float_end_shift;
5512             r->check_substr = r->float_substr;
5513             r->check_utf8 = r->float_utf8;
5514             r->check_offset_min = r->float_min_offset;
5515             r->check_offset_max = r->float_max_offset;
5516         }
5517         /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
5518            This should be changed ASAP!  */
5519         if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
5520             r->extflags |= RXf_USE_INTUIT;
5521             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
5522                 r->extflags |= RXf_INTUIT_TAIL;
5523         }
5524         /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
5525         if ( (STRLEN)minlen < longest_float_length )
5526             minlen= longest_float_length;
5527         if ( (STRLEN)minlen < longest_fixed_length )
5528             minlen= longest_fixed_length;
5529         */
5530     }
5531     else {
5532         /* Several toplevels. Best we can is to set minlen. */
5533         I32 fake;
5534         struct regnode_charclass_class ch_class;
5535         I32 last_close = 0;
5536
5537         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
5538
5539         scan = ri->program + 1;
5540         cl_init(pRExC_state, &ch_class);
5541         data.start_class = &ch_class;
5542         data.last_closep = &last_close;
5543
5544
5545         minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
5546             &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
5547
5548         CHECK_RESTUDY_GOTO;
5549
5550         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
5551                 = r->float_substr = r->float_utf8 = NULL;
5552
5553         if (!(data.start_class->flags & ANYOF_EOS)
5554             && !cl_is_anything(data.start_class))
5555         {
5556             const U32 n = add_data(pRExC_state, 1, "f");
5557             data.start_class->flags |= ANYOF_IS_SYNTHETIC;
5558
5559             Newx(RExC_rxi->data->data[n], 1,
5560                 struct regnode_charclass_class);
5561             StructCopy(data.start_class,
5562                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
5563                        struct regnode_charclass_class);
5564             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
5565             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
5566             DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
5567                       regprop(r, sv, (regnode*)data.start_class);
5568                       PerlIO_printf(Perl_debug_log,
5569                                     "synthetic stclass \"%s\".\n",
5570                                     SvPVX_const(sv));});
5571         }
5572     }
5573
5574     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
5575        the "real" pattern. */
5576     DEBUG_OPTIMISE_r({
5577         PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
5578                       (IV)minlen, (IV)r->minlen);
5579     });
5580     r->minlenret = minlen;
5581     if (r->minlen < minlen)
5582         r->minlen = minlen;
5583
5584     if (RExC_seen & REG_SEEN_GPOS)
5585         r->extflags |= RXf_GPOS_SEEN;
5586     if (RExC_seen & REG_SEEN_LOOKBEHIND)
5587         r->extflags |= RXf_LOOKBEHIND_SEEN;
5588     if (RExC_seen & REG_SEEN_EVAL)
5589         r->extflags |= RXf_EVAL_SEEN;
5590     if (RExC_seen & REG_SEEN_CANY)
5591         r->extflags |= RXf_CANY_SEEN;
5592     if (RExC_seen & REG_SEEN_VERBARG)
5593         r->intflags |= PREGf_VERBARG_SEEN;
5594     if (RExC_seen & REG_SEEN_CUTGROUP)
5595         r->intflags |= PREGf_CUTGROUP_SEEN;
5596     if (RExC_paren_names)
5597         RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
5598     else
5599         RXp_PAREN_NAMES(r) = NULL;
5600
5601 #ifdef STUPID_PATTERN_CHECKS
5602     if (RX_PRELEN(rx) == 0)
5603         r->extflags |= RXf_NULL;
5604     if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
5605         /* XXX: this should happen BEFORE we compile */
5606         r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5607     else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
5608         r->extflags |= RXf_WHITE;
5609     else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
5610         r->extflags |= RXf_START_ONLY;
5611 #else
5612     if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
5613             /* XXX: this should happen BEFORE we compile */
5614             r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5615     else {
5616         regnode *first = ri->program + 1;
5617         U8 fop = OP(first);
5618
5619         if (PL_regkind[fop] == NOTHING && OP(NEXTOPER(first)) == END)
5620             r->extflags |= RXf_NULL;
5621         else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END)
5622             r->extflags |= RXf_START_ONLY;
5623         else if (fop == PLUS && OP(NEXTOPER(first)) == SPACE
5624                              && OP(regnext(first)) == END)
5625             r->extflags |= RXf_WHITE;
5626     }
5627 #endif
5628 #ifdef DEBUGGING
5629     if (RExC_paren_names) {
5630         ri->name_list_idx = add_data( pRExC_state, 1, "a" );
5631         ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
5632     } else
5633 #endif
5634         ri->name_list_idx = 0;
5635
5636     if (RExC_recurse_count) {
5637         for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
5638             const regnode *scan = RExC_recurse[RExC_recurse_count-1];
5639             ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
5640         }
5641     }
5642     Newxz(r->offs, RExC_npar, regexp_paren_pair);
5643     /* assume we don't need to swap parens around before we match */
5644
5645     DEBUG_DUMP_r({
5646         PerlIO_printf(Perl_debug_log,"Final program:\n");
5647         regdump(r);
5648     });
5649 #ifdef RE_TRACK_PATTERN_OFFSETS
5650     DEBUG_OFFSETS_r(if (ri->u.offsets) {
5651         const U32 len = ri->u.offsets[0];
5652         U32 i;
5653         GET_RE_DEBUG_FLAGS_DECL;
5654         PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
5655         for (i = 1; i <= len; i++) {
5656             if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
5657                 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
5658                 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
5659             }
5660         PerlIO_printf(Perl_debug_log, "\n");
5661     });
5662 #endif
5663     return rx;
5664 }
5665
5666 #undef RE_ENGINE_PTR
5667
5668
5669 SV*
5670 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
5671                     const U32 flags)
5672 {
5673     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
5674
5675     PERL_UNUSED_ARG(value);
5676
5677     if (flags & RXapif_FETCH) {
5678         return reg_named_buff_fetch(rx, key, flags);
5679     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
5680         Perl_croak_no_modify(aTHX);
5681         return NULL;
5682     } else if (flags & RXapif_EXISTS) {
5683         return reg_named_buff_exists(rx, key, flags)
5684             ? &PL_sv_yes
5685             : &PL_sv_no;
5686     } else if (flags & RXapif_REGNAMES) {
5687         return reg_named_buff_all(rx, flags);
5688     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
5689         return reg_named_buff_scalar(rx, flags);
5690     } else {
5691         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
5692         return NULL;
5693     }
5694 }
5695
5696 SV*
5697 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
5698                          const U32 flags)
5699 {
5700     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
5701     PERL_UNUSED_ARG(lastkey);
5702
5703     if (flags & RXapif_FIRSTKEY)
5704         return reg_named_buff_firstkey(rx, flags);
5705     else if (flags & RXapif_NEXTKEY)
5706         return reg_named_buff_nextkey(rx, flags);
5707     else {
5708         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
5709         return NULL;
5710     }
5711 }
5712
5713 SV*
5714 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
5715                           const U32 flags)
5716 {
5717     AV *retarray = NULL;
5718     SV *ret;
5719     struct regexp *const rx = (struct regexp *)SvANY(r);
5720
5721     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
5722
5723     if (flags & RXapif_ALL)
5724         retarray=newAV();
5725
5726     if (rx && RXp_PAREN_NAMES(rx)) {
5727         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
5728         if (he_str) {
5729             IV i;
5730             SV* sv_dat=HeVAL(he_str);
5731             I32 *nums=(I32*)SvPVX(sv_dat);
5732             for ( i=0; i<SvIVX(sv_dat); i++ ) {
5733                 if ((I32)(rx->nparens) >= nums[i]
5734                     && rx->offs[nums[i]].start != -1
5735                     && rx->offs[nums[i]].end != -1)
5736                 {
5737                     ret = newSVpvs("");
5738                     CALLREG_NUMBUF_FETCH(r,nums[i],ret);
5739                     if (!retarray)
5740                         return ret;
5741                 } else {
5742                     if (retarray)
5743                         ret = newSVsv(&PL_sv_undef);
5744                 }
5745                 if (retarray)
5746                     av_push(retarray, ret);
5747             }
5748             if (retarray)
5749                 return newRV_noinc(MUTABLE_SV(retarray));
5750         }
5751     }
5752     return NULL;
5753 }
5754
5755 bool
5756 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
5757                            const U32 flags)
5758 {
5759     struct regexp *const rx = (struct regexp *)SvANY(r);
5760
5761     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
5762
5763     if (rx && RXp_PAREN_NAMES(rx)) {
5764         if (flags & RXapif_ALL) {
5765             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
5766         } else {
5767             SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
5768             if (sv) {
5769                 SvREFCNT_dec(sv);
5770                 return TRUE;
5771             } else {
5772                 return FALSE;
5773             }
5774         }
5775     } else {
5776         return FALSE;
5777     }
5778 }
5779
5780 SV*
5781 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
5782 {
5783     struct regexp *const rx = (struct regexp *)SvANY(r);
5784
5785     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
5786
5787     if ( rx && RXp_PAREN_NAMES(rx) ) {
5788         (void)hv_iterinit(RXp_PAREN_NAMES(rx));
5789
5790         return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
5791     } else {
5792         return FALSE;
5793     }
5794 }
5795
5796 SV*
5797 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
5798 {
5799     struct regexp *const rx = (struct regexp *)SvANY(r);
5800     GET_RE_DEBUG_FLAGS_DECL;
5801
5802     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
5803
5804     if (rx && RXp_PAREN_NAMES(rx)) {
5805         HV *hv = RXp_PAREN_NAMES(rx);
5806         HE *temphe;
5807         while ( (temphe = hv_iternext_flags(hv,0)) ) {
5808             IV i;
5809             IV parno = 0;
5810             SV* sv_dat = HeVAL(temphe);
5811             I32 *nums = (I32*)SvPVX(sv_dat);
5812             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5813                 if ((I32)(rx->lastparen) >= nums[i] &&
5814                     rx->offs[nums[i]].start != -1 &&
5815                     rx->offs[nums[i]].end != -1)
5816                 {
5817                     parno = nums[i];
5818                     break;
5819                 }
5820             }
5821             if (parno || flags & RXapif_ALL) {
5822                 return newSVhek(HeKEY_hek(temphe));
5823             }
5824         }
5825     }
5826     return NULL;
5827 }
5828
5829 SV*
5830 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
5831 {
5832     SV *ret;
5833     AV *av;
5834     I32 length;
5835     struct regexp *const rx = (struct regexp *)SvANY(r);
5836
5837     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
5838
5839     if (rx && RXp_PAREN_NAMES(rx)) {
5840         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5841             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
5842         } else if (flags & RXapif_ONE) {
5843             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
5844             av = MUTABLE_AV(SvRV(ret));
5845             length = av_len(av);
5846             SvREFCNT_dec(ret);
5847             return newSViv(length + 1);
5848         } else {
5849             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5850             return NULL;
5851         }
5852     }
5853     return &PL_sv_undef;
5854 }
5855
5856 SV*
5857 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
5858 {
5859     struct regexp *const rx = (struct regexp *)SvANY(r);
5860     AV *av = newAV();
5861
5862     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
5863
5864     if (rx && RXp_PAREN_NAMES(rx)) {
5865         HV *hv= RXp_PAREN_NAMES(rx);
5866         HE *temphe;
5867         (void)hv_iterinit(hv);
5868         while ( (temphe = hv_iternext_flags(hv,0)) ) {
5869             IV i;
5870             IV parno = 0;
5871             SV* sv_dat = HeVAL(temphe);
5872             I32 *nums = (I32*)SvPVX(sv_dat);
5873             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5874                 if ((I32)(rx->lastparen) >= nums[i] &&
5875                     rx->offs[nums[i]].start != -1 &&
5876                     rx->offs[nums[i]].end != -1)
5877                 {
5878                     parno = nums[i];
5879                     break;
5880                 }
5881             }
5882             if (parno || flags & RXapif_ALL) {
5883                 av_push(av, newSVhek(HeKEY_hek(temphe)));
5884             }
5885         }
5886     }
5887
5888     return newRV_noinc(MUTABLE_SV(av));
5889 }
5890
5891 void
5892 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
5893                              SV * const sv)
5894 {
5895     struct regexp *const rx = (struct regexp *)SvANY(r);
5896     char *s = NULL;
5897     I32 i = 0;
5898     I32 s1, t1;
5899
5900     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
5901
5902     if (!rx->subbeg) {
5903         sv_setsv(sv,&PL_sv_undef);
5904         return;
5905     }
5906     else
5907     if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
5908         /* $` */
5909         i = rx->offs[0].start;
5910         s = rx->subbeg;
5911     }
5912     else
5913     if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
5914         /* $' */
5915         s = rx->subbeg + rx->offs[0].end;
5916         i = rx->sublen - rx->offs[0].end;
5917     }
5918     else
5919     if ( 0 <= paren && paren <= (I32)rx->nparens &&
5920         (s1 = rx->offs[paren].start) != -1 &&
5921         (t1 = rx->offs[paren].end) != -1)
5922     {
5923         /* $& $1 ... */
5924         i = t1 - s1;
5925         s = rx->subbeg + s1;
5926     } else {
5927         sv_setsv(sv,&PL_sv_undef);
5928         return;
5929     }
5930     assert(rx->sublen >= (s - rx->subbeg) + i );
5931     if (i >= 0) {
5932         const int oldtainted = PL_tainted;
5933         TAINT_NOT;
5934         sv_setpvn(sv, s, i);
5935         PL_tainted = oldtainted;
5936         if ( (rx->extflags & RXf_CANY_SEEN)
5937             ? (RXp_MATCH_UTF8(rx)
5938                         && (!i || is_utf8_string((U8*)s, i)))
5939             : (RXp_MATCH_UTF8(rx)) )
5940         {
5941             SvUTF8_on(sv);
5942         }
5943         else
5944             SvUTF8_off(sv);
5945         if (PL_tainting) {
5946             if (RXp_MATCH_TAINTED(rx)) {
5947                 if (SvTYPE(sv) >= SVt_PVMG) {
5948                     MAGIC* const mg = SvMAGIC(sv);
5949                     MAGIC* mgt;
5950                     PL_tainted = 1;
5951                     SvMAGIC_set(sv, mg->mg_moremagic);
5952                     SvTAINT(sv);
5953                     if ((mgt = SvMAGIC(sv))) {
5954                         mg->mg_moremagic = mgt;
5955                         SvMAGIC_set(sv, mg);
5956                     }
5957                 } else {
5958                     PL_tainted = 1;
5959                     SvTAINT(sv);
5960                 }
5961             } else
5962                 SvTAINTED_off(sv);
5963         }
5964     } else {
5965         sv_setsv(sv,&PL_sv_undef);
5966         return;
5967     }
5968 }
5969
5970 void
5971 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
5972                                                          SV const * const value)
5973 {
5974     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
5975
5976     PERL_UNUSED_ARG(rx);
5977     PERL_UNUSED_ARG(paren);
5978     PERL_UNUSED_ARG(value);
5979
5980     if (!PL_localizing)
5981         Perl_croak_no_modify(aTHX);
5982 }
5983
5984 I32
5985 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
5986                               const I32 paren)
5987 {
5988     struct regexp *const rx = (struct regexp *)SvANY(r);
5989     I32 i;
5990     I32 s1, t1;
5991
5992     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
5993
5994     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
5995         switch (paren) {
5996       /* $` / ${^PREMATCH} */
5997       case RX_BUFF_IDX_PREMATCH:
5998         if (rx->offs[0].start != -1) {
5999                         i = rx->offs[0].start;
6000                         if (i > 0) {
6001                                 s1 = 0;
6002                                 t1 = i;
6003                                 goto getlen;
6004                         }
6005             }
6006         return 0;
6007       /* $' / ${^POSTMATCH} */
6008       case RX_BUFF_IDX_POSTMATCH:
6009             if (rx->offs[0].end != -1) {
6010                         i = rx->sublen - rx->offs[0].end;
6011                         if (i > 0) {
6012                                 s1 = rx->offs[0].end;
6013                                 t1 = rx->sublen;
6014                                 goto getlen;
6015                         }
6016             }
6017         return 0;
6018       /* $& / ${^MATCH}, $1, $2, ... */
6019       default:
6020             if (paren <= (I32)rx->nparens &&
6021             (s1 = rx->offs[paren].start) != -1 &&
6022             (t1 = rx->offs[paren].end) != -1)
6023             {
6024             i = t1 - s1;
6025             goto getlen;
6026         } else {
6027             if (ckWARN(WARN_UNINITIALIZED))
6028                 report_uninit((const SV *)sv);
6029             return 0;
6030         }
6031     }
6032   getlen:
6033     if (i > 0 && RXp_MATCH_UTF8(rx)) {
6034         const char * const s = rx->subbeg + s1;
6035         const U8 *ep;
6036         STRLEN el;
6037
6038         i = t1 - s1;
6039         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
6040                         i = el;
6041     }
6042     return i;
6043 }
6044
6045 SV*
6046 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
6047 {
6048     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
6049         PERL_UNUSED_ARG(rx);
6050         if (0)
6051             return NULL;
6052         else
6053             return newSVpvs("Regexp");
6054 }
6055
6056 /* Scans the name of a named buffer from the pattern.
6057  * If flags is REG_RSN_RETURN_NULL returns null.
6058  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
6059  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
6060  * to the parsed name as looked up in the RExC_paren_names hash.
6061  * If there is an error throws a vFAIL().. type exception.
6062  */
6063
6064 #define REG_RSN_RETURN_NULL    0
6065 #define REG_RSN_RETURN_NAME    1
6066 #define REG_RSN_RETURN_DATA    2
6067
6068 STATIC SV*
6069 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
6070 {
6071     char *name_start = RExC_parse;
6072
6073     PERL_ARGS_ASSERT_REG_SCAN_NAME;
6074
6075     if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
6076          /* skip IDFIRST by using do...while */
6077         if (UTF)
6078             do {
6079                 RExC_parse += UTF8SKIP(RExC_parse);
6080             } while (isALNUM_utf8((U8*)RExC_parse));
6081         else
6082             do {
6083                 RExC_parse++;
6084             } while (isALNUM(*RExC_parse));
6085     }
6086
6087     if ( flags ) {
6088         SV* sv_name
6089             = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
6090                              SVs_TEMP | (UTF ? SVf_UTF8 : 0));
6091         if ( flags == REG_RSN_RETURN_NAME)
6092             return sv_name;
6093         else if (flags==REG_RSN_RETURN_DATA) {
6094             HE *he_str = NULL;
6095             SV *sv_dat = NULL;
6096             if ( ! sv_name )      /* should not happen*/
6097                 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
6098             if (RExC_paren_names)
6099                 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
6100             if ( he_str )
6101                 sv_dat = HeVAL(he_str);
6102             if ( ! sv_dat )
6103                 vFAIL("Reference to nonexistent named group");
6104             return sv_dat;
6105         }
6106         else {
6107             Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
6108                        (unsigned long) flags);
6109         }
6110         /* NOT REACHED */
6111     }
6112     return NULL;
6113 }
6114
6115 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
6116     int rem=(int)(RExC_end - RExC_parse);                       \
6117     int cut;                                                    \
6118     int num;                                                    \
6119     int iscut=0;                                                \
6120     if (rem>10) {                                               \
6121         rem=10;                                                 \
6122         iscut=1;                                                \
6123     }                                                           \
6124     cut=10-rem;                                                 \
6125     if (RExC_lastparse!=RExC_parse)                             \
6126         PerlIO_printf(Perl_debug_log," >%.*s%-*s",              \
6127             rem, RExC_parse,                                    \
6128             cut + 4,                                            \
6129             iscut ? "..." : "<"                                 \
6130         );                                                      \
6131     else                                                        \
6132         PerlIO_printf(Perl_debug_log,"%16s","");                \
6133                                                                 \
6134     if (SIZE_ONLY)                                              \
6135        num = RExC_size + 1;                                     \
6136     else                                                        \
6137        num=REG_NODE_NUM(RExC_emit);                             \
6138     if (RExC_lastnum!=num)                                      \
6139        PerlIO_printf(Perl_debug_log,"|%4d",num);                \
6140     else                                                        \
6141        PerlIO_printf(Perl_debug_log,"|%4s","");                 \
6142     PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
6143         (int)((depth*2)), "",                                   \
6144         (funcname)                                              \
6145     );                                                          \
6146     RExC_lastnum=num;                                           \
6147     RExC_lastparse=RExC_parse;                                  \
6148 })
6149
6150
6151
6152 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
6153     DEBUG_PARSE_MSG((funcname));                            \
6154     PerlIO_printf(Perl_debug_log,"%4s","\n");               \
6155 })
6156 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
6157     DEBUG_PARSE_MSG((funcname));                            \
6158     PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
6159 })
6160
6161 /* This section of code defines the inversion list object and its methods.  The
6162  * interfaces are highly subject to change, so as much as possible is static to
6163  * this file.  An inversion list is here implemented as a malloc'd C UV array
6164  * with some added info that is placed as UVs at the beginning in a header
6165  * portion.  An inversion list for Unicode is an array of code points, sorted
6166  * by ordinal number.  The zeroth element is the first code point in the list.
6167  * The 1th element is the first element beyond that not in the list.  In other
6168  * words, the first range is
6169  *  invlist[0]..(invlist[1]-1)
6170  * The other ranges follow.  Thus every element whose index is divisible by two
6171  * marks the beginning of a range that is in the list, and every element not
6172  * divisible by two marks the beginning of a range not in the list.  A single
6173  * element inversion list that contains the single code point N generally
6174  * consists of two elements
6175  *  invlist[0] == N
6176  *  invlist[1] == N+1
6177  * (The exception is when N is the highest representable value on the
6178  * machine, in which case the list containing just it would be a single
6179  * element, itself.  By extension, if the last range in the list extends to
6180  * infinity, then the first element of that range will be in the inversion list
6181  * at a position that is divisible by two, and is the final element in the
6182  * list.)
6183  * Taking the complement (inverting) an inversion list is quite simple, if the
6184  * first element is 0, remove it; otherwise add a 0 element at the beginning.
6185  * This implementation reserves an element at the beginning of each inversion list
6186  * to contain 0 when the list contains 0, and contains 1 otherwise.  The actual
6187  * beginning of the list is either that element if 0, or the next one if 1.
6188  *
6189  * More about inversion lists can be found in "Unicode Demystified"
6190  * Chapter 13 by Richard Gillam, published by Addison-Wesley.
6191  * More will be coming when functionality is added later.
6192  *
6193  * The inversion list data structure is currently implemented as an SV pointing
6194  * to an array of UVs that the SV thinks are bytes.  This allows us to have an
6195  * array of UV whose memory management is automatically handled by the existing
6196  * facilities for SV's.
6197  *
6198  * Some of the methods should always be private to the implementation, and some
6199  * should eventually be made public */
6200
6201 #define INVLIST_LEN_OFFSET 0    /* Number of elements in the inversion list */
6202 #define INVLIST_ITER_OFFSET 1   /* Current iteration position */
6203
6204 /* This is a combination of a version and data structure type, so that one
6205  * being passed in can be validated to be an inversion list of the correct
6206  * vintage.  When the structure of the header is changed, a new random number
6207  * in the range 2**31-1 should be generated and the new() method changed to
6208  * insert that at this location.  Then, if an auxiliary program doesn't change
6209  * correspondingly, it will be discovered immediately */
6210 #define INVLIST_VERSION_ID_OFFSET 2
6211 #define INVLIST_VERSION_ID 1064334010
6212
6213 /* For safety, when adding new elements, remember to #undef them at the end of
6214  * the inversion list code section */
6215
6216 #define INVLIST_ZERO_OFFSET 3   /* 0 or 1; must be last element in header */
6217 /* The UV at position ZERO contains either 0 or 1.  If 0, the inversion list
6218  * contains the code point U+00000, and begins here.  If 1, the inversion list
6219  * doesn't contain U+0000, and it begins at the next UV in the array.
6220  * Inverting an inversion list consists of adding or removing the 0 at the
6221  * beginning of it.  By reserving a space for that 0, inversion can be made
6222  * very fast */
6223
6224 #define HEADER_LENGTH (INVLIST_ZERO_OFFSET + 1)
6225
6226 /* Internally things are UVs */
6227 #define TO_INTERNAL_SIZE(x) ((x + HEADER_LENGTH) * sizeof(UV))
6228 #define FROM_INTERNAL_SIZE(x) ((x / sizeof(UV)) - HEADER_LENGTH)
6229
6230 #define INVLIST_INITIAL_LEN 10
6231
6232 PERL_STATIC_INLINE UV*
6233 S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
6234 {
6235     /* Returns a pointer to the first element in the inversion list's array.
6236      * This is called upon initialization of an inversion list.  Where the
6237      * array begins depends on whether the list has the code point U+0000
6238      * in it or not.  The other parameter tells it whether the code that
6239      * follows this call is about to put a 0 in the inversion list or not.
6240      * The first element is either the element with 0, if 0, or the next one,
6241      * if 1 */
6242
6243     UV* zero = get_invlist_zero_addr(invlist);
6244
6245     PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
6246
6247     /* Must be empty */
6248     assert(! *get_invlist_len_addr(invlist));
6249
6250     /* 1^1 = 0; 1^0 = 1 */
6251     *zero = 1 ^ will_have_0;
6252     return zero + *zero;
6253 }
6254
6255 PERL_STATIC_INLINE UV*
6256 S_invlist_array(pTHX_ SV* const invlist)
6257 {
6258     /* Returns the pointer to the inversion list's array.  Every time the
6259      * length changes, this needs to be called in case malloc or realloc moved
6260      * it */
6261
6262     PERL_ARGS_ASSERT_INVLIST_ARRAY;
6263
6264     /* Must not be empty.  If these fail, you probably didn't check for <len>
6265      * being non-zero before trying to get the array */
6266     assert(*get_invlist_len_addr(invlist));
6267     assert(*get_invlist_zero_addr(invlist) == 0
6268            || *get_invlist_zero_addr(invlist) == 1);
6269
6270     /* The array begins either at the element reserved for zero if the
6271      * list contains 0 (that element will be set to 0), or otherwise the next
6272      * element (in which case the reserved element will be set to 1). */
6273     return (UV *) (get_invlist_zero_addr(invlist)
6274                    + *get_invlist_zero_addr(invlist));
6275 }
6276
6277 PERL_STATIC_INLINE UV*
6278 S_get_invlist_len_addr(pTHX_ SV* invlist)
6279 {
6280     /* Return the address of the UV that contains the current number
6281      * of used elements in the inversion list */
6282
6283     PERL_ARGS_ASSERT_GET_INVLIST_LEN_ADDR;
6284
6285     return (UV *) (SvPVX(invlist) + (INVLIST_LEN_OFFSET * sizeof (UV)));
6286 }
6287
6288 PERL_STATIC_INLINE UV
6289 S_invlist_len(pTHX_ SV* const invlist)
6290 {
6291     /* Returns the current number of elements stored in the inversion list's
6292      * array */
6293
6294     PERL_ARGS_ASSERT_INVLIST_LEN;
6295
6296     return *get_invlist_len_addr(invlist);
6297 }
6298
6299 PERL_STATIC_INLINE void
6300 S_invlist_set_len(pTHX_ SV* const invlist, const UV len)
6301 {
6302     /* Sets the current number of elements stored in the inversion list */
6303
6304     PERL_ARGS_ASSERT_INVLIST_SET_LEN;
6305
6306     *get_invlist_len_addr(invlist) = len;
6307
6308     assert(len <= SvLEN(invlist));
6309
6310     SvCUR_set(invlist, TO_INTERNAL_SIZE(len));
6311     /* If the list contains U+0000, that element is part of the header,
6312      * and should not be counted as part of the array.  It will contain
6313      * 0 in that case, and 1 otherwise.  So we could flop 0=>1, 1=>0 and
6314      * subtract:
6315      *  SvCUR_set(invlist,
6316      *            TO_INTERNAL_SIZE(len
6317      *                             - (*get_invlist_zero_addr(inv_list) ^ 1)));
6318      * But, this is only valid if len is not 0.  The consequences of not doing
6319      * this is that the memory allocation code may think that 1 more UV is
6320      * being used than actually is, and so might do an unnecessary grow.  That
6321      * seems worth not bothering to make this the precise amount.
6322      *
6323      * Note that when inverting, SvCUR shouldn't change */
6324 }
6325
6326 PERL_STATIC_INLINE UV
6327 S_invlist_max(pTHX_ SV* const invlist)
6328 {
6329     /* Returns the maximum number of elements storable in the inversion list's
6330      * array, without having to realloc() */
6331
6332     PERL_ARGS_ASSERT_INVLIST_MAX;
6333
6334     return FROM_INTERNAL_SIZE(SvLEN(invlist));
6335 }
6336
6337 PERL_STATIC_INLINE UV*
6338 S_get_invlist_zero_addr(pTHX_ SV* invlist)
6339 {
6340     /* Return the address of the UV that is reserved to hold 0 if the inversion
6341      * list contains 0.  This has to be the last element of the heading, as the
6342      * list proper starts with either it if 0, or the next element if not.
6343      * (But we force it to contain either 0 or 1) */
6344
6345     PERL_ARGS_ASSERT_GET_INVLIST_ZERO_ADDR;
6346
6347     return (UV *) (SvPVX(invlist) + (INVLIST_ZERO_OFFSET * sizeof (UV)));
6348 }
6349
6350 #ifndef PERL_IN_XSUB_RE
6351 SV*
6352 Perl__new_invlist(pTHX_ IV initial_size)
6353 {
6354
6355     /* Return a pointer to a newly constructed inversion list, with enough
6356      * space to store 'initial_size' elements.  If that number is negative, a
6357      * system default is used instead */
6358
6359     SV* new_list;
6360
6361     if (initial_size < 0) {
6362         initial_size = INVLIST_INITIAL_LEN;
6363     }
6364
6365     /* Allocate the initial space */
6366     new_list = newSV(TO_INTERNAL_SIZE(initial_size));
6367     invlist_set_len(new_list, 0);
6368
6369     /* Force iterinit() to be used to get iteration to work */
6370     *get_invlist_iter_addr(new_list) = UV_MAX;
6371
6372     /* This should force a segfault if a method doesn't initialize this
6373      * properly */
6374     *get_invlist_zero_addr(new_list) = UV_MAX;
6375
6376     *get_invlist_version_id_addr(new_list) = INVLIST_VERSION_ID;
6377 #if HEADER_LENGTH != 4
6378 #   error Need to regenerate VERSION_ID by running perl -E 'say int(rand 2**31-1)', and then changing the #if to the new length
6379 #endif
6380
6381     return new_list;
6382 }
6383 #endif
6384
6385 STATIC SV*
6386 S__new_invlist_C_array(pTHX_ UV* list)
6387 {
6388     /* Return a pointer to a newly constructed inversion list, initialized to
6389      * point to <list>, which has to be in the exact correct inversion list
6390      * form, including internal fields.  Thus this is a dangerous routine that
6391      * should not be used in the wrong hands */
6392
6393     SV* invlist = newSV_type(SVt_PV);
6394
6395     PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
6396
6397     SvPV_set(invlist, (char *) list);
6398     SvLEN_set(invlist, 0);  /* Means we own the contents, and the system
6399                                shouldn't touch it */
6400     SvCUR_set(invlist, TO_INTERNAL_SIZE(invlist_len(invlist)));
6401
6402     if (*get_invlist_version_id_addr(invlist) != INVLIST_VERSION_ID) {
6403         Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
6404     }
6405
6406     return invlist;
6407 }
6408
6409 STATIC void
6410 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
6411 {
6412     /* Grow the maximum size of an inversion list */
6413
6414     PERL_ARGS_ASSERT_INVLIST_EXTEND;
6415
6416     SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max));
6417 }
6418
6419 PERL_STATIC_INLINE void
6420 S_invlist_trim(pTHX_ SV* const invlist)
6421 {
6422     PERL_ARGS_ASSERT_INVLIST_TRIM;
6423
6424     /* Change the length of the inversion list to how many entries it currently
6425      * has */
6426
6427     SvPV_shrink_to_cur((SV *) invlist);
6428 }
6429
6430 /* An element is in an inversion list iff its index is even numbered: 0, 2, 4,
6431  * etc */
6432 #define ELEMENT_RANGE_MATCHES_INVLIST(i) (! ((i) & 1))
6433 #define PREV_RANGE_MATCHES_INVLIST(i) (! ELEMENT_RANGE_MATCHES_INVLIST(i))
6434
6435 #define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
6436
6437 #ifndef PERL_IN_XSUB_RE
6438 void
6439 Perl__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
6440 {
6441    /* Subject to change or removal.  Append the range from 'start' to 'end' at
6442     * the end of the inversion list.  The range must be above any existing
6443     * ones. */
6444
6445     UV* array;
6446     UV max = invlist_max(invlist);
6447     UV len = invlist_len(invlist);
6448
6449     PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
6450
6451     if (len == 0) { /* Empty lists must be initialized */
6452         array = _invlist_array_init(invlist, start == 0);
6453     }
6454     else {
6455         /* Here, the existing list is non-empty. The current max entry in the
6456          * list is generally the first value not in the set, except when the
6457          * set extends to the end of permissible values, in which case it is
6458          * the first entry in that final set, and so this call is an attempt to
6459          * append out-of-order */
6460
6461         UV final_element = len - 1;
6462         array = invlist_array(invlist);
6463         if (array[final_element] > start
6464             || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
6465         {
6466             Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
6467                        array[final_element], start,
6468                        ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
6469         }
6470
6471         /* Here, it is a legal append.  If the new range begins with the first
6472          * value not in the set, it is extending the set, so the new first
6473          * value not in the set is one greater than the newly extended range.
6474          * */
6475         if (array[final_element] == start) {
6476             if (end != UV_MAX) {
6477                 array[final_element] = end + 1;
6478             }
6479             else {
6480                 /* But if the end is the maximum representable on the machine,
6481                  * just let the range that this would extend to have no end */
6482                 invlist_set_len(invlist, len - 1);
6483             }
6484             return;
6485         }
6486     }
6487
6488     /* Here the new range doesn't extend any existing set.  Add it */
6489
6490     len += 2;   /* Includes an element each for the start and end of range */
6491
6492     /* If overflows the existing space, extend, which may cause the array to be
6493      * moved */
6494     if (max < len) {
6495         invlist_extend(invlist, len);
6496         invlist_set_len(invlist, len);  /* Have to set len here to avoid assert
6497                                            failure in invlist_array() */
6498         array = invlist_array(invlist);
6499     }
6500     else {
6501         invlist_set_len(invlist, len);
6502     }
6503
6504     /* The next item on the list starts the range, the one after that is
6505      * one past the new range.  */
6506     array[len - 2] = start;
6507     if (end != UV_MAX) {
6508         array[len - 1] = end + 1;
6509     }
6510     else {
6511         /* But if the end is the maximum representable on the machine, just let
6512          * the range have no end */
6513         invlist_set_len(invlist, len - 1);
6514     }
6515 }
6516
6517 STATIC IV
6518 S_invlist_search(pTHX_ SV* const invlist, const UV cp)
6519 {
6520     /* Searches the inversion list for the entry that contains the input code
6521      * point <cp>.  If <cp> is not in the list, -1 is returned.  Otherwise, the
6522      * return value is the index into the list's array of the range that
6523      * contains <cp> */
6524
6525     IV low = 0;
6526     IV high = invlist_len(invlist);
6527     const UV * const array = invlist_array(invlist);
6528
6529     PERL_ARGS_ASSERT_INVLIST_SEARCH;
6530
6531     /* If list is empty or the code point is before the first element, return
6532      * failure. */
6533     if (high == 0 || cp < array[0]) {
6534         return -1;
6535     }
6536
6537     /* Binary search.  What we are looking for is <i> such that
6538      *  array[i] <= cp < array[i+1]
6539      * The loop below converges on the i+1. */
6540     while (low < high) {
6541         IV mid = (low + high) / 2;
6542         if (array[mid] <= cp) {
6543             low = mid + 1;
6544
6545             /* We could do this extra test to exit the loop early.
6546             if (cp < array[low]) {
6547                 return mid;
6548             }
6549             */
6550         }
6551         else { /* cp < array[mid] */
6552             high = mid;
6553         }
6554     }
6555
6556     return high - 1;
6557 }
6558
6559 void
6560 Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch)
6561 {
6562     /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
6563      * but is used when the swash has an inversion list.  This makes this much
6564      * faster, as it uses a binary search instead of a linear one.  This is
6565      * intimately tied to that function, and perhaps should be in utf8.c,
6566      * except it is intimately tied to inversion lists as well.  It assumes
6567      * that <swatch> is all 0's on input */
6568
6569     UV current = start;
6570     const IV len = invlist_len(invlist);
6571     IV i;
6572     const UV * array;
6573
6574     PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
6575
6576     if (len == 0) { /* Empty inversion list */
6577         return;
6578     }
6579
6580     array = invlist_array(invlist);
6581
6582     /* Find which element it is */
6583     i = invlist_search(invlist, start);
6584
6585     /* We populate from <start> to <end> */
6586     while (current < end) {
6587         UV upper;
6588
6589         /* The inversion list gives the results for every possible code point
6590          * after the first one in the list.  Only those ranges whose index is
6591          * even are ones that the inversion list matches.  For the odd ones,
6592          * and if the initial code point is not in the list, we have to skip
6593          * forward to the next element */
6594         if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
6595             i++;
6596             if (i >= len) { /* Finished if beyond the end of the array */
6597                 return;
6598             }
6599             current = array[i];
6600             if (current >= end) {   /* Finished if beyond the end of what we
6601                                        are populating */
6602                 return;
6603             }
6604         }
6605         assert(current >= start);
6606
6607         /* The current range ends one below the next one, except don't go past
6608          * <end> */
6609         i++;
6610         upper = (i < len && array[i] < end) ? array[i] : end;
6611
6612         /* Here we are in a range that matches.  Populate a bit in the 3-bit U8
6613          * for each code point in it */
6614         for (; current < upper; current++) {
6615             const STRLEN offset = (STRLEN)(current - start);
6616             swatch[offset >> 3] |= 1 << (offset & 7);
6617         }
6618
6619         /* Quit if at the end of the list */
6620         if (i >= len) {
6621
6622             /* But first, have to deal with the highest possible code point on
6623              * the platform.  The previous code assumes that <end> is one
6624              * beyond where we want to populate, but that is impossible at the
6625              * platform's infinity, so have to handle it specially */
6626             if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
6627             {
6628                 const STRLEN offset = (STRLEN)(end - start);
6629                 swatch[offset >> 3] |= 1 << (offset & 7);
6630             }
6631             return;
6632         }
6633
6634         /* Advance to the next range, which will be for code points not in the
6635          * inversion list */
6636         current = array[i];
6637     }
6638
6639     return;
6640 }
6641
6642
6643 void
6644 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** output)
6645 {
6646     /* Take the union of two inversion lists and point <output> to it.  *output
6647      * should be defined upon input, and if it points to one of the two lists,
6648      * the reference count to that list will be decremented.  The first list,
6649      * <a>, may be NULL, in which case a copy of the second list is returned.
6650      * If <complement_b> is TRUE, the union is taken of the complement
6651      * (inversion) of <b> instead of b itself.
6652      *
6653      * The basis for this comes from "Unicode Demystified" Chapter 13 by
6654      * Richard Gillam, published by Addison-Wesley, and explained at some
6655      * length there.  The preface says to incorporate its examples into your
6656      * code at your own risk.
6657      *
6658      * The algorithm is like a merge sort.
6659      *
6660      * XXX A potential performance improvement is to keep track as we go along
6661      * if only one of the inputs contributes to the result, meaning the other
6662      * is a subset of that one.  In that case, we can skip the final copy and
6663      * return the larger of the input lists, but then outside code might need
6664      * to keep track of whether to free the input list or not */
6665
6666     UV* array_a;    /* a's array */
6667     UV* array_b;
6668     UV len_a;       /* length of a's array */
6669     UV len_b;
6670
6671     SV* u;                      /* the resulting union */
6672     UV* array_u;
6673     UV len_u;
6674
6675     UV i_a = 0;             /* current index into a's array */
6676     UV i_b = 0;
6677     UV i_u = 0;
6678
6679     /* running count, as explained in the algorithm source book; items are
6680      * stopped accumulating and are output when the count changes to/from 0.
6681      * The count is incremented when we start a range that's in the set, and
6682      * decremented when we start a range that's not in the set.  So its range
6683      * is 0 to 2.  Only when the count is zero is something not in the set.
6684      */
6685     UV count = 0;
6686
6687     PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
6688     assert(a != b);
6689
6690     /* If either one is empty, the union is the other one */
6691     if (a == NULL || ((len_a = invlist_len(a)) == 0)) {
6692         if (*output == a) {
6693             if (a != NULL) {
6694                 SvREFCNT_dec(a);
6695             }
6696         }
6697         if (*output != b) {
6698             *output = invlist_clone(b);
6699             if (complement_b) {
6700                 _invlist_invert(*output);
6701             }
6702         } /* else *output already = b; */
6703         return;
6704     }
6705     else if ((len_b = invlist_len(b)) == 0) {
6706         if (*output == b) {
6707             SvREFCNT_dec(b);
6708         }
6709
6710         /* The complement of an empty list is a list that has everything in it,
6711          * so the union with <a> includes everything too */
6712         if (complement_b) {
6713             if (a == *output) {
6714                 SvREFCNT_dec(a);
6715             }
6716             *output = _new_invlist(1);
6717             _append_range_to_invlist(*output, 0, UV_MAX);
6718         }
6719         else if (*output != a) {
6720             *output = invlist_clone(a);
6721         }
6722         /* else *output already = a; */
6723         return;
6724     }
6725
6726     /* Here both lists exist and are non-empty */
6727     array_a = invlist_array(a);
6728     array_b = invlist_array(b);
6729
6730     /* If are to take the union of 'a' with the complement of b, set it
6731      * up so are looking at b's complement. */
6732     if (complement_b) {
6733
6734         /* To complement, we invert: if the first element is 0, remove it.  To
6735          * do this, we just pretend the array starts one later, and clear the
6736          * flag as we don't have to do anything else later */
6737         if (array_b[0] == 0) {
6738             array_b++;
6739             len_b--;
6740             complement_b = FALSE;
6741         }
6742         else {
6743
6744             /* But if the first element is not zero, we unshift a 0 before the
6745              * array.  The data structure reserves a space for that 0 (which
6746              * should be a '1' right now), so physical shifting is unneeded,
6747              * but temporarily change that element to 0.  Before exiting the
6748              * routine, we must restore the element to '1' */
6749             array_b--;
6750             len_b++;
6751             array_b[0] = 0;
6752         }
6753     }
6754
6755     /* Size the union for the worst case: that the sets are completely
6756      * disjoint */
6757     u = _new_invlist(len_a + len_b);
6758
6759     /* Will contain U+0000 if either component does */
6760     array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
6761                                       || (len_b > 0 && array_b[0] == 0));
6762
6763     /* Go through each list item by item, stopping when exhausted one of
6764      * them */
6765     while (i_a < len_a && i_b < len_b) {
6766         UV cp;      /* The element to potentially add to the union's array */
6767         bool cp_in_set;   /* is it in the the input list's set or not */
6768
6769         /* We need to take one or the other of the two inputs for the union.
6770          * Since we are merging two sorted lists, we take the smaller of the
6771          * next items.  In case of a tie, we take the one that is in its set
6772          * first.  If we took one not in the set first, it would decrement the
6773          * count, possibly to 0 which would cause it to be output as ending the
6774          * range, and the next time through we would take the same number, and
6775          * output it again as beginning the next range.  By doing it the
6776          * opposite way, there is no possibility that the count will be
6777          * momentarily decremented to 0, and thus the two adjoining ranges will
6778          * be seamlessly merged.  (In a tie and both are in the set or both not
6779          * in the set, it doesn't matter which we take first.) */
6780         if (array_a[i_a] < array_b[i_b]
6781             || (array_a[i_a] == array_b[i_b]
6782                 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
6783         {
6784             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
6785             cp= array_a[i_a++];
6786         }
6787         else {
6788             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
6789             cp= array_b[i_b++];
6790         }
6791
6792         /* Here, have chosen which of the two inputs to look at.  Only output
6793          * if the running count changes to/from 0, which marks the
6794          * beginning/end of a range in that's in the set */
6795         if (cp_in_set) {
6796             if (count == 0) {
6797                 array_u[i_u++] = cp;
6798             }
6799             count++;
6800         }
6801         else {
6802             count--;
6803             if (count == 0) {
6804                 array_u[i_u++] = cp;
6805             }
6806         }
6807     }
6808
6809     /* Here, we are finished going through at least one of the lists, which
6810      * means there is something remaining in at most one.  We check if the list
6811      * that hasn't been exhausted is positioned such that we are in the middle
6812      * of a range in its set or not.  (i_a and i_b point to the element beyond
6813      * the one we care about.) If in the set, we decrement 'count'; if 0, there
6814      * is potentially more to output.
6815      * There are four cases:
6816      *  1) Both weren't in their sets, count is 0, and remains 0.  What's left
6817      *     in the union is entirely from the non-exhausted set.
6818      *  2) Both were in their sets, count is 2.  Nothing further should
6819      *     be output, as everything that remains will be in the exhausted
6820      *     list's set, hence in the union; decrementing to 1 but not 0 insures
6821      *     that
6822      *  3) the exhausted was in its set, non-exhausted isn't, count is 1.
6823      *     Nothing further should be output because the union includes
6824      *     everything from the exhausted set.  Not decrementing ensures that.
6825      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1;
6826      *     decrementing to 0 insures that we look at the remainder of the
6827      *     non-exhausted set */
6828     if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
6829         || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
6830     {
6831         count--;
6832     }
6833
6834     /* The final length is what we've output so far, plus what else is about to
6835      * be output.  (If 'count' is non-zero, then the input list we exhausted
6836      * has everything remaining up to the machine's limit in its set, and hence
6837      * in the union, so there will be no further output. */
6838     len_u = i_u;
6839     if (count == 0) {
6840         /* At most one of the subexpressions will be non-zero */
6841         len_u += (len_a - i_a) + (len_b - i_b);
6842     }
6843
6844     /* Set result to final length, which can change the pointer to array_u, so
6845      * re-find it */
6846     if (len_u != invlist_len(u)) {
6847         invlist_set_len(u, len_u);
6848         invlist_trim(u);
6849         array_u = invlist_array(u);
6850     }
6851
6852     /* When 'count' is 0, the list that was exhausted (if one was shorter than
6853      * the other) ended with everything above it not in its set.  That means
6854      * that the remaining part of the union is precisely the same as the
6855      * non-exhausted list, so can just copy it unchanged.  (If both list were
6856      * exhausted at the same time, then the operations below will be both 0.)
6857      */
6858     if (count == 0) {
6859         IV copy_count; /* At most one will have a non-zero copy count */
6860         if ((copy_count = len_a - i_a) > 0) {
6861             Copy(array_a + i_a, array_u + i_u, copy_count, UV);
6862         }
6863         else if ((copy_count = len_b - i_b) > 0) {
6864             Copy(array_b + i_b, array_u + i_u, copy_count, UV);
6865         }
6866     }
6867
6868     /*  We may be removing a reference to one of the inputs */
6869     if (a == *output || b == *output) {
6870         SvREFCNT_dec(*output);
6871     }
6872
6873     /* If we've changed b, restore it */
6874     if (complement_b) {
6875         array_b[0] = 1;
6876     }
6877
6878     *output = u;
6879     return;
6880 }
6881
6882 void
6883 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** i)
6884 {
6885     /* Take the intersection of two inversion lists and point <i> to it.  *i
6886      * should be defined upon input, and if it points to one of the two lists,
6887      * the reference count to that list will be decremented.
6888      * If <complement_b> is TRUE, the result will be the intersection of <a>
6889      * and the complement (or inversion) of <b> instead of <b> directly.
6890      *
6891      * The basis for this comes from "Unicode Demystified" Chapter 13 by
6892      * Richard Gillam, published by Addison-Wesley, and explained at some
6893      * length there.  The preface says to incorporate its examples into your
6894      * code at your own risk.  In fact, it had bugs
6895      *
6896      * The algorithm is like a merge sort, and is essentially the same as the
6897      * union above
6898      */
6899
6900     UV* array_a;                /* a's array */
6901     UV* array_b;
6902     UV len_a;   /* length of a's array */
6903     UV len_b;
6904
6905     SV* r;                   /* the resulting intersection */
6906     UV* array_r;
6907     UV len_r;
6908
6909     UV i_a = 0;             /* current index into a's array */
6910     UV i_b = 0;
6911     UV i_r = 0;
6912
6913     /* running count, as explained in the algorithm source book; items are
6914      * stopped accumulating and are output when the count changes to/from 2.
6915      * The count is incremented when we start a range that's in the set, and
6916      * decremented when we start a range that's not in the set.  So its range
6917      * is 0 to 2.  Only when the count is 2 is something in the intersection.
6918      */
6919     UV count = 0;
6920
6921     PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
6922     assert(a != b);
6923
6924     /* Special case if either one is empty */
6925     len_a = invlist_len(a);
6926     if ((len_a == 0) || ((len_b = invlist_len(b)) == 0)) {
6927
6928         if (len_a != 0 && complement_b) {
6929
6930             /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
6931              * be empty.  Here, also we are using 'b's complement, which hence
6932              * must be every possible code point.  Thus the intersection is
6933              * simply 'a'. */
6934             if (*i != a) {
6935                 *i = invlist_clone(a);
6936
6937                 if (*i == b) {
6938                     SvREFCNT_dec(b);
6939                 }
6940             }
6941             /* else *i is already 'a' */
6942             return;
6943         }
6944
6945         /* Here, 'a' or 'b' is empty and not using the complement of 'b'.  The
6946          * intersection must be empty */
6947         if (*i == a) {
6948             SvREFCNT_dec(a);
6949         }
6950         else if (*i == b) {
6951             SvREFCNT_dec(b);
6952         }
6953         *i = _new_invlist(0);
6954         return;
6955     }
6956
6957     /* Here both lists exist and are non-empty */
6958     array_a = invlist_array(a);
6959     array_b = invlist_array(b);
6960
6961     /* If are to take the intersection of 'a' with the complement of b, set it
6962      * up so are looking at b's complement. */
6963     if (complement_b) {
6964
6965         /* To complement, we invert: if the first element is 0, remove it.  To
6966          * do this, we just pretend the array starts one later, and clear the
6967          * flag as we don't have to do anything else later */
6968         if (array_b[0] == 0) {
6969             array_b++;
6970             len_b--;
6971             complement_b = FALSE;
6972         }
6973         else {
6974
6975             /* But if the first element is not zero, we unshift a 0 before the
6976              * array.  The data structure reserves a space for that 0 (which
6977              * should be a '1' right now), so physical shifting is unneeded,
6978              * but temporarily change that element to 0.  Before exiting the
6979              * routine, we must restore the element to '1' */
6980             array_b--;
6981             len_b++;
6982             array_b[0] = 0;
6983         }
6984     }
6985
6986     /* Size the intersection for the worst case: that the intersection ends up
6987      * fragmenting everything to be completely disjoint */
6988     r= _new_invlist(len_a + len_b);
6989
6990     /* Will contain U+0000 iff both components do */
6991     array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
6992                                      && len_b > 0 && array_b[0] == 0);
6993
6994     /* Go through each list item by item, stopping when exhausted one of
6995      * them */
6996     while (i_a < len_a && i_b < len_b) {
6997         UV cp;      /* The element to potentially add to the intersection's
6998                        array */
6999         bool cp_in_set; /* Is it in the input list's set or not */
7000
7001         /* We need to take one or the other of the two inputs for the
7002          * intersection.  Since we are merging two sorted lists, we take the
7003          * smaller of the next items.  In case of a tie, we take the one that
7004          * is not in its set first (a difference from the union algorithm).  If
7005          * we took one in the set first, it would increment the count, possibly
7006          * to 2 which would cause it to be output as starting a range in the
7007          * intersection, and the next time through we would take that same
7008          * number, and output it again as ending the set.  By doing it the
7009          * opposite of this, there is no possibility that the count will be
7010          * momentarily incremented to 2.  (In a tie and both are in the set or
7011          * both not in the set, it doesn't matter which we take first.) */
7012         if (array_a[i_a] < array_b[i_b]
7013             || (array_a[i_a] == array_b[i_b]
7014                 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7015         {
7016             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7017             cp= array_a[i_a++];
7018         }
7019         else {
7020             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7021             cp= array_b[i_b++];
7022         }
7023
7024         /* Here, have chosen which of the two inputs to look at.  Only output
7025          * if the running count changes to/from 2, which marks the
7026          * beginning/end of a range that's in the intersection */
7027         if (cp_in_set) {
7028             count++;
7029             if (count == 2) {
7030                 array_r[i_r++] = cp;
7031             }
7032         }
7033         else {
7034             if (count == 2) {
7035                 array_r[i_r++] = cp;
7036             }
7037             count--;
7038         }
7039     }
7040
7041     /* Here, we are finished going through at least one of the lists, which
7042      * means there is something remaining in at most one.  We check if the list
7043      * that has been exhausted is positioned such that we are in the middle
7044      * of a range in its set or not.  (i_a and i_b point to elements 1 beyond
7045      * the ones we care about.)  There are four cases:
7046      *  1) Both weren't in their sets, count is 0, and remains 0.  There's
7047      *     nothing left in the intersection.
7048      *  2) Both were in their sets, count is 2 and perhaps is incremented to
7049      *     above 2.  What should be output is exactly that which is in the
7050      *     non-exhausted set, as everything it has is also in the intersection
7051      *     set, and everything it doesn't have can't be in the intersection
7052      *  3) The exhausted was in its set, non-exhausted isn't, count is 1, and
7053      *     gets incremented to 2.  Like the previous case, the intersection is
7054      *     everything that remains in the non-exhausted set.
7055      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
7056      *     remains 1.  And the intersection has nothing more. */
7057     if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7058         || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7059     {
7060         count++;
7061     }
7062
7063     /* The final length is what we've output so far plus what else is in the
7064      * intersection.  At most one of the subexpressions below will be non-zero */
7065     len_r = i_r;
7066     if (count >= 2) {
7067         len_r += (len_a - i_a) + (len_b - i_b);
7068     }
7069
7070     /* Set result to final length, which can change the pointer to array_r, so
7071      * re-find it */
7072     if (len_r != invlist_len(r)) {
7073         invlist_set_len(r, len_r);
7074         invlist_trim(r);
7075         array_r = invlist_array(r);
7076     }
7077
7078     /* Finish outputting any remaining */
7079     if (count >= 2) { /* At most one will have a non-zero copy count */
7080         IV copy_count;
7081         if ((copy_count = len_a - i_a) > 0) {
7082             Copy(array_a + i_a, array_r + i_r, copy_count, UV);
7083         }
7084         else if ((copy_count = len_b - i_b) > 0) {
7085             Copy(array_b + i_b, array_r + i_r, copy_count, UV);
7086         }
7087     }
7088
7089     /*  We may be removing a reference to one of the inputs */
7090     if (a == *i || b == *i) {
7091         SvREFCNT_dec(*i);
7092     }
7093
7094     /* If we've changed b, restore it */
7095     if (complement_b) {
7096         array_b[0] = 1;
7097     }
7098
7099     *i = r;
7100     return;
7101 }
7102
7103 #endif
7104
7105 STATIC SV*
7106 S_add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
7107 {
7108     /* Add the range from 'start' to 'end' inclusive to the inversion list's
7109      * set.  A pointer to the inversion list is returned.  This may actually be
7110      * a new list, in which case the passed in one has been destroyed.  The
7111      * passed in inversion list can be NULL, in which case a new one is created
7112      * with just the one range in it */
7113
7114     SV* range_invlist;
7115     UV len;
7116
7117     if (invlist == NULL) {
7118         invlist = _new_invlist(2);
7119         len = 0;
7120     }
7121     else {
7122         len = invlist_len(invlist);
7123     }
7124
7125     /* If comes after the final entry, can just append it to the end */
7126     if (len == 0
7127         || start >= invlist_array(invlist)
7128                                     [invlist_len(invlist) - 1])
7129     {
7130         _append_range_to_invlist(invlist, start, end);
7131         return invlist;
7132     }
7133
7134     /* Here, can't just append things, create and return a new inversion list
7135      * which is the union of this range and the existing inversion list */
7136     range_invlist = _new_invlist(2);
7137     _append_range_to_invlist(range_invlist, start, end);
7138
7139     _invlist_union(invlist, range_invlist, &invlist);
7140
7141     /* The temporary can be freed */
7142     SvREFCNT_dec(range_invlist);
7143
7144     return invlist;
7145 }
7146
7147 PERL_STATIC_INLINE SV*
7148 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
7149     return add_range_to_invlist(invlist, cp, cp);
7150 }
7151
7152 #ifndef PERL_IN_XSUB_RE
7153 void
7154 Perl__invlist_invert(pTHX_ SV* const invlist)
7155 {
7156     /* Complement the input inversion list.  This adds a 0 if the list didn't
7157      * have a zero; removes it otherwise.  As described above, the data
7158      * structure is set up so that this is very efficient */
7159
7160     UV* len_pos = get_invlist_len_addr(invlist);
7161
7162     PERL_ARGS_ASSERT__INVLIST_INVERT;
7163
7164     /* The inverse of matching nothing is matching everything */
7165     if (*len_pos == 0) {
7166         _append_range_to_invlist(invlist, 0, UV_MAX);
7167         return;
7168     }
7169
7170     /* The exclusive or complents 0 to 1; and 1 to 0.  If the result is 1, the
7171      * zero element was a 0, so it is being removed, so the length decrements
7172      * by 1; and vice-versa.  SvCUR is unaffected */
7173     if (*get_invlist_zero_addr(invlist) ^= 1) {
7174         (*len_pos)--;
7175     }
7176     else {
7177         (*len_pos)++;
7178     }
7179 }
7180
7181 void
7182 Perl__invlist_invert_prop(pTHX_ SV* const invlist)
7183 {
7184     /* Complement the input inversion list (which must be a Unicode property,
7185      * all of which don't match above the Unicode maximum code point.)  And
7186      * Perl has chosen to not have the inversion match above that either.  This
7187      * adds a 0x110000 if the list didn't end with it, and removes it if it did
7188      */
7189
7190     UV len;
7191     UV* array;
7192
7193     PERL_ARGS_ASSERT__INVLIST_INVERT_PROP;
7194
7195     _invlist_invert(invlist);
7196
7197     len = invlist_len(invlist);
7198
7199     if (len != 0) { /* If empty do nothing */
7200         array = invlist_array(invlist);
7201         if (array[len - 1] != PERL_UNICODE_MAX + 1) {
7202             /* Add 0x110000.  First, grow if necessary */
7203             len++;
7204             if (invlist_max(invlist) < len) {
7205                 invlist_extend(invlist, len);
7206                 array = invlist_array(invlist);
7207             }
7208             invlist_set_len(invlist, len);
7209             array[len - 1] = PERL_UNICODE_MAX + 1;
7210         }
7211         else {  /* Remove the 0x110000 */
7212             invlist_set_len(invlist, len - 1);
7213         }
7214     }
7215
7216     return;
7217 }
7218 #endif
7219
7220 PERL_STATIC_INLINE SV*
7221 S_invlist_clone(pTHX_ SV* const invlist)
7222 {
7223
7224     /* Return a new inversion list that is a copy of the input one, which is
7225      * unchanged */
7226
7227     /* Need to allocate extra space to accommodate Perl's addition of a
7228      * trailing NUL to SvPV's, since it thinks they are always strings */
7229     SV* new_invlist = _new_invlist(invlist_len(invlist) + 1);
7230     STRLEN length = SvCUR(invlist);
7231
7232     PERL_ARGS_ASSERT_INVLIST_CLONE;
7233
7234     SvCUR_set(new_invlist, length); /* This isn't done automatically */
7235     Copy(SvPVX(invlist), SvPVX(new_invlist), length, char);
7236
7237     return new_invlist;
7238 }
7239
7240 PERL_STATIC_INLINE UV*
7241 S_get_invlist_iter_addr(pTHX_ SV* invlist)
7242 {
7243     /* Return the address of the UV that contains the current iteration
7244      * position */
7245
7246     PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
7247
7248     return (UV *) (SvPVX(invlist) + (INVLIST_ITER_OFFSET * sizeof (UV)));
7249 }
7250
7251 PERL_STATIC_INLINE UV*
7252 S_get_invlist_version_id_addr(pTHX_ SV* invlist)
7253 {
7254     /* Return the address of the UV that contains the version id. */
7255
7256     PERL_ARGS_ASSERT_GET_INVLIST_VERSION_ID_ADDR;
7257
7258     return (UV *) (SvPVX(invlist) + (INVLIST_VERSION_ID_OFFSET * sizeof (UV)));
7259 }
7260
7261 PERL_STATIC_INLINE void
7262 S_invlist_iterinit(pTHX_ SV* invlist)   /* Initialize iterator for invlist */
7263 {
7264     PERL_ARGS_ASSERT_INVLIST_ITERINIT;
7265
7266     *get_invlist_iter_addr(invlist) = 0;
7267 }
7268
7269 STATIC bool
7270 S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
7271 {
7272     /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
7273      * This call sets in <*start> and <*end>, the next range in <invlist>.
7274      * Returns <TRUE> if successful and the next call will return the next
7275      * range; <FALSE> if was already at the end of the list.  If the latter,
7276      * <*start> and <*end> are unchanged, and the next call to this function
7277      * will start over at the beginning of the list */
7278
7279     UV* pos = get_invlist_iter_addr(invlist);
7280     UV len = invlist_len(invlist);
7281     UV *array;
7282
7283     PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
7284
7285     if (*pos >= len) {
7286         *pos = UV_MAX;  /* Force iternit() to be required next time */
7287         return FALSE;
7288     }
7289
7290     array = invlist_array(invlist);
7291
7292     *start = array[(*pos)++];
7293
7294     if (*pos >= len) {
7295         *end = UV_MAX;
7296     }
7297     else {
7298         *end = array[(*pos)++] - 1;
7299     }
7300
7301     return TRUE;
7302 }
7303
7304 #ifndef PERL_IN_XSUB_RE
7305 SV *
7306 Perl__invlist_contents(pTHX_ SV* const invlist)
7307 {
7308     /* Get the contents of an inversion list into a string SV so that they can
7309      * be printed out.  It uses the format traditionally done for debug tracing
7310      */
7311
7312     UV start, end;
7313     SV* output = newSVpvs("\n");
7314
7315     PERL_ARGS_ASSERT__INVLIST_CONTENTS;
7316
7317     invlist_iterinit(invlist);
7318     while (invlist_iternext(invlist, &start, &end)) {
7319         if (end == UV_MAX) {
7320             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
7321         }
7322         else if (end != start) {
7323             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
7324                     start,       end);
7325         }
7326         else {
7327             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
7328         }
7329     }
7330
7331     return output;
7332 }
7333 #endif
7334
7335 #if 0
7336 void
7337 S_invlist_dump(pTHX_ SV* const invlist, const char * const header)
7338 {
7339     /* Dumps out the ranges in an inversion list.  The string 'header'
7340      * if present is output on a line before the first range */
7341
7342     UV start, end;
7343
7344     if (header && strlen(header)) {
7345         PerlIO_printf(Perl_debug_log, "%s\n", header);
7346     }
7347     invlist_iterinit(invlist);
7348     while (invlist_iternext(invlist, &start, &end)) {
7349         if (end == UV_MAX) {
7350             PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. INFINITY\n", start);
7351         }
7352         else {
7353             PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. 0x%04"UVXf"\n", start, end);
7354         }
7355     }
7356 }
7357 #endif
7358
7359 #undef HEADER_LENGTH
7360 #undef INVLIST_INITIAL_LENGTH
7361 #undef TO_INTERNAL_SIZE
7362 #undef FROM_INTERNAL_SIZE
7363 #undef INVLIST_LEN_OFFSET
7364 #undef INVLIST_ZERO_OFFSET
7365 #undef INVLIST_ITER_OFFSET
7366 #undef INVLIST_VERSION_ID
7367
7368 /* End of inversion list object */
7369
7370 /*
7371  - reg - regular expression, i.e. main body or parenthesized thing
7372  *
7373  * Caller must absorb opening parenthesis.
7374  *
7375  * Combining parenthesis handling with the base level of regular expression
7376  * is a trifle forced, but the need to tie the tails of the branches to what
7377  * follows makes it hard to avoid.
7378  */
7379 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
7380 #ifdef DEBUGGING
7381 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
7382 #else
7383 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
7384 #endif
7385
7386 STATIC regnode *
7387 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
7388     /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
7389 {
7390     dVAR;
7391     register regnode *ret;              /* Will be the head of the group. */
7392     register regnode *br;
7393     register regnode *lastbr;
7394     register regnode *ender = NULL;
7395     register I32 parno = 0;
7396     I32 flags;
7397     U32 oregflags = RExC_flags;
7398     bool have_branch = 0;
7399     bool is_open = 0;
7400     I32 freeze_paren = 0;
7401     I32 after_freeze = 0;
7402
7403     /* for (?g), (?gc), and (?o) warnings; warning
7404        about (?c) will warn about (?g) -- japhy    */
7405
7406 #define WASTED_O  0x01
7407 #define WASTED_G  0x02
7408 #define WASTED_C  0x04
7409 #define WASTED_GC (0x02|0x04)
7410     I32 wastedflags = 0x00;
7411
7412     char * parse_start = RExC_parse; /* MJD */
7413     char * const oregcomp_parse = RExC_parse;
7414
7415     GET_RE_DEBUG_FLAGS_DECL;
7416
7417     PERL_ARGS_ASSERT_REG;
7418     DEBUG_PARSE("reg ");
7419
7420     *flagp = 0;                         /* Tentatively. */
7421
7422
7423     /* Make an OPEN node, if parenthesized. */
7424     if (paren) {
7425         if ( *RExC_parse == '*') { /* (*VERB:ARG) */
7426             char *start_verb = RExC_parse;
7427             STRLEN verb_len = 0;
7428             char *start_arg = NULL;
7429             unsigned char op = 0;
7430             int argok = 1;
7431             int internal_argval = 0; /* internal_argval is only useful if !argok */
7432             while ( *RExC_parse && *RExC_parse != ')' ) {
7433                 if ( *RExC_parse == ':' ) {
7434                     start_arg = RExC_parse + 1;
7435                     break;
7436                 }
7437                 RExC_parse++;
7438             }
7439             ++start_verb;
7440             verb_len = RExC_parse - start_verb;
7441             if ( start_arg ) {
7442                 RExC_parse++;
7443                 while ( *RExC_parse && *RExC_parse != ')' )
7444                     RExC_parse++;
7445                 if ( *RExC_parse != ')' )
7446                     vFAIL("Unterminated verb pattern argument");
7447                 if ( RExC_parse == start_arg )
7448                     start_arg = NULL;
7449             } else {
7450                 if ( *RExC_parse != ')' )
7451                     vFAIL("Unterminated verb pattern");
7452             }
7453
7454             switch ( *start_verb ) {
7455             case 'A':  /* (*ACCEPT) */
7456                 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
7457                     op = ACCEPT;
7458                     internal_argval = RExC_nestroot;
7459                 }
7460                 break;
7461             case 'C':  /* (*COMMIT) */
7462                 if ( memEQs(start_verb,verb_len,"COMMIT") )
7463                     op = COMMIT;
7464                 break;
7465             case 'F':  /* (*FAIL) */
7466                 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
7467                     op = OPFAIL;
7468                     argok = 0;
7469                 }
7470                 break;
7471             case ':':  /* (*:NAME) */
7472             case 'M':  /* (*MARK:NAME) */
7473                 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
7474                     op = MARKPOINT;
7475                     argok = -1;
7476                 }
7477                 break;
7478             case 'P':  /* (*PRUNE) */
7479                 if ( memEQs(start_verb,verb_len,"PRUNE") )
7480                     op = PRUNE;
7481                 break;
7482             case 'S':   /* (*SKIP) */
7483                 if ( memEQs(start_verb,verb_len,"SKIP") )
7484                     op = SKIP;
7485                 break;
7486             case 'T':  /* (*THEN) */
7487                 /* [19:06] <TimToady> :: is then */
7488                 if ( memEQs(start_verb,verb_len,"THEN") ) {
7489                     op = CUTGROUP;
7490                     RExC_seen |= REG_SEEN_CUTGROUP;
7491                 }
7492                 break;
7493             }
7494             if ( ! op ) {
7495                 RExC_parse++;
7496                 vFAIL3("Unknown verb pattern '%.*s'",
7497                     verb_len, start_verb);
7498             }
7499             if ( argok ) {
7500                 if ( start_arg && internal_argval ) {
7501                     vFAIL3("Verb pattern '%.*s' may not have an argument",
7502                         verb_len, start_verb);
7503                 } else if ( argok < 0 && !start_arg ) {
7504                     vFAIL3("Verb pattern '%.*s' has a mandatory argument",
7505                         verb_len, start_verb);
7506                 } else {
7507                     ret = reganode(pRExC_state, op, internal_argval);
7508                     if ( ! internal_argval && ! SIZE_ONLY ) {
7509                         if (start_arg) {
7510                             SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
7511                             ARG(ret) = add_data( pRExC_state, 1, "S" );
7512                             RExC_rxi->data->data[ARG(ret)]=(void*)sv;
7513                             ret->flags = 0;
7514                         } else {
7515                             ret->flags = 1;
7516                         }
7517                     }
7518                 }
7519                 if (!internal_argval)
7520                     RExC_seen |= REG_SEEN_VERBARG;
7521             } else if ( start_arg ) {
7522                 vFAIL3("Verb pattern '%.*s' may not have an argument",
7523                         verb_len, start_verb);
7524             } else {
7525                 ret = reg_node(pRExC_state, op);
7526             }
7527             nextchar(pRExC_state);
7528             return ret;
7529         } else
7530         if (*RExC_parse == '?') { /* (?...) */
7531             bool is_logical = 0;
7532             const char * const seqstart = RExC_parse;
7533             bool has_use_defaults = FALSE;
7534
7535             RExC_parse++;
7536             paren = *RExC_parse++;
7537             ret = NULL;                 /* For look-ahead/behind. */
7538             switch (paren) {
7539
7540             case 'P':   /* (?P...) variants for those used to PCRE/Python */
7541                 paren = *RExC_parse++;
7542                 if ( paren == '<')         /* (?P<...>) named capture */
7543                     goto named_capture;
7544                 else if (paren == '>') {   /* (?P>name) named recursion */
7545                     goto named_recursion;
7546                 }
7547                 else if (paren == '=') {   /* (?P=...)  named backref */
7548                     /* this pretty much dupes the code for \k<NAME> in regatom(), if
7549                        you change this make sure you change that */
7550                     char* name_start = RExC_parse;
7551                     U32 num = 0;
7552                     SV *sv_dat = reg_scan_name(pRExC_state,
7553                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7554                     if (RExC_parse == name_start || *RExC_parse != ')')
7555                         vFAIL2("Sequence %.3s... not terminated",parse_start);
7556
7557                     if (!SIZE_ONLY) {
7558                         num = add_data( pRExC_state, 1, "S" );
7559                         RExC_rxi->data->data[num]=(void*)sv_dat;
7560                         SvREFCNT_inc_simple_void(sv_dat);
7561                     }
7562                     RExC_sawback = 1;
7563                     ret = reganode(pRExC_state,
7564                                    ((! FOLD)
7565                                      ? NREF
7566                                      : (MORE_ASCII_RESTRICTED)
7567                                        ? NREFFA
7568                                        : (AT_LEAST_UNI_SEMANTICS)
7569                                          ? NREFFU
7570                                          : (LOC)
7571                                            ? NREFFL
7572                                            : NREFF),
7573                                     num);
7574                     *flagp |= HASWIDTH;
7575
7576                     Set_Node_Offset(ret, parse_start+1);
7577                     Set_Node_Cur_Length(ret); /* MJD */
7578
7579                     nextchar(pRExC_state);
7580                     return ret;
7581                 }
7582                 RExC_parse++;
7583                 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
7584                 /*NOTREACHED*/
7585             case '<':           /* (?<...) */
7586                 if (*RExC_parse == '!')
7587                     paren = ',';
7588                 else if (*RExC_parse != '=')
7589               named_capture:
7590                 {               /* (?<...>) */
7591                     char *name_start;
7592                     SV *svname;
7593                     paren= '>';
7594             case '\'':          /* (?'...') */
7595                     name_start= RExC_parse;
7596                     svname = reg_scan_name(pRExC_state,
7597                         SIZE_ONLY ?  /* reverse test from the others */
7598                         REG_RSN_RETURN_NAME :
7599                         REG_RSN_RETURN_NULL);
7600                     if (RExC_parse == name_start) {
7601                         RExC_parse++;
7602                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
7603                         /*NOTREACHED*/
7604                     }
7605                     if (*RExC_parse != paren)
7606                         vFAIL2("Sequence (?%c... not terminated",
7607                             paren=='>' ? '<' : paren);
7608                     if (SIZE_ONLY) {
7609                         HE *he_str;
7610                         SV *sv_dat = NULL;
7611                         if (!svname) /* shouldn't happen */
7612                             Perl_croak(aTHX_
7613                                 "panic: reg_scan_name returned NULL");
7614                         if (!RExC_paren_names) {
7615                             RExC_paren_names= newHV();
7616                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
7617 #ifdef DEBUGGING
7618                             RExC_paren_name_list= newAV();
7619                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
7620 #endif
7621                         }
7622                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
7623                         if ( he_str )
7624                             sv_dat = HeVAL(he_str);
7625                         if ( ! sv_dat ) {
7626                             /* croak baby croak */
7627                             Perl_croak(aTHX_
7628                                 "panic: paren_name hash element allocation failed");
7629                         } else if ( SvPOK(sv_dat) ) {
7630                             /* (?|...) can mean we have dupes so scan to check
7631                                its already been stored. Maybe a flag indicating
7632                                we are inside such a construct would be useful,
7633                                but the arrays are likely to be quite small, so
7634                                for now we punt -- dmq */
7635                             IV count = SvIV(sv_dat);
7636                             I32 *pv = (I32*)SvPVX(sv_dat);
7637                             IV i;
7638                             for ( i = 0 ; i < count ; i++ ) {
7639                                 if ( pv[i] == RExC_npar ) {
7640                                     count = 0;
7641                                     break;
7642                                 }
7643                             }
7644                             if ( count ) {
7645                                 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
7646                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
7647                                 pv[count] = RExC_npar;
7648                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
7649                             }
7650                         } else {
7651                             (void)SvUPGRADE(sv_dat,SVt_PVNV);
7652                             sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
7653                             SvIOK_on(sv_dat);
7654                             SvIV_set(sv_dat, 1);
7655                         }
7656 #ifdef DEBUGGING
7657                         /* Yes this does cause a memory leak in debugging Perls */
7658                         if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
7659                             SvREFCNT_dec(svname);
7660 #endif
7661
7662                         /*sv_dump(sv_dat);*/
7663                     }
7664                     nextchar(pRExC_state);
7665                     paren = 1;
7666                     goto capturing_parens;
7667                 }
7668                 RExC_seen |= REG_SEEN_LOOKBEHIND;
7669                 RExC_in_lookbehind++;
7670                 RExC_parse++;
7671             case '=':           /* (?=...) */
7672                 RExC_seen_zerolen++;
7673                 break;
7674             case '!':           /* (?!...) */
7675                 RExC_seen_zerolen++;
7676                 if (*RExC_parse == ')') {
7677                     ret=reg_node(pRExC_state, OPFAIL);
7678                     nextchar(pRExC_state);
7679                     return ret;
7680                 }
7681                 break;
7682             case '|':           /* (?|...) */
7683                 /* branch reset, behave like a (?:...) except that
7684                    buffers in alternations share the same numbers */
7685                 paren = ':';
7686                 after_freeze = freeze_paren = RExC_npar;
7687                 break;
7688             case ':':           /* (?:...) */
7689             case '>':           /* (?>...) */
7690                 break;
7691             case '$':           /* (?$...) */
7692             case '@':           /* (?@...) */
7693                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
7694                 break;
7695             case '#':           /* (?#...) */
7696                 while (*RExC_parse && *RExC_parse != ')')
7697                     RExC_parse++;
7698                 if (*RExC_parse != ')')
7699                     FAIL("Sequence (?#... not terminated");
7700                 nextchar(pRExC_state);
7701                 *flagp = TRYAGAIN;
7702                 return NULL;
7703             case '0' :           /* (?0) */
7704             case 'R' :           /* (?R) */
7705                 if (*RExC_parse != ')')
7706                     FAIL("Sequence (?R) not terminated");
7707                 ret = reg_node(pRExC_state, GOSTART);
7708                 *flagp |= POSTPONED;
7709                 nextchar(pRExC_state);
7710                 return ret;
7711                 /*notreached*/
7712             { /* named and numeric backreferences */
7713                 I32 num;
7714             case '&':            /* (?&NAME) */
7715                 parse_start = RExC_parse - 1;
7716               named_recursion:
7717                 {
7718                     SV *sv_dat = reg_scan_name(pRExC_state,
7719                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7720                      num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
7721                 }
7722                 goto gen_recurse_regop;
7723                 /* NOT REACHED */
7724             case '+':
7725                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
7726                     RExC_parse++;
7727                     vFAIL("Illegal pattern");
7728                 }
7729                 goto parse_recursion;
7730                 /* NOT REACHED*/
7731             case '-': /* (?-1) */
7732                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
7733                     RExC_parse--; /* rewind to let it be handled later */
7734                     goto parse_flags;
7735                 }
7736                 /*FALLTHROUGH */
7737             case '1': case '2': case '3': case '4': /* (?1) */
7738             case '5': case '6': case '7': case '8': case '9':
7739                 RExC_parse--;
7740               parse_recursion:
7741                 num = atoi(RExC_parse);
7742                 parse_start = RExC_parse - 1; /* MJD */
7743                 if (*RExC_parse == '-')
7744                     RExC_parse++;
7745                 while (isDIGIT(*RExC_parse))
7746                         RExC_parse++;
7747                 if (*RExC_parse!=')')
7748                     vFAIL("Expecting close bracket");
7749
7750               gen_recurse_regop:
7751                 if ( paren == '-' ) {
7752                     /*
7753                     Diagram of capture buffer numbering.
7754                     Top line is the normal capture buffer numbers
7755                     Bottom line is the negative indexing as from
7756                     the X (the (?-2))
7757
7758                     +   1 2    3 4 5 X          6 7
7759                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
7760                     -   5 4    3 2 1 X          x x
7761
7762                     */
7763                     num = RExC_npar + num;
7764                     if (num < 1)  {
7765                         RExC_parse++;
7766                         vFAIL("Reference to nonexistent group");
7767                     }
7768                 } else if ( paren == '+' ) {
7769                     num = RExC_npar + num - 1;
7770                 }
7771
7772                 ret = reganode(pRExC_state, GOSUB, num);
7773                 if (!SIZE_ONLY) {
7774                     if (num > (I32)RExC_rx->nparens) {
7775                         RExC_parse++;
7776                         vFAIL("Reference to nonexistent group");
7777                     }
7778                     ARG2L_SET( ret, RExC_recurse_count++);
7779                     RExC_emit++;
7780                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
7781                         "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
7782                 } else {
7783                     RExC_size++;
7784                 }
7785                 RExC_seen |= REG_SEEN_RECURSE;
7786                 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
7787                 Set_Node_Offset(ret, parse_start); /* MJD */
7788
7789                 *flagp |= POSTPONED;
7790                 nextchar(pRExC_state);
7791                 return ret;
7792             } /* named and numeric backreferences */
7793             /* NOT REACHED */
7794
7795             case '?':           /* (??...) */
7796                 is_logical = 1;
7797                 if (*RExC_parse != '{') {
7798                     RExC_parse++;
7799                     vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
7800                     /*NOTREACHED*/
7801                 }
7802                 *flagp |= POSTPONED;
7803                 paren = *RExC_parse++;
7804                 /* FALL THROUGH */
7805             case '{':           /* (?{...}) */
7806             {
7807                 I32 count = 1;
7808                 U32 n = 0;
7809                 char c;
7810                 char *s = RExC_parse;
7811
7812                 RExC_seen_zerolen++;
7813                 RExC_seen |= REG_SEEN_EVAL;
7814                 while (count && (c = *RExC_parse)) {
7815                     if (c == '\\') {
7816                         if (RExC_parse[1])
7817                             RExC_parse++;
7818                     }
7819                     else if (c == '{')
7820                         count++;
7821                     else if (c == '}')
7822                         count--;
7823                     RExC_parse++;
7824                 }
7825                 if (*RExC_parse != ')') {
7826                     RExC_parse = s;
7827                     vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
7828                 }
7829                 if (!SIZE_ONLY) {
7830                     PAD *pad;
7831                     OP_4tree *sop, *rop;
7832                     SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
7833
7834                     ENTER;
7835                     Perl_save_re_context(aTHX);
7836                     rop = Perl_sv_compile_2op_is_broken(aTHX_ sv, &sop, "re", &pad);
7837                     sop->op_private |= OPpREFCOUNTED;
7838                     /* re_dup will OpREFCNT_inc */
7839                     OpREFCNT_set(sop, 1);
7840                     LEAVE;
7841
7842                     n = add_data(pRExC_state, 3, "nop");
7843                     RExC_rxi->data->data[n] = (void*)rop;
7844                     RExC_rxi->data->data[n+1] = (void*)sop;
7845                     RExC_rxi->data->data[n+2] = (void*)pad;
7846                     SvREFCNT_dec(sv);
7847                 }
7848                 else {                                          /* First pass */
7849                     if (PL_reginterp_cnt < ++RExC_seen_evals
7850                         && IN_PERL_RUNTIME)
7851                         /* No compiled RE interpolated, has runtime
7852                            components ===> unsafe.  */
7853                         FAIL("Eval-group not allowed at runtime, use re 'eval'");
7854                     if (PL_tainting && PL_tainted)
7855                         FAIL("Eval-group in insecure regular expression");
7856 #if PERL_VERSION > 8
7857                     if (IN_PERL_COMPILETIME)
7858                         PL_cv_has_eval = 1;
7859 #endif
7860                 }
7861
7862                 nextchar(pRExC_state);
7863                 if (is_logical) {
7864                     ret = reg_node(pRExC_state, LOGICAL);
7865                     if (!SIZE_ONLY)
7866                         ret->flags = 2;
7867                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
7868                     /* deal with the length of this later - MJD */
7869                     return ret;
7870                 }
7871                 ret = reganode(pRExC_state, EVAL, n);
7872                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
7873                 Set_Node_Offset(ret, parse_start);
7874                 return ret;
7875             }
7876             case '(':           /* (?(?{...})...) and (?(?=...)...) */
7877             {
7878                 int is_define= 0;
7879                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
7880                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
7881                         || RExC_parse[1] == '<'
7882                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
7883                         I32 flag;
7884
7885                         ret = reg_node(pRExC_state, LOGICAL);
7886                         if (!SIZE_ONLY)
7887                             ret->flags = 1;
7888                         REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
7889                         goto insert_if;
7890                     }
7891                 }
7892                 else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
7893                          || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
7894                 {
7895                     char ch = RExC_parse[0] == '<' ? '>' : '\'';
7896                     char *name_start= RExC_parse++;
7897                     U32 num = 0;
7898                     SV *sv_dat=reg_scan_name(pRExC_state,
7899                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7900                     if (RExC_parse == name_start || *RExC_parse != ch)
7901                         vFAIL2("Sequence (?(%c... not terminated",
7902                             (ch == '>' ? '<' : ch));
7903                     RExC_parse++;
7904                     if (!SIZE_ONLY) {
7905                         num = add_data( pRExC_state, 1, "S" );
7906                         RExC_rxi->data->data[num]=(void*)sv_dat;
7907                         SvREFCNT_inc_simple_void(sv_dat);
7908                     }
7909                     ret = reganode(pRExC_state,NGROUPP,num);
7910                     goto insert_if_check_paren;
7911                 }
7912                 else if (RExC_parse[0] == 'D' &&
7913                          RExC_parse[1] == 'E' &&
7914                          RExC_parse[2] == 'F' &&
7915                          RExC_parse[3] == 'I' &&
7916                          RExC_parse[4] == 'N' &&
7917                          RExC_parse[5] == 'E')
7918                 {
7919                     ret = reganode(pRExC_state,DEFINEP,0);
7920                     RExC_parse +=6 ;
7921                     is_define = 1;
7922                     goto insert_if_check_paren;
7923                 }
7924                 else if (RExC_parse[0] == 'R') {
7925                     RExC_parse++;
7926                     parno = 0;
7927                     if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
7928                         parno = atoi(RExC_parse++);
7929                         while (isDIGIT(*RExC_parse))
7930                             RExC_parse++;
7931                     } else if (RExC_parse[0] == '&') {
7932                         SV *sv_dat;
7933                         RExC_parse++;
7934                         sv_dat = reg_scan_name(pRExC_state,
7935                             SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7936                         parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
7937                     }
7938                     ret = reganode(pRExC_state,INSUBP,parno);
7939                     goto insert_if_check_paren;
7940                 }
7941                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
7942                     /* (?(1)...) */
7943                     char c;
7944                     parno = atoi(RExC_parse++);
7945
7946                     while (isDIGIT(*RExC_parse))
7947                         RExC_parse++;
7948                     ret = reganode(pRExC_state, GROUPP, parno);
7949
7950                  insert_if_check_paren:
7951                     if ((c = *nextchar(pRExC_state)) != ')')
7952                         vFAIL("Switch condition not recognized");
7953                   insert_if:
7954                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
7955                     br = regbranch(pRExC_state, &flags, 1,depth+1);
7956                     if (br == NULL)
7957                         br = reganode(pRExC_state, LONGJMP, 0);
7958                     else
7959                         REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
7960                     c = *nextchar(pRExC_state);
7961                     if (flags&HASWIDTH)
7962                         *flagp |= HASWIDTH;
7963                     if (c == '|') {
7964                         if (is_define)
7965                             vFAIL("(?(DEFINE)....) does not allow branches");
7966                         lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
7967                         regbranch(pRExC_state, &flags, 1,depth+1);
7968                         REGTAIL(pRExC_state, ret, lastbr);
7969                         if (flags&HASWIDTH)
7970                             *flagp |= HASWIDTH;
7971                         c = *nextchar(pRExC_state);
7972                     }
7973                     else
7974                         lastbr = NULL;
7975                     if (c != ')')
7976                         vFAIL("Switch (?(condition)... contains too many branches");
7977                     ender = reg_node(pRExC_state, TAIL);
7978                     REGTAIL(pRExC_state, br, ender);
7979                     if (lastbr) {
7980                         REGTAIL(pRExC_state, lastbr, ender);
7981                         REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
7982                     }
7983                     else
7984                         REGTAIL(pRExC_state, ret, ender);
7985                     RExC_size++; /* XXX WHY do we need this?!!
7986                                     For large programs it seems to be required
7987                                     but I can't figure out why. -- dmq*/
7988                     return ret;
7989                 }
7990                 else {
7991                     vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
7992                 }
7993             }
7994             case 0:
7995                 RExC_parse--; /* for vFAIL to print correctly */
7996                 vFAIL("Sequence (? incomplete");
7997                 break;
7998             case DEFAULT_PAT_MOD:   /* Use default flags with the exceptions
7999                                        that follow */
8000                 has_use_defaults = TRUE;
8001                 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
8002                 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
8003                                                 ? REGEX_UNICODE_CHARSET
8004                                                 : REGEX_DEPENDS_CHARSET);
8005                 goto parse_flags;
8006             default:
8007                 --RExC_parse;
8008                 parse_flags:      /* (?i) */
8009             {
8010                 U32 posflags = 0, negflags = 0;
8011                 U32 *flagsp = &posflags;
8012                 char has_charset_modifier = '\0';
8013                 regex_charset cs = (RExC_utf8 || RExC_uni_semantics)
8014                                     ? REGEX_UNICODE_CHARSET
8015                                     : REGEX_DEPENDS_CHARSET;
8016
8017                 while (*RExC_parse) {
8018                     /* && strchr("iogcmsx", *RExC_parse) */
8019                     /* (?g), (?gc) and (?o) are useless here
8020                        and must be globally applied -- japhy */
8021                     switch (*RExC_parse) {
8022                     CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
8023                     case LOCALE_PAT_MOD:
8024                         if (has_charset_modifier) {
8025                             goto excess_modifier;
8026                         }
8027                         else if (flagsp == &negflags) {
8028                             goto neg_modifier;
8029                         }
8030                         cs = REGEX_LOCALE_CHARSET;
8031                         has_charset_modifier = LOCALE_PAT_MOD;
8032                         RExC_contains_locale = 1;
8033                         break;
8034                     case UNICODE_PAT_MOD:
8035                         if (has_charset_modifier) {
8036                             goto excess_modifier;
8037                         }
8038                         else if (flagsp == &negflags) {
8039                             goto neg_modifier;
8040                         }
8041                         cs = REGEX_UNICODE_CHARSET;
8042                         has_charset_modifier = UNICODE_PAT_MOD;
8043                         break;
8044                     case ASCII_RESTRICT_PAT_MOD:
8045                         if (flagsp == &negflags) {
8046                             goto neg_modifier;
8047                         }
8048                         if (has_charset_modifier) {
8049                             if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
8050                                 goto excess_modifier;
8051                             }
8052                             /* Doubled modifier implies more restricted */
8053                             cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
8054                         }
8055                         else {
8056                             cs = REGEX_ASCII_RESTRICTED_CHARSET;
8057                         }
8058                         has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
8059                         break;
8060                     case DEPENDS_PAT_MOD:
8061                         if (has_use_defaults) {
8062                             goto fail_modifiers;
8063                         }
8064                         else if (flagsp == &negflags) {
8065                             goto neg_modifier;
8066                         }
8067                         else if (has_charset_modifier) {
8068                             goto excess_modifier;
8069                         }
8070
8071                         /* The dual charset means unicode semantics if the
8072                          * pattern (or target, not known until runtime) are
8073                          * utf8, or something in the pattern indicates unicode
8074                          * semantics */
8075                         cs = (RExC_utf8 || RExC_uni_semantics)
8076                              ? REGEX_UNICODE_CHARSET
8077                              : REGEX_DEPENDS_CHARSET;
8078                         has_charset_modifier = DEPENDS_PAT_MOD;
8079                         break;
8080                     excess_modifier:
8081                         RExC_parse++;
8082                         if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
8083                             vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
8084                         }
8085                         else if (has_charset_modifier == *(RExC_parse - 1)) {
8086                             vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
8087                         }
8088                         else {
8089                             vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
8090                         }
8091                         /*NOTREACHED*/
8092                     neg_modifier:
8093                         RExC_parse++;
8094                         vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
8095                         /*NOTREACHED*/
8096                     case ONCE_PAT_MOD: /* 'o' */
8097                     case GLOBAL_PAT_MOD: /* 'g' */
8098                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
8099                             const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
8100                             if (! (wastedflags & wflagbit) ) {
8101                                 wastedflags |= wflagbit;
8102                                 vWARN5(
8103                                     RExC_parse + 1,
8104                                     "Useless (%s%c) - %suse /%c modifier",
8105                                     flagsp == &negflags ? "?-" : "?",
8106                                     *RExC_parse,
8107                                     flagsp == &negflags ? "don't " : "",
8108                                     *RExC_parse
8109                                 );
8110                             }
8111                         }
8112                         break;
8113
8114                     case CONTINUE_PAT_MOD: /* 'c' */
8115                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
8116                             if (! (wastedflags & WASTED_C) ) {
8117                                 wastedflags |= WASTED_GC;
8118                                 vWARN3(
8119                                     RExC_parse + 1,
8120                                     "Useless (%sc) - %suse /gc modifier",
8121                                     flagsp == &negflags ? "?-" : "?",
8122                                     flagsp == &negflags ? "don't " : ""
8123                                 );
8124                             }
8125                         }
8126                         break;
8127                     case KEEPCOPY_PAT_MOD: /* 'p' */
8128                         if (flagsp == &negflags) {
8129                             if (SIZE_ONLY)
8130                                 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
8131                         } else {
8132                             *flagsp |= RXf_PMf_KEEPCOPY;
8133                         }
8134                         break;
8135                     case '-':
8136                         /* A flag is a default iff it is following a minus, so
8137                          * if there is a minus, it means will be trying to
8138                          * re-specify a default which is an error */
8139                         if (has_use_defaults || flagsp == &negflags) {
8140             fail_modifiers:
8141                             RExC_parse++;
8142                             vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8143                             /*NOTREACHED*/
8144                         }
8145                         flagsp = &negflags;
8146                         wastedflags = 0;  /* reset so (?g-c) warns twice */
8147                         break;
8148                     case ':':
8149                         paren = ':';
8150                         /*FALLTHROUGH*/
8151                     case ')':
8152                         RExC_flags |= posflags;
8153                         RExC_flags &= ~negflags;
8154                         set_regex_charset(&RExC_flags, cs);
8155                         if (paren != ':') {
8156                             oregflags |= posflags;
8157                             oregflags &= ~negflags;
8158                             set_regex_charset(&oregflags, cs);
8159                         }
8160                         nextchar(pRExC_state);
8161                         if (paren != ':') {
8162                             *flagp = TRYAGAIN;
8163                             return NULL;
8164                         } else {
8165                             ret = NULL;
8166                             goto parse_rest;
8167                         }
8168                         /*NOTREACHED*/
8169                     default:
8170                         RExC_parse++;
8171                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8172                         /*NOTREACHED*/
8173                     }
8174                     ++RExC_parse;
8175                 }
8176             }} /* one for the default block, one for the switch */
8177         }
8178         else {                  /* (...) */
8179           capturing_parens:
8180             parno = RExC_npar;
8181             RExC_npar++;
8182
8183             ret = reganode(pRExC_state, OPEN, parno);
8184             if (!SIZE_ONLY ){
8185                 if (!RExC_nestroot)
8186                     RExC_nestroot = parno;
8187                 if (RExC_seen & REG_SEEN_RECURSE
8188                     && !RExC_open_parens[parno-1])
8189                 {
8190                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
8191                         "Setting open paren #%"IVdf" to %d\n",
8192                         (IV)parno, REG_NODE_NUM(ret)));
8193                     RExC_open_parens[parno-1]= ret;
8194                 }
8195             }
8196             Set_Node_Length(ret, 1); /* MJD */
8197             Set_Node_Offset(ret, RExC_parse); /* MJD */
8198             is_open = 1;
8199         }
8200     }
8201     else                        /* ! paren */
8202         ret = NULL;
8203
8204    parse_rest:
8205     /* Pick up the branches, linking them together. */
8206     parse_start = RExC_parse;   /* MJD */
8207     br = regbranch(pRExC_state, &flags, 1,depth+1);
8208
8209     /*     branch_len = (paren != 0); */
8210
8211     if (br == NULL)
8212         return(NULL);
8213     if (*RExC_parse == '|') {
8214         if (!SIZE_ONLY && RExC_extralen) {
8215             reginsert(pRExC_state, BRANCHJ, br, depth+1);
8216         }
8217         else {                  /* MJD */
8218             reginsert(pRExC_state, BRANCH, br, depth+1);
8219             Set_Node_Length(br, paren != 0);
8220             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
8221         }
8222         have_branch = 1;
8223         if (SIZE_ONLY)
8224             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
8225     }
8226     else if (paren == ':') {
8227         *flagp |= flags&SIMPLE;
8228     }
8229     if (is_open) {                              /* Starts with OPEN. */
8230         REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
8231     }
8232     else if (paren != '?')              /* Not Conditional */
8233         ret = br;
8234     *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
8235     lastbr = br;
8236     while (*RExC_parse == '|') {
8237         if (!SIZE_ONLY && RExC_extralen) {
8238             ender = reganode(pRExC_state, LONGJMP,0);
8239             REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
8240         }
8241         if (SIZE_ONLY)
8242             RExC_extralen += 2;         /* Account for LONGJMP. */
8243         nextchar(pRExC_state);
8244         if (freeze_paren) {
8245             if (RExC_npar > after_freeze)
8246                 after_freeze = RExC_npar;
8247             RExC_npar = freeze_paren;
8248         }
8249         br = regbranch(pRExC_state, &flags, 0, depth+1);
8250
8251         if (br == NULL)
8252             return(NULL);
8253         REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
8254         lastbr = br;
8255         *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
8256     }
8257
8258     if (have_branch || paren != ':') {
8259         /* Make a closing node, and hook it on the end. */
8260         switch (paren) {
8261         case ':':
8262             ender = reg_node(pRExC_state, TAIL);
8263             break;
8264         case 1:
8265             ender = reganode(pRExC_state, CLOSE, parno);
8266             if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
8267                 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
8268                         "Setting close paren #%"IVdf" to %d\n",
8269                         (IV)parno, REG_NODE_NUM(ender)));
8270                 RExC_close_parens[parno-1]= ender;
8271                 if (RExC_nestroot == parno)
8272                     RExC_nestroot = 0;
8273             }
8274             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
8275             Set_Node_Length(ender,1); /* MJD */
8276             break;
8277         case '<':
8278         case ',':
8279         case '=':
8280         case '!':
8281             *flagp &= ~HASWIDTH;
8282             /* FALL THROUGH */
8283         case '>':
8284             ender = reg_node(pRExC_state, SUCCEED);
8285             break;
8286         case 0:
8287             ender = reg_node(pRExC_state, END);
8288             if (!SIZE_ONLY) {
8289                 assert(!RExC_opend); /* there can only be one! */
8290                 RExC_opend = ender;
8291             }
8292             break;
8293         }
8294         REGTAIL(pRExC_state, lastbr, ender);
8295
8296         if (have_branch && !SIZE_ONLY) {
8297             if (depth==1)
8298                 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
8299
8300             /* Hook the tails of the branches to the closing node. */
8301             for (br = ret; br; br = regnext(br)) {
8302                 const U8 op = PL_regkind[OP(br)];
8303                 if (op == BRANCH) {
8304                     REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
8305                 }
8306                 else if (op == BRANCHJ) {
8307                     REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
8308                 }
8309             }
8310         }
8311     }
8312
8313     {
8314         const char *p;
8315         static const char parens[] = "=!<,>";
8316
8317         if (paren && (p = strchr(parens, paren))) {
8318             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
8319             int flag = (p - parens) > 1;
8320
8321             if (paren == '>')
8322                 node = SUSPEND, flag = 0;
8323             reginsert(pRExC_state, node,ret, depth+1);
8324             Set_Node_Cur_Length(ret);
8325             Set_Node_Offset(ret, parse_start + 1);
8326             ret->flags = flag;
8327             REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
8328         }
8329     }
8330
8331     /* Check for proper termination. */
8332     if (paren) {
8333         RExC_flags = oregflags;
8334         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
8335             RExC_parse = oregcomp_parse;
8336             vFAIL("Unmatched (");
8337         }
8338     }
8339     else if (!paren && RExC_parse < RExC_end) {
8340         if (*RExC_parse == ')') {
8341             RExC_parse++;
8342             vFAIL("Unmatched )");
8343         }
8344         else
8345             FAIL("Junk on end of regexp");      /* "Can't happen". */
8346         /* NOTREACHED */
8347     }
8348
8349     if (RExC_in_lookbehind) {
8350         RExC_in_lookbehind--;
8351     }
8352     if (after_freeze > RExC_npar)
8353         RExC_npar = after_freeze;
8354     return(ret);
8355 }
8356
8357 /*
8358  - regbranch - one alternative of an | operator
8359  *
8360  * Implements the concatenation operator.
8361  */
8362 STATIC regnode *
8363 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
8364 {
8365     dVAR;
8366     register regnode *ret;
8367     register regnode *chain = NULL;
8368     register regnode *latest;
8369     I32 flags = 0, c = 0;
8370     GET_RE_DEBUG_FLAGS_DECL;
8371
8372     PERL_ARGS_ASSERT_REGBRANCH;
8373
8374     DEBUG_PARSE("brnc");
8375
8376     if (first)
8377         ret = NULL;
8378     else {
8379         if (!SIZE_ONLY && RExC_extralen)
8380             ret = reganode(pRExC_state, BRANCHJ,0);
8381         else {
8382             ret = reg_node(pRExC_state, BRANCH);
8383             Set_Node_Length(ret, 1);
8384         }
8385     }
8386
8387     if (!first && SIZE_ONLY)
8388         RExC_extralen += 1;                     /* BRANCHJ */
8389
8390     *flagp = WORST;                     /* Tentatively. */
8391
8392     RExC_parse--;
8393     nextchar(pRExC_state);
8394     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
8395         flags &= ~TRYAGAIN;
8396         latest = regpiece(pRExC_state, &flags,depth+1);
8397         if (latest == NULL) {
8398             if (flags & TRYAGAIN)
8399                 continue;
8400             return(NULL);
8401         }
8402         else if (ret == NULL)
8403             ret = latest;
8404         *flagp |= flags&(HASWIDTH|POSTPONED);
8405         if (chain == NULL)      /* First piece. */
8406             *flagp |= flags&SPSTART;
8407         else {
8408             RExC_naughty++;
8409             REGTAIL(pRExC_state, chain, latest);
8410         }
8411         chain = latest;
8412         c++;
8413     }
8414     if (chain == NULL) {        /* Loop ran zero times. */
8415         chain = reg_node(pRExC_state, NOTHING);
8416         if (ret == NULL)
8417             ret = chain;
8418     }
8419     if (c == 1) {
8420         *flagp |= flags&SIMPLE;
8421     }
8422
8423     return ret;
8424 }
8425
8426 /*
8427  - regpiece - something followed by possible [*+?]
8428  *
8429  * Note that the branching code sequences used for ? and the general cases
8430  * of * and + are somewhat optimized:  they use the same NOTHING node as
8431  * both the endmarker for their branch list and the body of the last branch.
8432  * It might seem that this node could be dispensed with entirely, but the
8433  * endmarker role is not redundant.
8434  */
8435 STATIC regnode *
8436 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
8437 {
8438     dVAR;
8439     register regnode *ret;
8440     register char op;
8441     register char *next;
8442     I32 flags;
8443     const char * const origparse = RExC_parse;
8444     I32 min;
8445     I32 max = REG_INFTY;
8446 #ifdef RE_TRACK_PATTERN_OFFSETS
8447     char *parse_start;
8448 #endif
8449     const char *maxpos = NULL;
8450     GET_RE_DEBUG_FLAGS_DECL;
8451
8452     PERL_ARGS_ASSERT_REGPIECE;
8453
8454     DEBUG_PARSE("piec");
8455
8456     ret = regatom(pRExC_state, &flags,depth+1);
8457     if (ret == NULL) {
8458         if (flags & TRYAGAIN)
8459             *flagp |= TRYAGAIN;
8460         return(NULL);
8461     }
8462
8463     op = *RExC_parse;
8464
8465     if (op == '{' && regcurly(RExC_parse)) {
8466         maxpos = NULL;
8467 #ifdef RE_TRACK_PATTERN_OFFSETS
8468         parse_start = RExC_parse; /* MJD */
8469 #endif
8470         next = RExC_parse + 1;
8471         while (isDIGIT(*next) || *next == ',') {
8472             if (*next == ',') {
8473                 if (maxpos)
8474                     break;
8475                 else
8476                     maxpos = next;
8477             }
8478             next++;
8479         }
8480         if (*next == '}') {             /* got one */
8481             if (!maxpos)
8482                 maxpos = next;
8483             RExC_parse++;
8484             min = atoi(RExC_parse);
8485             if (*maxpos == ',')
8486                 maxpos++;
8487             else
8488                 maxpos = RExC_parse;
8489             max = atoi(maxpos);
8490             if (!max && *maxpos != '0')
8491                 max = REG_INFTY;                /* meaning "infinity" */
8492             else if (max >= REG_INFTY)
8493                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
8494             RExC_parse = next;
8495             nextchar(pRExC_state);
8496
8497         do_curly:
8498             if ((flags&SIMPLE)) {
8499                 RExC_naughty += 2 + RExC_naughty / 2;
8500                 reginsert(pRExC_state, CURLY, ret, depth+1);
8501                 Set_Node_Offset(ret, parse_start+1); /* MJD */
8502                 Set_Node_Cur_Length(ret);
8503             }
8504             else {
8505                 regnode * const w = reg_node(pRExC_state, WHILEM);
8506
8507                 w->flags = 0;
8508                 REGTAIL(pRExC_state, ret, w);
8509                 if (!SIZE_ONLY && RExC_extralen) {
8510                     reginsert(pRExC_state, LONGJMP,ret, depth+1);
8511                     reginsert(pRExC_state, NOTHING,ret, depth+1);
8512                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
8513                 }
8514                 reginsert(pRExC_state, CURLYX,ret, depth+1);
8515                                 /* MJD hk */
8516                 Set_Node_Offset(ret, parse_start+1);
8517                 Set_Node_Length(ret,
8518                                 op == '{' ? (RExC_parse - parse_start) : 1);
8519
8520                 if (!SIZE_ONLY && RExC_extralen)
8521                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
8522                 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
8523                 if (SIZE_ONLY)
8524                     RExC_whilem_seen++, RExC_extralen += 3;
8525                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
8526             }
8527             ret->flags = 0;
8528
8529             if (min > 0)
8530                 *flagp = WORST;
8531             if (max > 0)
8532                 *flagp |= HASWIDTH;
8533             if (max < min)
8534                 vFAIL("Can't do {n,m} with n > m");
8535             if (!SIZE_ONLY) {
8536                 ARG1_SET(ret, (U16)min);
8537                 ARG2_SET(ret, (U16)max);
8538             }
8539
8540             goto nest_check;
8541         }
8542     }
8543
8544     if (!ISMULT1(op)) {
8545         *flagp = flags;
8546         return(ret);
8547     }
8548
8549 #if 0                           /* Now runtime fix should be reliable. */
8550
8551     /* if this is reinstated, don't forget to put this back into perldiag:
8552
8553             =item Regexp *+ operand could be empty at {#} in regex m/%s/
8554
8555            (F) The part of the regexp subject to either the * or + quantifier
8556            could match an empty string. The {#} shows in the regular
8557            expression about where the problem was discovered.
8558
8559     */
8560
8561     if (!(flags&HASWIDTH) && op != '?')
8562       vFAIL("Regexp *+ operand could be empty");
8563 #endif
8564
8565 #ifdef RE_TRACK_PATTERN_OFFSETS
8566     parse_start = RExC_parse;
8567 #endif
8568     nextchar(pRExC_state);
8569
8570     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
8571
8572     if (op == '*' && (flags&SIMPLE)) {
8573         reginsert(pRExC_state, STAR, ret, depth+1);
8574         ret->flags = 0;
8575         RExC_naughty += 4;
8576     }
8577     else if (op == '*') {
8578         min = 0;
8579         goto do_curly;
8580     }
8581     else if (op == '+' && (flags&SIMPLE)) {
8582         reginsert(pRExC_state, PLUS, ret, depth+1);
8583         ret->flags = 0;
8584         RExC_naughty += 3;
8585     }
8586     else if (op == '+') {
8587         min = 1;
8588         goto do_curly;
8589     }
8590     else if (op == '?') {
8591         min = 0; max = 1;
8592         goto do_curly;
8593     }
8594   nest_check:
8595     if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
8596         ckWARN3reg(RExC_parse,
8597                    "%.*s matches null string many times",
8598                    (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
8599                    origparse);
8600     }
8601
8602     if (RExC_parse < RExC_end && *RExC_parse == '?') {
8603         nextchar(pRExC_state);
8604         reginsert(pRExC_state, MINMOD, ret, depth+1);
8605         REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
8606     }
8607 #ifndef REG_ALLOW_MINMOD_SUSPEND
8608     else
8609 #endif
8610     if (RExC_parse < RExC_end && *RExC_parse == '+') {
8611         regnode *ender;
8612         nextchar(pRExC_state);
8613         ender = reg_node(pRExC_state, SUCCEED);
8614         REGTAIL(pRExC_state, ret, ender);
8615         reginsert(pRExC_state, SUSPEND, ret, depth+1);
8616         ret->flags = 0;
8617         ender = reg_node(pRExC_state, TAIL);
8618         REGTAIL(pRExC_state, ret, ender);
8619         /*ret= ender;*/
8620     }
8621
8622     if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
8623         RExC_parse++;
8624         vFAIL("Nested quantifiers");
8625     }
8626
8627     return(ret);
8628 }
8629
8630
8631 /* reg_namedseq(pRExC_state,UVp, UV depth)
8632
8633    This is expected to be called by a parser routine that has
8634    recognized '\N' and needs to handle the rest. RExC_parse is
8635    expected to point at the first char following the N at the time
8636    of the call.
8637
8638    The \N may be inside (indicated by valuep not being NULL) or outside a
8639    character class.
8640
8641    \N may begin either a named sequence, or if outside a character class, mean
8642    to match a non-newline.  For non single-quoted regexes, the tokenizer has
8643    attempted to decide which, and in the case of a named sequence converted it
8644    into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
8645    where c1... are the characters in the sequence.  For single-quoted regexes,
8646    the tokenizer passes the \N sequence through unchanged; this code will not
8647    attempt to determine this nor expand those.  The net effect is that if the
8648    beginning of the passed-in pattern isn't '{U+' or there is no '}', it
8649    signals that this \N occurrence means to match a non-newline.
8650
8651    Only the \N{U+...} form should occur in a character class, for the same
8652    reason that '.' inside a character class means to just match a period: it
8653    just doesn't make sense.
8654
8655    If valuep is non-null then it is assumed that we are parsing inside
8656    of a charclass definition and the first codepoint in the resolved
8657    string is returned via *valuep and the routine will return NULL.
8658    In this mode if a multichar string is returned from the charnames
8659    handler, a warning will be issued, and only the first char in the
8660    sequence will be examined. If the string returned is zero length
8661    then the value of *valuep is undefined and NON-NULL will
8662    be returned to indicate failure. (This will NOT be a valid pointer
8663    to a regnode.)
8664
8665    If valuep is null then it is assumed that we are parsing normal text and a
8666    new EXACT node is inserted into the program containing the resolved string,
8667    and a pointer to the new node is returned.  But if the string is zero length
8668    a NOTHING node is emitted instead.
8669
8670    On success RExC_parse is set to the char following the endbrace.
8671    Parsing failures will generate a fatal error via vFAIL(...)
8672  */
8673 STATIC regnode *
8674 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp, U32 depth)
8675 {
8676     char * endbrace;    /* '}' following the name */
8677     regnode *ret = NULL;
8678     char* p;
8679
8680     GET_RE_DEBUG_FLAGS_DECL;
8681
8682     PERL_ARGS_ASSERT_REG_NAMEDSEQ;
8683
8684     GET_RE_DEBUG_FLAGS;
8685
8686     /* The [^\n] meaning of \N ignores spaces and comments under the /x
8687      * modifier.  The other meaning does not */
8688     p = (RExC_flags & RXf_PMf_EXTENDED)
8689         ? regwhite( pRExC_state, RExC_parse )
8690         : RExC_parse;
8691
8692     /* Disambiguate between \N meaning a named character versus \N meaning
8693      * [^\n].  The former is assumed when it can't be the latter. */
8694     if (*p != '{' || regcurly(p)) {
8695         RExC_parse = p;
8696         if (valuep) {
8697             /* no bare \N in a charclass */
8698             vFAIL("\\N in a character class must be a named character: \\N{...}");
8699         }
8700         nextchar(pRExC_state);
8701         ret = reg_node(pRExC_state, REG_ANY);
8702         *flagp |= HASWIDTH|SIMPLE;
8703         RExC_naughty++;
8704         RExC_parse--;
8705         Set_Node_Length(ret, 1); /* MJD */
8706         return ret;
8707     }
8708
8709     /* Here, we have decided it should be a named sequence */
8710
8711     /* The test above made sure that the next real character is a '{', but
8712      * under the /x modifier, it could be separated by space (or a comment and
8713      * \n) and this is not allowed (for consistency with \x{...} and the
8714      * tokenizer handling of \N{NAME}). */
8715     if (*RExC_parse != '{') {
8716         vFAIL("Missing braces on \\N{}");
8717     }
8718
8719     RExC_parse++;       /* Skip past the '{' */
8720
8721     if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
8722         || ! (endbrace == RExC_parse            /* nothing between the {} */
8723               || (endbrace - RExC_parse >= 2    /* U+ (bad hex is checked below */
8724                   && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
8725     {
8726         if (endbrace) RExC_parse = endbrace;    /* position msg's '<--HERE' */
8727         vFAIL("\\N{NAME} must be resolved by the lexer");
8728     }
8729
8730     if (endbrace == RExC_parse) {   /* empty: \N{} */
8731         if (! valuep) {
8732             RExC_parse = endbrace + 1;
8733             return reg_node(pRExC_state,NOTHING);
8734         }
8735
8736         if (SIZE_ONLY) {
8737             ckWARNreg(RExC_parse,
8738                     "Ignoring zero length \\N{} in character class"
8739             );
8740             RExC_parse = endbrace + 1;
8741         }
8742         *valuep = 0;
8743         return (regnode *) &RExC_parse; /* Invalid regnode pointer */
8744     }
8745
8746     REQUIRE_UTF8;       /* named sequences imply Unicode semantics */
8747     RExC_parse += 2;    /* Skip past the 'U+' */
8748
8749     if (valuep) {   /* In a bracketed char class */
8750         /* We only pay attention to the first char of
8751         multichar strings being returned. I kinda wonder
8752         if this makes sense as it does change the behaviour
8753         from earlier versions, OTOH that behaviour was broken
8754         as well. XXX Solution is to recharacterize as
8755         [rest-of-class]|multi1|multi2... */
8756
8757         STRLEN length_of_hex;
8758         I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
8759             | PERL_SCAN_DISALLOW_PREFIX
8760             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
8761
8762         char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
8763         if (endchar < endbrace) {
8764             ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
8765         }
8766
8767         length_of_hex = (STRLEN)(endchar - RExC_parse);
8768         *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
8769
8770         /* The tokenizer should have guaranteed validity, but it's possible to
8771          * bypass it by using single quoting, so check */
8772         if (length_of_hex == 0
8773             || length_of_hex != (STRLEN)(endchar - RExC_parse) )
8774         {
8775             RExC_parse += length_of_hex;        /* Includes all the valid */
8776             RExC_parse += (RExC_orig_utf8)      /* point to after 1st invalid */
8777                             ? UTF8SKIP(RExC_parse)
8778                             : 1;
8779             /* Guard against malformed utf8 */
8780             if (RExC_parse >= endchar) RExC_parse = endchar;
8781             vFAIL("Invalid hexadecimal number in \\N{U+...}");
8782         }
8783
8784         RExC_parse = endbrace + 1;
8785         if (endchar == endbrace) return NULL;
8786
8787         ret = (regnode *) &RExC_parse;  /* Invalid regnode pointer */
8788     }
8789     else {      /* Not a char class */
8790
8791         /* What is done here is to convert this to a sub-pattern of the form
8792          * (?:\x{char1}\x{char2}...)
8793          * and then call reg recursively.  That way, it retains its atomicness,
8794          * while not having to worry about special handling that some code
8795          * points may have.  toke.c has converted the original Unicode values
8796          * to native, so that we can just pass on the hex values unchanged.  We
8797          * do have to set a flag to keep recoding from happening in the
8798          * recursion */
8799
8800         SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
8801         STRLEN len;
8802         char *endchar;      /* Points to '.' or '}' ending cur char in the input
8803                                stream */
8804         char *orig_end = RExC_end;
8805
8806         while (RExC_parse < endbrace) {
8807
8808             /* Code points are separated by dots.  If none, there is only one
8809              * code point, and is terminated by the brace */
8810             endchar = RExC_parse + strcspn(RExC_parse, ".}");
8811
8812             /* Convert to notation the rest of the code understands */
8813             sv_catpv(substitute_parse, "\\x{");
8814             sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
8815             sv_catpv(substitute_parse, "}");
8816
8817             /* Point to the beginning of the next character in the sequence. */
8818             RExC_parse = endchar + 1;
8819         }
8820         sv_catpv(substitute_parse, ")");
8821
8822         RExC_parse = SvPV(substitute_parse, len);
8823
8824         /* Don't allow empty number */
8825         if (len < 8) {
8826             vFAIL("Invalid hexadecimal number in \\N{U+...}");
8827         }
8828         RExC_end = RExC_parse + len;
8829
8830         /* The values are Unicode, and therefore not subject to recoding */
8831         RExC_override_recoding = 1;
8832
8833         ret = reg(pRExC_state, 1, flagp, depth+1);
8834
8835         RExC_parse = endbrace;
8836         RExC_end = orig_end;
8837         RExC_override_recoding = 0;
8838
8839         nextchar(pRExC_state);
8840     }
8841
8842     return ret;
8843 }
8844
8845
8846 /*
8847  * reg_recode
8848  *
8849  * It returns the code point in utf8 for the value in *encp.
8850  *    value: a code value in the source encoding
8851  *    encp:  a pointer to an Encode object
8852  *
8853  * If the result from Encode is not a single character,
8854  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
8855  */
8856 STATIC UV
8857 S_reg_recode(pTHX_ const char value, SV **encp)
8858 {
8859     STRLEN numlen = 1;
8860     SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
8861     const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
8862     const STRLEN newlen = SvCUR(sv);
8863     UV uv = UNICODE_REPLACEMENT;
8864
8865     PERL_ARGS_ASSERT_REG_RECODE;
8866
8867     if (newlen)
8868         uv = SvUTF8(sv)
8869              ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
8870              : *(U8*)s;
8871
8872     if (!newlen || numlen != newlen) {
8873         uv = UNICODE_REPLACEMENT;
8874         *encp = NULL;
8875     }
8876     return uv;
8877 }
8878
8879
8880 /*
8881  - regatom - the lowest level
8882
8883    Try to identify anything special at the start of the pattern. If there
8884    is, then handle it as required. This may involve generating a single regop,
8885    such as for an assertion; or it may involve recursing, such as to
8886    handle a () structure.
8887
8888    If the string doesn't start with something special then we gobble up
8889    as much literal text as we can.
8890
8891    Once we have been able to handle whatever type of thing started the
8892    sequence, we return.
8893
8894    Note: we have to be careful with escapes, as they can be both literal
8895    and special, and in the case of \10 and friends can either, depending
8896    on context. Specifically there are two separate switches for handling
8897    escape sequences, with the one for handling literal escapes requiring
8898    a dummy entry for all of the special escapes that are actually handled
8899    by the other.
8900 */
8901
8902 STATIC regnode *
8903 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
8904 {
8905     dVAR;
8906     register regnode *ret = NULL;
8907     I32 flags;
8908     char *parse_start = RExC_parse;
8909     U8 op;
8910     GET_RE_DEBUG_FLAGS_DECL;
8911     DEBUG_PARSE("atom");
8912     *flagp = WORST;             /* Tentatively. */
8913
8914     PERL_ARGS_ASSERT_REGATOM;
8915
8916 tryagain:
8917     switch ((U8)*RExC_parse) {
8918     case '^':
8919         RExC_seen_zerolen++;
8920         nextchar(pRExC_state);
8921         if (RExC_flags & RXf_PMf_MULTILINE)
8922             ret = reg_node(pRExC_state, MBOL);
8923         else if (RExC_flags & RXf_PMf_SINGLELINE)
8924             ret = reg_node(pRExC_state, SBOL);
8925         else
8926             ret = reg_node(pRExC_state, BOL);
8927         Set_Node_Length(ret, 1); /* MJD */
8928         break;
8929     case '$':
8930         nextchar(pRExC_state);
8931         if (*RExC_parse)
8932             RExC_seen_zerolen++;
8933         if (RExC_flags & RXf_PMf_MULTILINE)
8934             ret = reg_node(pRExC_state, MEOL);
8935         else if (RExC_flags & RXf_PMf_SINGLELINE)
8936             ret = reg_node(pRExC_state, SEOL);
8937         else
8938             ret = reg_node(pRExC_state, EOL);
8939         Set_Node_Length(ret, 1); /* MJD */
8940         break;
8941     case '.':
8942         nextchar(pRExC_state);
8943         if (RExC_flags & RXf_PMf_SINGLELINE)
8944             ret = reg_node(pRExC_state, SANY);
8945         else
8946             ret = reg_node(pRExC_state, REG_ANY);
8947         *flagp |= HASWIDTH|SIMPLE;
8948         RExC_naughty++;
8949         Set_Node_Length(ret, 1); /* MJD */
8950         break;
8951     case '[':
8952     {
8953         char * const oregcomp_parse = ++RExC_parse;
8954         ret = regclass(pRExC_state,depth+1);
8955         if (*RExC_parse != ']') {
8956             RExC_parse = oregcomp_parse;
8957             vFAIL("Unmatched [");
8958         }
8959         nextchar(pRExC_state);
8960         *flagp |= HASWIDTH|SIMPLE;
8961         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
8962         break;
8963     }
8964     case '(':
8965         nextchar(pRExC_state);
8966         ret = reg(pRExC_state, 1, &flags,depth+1);
8967         if (ret == NULL) {
8968                 if (flags & TRYAGAIN) {
8969                     if (RExC_parse == RExC_end) {
8970                          /* Make parent create an empty node if needed. */
8971                         *flagp |= TRYAGAIN;
8972                         return(NULL);
8973                     }
8974                     goto tryagain;
8975                 }
8976                 return(NULL);
8977         }
8978         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
8979         break;
8980     case '|':
8981     case ')':
8982         if (flags & TRYAGAIN) {
8983             *flagp |= TRYAGAIN;
8984             return NULL;
8985         }
8986         vFAIL("Internal urp");
8987                                 /* Supposed to be caught earlier. */
8988         break;
8989     case '{':
8990         if (!regcurly(RExC_parse)) {
8991             RExC_parse++;
8992             goto defchar;
8993         }
8994         /* FALL THROUGH */
8995     case '?':
8996     case '+':
8997     case '*':
8998         RExC_parse++;
8999         vFAIL("Quantifier follows nothing");
9000         break;
9001     case '\\':
9002         /* Special Escapes
9003
9004            This switch handles escape sequences that resolve to some kind
9005            of special regop and not to literal text. Escape sequnces that
9006            resolve to literal text are handled below in the switch marked
9007            "Literal Escapes".
9008
9009            Every entry in this switch *must* have a corresponding entry
9010            in the literal escape switch. However, the opposite is not
9011            required, as the default for this switch is to jump to the
9012            literal text handling code.
9013         */
9014         switch ((U8)*++RExC_parse) {
9015         /* Special Escapes */
9016         case 'A':
9017             RExC_seen_zerolen++;
9018             ret = reg_node(pRExC_state, SBOL);
9019             *flagp |= SIMPLE;
9020             goto finish_meta_pat;
9021         case 'G':
9022             ret = reg_node(pRExC_state, GPOS);
9023             RExC_seen |= REG_SEEN_GPOS;
9024             *flagp |= SIMPLE;
9025             goto finish_meta_pat;
9026         case 'K':
9027             RExC_seen_zerolen++;
9028             ret = reg_node(pRExC_state, KEEPS);
9029             *flagp |= SIMPLE;
9030             /* XXX:dmq : disabling in-place substitution seems to
9031              * be necessary here to avoid cases of memory corruption, as
9032              * with: C<$_="x" x 80; s/x\K/y/> -- rgs
9033              */
9034             RExC_seen |= REG_SEEN_LOOKBEHIND;
9035             goto finish_meta_pat;
9036         case 'Z':
9037             ret = reg_node(pRExC_state, SEOL);
9038             *flagp |= SIMPLE;
9039             RExC_seen_zerolen++;                /* Do not optimize RE away */
9040             goto finish_meta_pat;
9041         case 'z':
9042             ret = reg_node(pRExC_state, EOS);
9043             *flagp |= SIMPLE;
9044             RExC_seen_zerolen++;                /* Do not optimize RE away */
9045             goto finish_meta_pat;
9046         case 'C':
9047             ret = reg_node(pRExC_state, CANY);
9048             RExC_seen |= REG_SEEN_CANY;
9049             *flagp |= HASWIDTH|SIMPLE;
9050             goto finish_meta_pat;
9051         case 'X':
9052             ret = reg_node(pRExC_state, CLUMP);
9053             *flagp |= HASWIDTH;
9054             goto finish_meta_pat;
9055         case 'w':
9056             switch (get_regex_charset(RExC_flags)) {
9057                 case REGEX_LOCALE_CHARSET:
9058                     op = ALNUML;
9059                     break;
9060                 case REGEX_UNICODE_CHARSET:
9061                     op = ALNUMU;
9062                     break;
9063                 case REGEX_ASCII_RESTRICTED_CHARSET:
9064                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9065                     op = ALNUMA;
9066                     break;
9067                 case REGEX_DEPENDS_CHARSET:
9068                     op = ALNUM;
9069                     break;
9070                 default:
9071                     goto bad_charset;
9072             }
9073             ret = reg_node(pRExC_state, op);
9074             *flagp |= HASWIDTH|SIMPLE;
9075             goto finish_meta_pat;
9076         case 'W':
9077             switch (get_regex_charset(RExC_flags)) {
9078                 case REGEX_LOCALE_CHARSET:
9079                     op = NALNUML;
9080                     break;
9081                 case REGEX_UNICODE_CHARSET:
9082                     op = NALNUMU;
9083                     break;
9084                 case REGEX_ASCII_RESTRICTED_CHARSET:
9085                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9086                     op = NALNUMA;
9087                     break;
9088                 case REGEX_DEPENDS_CHARSET:
9089                     op = NALNUM;
9090                     break;
9091                 default:
9092                     goto bad_charset;
9093             }
9094             ret = reg_node(pRExC_state, op);
9095             *flagp |= HASWIDTH|SIMPLE;
9096             goto finish_meta_pat;
9097         case 'b':
9098             RExC_seen_zerolen++;
9099             RExC_seen |= REG_SEEN_LOOKBEHIND;
9100             switch (get_regex_charset(RExC_flags)) {
9101                 case REGEX_LOCALE_CHARSET:
9102                     op = BOUNDL;
9103                     break;
9104                 case REGEX_UNICODE_CHARSET:
9105                     op = BOUNDU;
9106                     break;
9107                 case REGEX_ASCII_RESTRICTED_CHARSET:
9108                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9109                     op = BOUNDA;
9110                     break;
9111                 case REGEX_DEPENDS_CHARSET:
9112                     op = BOUND;
9113                     break;
9114                 default:
9115                     goto bad_charset;
9116             }
9117             ret = reg_node(pRExC_state, op);
9118             FLAGS(ret) = get_regex_charset(RExC_flags);
9119             *flagp |= SIMPLE;
9120             if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
9121                 ckWARNregdep(RExC_parse, "\"\\b{\" is deprecated; use \"\\b\\{\" instead");
9122             }
9123             goto finish_meta_pat;
9124         case 'B':
9125             RExC_seen_zerolen++;
9126             RExC_seen |= REG_SEEN_LOOKBEHIND;
9127             switch (get_regex_charset(RExC_flags)) {
9128                 case REGEX_LOCALE_CHARSET:
9129                     op = NBOUNDL;
9130                     break;
9131                 case REGEX_UNICODE_CHARSET:
9132                     op = NBOUNDU;
9133                     break;
9134                 case REGEX_ASCII_RESTRICTED_CHARSET:
9135                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9136                     op = NBOUNDA;
9137                     break;
9138                 case REGEX_DEPENDS_CHARSET:
9139                     op = NBOUND;
9140                     break;
9141                 default:
9142                     goto bad_charset;
9143             }
9144             ret = reg_node(pRExC_state, op);
9145             FLAGS(ret) = get_regex_charset(RExC_flags);
9146             *flagp |= SIMPLE;
9147             if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
9148                 ckWARNregdep(RExC_parse, "\"\\B{\" is deprecated; use \"\\B\\{\" instead");
9149             }
9150             goto finish_meta_pat;
9151         case 's':
9152             switch (get_regex_charset(RExC_flags)) {
9153                 case REGEX_LOCALE_CHARSET:
9154                     op = SPACEL;
9155                     break;
9156                 case REGEX_UNICODE_CHARSET:
9157                     op = SPACEU;
9158                     break;
9159                 case REGEX_ASCII_RESTRICTED_CHARSET:
9160                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9161                     op = SPACEA;
9162                     break;
9163                 case REGEX_DEPENDS_CHARSET:
9164                     op = SPACE;
9165                     break;
9166                 default:
9167                     goto bad_charset;
9168             }
9169             ret = reg_node(pRExC_state, op);
9170             *flagp |= HASWIDTH|SIMPLE;
9171             goto finish_meta_pat;
9172         case 'S':
9173             switch (get_regex_charset(RExC_flags)) {
9174                 case REGEX_LOCALE_CHARSET:
9175                     op = NSPACEL;
9176                     break;
9177                 case REGEX_UNICODE_CHARSET:
9178                     op = NSPACEU;
9179                     break;
9180                 case REGEX_ASCII_RESTRICTED_CHARSET:
9181                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9182                     op = NSPACEA;
9183                     break;
9184                 case REGEX_DEPENDS_CHARSET:
9185                     op = NSPACE;
9186                     break;
9187                 default:
9188                     goto bad_charset;
9189             }
9190             ret = reg_node(pRExC_state, op);
9191             *flagp |= HASWIDTH|SIMPLE;
9192             goto finish_meta_pat;
9193         case 'd':
9194             switch (get_regex_charset(RExC_flags)) {
9195                 case REGEX_LOCALE_CHARSET:
9196                     op = DIGITL;
9197                     break;
9198                 case REGEX_ASCII_RESTRICTED_CHARSET:
9199                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9200                     op = DIGITA;
9201                     break;
9202                 case REGEX_DEPENDS_CHARSET: /* No difference between these */
9203                 case REGEX_UNICODE_CHARSET:
9204                     op = DIGIT;
9205                     break;
9206                 default:
9207                     goto bad_charset;
9208             }
9209             ret = reg_node(pRExC_state, op);
9210             *flagp |= HASWIDTH|SIMPLE;
9211             goto finish_meta_pat;
9212         case 'D':
9213             switch (get_regex_charset(RExC_flags)) {
9214                 case REGEX_LOCALE_CHARSET:
9215                     op = NDIGITL;
9216                     break;
9217                 case REGEX_ASCII_RESTRICTED_CHARSET:
9218                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9219                     op = NDIGITA;
9220                     break;
9221                 case REGEX_DEPENDS_CHARSET: /* No difference between these */
9222                 case REGEX_UNICODE_CHARSET:
9223                     op = NDIGIT;
9224                     break;
9225                 default:
9226                     goto bad_charset;
9227             }
9228             ret = reg_node(pRExC_state, op);
9229             *flagp |= HASWIDTH|SIMPLE;
9230             goto finish_meta_pat;
9231         case 'R':
9232             ret = reg_node(pRExC_state, LNBREAK);
9233             *flagp |= HASWIDTH|SIMPLE;
9234             goto finish_meta_pat;
9235         case 'h':
9236             ret = reg_node(pRExC_state, HORIZWS);
9237             *flagp |= HASWIDTH|SIMPLE;
9238             goto finish_meta_pat;
9239         case 'H':
9240             ret = reg_node(pRExC_state, NHORIZWS);
9241             *flagp |= HASWIDTH|SIMPLE;
9242             goto finish_meta_pat;
9243         case 'v':
9244             ret = reg_node(pRExC_state, VERTWS);
9245             *flagp |= HASWIDTH|SIMPLE;
9246             goto finish_meta_pat;
9247         case 'V':
9248             ret = reg_node(pRExC_state, NVERTWS);
9249             *flagp |= HASWIDTH|SIMPLE;
9250          finish_meta_pat:
9251             nextchar(pRExC_state);
9252             Set_Node_Length(ret, 2); /* MJD */
9253             break;
9254         case 'p':
9255         case 'P':
9256             {
9257                 char* const oldregxend = RExC_end;
9258 #ifdef DEBUGGING
9259                 char* parse_start = RExC_parse - 2;
9260 #endif
9261
9262                 if (RExC_parse[1] == '{') {
9263                   /* a lovely hack--pretend we saw [\pX] instead */
9264                     RExC_end = strchr(RExC_parse, '}');
9265                     if (!RExC_end) {
9266                         const U8 c = (U8)*RExC_parse;
9267                         RExC_parse += 2;
9268                         RExC_end = oldregxend;
9269                         vFAIL2("Missing right brace on \\%c{}", c);
9270                     }
9271                     RExC_end++;
9272                 }
9273                 else {
9274                     RExC_end = RExC_parse + 2;
9275                     if (RExC_end > oldregxend)
9276                         RExC_end = oldregxend;
9277                 }
9278                 RExC_parse--;
9279
9280                 ret = regclass(pRExC_state,depth+1);
9281
9282                 RExC_end = oldregxend;
9283                 RExC_parse--;
9284
9285                 Set_Node_Offset(ret, parse_start + 2);
9286                 Set_Node_Cur_Length(ret);
9287                 nextchar(pRExC_state);
9288                 *flagp |= HASWIDTH|SIMPLE;
9289             }
9290             break;
9291         case 'N':
9292             /* Handle \N and \N{NAME} here and not below because it can be
9293             multicharacter. join_exact() will join them up later on.
9294             Also this makes sure that things like /\N{BLAH}+/ and
9295             \N{BLAH} being multi char Just Happen. dmq*/
9296             ++RExC_parse;
9297             ret= reg_namedseq(pRExC_state, NULL, flagp, depth);
9298             break;
9299         case 'k':    /* Handle \k<NAME> and \k'NAME' */
9300         parse_named_seq:
9301         {
9302             char ch= RExC_parse[1];
9303             if (ch != '<' && ch != '\'' && ch != '{') {
9304                 RExC_parse++;
9305                 vFAIL2("Sequence %.2s... not terminated",parse_start);
9306             } else {
9307                 /* this pretty much dupes the code for (?P=...) in reg(), if
9308                    you change this make sure you change that */
9309                 char* name_start = (RExC_parse += 2);
9310                 U32 num = 0;
9311                 SV *sv_dat = reg_scan_name(pRExC_state,
9312                     SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9313                 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
9314                 if (RExC_parse == name_start || *RExC_parse != ch)
9315                     vFAIL2("Sequence %.3s... not terminated",parse_start);
9316
9317                 if (!SIZE_ONLY) {
9318                     num = add_data( pRExC_state, 1, "S" );
9319                     RExC_rxi->data->data[num]=(void*)sv_dat;
9320                     SvREFCNT_inc_simple_void(sv_dat);
9321                 }
9322
9323                 RExC_sawback = 1;
9324                 ret = reganode(pRExC_state,
9325                                ((! FOLD)
9326                                  ? NREF
9327                                  : (MORE_ASCII_RESTRICTED)
9328                                    ? NREFFA
9329                                    : (AT_LEAST_UNI_SEMANTICS)
9330                                      ? NREFFU
9331                                      : (LOC)
9332                                        ? NREFFL
9333                                        : NREFF),
9334                                 num);
9335                 *flagp |= HASWIDTH;
9336
9337                 /* override incorrect value set in reganode MJD */
9338                 Set_Node_Offset(ret, parse_start+1);
9339                 Set_Node_Cur_Length(ret); /* MJD */
9340                 nextchar(pRExC_state);
9341
9342             }
9343             break;
9344         }
9345         case 'g':
9346         case '1': case '2': case '3': case '4':
9347         case '5': case '6': case '7': case '8': case '9':
9348             {
9349                 I32 num;
9350                 bool isg = *RExC_parse == 'g';
9351                 bool isrel = 0;
9352                 bool hasbrace = 0;
9353                 if (isg) {
9354                     RExC_parse++;
9355                     if (*RExC_parse == '{') {
9356                         RExC_parse++;
9357                         hasbrace = 1;
9358                     }
9359                     if (*RExC_parse == '-') {
9360                         RExC_parse++;
9361                         isrel = 1;
9362                     }
9363                     if (hasbrace && !isDIGIT(*RExC_parse)) {
9364                         if (isrel) RExC_parse--;
9365                         RExC_parse -= 2;
9366                         goto parse_named_seq;
9367                 }   }
9368                 num = atoi(RExC_parse);
9369                 if (isg && num == 0)
9370                     vFAIL("Reference to invalid group 0");
9371                 if (isrel) {
9372                     num = RExC_npar - num;
9373                     if (num < 1)
9374                         vFAIL("Reference to nonexistent or unclosed group");
9375                 }
9376                 if (!isg && num > 9 && num >= RExC_npar)
9377                     goto defchar;
9378                 else {
9379                     char * const parse_start = RExC_parse - 1; /* MJD */
9380                     while (isDIGIT(*RExC_parse))
9381                         RExC_parse++;
9382                     if (parse_start == RExC_parse - 1)
9383                         vFAIL("Unterminated \\g... pattern");
9384                     if (hasbrace) {
9385                         if (*RExC_parse != '}')
9386                             vFAIL("Unterminated \\g{...} pattern");
9387                         RExC_parse++;
9388                     }
9389                     if (!SIZE_ONLY) {
9390                         if (num > (I32)RExC_rx->nparens)
9391                             vFAIL("Reference to nonexistent group");
9392                     }
9393                     RExC_sawback = 1;
9394                     ret = reganode(pRExC_state,
9395                                    ((! FOLD)
9396                                      ? REF
9397                                      : (MORE_ASCII_RESTRICTED)
9398                                        ? REFFA
9399                                        : (AT_LEAST_UNI_SEMANTICS)
9400                                          ? REFFU
9401                                          : (LOC)
9402                                            ? REFFL
9403                                            : REFF),
9404                                     num);
9405                     *flagp |= HASWIDTH;
9406
9407                     /* override incorrect value set in reganode MJD */
9408                     Set_Node_Offset(ret, parse_start+1);
9409                     Set_Node_Cur_Length(ret); /* MJD */
9410                     RExC_parse--;
9411                     nextchar(pRExC_state);
9412                 }
9413             }
9414             break;
9415         case '\0':
9416             if (RExC_parse >= RExC_end)
9417                 FAIL("Trailing \\");
9418             /* FALL THROUGH */
9419         default:
9420             /* Do not generate "unrecognized" warnings here, we fall
9421                back into the quick-grab loop below */
9422             parse_start--;
9423             goto defchar;
9424         }
9425         break;
9426
9427     case '#':
9428         if (RExC_flags & RXf_PMf_EXTENDED) {
9429             if ( reg_skipcomment( pRExC_state ) )
9430                 goto tryagain;
9431         }
9432         /* FALL THROUGH */
9433
9434     default:
9435
9436             parse_start = RExC_parse - 1;
9437
9438             RExC_parse++;
9439
9440         defchar: {
9441             register STRLEN len;
9442             register UV ender;
9443             register char *p;
9444             char *s;
9445             STRLEN foldlen;
9446             U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
9447             U8 node_type;
9448
9449             /* Is this a LATIN LOWER CASE SHARP S in an EXACTFU node?  If so,
9450              * it is folded to 'ss' even if not utf8 */
9451             bool is_exactfu_sharp_s;
9452
9453             ender = 0;
9454             node_type = ((! FOLD) ? EXACT
9455                         : (LOC)
9456                           ? EXACTFL
9457                           : (MORE_ASCII_RESTRICTED)
9458                             ? EXACTFA
9459                             : (AT_LEAST_UNI_SEMANTICS)
9460                               ? EXACTFU
9461                               : EXACTF);
9462             ret = reg_node(pRExC_state, node_type);
9463             s = STRING(ret);
9464
9465             /* XXX The node can hold up to 255 bytes, yet this only goes to
9466              * 127.  I (khw) do not know why.  Keeping it somewhat less than
9467              * 255 allows us to not have to worry about overflow due to
9468              * converting to utf8 and fold expansion, but that value is
9469              * 255-UTF8_MAXBYTES_CASE.  join_exact() may join adjacent nodes
9470              * split up by this limit into a single one using the real max of
9471              * 255.  Even at 127, this breaks under rare circumstances.  If
9472              * folding, we do not want to split a node at a character that is a
9473              * non-final in a multi-char fold, as an input string could just
9474              * happen to want to match across the node boundary.  The join
9475              * would solve that problem if the join actually happens.  But a
9476              * series of more than two nodes in a row each of 127 would cause
9477              * the first join to succeed to get to 254, but then there wouldn't
9478              * be room for the next one, which could at be one of those split
9479              * multi-char folds.  I don't know of any fool-proof solution.  One
9480              * could back off to end with only a code point that isn't such a
9481              * non-final, but it is possible for there not to be any in the
9482              * entire node. */
9483             for (len = 0, p = RExC_parse - 1;
9484                  len < 127 && p < RExC_end;
9485                  len++)
9486             {
9487                 char * const oldp = p;
9488
9489                 if (RExC_flags & RXf_PMf_EXTENDED)
9490                     p = regwhite( pRExC_state, p );
9491                 switch ((U8)*p) {
9492                 case '^':
9493                 case '$':
9494                 case '.':
9495                 case '[':
9496                 case '(':
9497                 case ')':
9498                 case '|':
9499                     goto loopdone;
9500                 case '\\':
9501                     /* Literal Escapes Switch
9502
9503                        This switch is meant to handle escape sequences that
9504                        resolve to a literal character.
9505
9506                        Every escape sequence that represents something
9507                        else, like an assertion or a char class, is handled
9508                        in the switch marked 'Special Escapes' above in this
9509                        routine, but also has an entry here as anything that
9510                        isn't explicitly mentioned here will be treated as
9511                        an unescaped equivalent literal.
9512                     */
9513
9514                     switch ((U8)*++p) {
9515                     /* These are all the special escapes. */
9516                     case 'A':             /* Start assertion */
9517                     case 'b': case 'B':   /* Word-boundary assertion*/
9518                     case 'C':             /* Single char !DANGEROUS! */
9519                     case 'd': case 'D':   /* digit class */
9520                     case 'g': case 'G':   /* generic-backref, pos assertion */
9521                     case 'h': case 'H':   /* HORIZWS */
9522                     case 'k': case 'K':   /* named backref, keep marker */
9523                     case 'N':             /* named char sequence */
9524                     case 'p': case 'P':   /* Unicode property */
9525                               case 'R':   /* LNBREAK */
9526                     case 's': case 'S':   /* space class */
9527                     case 'v': case 'V':   /* VERTWS */
9528                     case 'w': case 'W':   /* word class */
9529                     case 'X':             /* eXtended Unicode "combining character sequence" */
9530                     case 'z': case 'Z':   /* End of line/string assertion */
9531                         --p;
9532                         goto loopdone;
9533
9534                     /* Anything after here is an escape that resolves to a
9535                        literal. (Except digits, which may or may not)
9536                      */
9537                     case 'n':
9538                         ender = '\n';
9539                         p++;
9540                         break;
9541                     case 'r':
9542                         ender = '\r';
9543                         p++;
9544                         break;
9545                     case 't':
9546                         ender = '\t';
9547                         p++;
9548                         break;
9549                     case 'f':
9550                         ender = '\f';
9551                         p++;
9552                         break;
9553                     case 'e':
9554                           ender = ASCII_TO_NATIVE('\033');
9555                         p++;
9556                         break;
9557                     case 'a':
9558                           ender = ASCII_TO_NATIVE('\007');
9559                         p++;
9560                         break;
9561                     case 'o':
9562                         {
9563                             STRLEN brace_len = len;
9564                             UV result;
9565                             const char* error_msg;
9566
9567                             bool valid = grok_bslash_o(p,
9568                                                        &result,
9569                                                        &brace_len,
9570                                                        &error_msg,
9571                                                        1);
9572                             p += brace_len;
9573                             if (! valid) {
9574                                 RExC_parse = p; /* going to die anyway; point
9575                                                    to exact spot of failure */
9576                                 vFAIL(error_msg);
9577                             }
9578                             else
9579                             {
9580                                 ender = result;
9581                             }
9582                             if (PL_encoding && ender < 0x100) {
9583                                 goto recode_encoding;
9584                             }
9585                             if (ender > 0xff) {
9586                                 REQUIRE_UTF8;
9587                             }
9588                             break;
9589                         }
9590                     case 'x':
9591                         if (*++p == '{') {
9592                             char* const e = strchr(p, '}');
9593
9594                             if (!e) {
9595                                 RExC_parse = p + 1;
9596                                 vFAIL("Missing right brace on \\x{}");
9597                             }
9598                             else {
9599                                 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
9600                                     | PERL_SCAN_DISALLOW_PREFIX;
9601                                 STRLEN numlen = e - p - 1;
9602                                 ender = grok_hex(p + 1, &numlen, &flags, NULL);
9603                                 if (ender > 0xff)
9604                                     REQUIRE_UTF8;
9605                                 p = e + 1;
9606                             }
9607                         }
9608                         else {
9609                             I32 flags = PERL_SCAN_DISALLOW_PREFIX;
9610                             STRLEN numlen = 2;
9611                             ender = grok_hex(p, &numlen, &flags, NULL);
9612                             p += numlen;
9613                         }
9614                         if (PL_encoding && ender < 0x100)
9615                             goto recode_encoding;
9616                         break;
9617                     case 'c':
9618                         p++;
9619                         ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
9620                         break;
9621                     case '0': case '1': case '2': case '3':case '4':
9622                     case '5': case '6': case '7': case '8':case '9':
9623                         if (*p == '0' ||
9624                             (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
9625                         {
9626                             I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
9627                             STRLEN numlen = 3;
9628                             ender = grok_oct(p, &numlen, &flags, NULL);
9629                             if (ender > 0xff) {
9630                                 REQUIRE_UTF8;
9631                             }
9632                             p += numlen;
9633                         }
9634                         else {
9635                             --p;
9636                             goto loopdone;
9637                         }
9638                         if (PL_encoding && ender < 0x100)
9639                             goto recode_encoding;
9640                         break;
9641                     recode_encoding:
9642                         if (! RExC_override_recoding) {
9643                             SV* enc = PL_encoding;
9644                             ender = reg_recode((const char)(U8)ender, &enc);
9645                             if (!enc && SIZE_ONLY)
9646                                 ckWARNreg(p, "Invalid escape in the specified encoding");
9647                             REQUIRE_UTF8;
9648                         }
9649                         break;
9650                     case '\0':
9651                         if (p >= RExC_end)
9652                             FAIL("Trailing \\");
9653                         /* FALL THROUGH */
9654                     default:
9655                         if (!SIZE_ONLY&& isALPHA(*p)) {
9656                             /* Include any { following the alpha to emphasize
9657                              * that it could be part of an escape at some point
9658                              * in the future */
9659                             int len = (*(p + 1) == '{') ? 2 : 1;
9660                             ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
9661                         }
9662                         goto normal_default;
9663                     }
9664                     break;
9665                 default:
9666                   normal_default:
9667                     if (UTF8_IS_START(*p) && UTF) {
9668                         STRLEN numlen;
9669                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
9670                                                &numlen, UTF8_ALLOW_DEFAULT);
9671                         p += numlen;
9672                     }
9673                     else
9674                         ender = (U8) *p++;
9675                     break;
9676                 } /* End of switch on the literal */
9677
9678                 is_exactfu_sharp_s = (node_type == EXACTFU
9679                                       && ender == LATIN_SMALL_LETTER_SHARP_S);
9680                 if ( RExC_flags & RXf_PMf_EXTENDED)
9681                     p = regwhite( pRExC_state, p );
9682                 if ((UTF && FOLD) || is_exactfu_sharp_s) {
9683                     /* Prime the casefolded buffer.  Locale rules, which apply
9684                      * only to code points < 256, aren't known until execution,
9685                      * so for them, just output the original character using
9686                      * utf8.  If we start to fold non-UTF patterns, be sure to
9687                      * update join_exact() */
9688                     if (LOC && ender < 256) {
9689                         if (UNI_IS_INVARIANT(ender)) {
9690                             *tmpbuf = (U8) ender;
9691                             foldlen = 1;
9692                         } else {
9693                             *tmpbuf = UTF8_TWO_BYTE_HI(ender);
9694                             *(tmpbuf + 1) = UTF8_TWO_BYTE_LO(ender);
9695                             foldlen = 2;
9696                         }
9697                     }
9698                     else if (isASCII(ender)) {  /* Note: Here can't also be LOC
9699                                                  */
9700                         ender = toLOWER(ender);
9701                         *tmpbuf = (U8) ender;
9702                         foldlen = 1;
9703                     }
9704                     else if (! MORE_ASCII_RESTRICTED && ! LOC) {
9705
9706                         /* Locale and /aa require more selectivity about the
9707                          * fold, so are handled below.  Otherwise, here, just
9708                          * use the fold */
9709                         ender = toFOLD_uni(ender, tmpbuf, &foldlen);
9710                     }
9711                     else {
9712                         /* Under locale rules or /aa we are not to mix,
9713                          * respectively, ords < 256 or ASCII with non-.  So
9714                          * reject folds that mix them, using only the
9715                          * non-folded code point.  So do the fold to a
9716                          * temporary, and inspect each character in it. */
9717                         U8 trialbuf[UTF8_MAXBYTES_CASE+1];
9718                         U8* s = trialbuf;
9719                         UV tmpender = toFOLD_uni(ender, trialbuf, &foldlen);
9720                         U8* e = s + foldlen;
9721                         bool fold_ok = TRUE;
9722
9723                         while (s < e) {
9724                             if (isASCII(*s)
9725                                 || (LOC && (UTF8_IS_INVARIANT(*s)
9726                                            || UTF8_IS_DOWNGRADEABLE_START(*s))))
9727                             {
9728                                 fold_ok = FALSE;
9729                                 break;
9730                             }
9731                             s += UTF8SKIP(s);
9732                         }
9733                         if (fold_ok) {
9734                             Copy(trialbuf, tmpbuf, foldlen, U8);
9735                             ender = tmpender;
9736                         }
9737                         else {
9738                             uvuni_to_utf8(tmpbuf, ender);
9739                             foldlen = UNISKIP(ender);
9740                         }
9741                     }
9742                 }
9743                 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
9744                     if (len)
9745                         p = oldp;
9746                     else if (UTF || is_exactfu_sharp_s) {
9747                          if (FOLD) {
9748                               /* Emit all the Unicode characters. */
9749                               STRLEN numlen;
9750                               for (foldbuf = tmpbuf;
9751                                    foldlen;
9752                                    foldlen -= numlen) {
9753                                    ender = utf8_to_uvchr(foldbuf, &numlen);
9754                                    if (numlen > 0) {
9755                                         const STRLEN unilen = reguni(pRExC_state, ender, s);
9756                                         s       += unilen;
9757                                         len     += unilen;
9758                                         /* In EBCDIC the numlen
9759                                          * and unilen can differ. */
9760                                         foldbuf += numlen;
9761                                         if (numlen >= foldlen)
9762                                              break;
9763                                    }
9764                                    else
9765                                         break; /* "Can't happen." */
9766                               }
9767                          }
9768                          else {
9769                               const STRLEN unilen = reguni(pRExC_state, ender, s);
9770                               if (unilen > 0) {
9771                                    s   += unilen;
9772                                    len += unilen;
9773                               }
9774                          }
9775                     }
9776                     else {
9777                         len++;
9778                         REGC((char)ender, s++);
9779                     }
9780                     break;
9781                 }
9782                 if (UTF || is_exactfu_sharp_s) {
9783                      if (FOLD) {
9784                           /* Emit all the Unicode characters. */
9785                           STRLEN numlen;
9786                           for (foldbuf = tmpbuf;
9787                                foldlen;
9788                                foldlen -= numlen) {
9789                                ender = utf8_to_uvchr(foldbuf, &numlen);
9790                                if (numlen > 0) {
9791                                     const STRLEN unilen = reguni(pRExC_state, ender, s);
9792                                     len     += unilen;
9793                                     s       += unilen;
9794                                     /* In EBCDIC the numlen
9795                                      * and unilen can differ. */
9796                                     foldbuf += numlen;
9797                                     if (numlen >= foldlen)
9798                                          break;
9799                                }
9800                                else
9801                                     break;
9802                           }
9803                      }
9804                      else {
9805                           const STRLEN unilen = reguni(pRExC_state, ender, s);
9806                           if (unilen > 0) {
9807                                s   += unilen;
9808                                len += unilen;
9809                           }
9810                      }
9811                      len--;
9812                 }
9813                 else {
9814                     REGC((char)ender, s++);
9815                 }
9816             }
9817         loopdone:   /* Jumped to when encounters something that shouldn't be in
9818                        the node */
9819             RExC_parse = p - 1;
9820             Set_Node_Cur_Length(ret); /* MJD */
9821             nextchar(pRExC_state);
9822             {
9823                 /* len is STRLEN which is unsigned, need to copy to signed */
9824                 IV iv = len;
9825                 if (iv < 0)
9826                     vFAIL("Internal disaster");
9827             }
9828             if (len > 0)
9829                 *flagp |= HASWIDTH;
9830             if (len == 1 && UNI_IS_INVARIANT(ender))
9831                 *flagp |= SIMPLE;
9832
9833             if (SIZE_ONLY)
9834                 RExC_size += STR_SZ(len);
9835             else {
9836                 STR_LEN(ret) = len;
9837                 RExC_emit += STR_SZ(len);
9838             }
9839         }
9840         break;
9841     }
9842
9843     return(ret);
9844
9845 /* Jumped to when an unrecognized character set is encountered */
9846 bad_charset:
9847     Perl_croak(aTHX_ "panic: Unknown regex character set encoding: %u", get_regex_charset(RExC_flags));
9848     return(NULL);
9849 }
9850
9851 STATIC char *
9852 S_regwhite( RExC_state_t *pRExC_state, char *p )
9853 {
9854     const char *e = RExC_end;
9855
9856     PERL_ARGS_ASSERT_REGWHITE;
9857
9858     while (p < e) {
9859         if (isSPACE(*p))
9860             ++p;
9861         else if (*p == '#') {
9862             bool ended = 0;
9863             do {
9864                 if (*p++ == '\n') {
9865                     ended = 1;
9866                     break;
9867                 }
9868             } while (p < e);
9869             if (!ended)
9870                 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
9871         }
9872         else
9873             break;
9874     }
9875     return p;
9876 }
9877
9878 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
9879    Character classes ([:foo:]) can also be negated ([:^foo:]).
9880    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
9881    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
9882    but trigger failures because they are currently unimplemented. */
9883
9884 #define POSIXCC_DONE(c)   ((c) == ':')
9885 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
9886 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
9887
9888 STATIC I32
9889 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
9890 {
9891     dVAR;
9892     I32 namedclass = OOB_NAMEDCLASS;
9893
9894     PERL_ARGS_ASSERT_REGPPOSIXCC;
9895
9896     if (value == '[' && RExC_parse + 1 < RExC_end &&
9897         /* I smell either [: or [= or [. -- POSIX has been here, right? */
9898         POSIXCC(UCHARAT(RExC_parse))) {
9899         const char c = UCHARAT(RExC_parse);
9900         char* const s = RExC_parse++;
9901
9902         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
9903             RExC_parse++;
9904         if (RExC_parse == RExC_end)
9905             /* Grandfather lone [:, [=, [. */
9906             RExC_parse = s;
9907         else {
9908             const char* const t = RExC_parse++; /* skip over the c */
9909             assert(*t == c);
9910
9911             if (UCHARAT(RExC_parse) == ']') {
9912                 const char *posixcc = s + 1;
9913                 RExC_parse++; /* skip over the ending ] */
9914
9915                 if (*s == ':') {
9916                     const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
9917                     const I32 skip = t - posixcc;
9918
9919                     /* Initially switch on the length of the name.  */
9920                     switch (skip) {
9921                     case 4:
9922                         if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
9923                             namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
9924                         break;
9925                     case 5:
9926                         /* Names all of length 5.  */
9927                         /* alnum alpha ascii blank cntrl digit graph lower
9928                            print punct space upper  */
9929                         /* Offset 4 gives the best switch position.  */
9930                         switch (posixcc[4]) {
9931                         case 'a':
9932                             if (memEQ(posixcc, "alph", 4)) /* alpha */
9933                                 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
9934                             break;
9935                         case 'e':
9936                             if (memEQ(posixcc, "spac", 4)) /* space */
9937                                 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
9938                             break;
9939                         case 'h':
9940                             if (memEQ(posixcc, "grap", 4)) /* graph */
9941                                 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
9942                             break;
9943                         case 'i':
9944                             if (memEQ(posixcc, "asci", 4)) /* ascii */
9945                                 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
9946                             break;
9947                         case 'k':
9948                             if (memEQ(posixcc, "blan", 4)) /* blank */
9949                                 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
9950                             break;
9951                         case 'l':
9952                             if (memEQ(posixcc, "cntr", 4)) /* cntrl */
9953                                 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
9954                             break;
9955                         case 'm':
9956                             if (memEQ(posixcc, "alnu", 4)) /* alnum */
9957                                 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
9958                             break;
9959                         case 'r':
9960                             if (memEQ(posixcc, "lowe", 4)) /* lower */
9961                                 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
9962                             else if (memEQ(posixcc, "uppe", 4)) /* upper */
9963                                 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
9964                             break;
9965                         case 't':
9966                             if (memEQ(posixcc, "digi", 4)) /* digit */
9967                                 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
9968                             else if (memEQ(posixcc, "prin", 4)) /* print */
9969                                 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
9970                             else if (memEQ(posixcc, "punc", 4)) /* punct */
9971                                 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
9972                             break;
9973                         }
9974                         break;
9975                     case 6:
9976                         if (memEQ(posixcc, "xdigit", 6))
9977                             namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
9978                         break;
9979                     }
9980
9981                     if (namedclass == OOB_NAMEDCLASS)
9982                         Simple_vFAIL3("POSIX class [:%.*s:] unknown",
9983                                       t - s - 1, s + 1);
9984                     assert (posixcc[skip] == ':');
9985                     assert (posixcc[skip+1] == ']');
9986                 } else if (!SIZE_ONLY) {
9987                     /* [[=foo=]] and [[.foo.]] are still future. */
9988
9989                     /* adjust RExC_parse so the warning shows after
9990                        the class closes */
9991                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
9992                         RExC_parse++;
9993                     Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
9994                 }
9995             } else {
9996                 /* Maternal grandfather:
9997                  * "[:" ending in ":" but not in ":]" */
9998                 RExC_parse = s;
9999             }
10000         }
10001     }
10002
10003     return namedclass;
10004 }
10005
10006 STATIC void
10007 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
10008 {
10009     dVAR;
10010
10011     PERL_ARGS_ASSERT_CHECKPOSIXCC;
10012
10013     if (POSIXCC(UCHARAT(RExC_parse))) {
10014         const char *s = RExC_parse;
10015         const char  c = *s++;
10016
10017         while (isALNUM(*s))
10018             s++;
10019         if (*s && c == *s && s[1] == ']') {
10020             ckWARN3reg(s+2,
10021                        "POSIX syntax [%c %c] belongs inside character classes",
10022                        c, c);
10023
10024             /* [[=foo=]] and [[.foo.]] are still future. */
10025             if (POSIXCC_NOTYET(c)) {
10026                 /* adjust RExC_parse so the error shows after
10027                    the class closes */
10028                 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
10029                     NOOP;
10030                 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
10031             }
10032         }
10033     }
10034 }
10035
10036 /* Generate the code to add a full posix character <class> to the bracketed
10037  * character class given by <node>.  (<node> is needed only under locale rules)
10038  * destlist     is the inversion list for non-locale rules that this class is
10039  *              to be added to
10040  * sourcelist   is the ASCII-range inversion list to add under /a rules
10041  * Xsourcelist  is the full Unicode range list to use otherwise. */
10042 #define DO_POSIX(node, class, destlist, sourcelist, Xsourcelist)           \
10043     if (LOC) {                                                             \
10044         SV* scratch_list = NULL;                                           \
10045                                                                            \
10046         /* Set this class in the node for runtime matching */              \
10047         ANYOF_CLASS_SET(node, class);                                      \
10048                                                                            \
10049         /* For above Latin1 code points, we use the full Unicode range */  \
10050         _invlist_intersection(PL_AboveLatin1,                              \
10051                               Xsourcelist,                                 \
10052                               &scratch_list);                              \
10053         /* And set the output to it, adding instead if there already is an \
10054          * output.  Checking if <destlist> is NULL first saves an extra    \
10055          * clone.  Its reference count will be decremented at the next     \
10056          * union, etc, or if this is the only instance, at the end of the  \
10057          * routine */                                                      \
10058         if (! destlist) {                                                  \
10059             destlist = scratch_list;                                       \
10060         }                                                                  \
10061         else {                                                             \
10062             _invlist_union(destlist, scratch_list, &destlist);             \
10063             SvREFCNT_dec(scratch_list);                                    \
10064         }                                                                  \
10065     }                                                                      \
10066     else {                                                                 \
10067         /* For non-locale, just add it to any existing list */             \
10068         _invlist_union(destlist,                                           \
10069                        (AT_LEAST_ASCII_RESTRICTED)                         \
10070                            ? sourcelist                                    \
10071                            : Xsourcelist,                                  \
10072                        &destlist);                                         \
10073     }
10074
10075 /* Like DO_POSIX, but matches the complement of <sourcelist> and <Xsourcelist>.
10076  */
10077 #define DO_N_POSIX(node, class, destlist, sourcelist, Xsourcelist)         \
10078     if (LOC) {                                                             \
10079         SV* scratch_list = NULL;                                           \
10080         ANYOF_CLASS_SET(node, class);                                      \
10081         _invlist_subtract(PL_AboveLatin1, Xsourcelist, &scratch_list);     \
10082         if (! destlist) {                                                  \
10083             destlist = scratch_list;                                       \
10084         }                                                                  \
10085         else {                                                             \
10086             _invlist_union(destlist, scratch_list, &destlist);             \
10087             SvREFCNT_dec(scratch_list);                                    \
10088         }                                                                  \
10089     }                                                                      \
10090     else {                                                                 \
10091         _invlist_union_complement_2nd(destlist,                            \
10092                                     (AT_LEAST_ASCII_RESTRICTED)            \
10093                                         ? sourcelist                       \
10094                                         : Xsourcelist,                     \
10095                                     &destlist);                            \
10096         /* Under /d, everything in the upper half of the Latin1 range      \
10097          * matches this complement */                                      \
10098         if (DEPENDS_SEMANTICS) {                                           \
10099             ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL;                \
10100         }                                                                  \
10101     }
10102
10103 /* Generate the code to add a posix character <class> to the bracketed
10104  * character class given by <node>.  (<node> is needed only under locale rules)
10105  * destlist       is the inversion list for non-locale rules that this class is
10106  *                to be added to
10107  * sourcelist     is the ASCII-range inversion list to add under /a rules
10108  * l1_sourcelist  is the Latin1 range list to use otherwise.
10109  * Xpropertyname  is the name to add to <run_time_list> of the property to
10110  *                specify the code points above Latin1 that will have to be
10111  *                determined at run-time
10112  * run_time_list  is a SV* that contains text names of properties that are to
10113  *                be computed at run time.  This concatenates <Xpropertyname>
10114  *                to it, apppropriately
10115  * This is essentially DO_POSIX, but we know only the Latin1 values at compile
10116  * time */
10117 #define DO_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist,      \
10118                               l1_sourcelist, Xpropertyname, run_time_list) \
10119     /* If not /a matching, there are going to be code points we will have  \
10120      * to defer to runtime to look-up */                                   \
10121     if (! AT_LEAST_ASCII_RESTRICTED) {                                     \
10122         Perl_sv_catpvf(aTHX_ run_time_list, "+utf8::%s\n", Xpropertyname); \
10123     }                                                                      \
10124     if (LOC) {                                                             \
10125         ANYOF_CLASS_SET(node, class);                                      \
10126     }                                                                      \
10127     else {                                                                 \
10128         _invlist_union(destlist,                                           \
10129                        (AT_LEAST_ASCII_RESTRICTED)                         \
10130                            ? sourcelist                                    \
10131                            : l1_sourcelist,                                \
10132                        &destlist);                                         \
10133     }
10134
10135 /* Like DO_POSIX_LATIN1_ONLY_KNOWN, but for the complement.  A combination of
10136  * this and DO_N_POSIX */
10137 #define DO_N_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist,    \
10138                               l1_sourcelist, Xpropertyname, run_time_list) \
10139     if (AT_LEAST_ASCII_RESTRICTED) {                                       \
10140         _invlist_union_complement_2nd(destlist, sourcelist, &destlist);    \
10141     }                                                                      \
10142     else {                                                                 \
10143         Perl_sv_catpvf(aTHX_ run_time_list, "!utf8::%s\n", Xpropertyname); \
10144         if (LOC) {                                                         \
10145             ANYOF_CLASS_SET(node, namedclass);                             \
10146         }                                                                  \
10147         else {                                                             \
10148             SV* scratch_list = NULL;                                       \
10149             _invlist_subtract(PL_Latin1, l1_sourcelist, &scratch_list);    \
10150             if (! destlist) {                                              \
10151                 destlist = scratch_list;                                   \
10152             }                                                              \
10153             else {                                                         \
10154                 _invlist_union(destlist, scratch_list, &destlist);         \
10155                 SvREFCNT_dec(scratch_list);                                \
10156             }                                                              \
10157             if (DEPENDS_SEMANTICS) {                                       \
10158                 ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL;            \
10159             }                                                              \
10160         }                                                                  \
10161     }
10162
10163 STATIC U8
10164 S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, SV** invlist_ptr, AV** alternate_ptr)
10165 {
10166
10167     /* Handle the setting of folds in the bitmap for non-locale ANYOF nodes.
10168      * Locale folding is done at run-time, so this function should not be
10169      * called for nodes that are for locales.
10170      *
10171      * This function sets the bit corresponding to the fold of the input
10172      * 'value', if not already set.  The fold of 'f' is 'F', and the fold of
10173      * 'F' is 'f'.
10174      *
10175      * It also knows about the characters that are in the bitmap that have
10176      * folds that are matchable only outside it, and sets the appropriate lists
10177      * and flags.
10178      *
10179      * It returns the number of bits that actually changed from 0 to 1 */
10180
10181     U8 stored = 0;
10182     U8 fold;
10183
10184     PERL_ARGS_ASSERT_SET_REGCLASS_BIT_FOLD;
10185
10186     fold = (AT_LEAST_UNI_SEMANTICS) ? PL_fold_latin1[value]
10187                                     : PL_fold[value];
10188
10189     /* It assumes the bit for 'value' has already been set */
10190     if (fold != value && ! ANYOF_BITMAP_TEST(node, fold)) {
10191         ANYOF_BITMAP_SET(node, fold);
10192         stored++;
10193     }
10194     if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value) && (! isASCII(value) || ! MORE_ASCII_RESTRICTED)) {
10195         /* Certain Latin1 characters have matches outside the bitmap.  To get
10196          * here, 'value' is one of those characters.   None of these matches is
10197          * valid for ASCII characters under /aa, which have been excluded by
10198          * the 'if' above.  The matches fall into three categories:
10199          * 1) They are singly folded-to or -from an above 255 character, as
10200          *    LATIN SMALL LETTER Y WITH DIAERESIS and LATIN CAPITAL LETTER Y
10201          *    WITH DIAERESIS;
10202          * 2) They are part of a multi-char fold with another character in the
10203          *    bitmap, only LATIN SMALL LETTER SHARP S => "ss" fits that bill;
10204          * 3) They are part of a multi-char fold with a character not in the
10205          *    bitmap, such as various ligatures.
10206          * We aren't dealing fully with multi-char folds, except we do deal
10207          * with the pattern containing a character that has a multi-char fold
10208          * (not so much the inverse).
10209          * For types 1) and 3), the matches only happen when the target string
10210          * is utf8; that's not true for 2), and we set a flag for it.
10211          *
10212          * The code below adds to the passed in inversion list the single fold
10213          * closures for 'value'.  The values are hard-coded here so that an
10214          * innocent-looking character class, like /[ks]/i won't have to go out
10215          * to disk to find the possible matches.  XXX It would be better to
10216          * generate these via regen, in case a new version of the Unicode
10217          * standard adds new mappings, though that is not really likely. */
10218         switch (value) {
10219             case 'k':
10220             case 'K':
10221                 /* KELVIN SIGN */
10222                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212A);
10223                 break;
10224             case 's':
10225             case 'S':
10226                 /* LATIN SMALL LETTER LONG S */
10227                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x017F);
10228                 break;
10229             case MICRO_SIGN:
10230                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10231                                                  GREEK_SMALL_LETTER_MU);
10232                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10233                                                  GREEK_CAPITAL_LETTER_MU);
10234                 break;
10235             case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
10236             case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
10237                 /* ANGSTROM SIGN */
10238                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212B);
10239                 if (DEPENDS_SEMANTICS) {    /* See DEPENDS comment below */
10240                     *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10241                                                      PL_fold_latin1[value]);
10242                 }
10243                 break;
10244             case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
10245                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10246                                         LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
10247                 break;
10248             case LATIN_SMALL_LETTER_SHARP_S:
10249                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10250                                         LATIN_CAPITAL_LETTER_SHARP_S);
10251
10252                 /* Under /a, /d, and /u, this can match the two chars "ss" */
10253                 if (! MORE_ASCII_RESTRICTED) {
10254                     add_alternate(alternate_ptr, (U8 *) "ss", 2);
10255
10256                     /* And under /u or /a, it can match even if the target is
10257                      * not utf8 */
10258                     if (AT_LEAST_UNI_SEMANTICS) {
10259                         ANYOF_FLAGS(node) |= ANYOF_NONBITMAP_NON_UTF8;
10260                     }
10261                 }
10262                 break;
10263             case 'F': case 'f':
10264             case 'I': case 'i':
10265             case 'L': case 'l':
10266             case 'T': case 't':
10267             case 'A': case 'a':
10268             case 'H': case 'h':
10269             case 'J': case 'j':
10270             case 'N': case 'n':
10271             case 'W': case 'w':
10272             case 'Y': case 'y':
10273                 /* These all are targets of multi-character folds from code
10274                  * points that require UTF8 to express, so they can't match
10275                  * unless the target string is in UTF-8, so no action here is
10276                  * necessary, as regexec.c properly handles the general case
10277                  * for UTF-8 matching */
10278                 break;
10279             default:
10280                 /* Use deprecated warning to increase the chances of this
10281                  * being output */
10282                 ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%x; please use the perlbug utility to report;", value);
10283                 break;
10284         }
10285     }
10286     else if (DEPENDS_SEMANTICS
10287             && ! isASCII(value)
10288             && PL_fold_latin1[value] != value)
10289     {
10290            /* Under DEPENDS rules, non-ASCII Latin1 characters match their
10291             * folds only when the target string is in UTF-8.  We add the fold
10292             * here to the list of things to match outside the bitmap, which
10293             * won't be looked at unless it is UTF8 (or else if something else
10294             * says to look even if not utf8, but those things better not happen
10295             * under DEPENDS semantics. */
10296         *invlist_ptr = add_cp_to_invlist(*invlist_ptr, PL_fold_latin1[value]);
10297     }
10298
10299     return stored;
10300 }
10301
10302
10303 PERL_STATIC_INLINE U8
10304 S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, SV** invlist_ptr, AV** alternate_ptr)
10305 {
10306     /* This inline function sets a bit in the bitmap if not already set, and if
10307      * appropriate, its fold, returning the number of bits that actually
10308      * changed from 0 to 1 */
10309
10310     U8 stored;
10311
10312     PERL_ARGS_ASSERT_SET_REGCLASS_BIT;
10313
10314     if (ANYOF_BITMAP_TEST(node, value)) {   /* Already set */
10315         return 0;
10316     }
10317
10318     ANYOF_BITMAP_SET(node, value);
10319     stored = 1;
10320
10321     if (FOLD && ! LOC) {        /* Locale folds aren't known until runtime */
10322         stored += set_regclass_bit_fold(pRExC_state, node, value, invlist_ptr, alternate_ptr);
10323     }
10324
10325     return stored;
10326 }
10327
10328 STATIC void
10329 S_add_alternate(pTHX_ AV** alternate_ptr, U8* string, STRLEN len)
10330 {
10331     /* Adds input 'string' with length 'len' to the ANYOF node's unicode
10332      * alternate list, pointed to by 'alternate_ptr'.  This is an array of
10333      * the multi-character folds of characters in the node */
10334     SV *sv;
10335
10336     PERL_ARGS_ASSERT_ADD_ALTERNATE;
10337
10338     if (! *alternate_ptr) {
10339         *alternate_ptr = newAV();
10340     }
10341     sv = newSVpvn_utf8((char*)string, len, TRUE);
10342     av_push(*alternate_ptr, sv);
10343     return;
10344 }
10345
10346 /*
10347    parse a class specification and produce either an ANYOF node that
10348    matches the pattern or perhaps will be optimized into an EXACTish node
10349    instead. The node contains a bit map for the first 256 characters, with the
10350    corresponding bit set if that character is in the list.  For characters
10351    above 255, a range list is used */
10352
10353 STATIC regnode *
10354 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
10355 {
10356     dVAR;
10357     register UV nextvalue;
10358     register IV prevvalue = OOB_UNICODE;
10359     register IV range = 0;
10360     UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
10361     register regnode *ret;
10362     STRLEN numlen;
10363     IV namedclass;
10364     char *rangebegin = NULL;
10365     bool need_class = 0;
10366     bool allow_full_fold = TRUE;   /* Assume wants multi-char folding */
10367     SV *listsv = NULL;
10368     STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
10369                                       than just initialized.  */
10370     SV* properties = NULL;    /* Code points that match \p{} \P{} */
10371     UV element_count = 0;   /* Number of distinct elements in the class.
10372                                Optimizations may be possible if this is tiny */
10373     UV n;
10374
10375     /* Unicode properties are stored in a swash; this holds the current one
10376      * being parsed.  If this swash is the only above-latin1 component of the
10377      * character class, an optimization is to pass it directly on to the
10378      * execution engine.  Otherwise, it is set to NULL to indicate that there
10379      * are other things in the class that have to be dealt with at execution
10380      * time */
10381     SV* swash = NULL;           /* Code points that match \p{} \P{} */
10382
10383     /* Set if a component of this character class is user-defined; just passed
10384      * on to the engine */
10385     UV has_user_defined_property = 0;
10386
10387     /* code points this node matches that can't be stored in the bitmap */
10388     SV* nonbitmap = NULL;
10389
10390     /* The items that are to match that aren't stored in the bitmap, but are a
10391      * result of things that are stored there.  This is the fold closure of
10392      * such a character, either because it has DEPENDS semantics and shouldn't
10393      * be matched unless the target string is utf8, or is a code point that is
10394      * too large for the bit map, as for example, the fold of the MICRO SIGN is
10395      * above 255.  This all is solely for performance reasons.  By having this
10396      * code know the outside-the-bitmap folds that the bitmapped characters are
10397      * involved with, we don't have to go out to disk to find the list of
10398      * matches, unless the character class includes code points that aren't
10399      * storable in the bit map.  That means that a character class with an 's'
10400      * in it, for example, doesn't need to go out to disk to find everything
10401      * that matches.  A 2nd list is used so that the 'nonbitmap' list is kept
10402      * empty unless there is something whose fold we don't know about, and will
10403      * have to go out to the disk to find. */
10404     SV* l1_fold_invlist = NULL;
10405
10406     /* List of multi-character folds that are matched by this node */
10407     AV* unicode_alternate  = NULL;
10408 #ifdef EBCDIC
10409     UV literal_endpoint = 0;
10410 #endif
10411     UV stored = 0;  /* how many chars stored in the bitmap */
10412
10413     regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
10414         case we need to change the emitted regop to an EXACT. */
10415     const char * orig_parse = RExC_parse;
10416     GET_RE_DEBUG_FLAGS_DECL;
10417
10418     PERL_ARGS_ASSERT_REGCLASS;
10419 #ifndef DEBUGGING
10420     PERL_UNUSED_ARG(depth);
10421 #endif
10422
10423     DEBUG_PARSE("clas");
10424
10425     /* Assume we are going to generate an ANYOF node. */
10426     ret = reganode(pRExC_state, ANYOF, 0);
10427
10428
10429     if (!SIZE_ONLY) {
10430         ANYOF_FLAGS(ret) = 0;
10431     }
10432
10433     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
10434         RExC_naughty++;
10435         RExC_parse++;
10436         if (!SIZE_ONLY)
10437             ANYOF_FLAGS(ret) |= ANYOF_INVERT;
10438
10439         /* We have decided to not allow multi-char folds in inverted character
10440          * classes, due to the confusion that can happen, especially with
10441          * classes that are designed for a non-Unicode world:  You have the
10442          * peculiar case that:
10443             "s s" =~ /^[^\xDF]+$/i => Y
10444             "ss"  =~ /^[^\xDF]+$/i => N
10445          *
10446          * See [perl #89750] */
10447         allow_full_fold = FALSE;
10448     }
10449
10450     if (SIZE_ONLY) {
10451         RExC_size += ANYOF_SKIP;
10452         listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
10453     }
10454     else {
10455         RExC_emit += ANYOF_SKIP;
10456         if (LOC) {
10457             ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
10458         }
10459         ANYOF_BITMAP_ZERO(ret);
10460         listsv = newSVpvs("# comment\n");
10461         initial_listsv_len = SvCUR(listsv);
10462     }
10463
10464     nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
10465
10466     if (!SIZE_ONLY && POSIXCC(nextvalue))
10467         checkposixcc(pRExC_state);
10468
10469     /* allow 1st char to be ] (allowing it to be - is dealt with later) */
10470     if (UCHARAT(RExC_parse) == ']')
10471         goto charclassloop;
10472
10473 parseit:
10474     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
10475
10476     charclassloop:
10477
10478         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
10479
10480         if (!range) {
10481             rangebegin = RExC_parse;
10482             element_count++;
10483         }
10484         if (UTF) {
10485             value = utf8n_to_uvchr((U8*)RExC_parse,
10486                                    RExC_end - RExC_parse,
10487                                    &numlen, UTF8_ALLOW_DEFAULT);
10488             RExC_parse += numlen;
10489         }
10490         else
10491             value = UCHARAT(RExC_parse++);
10492
10493         nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
10494         if (value == '[' && POSIXCC(nextvalue))
10495             namedclass = regpposixcc(pRExC_state, value);
10496         else if (value == '\\') {
10497             if (UTF) {
10498                 value = utf8n_to_uvchr((U8*)RExC_parse,
10499                                    RExC_end - RExC_parse,
10500                                    &numlen, UTF8_ALLOW_DEFAULT);
10501                 RExC_parse += numlen;
10502             }
10503             else
10504                 value = UCHARAT(RExC_parse++);
10505             /* Some compilers cannot handle switching on 64-bit integer
10506              * values, therefore value cannot be an UV.  Yes, this will
10507              * be a problem later if we want switch on Unicode.
10508              * A similar issue a little bit later when switching on
10509              * namedclass. --jhi */
10510             switch ((I32)value) {
10511             case 'w':   namedclass = ANYOF_ALNUM;       break;
10512             case 'W':   namedclass = ANYOF_NALNUM;      break;
10513             case 's':   namedclass = ANYOF_SPACE;       break;
10514             case 'S':   namedclass = ANYOF_NSPACE;      break;
10515             case 'd':   namedclass = ANYOF_DIGIT;       break;
10516             case 'D':   namedclass = ANYOF_NDIGIT;      break;
10517             case 'v':   namedclass = ANYOF_VERTWS;      break;
10518             case 'V':   namedclass = ANYOF_NVERTWS;     break;
10519             case 'h':   namedclass = ANYOF_HORIZWS;     break;
10520             case 'H':   namedclass = ANYOF_NHORIZWS;    break;
10521             case 'N':  /* Handle \N{NAME} in class */
10522                 {
10523                     /* We only pay attention to the first char of
10524                     multichar strings being returned. I kinda wonder
10525                     if this makes sense as it does change the behaviour
10526                     from earlier versions, OTOH that behaviour was broken
10527                     as well. */
10528                     UV v; /* value is register so we cant & it /grrr */
10529                     if (reg_namedseq(pRExC_state, &v, NULL, depth)) {
10530                         goto parseit;
10531                     }
10532                     value= v;
10533                 }
10534                 break;
10535             case 'p':
10536             case 'P':
10537                 {
10538                 char *e;
10539                 if (RExC_parse >= RExC_end)
10540                     vFAIL2("Empty \\%c{}", (U8)value);
10541                 if (*RExC_parse == '{') {
10542                     const U8 c = (U8)value;
10543                     e = strchr(RExC_parse++, '}');
10544                     if (!e)
10545                         vFAIL2("Missing right brace on \\%c{}", c);
10546                     while (isSPACE(UCHARAT(RExC_parse)))
10547                         RExC_parse++;
10548                     if (e == RExC_parse)
10549                         vFAIL2("Empty \\%c{}", c);
10550                     n = e - RExC_parse;
10551                     while (isSPACE(UCHARAT(RExC_parse + n - 1)))
10552                         n--;
10553                 }
10554                 else {
10555                     e = RExC_parse;
10556                     n = 1;
10557                 }
10558                 if (!SIZE_ONLY) {
10559                     SV** invlistsvp;
10560                     SV* invlist;
10561                     char* name;
10562                     if (UCHARAT(RExC_parse) == '^') {
10563                          RExC_parse++;
10564                          n--;
10565                          value = value == 'p' ? 'P' : 'p'; /* toggle */
10566                          while (isSPACE(UCHARAT(RExC_parse))) {
10567                               RExC_parse++;
10568                               n--;
10569                          }
10570                     }
10571                     /* Try to get the definition of the property into
10572                      * <invlist>.  If /i is in effect, the effective property
10573                      * will have its name be <__NAME_i>.  The design is
10574                      * discussed in commit
10575                      * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
10576                     Newx(name, n + sizeof("_i__\n"), char);
10577
10578                     sprintf(name, "%s%.*s%s\n",
10579                                     (FOLD) ? "__" : "",
10580                                     (int)n,
10581                                     RExC_parse,
10582                                     (FOLD) ? "_i" : ""
10583                     );
10584
10585                     /* Look up the property name, and get its swash and
10586                      * inversion list, if the property is found  */
10587                     if (swash) {
10588                         SvREFCNT_dec(swash);
10589                     }
10590                     swash = _core_swash_init("utf8", name, &PL_sv_undef,
10591                                              1, /* binary */
10592                                              0, /* not tr/// */
10593                                              TRUE, /* this routine will handle
10594                                                       undefined properties */
10595                                              NULL, FALSE /* No inversion list */
10596                                             );
10597                     if (   ! swash
10598                         || ! SvROK(swash)
10599                         || ! SvTYPE(SvRV(swash)) == SVt_PVHV
10600                         || ! (invlistsvp =
10601                                 hv_fetchs(MUTABLE_HV(SvRV(swash)),
10602                                 "INVLIST", FALSE))
10603                         || ! (invlist = *invlistsvp))
10604                     {
10605                         if (swash) {
10606                             SvREFCNT_dec(swash);
10607                             swash = NULL;
10608                         }
10609
10610                         /* Here didn't find it.  It could be a user-defined
10611                          * property that will be available at run-time.  Add it
10612                          * to the list to look up then */
10613                         Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n",
10614                                         (value == 'p' ? '+' : '!'),
10615                                         name);
10616                         has_user_defined_property = 1;
10617
10618                         /* We don't know yet, so have to assume that the
10619                          * property could match something in the Latin1 range,
10620                          * hence something that isn't utf8 */
10621                         ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
10622                     }
10623                     else {
10624
10625                         /* Here, did get the swash and its inversion list.  If
10626                          * the swash is from a user-defined property, then this
10627                          * whole character class should be regarded as such */
10628                         SV** user_defined_svp =
10629                                             hv_fetchs(MUTABLE_HV(SvRV(swash)),
10630                                                         "USER_DEFINED", FALSE);
10631                         if (user_defined_svp) {
10632                             has_user_defined_property
10633                                                     |= SvUV(*user_defined_svp);
10634                         }
10635
10636                         /* Invert if asking for the complement */
10637                         if (value == 'P') {
10638                             _invlist_union_complement_2nd(properties, invlist, &properties);
10639
10640                             /* The swash can't be used as-is, because we've
10641                              * inverted things; delay removing it to here after
10642                              * have copied its invlist above */
10643                             SvREFCNT_dec(swash);
10644                             swash = NULL;
10645                         }
10646                         else {
10647                             _invlist_union(properties, invlist, &properties);
10648                         }
10649                     }
10650                     Safefree(name);
10651                 }
10652                 RExC_parse = e + 1;
10653                 namedclass = ANYOF_MAX;  /* no official name, but it's named */
10654
10655                 /* \p means they want Unicode semantics */
10656                 RExC_uni_semantics = 1;
10657                 }
10658                 break;
10659             case 'n':   value = '\n';                   break;
10660             case 'r':   value = '\r';                   break;
10661             case 't':   value = '\t';                   break;
10662             case 'f':   value = '\f';                   break;
10663             case 'b':   value = '\b';                   break;
10664             case 'e':   value = ASCII_TO_NATIVE('\033');break;
10665             case 'a':   value = ASCII_TO_NATIVE('\007');break;
10666             case 'o':
10667                 RExC_parse--;   /* function expects to be pointed at the 'o' */
10668                 {
10669                     const char* error_msg;
10670                     bool valid = grok_bslash_o(RExC_parse,
10671                                                &value,
10672                                                &numlen,
10673                                                &error_msg,
10674                                                SIZE_ONLY);
10675                     RExC_parse += numlen;
10676                     if (! valid) {
10677                         vFAIL(error_msg);
10678                     }
10679                 }
10680                 if (PL_encoding && value < 0x100) {
10681                     goto recode_encoding;
10682                 }
10683                 break;
10684             case 'x':
10685                 if (*RExC_parse == '{') {
10686                     I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
10687                         | PERL_SCAN_DISALLOW_PREFIX;
10688                     char * const e = strchr(RExC_parse++, '}');
10689                     if (!e)
10690                         vFAIL("Missing right brace on \\x{}");
10691
10692                     numlen = e - RExC_parse;
10693                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
10694                     RExC_parse = e + 1;
10695                 }
10696                 else {
10697                     I32 flags = PERL_SCAN_DISALLOW_PREFIX;
10698                     numlen = 2;
10699                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
10700                     RExC_parse += numlen;
10701                 }
10702                 if (PL_encoding && value < 0x100)
10703                     goto recode_encoding;
10704                 break;
10705             case 'c':
10706                 value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
10707                 break;
10708             case '0': case '1': case '2': case '3': case '4':
10709             case '5': case '6': case '7':
10710                 {
10711                     /* Take 1-3 octal digits */
10712                     I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
10713                     numlen = 3;
10714                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
10715                     RExC_parse += numlen;
10716                     if (PL_encoding && value < 0x100)
10717                         goto recode_encoding;
10718                     break;
10719                 }
10720             recode_encoding:
10721                 if (! RExC_override_recoding) {
10722                     SV* enc = PL_encoding;
10723                     value = reg_recode((const char)(U8)value, &enc);
10724                     if (!enc && SIZE_ONLY)
10725                         ckWARNreg(RExC_parse,
10726                                   "Invalid escape in the specified encoding");
10727                     break;
10728                 }
10729             default:
10730                 /* Allow \_ to not give an error */
10731                 if (!SIZE_ONLY && isALNUM(value) && value != '_') {
10732                     ckWARN2reg(RExC_parse,
10733                                "Unrecognized escape \\%c in character class passed through",
10734                                (int)value);
10735                 }
10736                 break;
10737             }
10738         } /* end of \blah */
10739 #ifdef EBCDIC
10740         else
10741             literal_endpoint++;
10742 #endif
10743
10744         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
10745
10746             /* What matches in a locale is not known until runtime, so need to
10747              * (one time per class) allocate extra space to pass to regexec.
10748              * The space will contain a bit for each named class that is to be
10749              * matched against.  This isn't needed for \p{} and pseudo-classes,
10750              * as they are not affected by locale, and hence are dealt with
10751              * separately */
10752             if (LOC && namedclass < ANYOF_MAX && ! need_class) {
10753                 need_class = 1;
10754                 if (SIZE_ONLY) {
10755                     RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
10756                 }
10757                 else {
10758                     RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
10759                     ANYOF_CLASS_ZERO(ret);
10760                 }
10761                 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
10762             }
10763
10764             /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
10765              * literal, as is the character that began the false range, i.e.
10766              * the 'a' in the examples */
10767             if (range) {
10768                 if (!SIZE_ONLY) {
10769                     const int w =
10770                         RExC_parse >= rangebegin ?
10771                         RExC_parse - rangebegin : 0;
10772                     ckWARN4reg(RExC_parse,
10773                                "False [] range \"%*.*s\"",
10774                                w, w, rangebegin);
10775
10776                     stored +=
10777                          set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
10778                     if (prevvalue < 256) {
10779                         stored +=
10780                          set_regclass_bit(pRExC_state, ret, (U8) prevvalue, &l1_fold_invlist, &unicode_alternate);
10781                     }
10782                     else {
10783                         nonbitmap = add_cp_to_invlist(nonbitmap, prevvalue);
10784                     }
10785                 }
10786
10787                 range = 0; /* this was not a true range */
10788             }
10789
10790             if (!SIZE_ONLY) {
10791
10792                 /* Possible truncation here but in some 64-bit environments
10793                  * the compiler gets heartburn about switch on 64-bit values.
10794                  * A similar issue a little earlier when switching on value.
10795                  * --jhi */
10796                 switch ((I32)namedclass) {
10797
10798                 case ANYOF_ALNUMC: /* C's alnum, in contrast to \w */
10799                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10800                         PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv);
10801                     break;
10802                 case ANYOF_NALNUMC:
10803                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10804                         PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv);
10805                     break;
10806                 case ANYOF_ALPHA:
10807                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10808                         PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv);
10809                     break;
10810                 case ANYOF_NALPHA:
10811                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10812                         PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv);
10813                     break;
10814                 case ANYOF_ASCII:
10815                     if (LOC) {
10816                         ANYOF_CLASS_SET(ret, namedclass);
10817                     }
10818                     else {
10819                         _invlist_union(properties, PL_ASCII, &properties);
10820                     }
10821                     break;
10822                 case ANYOF_NASCII:
10823                     if (LOC) {
10824                         ANYOF_CLASS_SET(ret, namedclass);
10825                     }
10826                     else {
10827                         _invlist_union_complement_2nd(properties,
10828                                                     PL_ASCII, &properties);
10829                         if (DEPENDS_SEMANTICS) {
10830                             ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
10831                         }
10832                     }
10833                     break;
10834                 case ANYOF_BLANK:
10835                     DO_POSIX(ret, namedclass, properties,
10836                                             PL_PosixBlank, PL_XPosixBlank);
10837                     break;
10838                 case ANYOF_NBLANK:
10839                     DO_N_POSIX(ret, namedclass, properties,
10840                                             PL_PosixBlank, PL_XPosixBlank);
10841                     break;
10842                 case ANYOF_CNTRL:
10843                     DO_POSIX(ret, namedclass, properties,
10844                                             PL_PosixCntrl, PL_XPosixCntrl);
10845                     break;
10846                 case ANYOF_NCNTRL:
10847                     DO_N_POSIX(ret, namedclass, properties,
10848                                             PL_PosixCntrl, PL_XPosixCntrl);
10849                     break;
10850                 case ANYOF_DIGIT:
10851                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10852                         PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv);
10853                     break;
10854                 case ANYOF_NDIGIT:
10855                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10856                         PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv);
10857                     break;
10858                 case ANYOF_GRAPH:
10859                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10860                         PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv);
10861                     break;
10862                 case ANYOF_NGRAPH:
10863                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10864                         PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv);
10865                     break;
10866                 case ANYOF_HORIZWS:
10867                     /* For these, we use the nonbitmap, as /d doesn't make a
10868                      * difference in what these match.  There would be problems
10869                      * if these characters had folds other than themselves, as
10870                      * nonbitmap is subject to folding.  It turns out that \h
10871                      * is just a synonym for XPosixBlank */
10872                     _invlist_union(nonbitmap, PL_XPosixBlank, &nonbitmap);
10873                     break;
10874                 case ANYOF_NHORIZWS:
10875                     _invlist_union_complement_2nd(nonbitmap,
10876                                                  PL_XPosixBlank, &nonbitmap);
10877                     break;
10878                 case ANYOF_LOWER:
10879                 case ANYOF_NLOWER:
10880                 {   /* These require special handling, as they differ under
10881                        folding, matching Cased there (which in the ASCII range
10882                        is the same as Alpha */
10883
10884                     SV* ascii_source;
10885                     SV* l1_source;
10886                     const char *Xname;
10887
10888                     if (FOLD && ! LOC) {
10889                         ascii_source = PL_PosixAlpha;
10890                         l1_source = PL_L1Cased;
10891                         Xname = "Cased";
10892                     }
10893                     else {
10894                         ascii_source = PL_PosixLower;
10895                         l1_source = PL_L1PosixLower;
10896                         Xname = "XPosixLower";
10897                     }
10898                     if (namedclass == ANYOF_LOWER) {
10899                         DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10900                                     ascii_source, l1_source, Xname, listsv);
10901                     }
10902                     else {
10903                         DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
10904                             properties, ascii_source, l1_source, Xname, listsv);
10905                     }
10906                     break;
10907                 }
10908                 case ANYOF_PRINT:
10909                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10910                         PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv);
10911                     break;
10912                 case ANYOF_NPRINT:
10913                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10914                         PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv);
10915                     break;
10916                 case ANYOF_PUNCT:
10917                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10918                         PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv);
10919                     break;
10920                 case ANYOF_NPUNCT:
10921                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10922                         PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv);
10923                     break;
10924                 case ANYOF_PSXSPC:
10925                     DO_POSIX(ret, namedclass, properties,
10926                                             PL_PosixSpace, PL_XPosixSpace);
10927                     break;
10928                 case ANYOF_NPSXSPC:
10929                     DO_N_POSIX(ret, namedclass, properties,
10930                                             PL_PosixSpace, PL_XPosixSpace);
10931                     break;
10932                 case ANYOF_SPACE:
10933                     DO_POSIX(ret, namedclass, properties,
10934                                             PL_PerlSpace, PL_XPerlSpace);
10935                     break;
10936                 case ANYOF_NSPACE:
10937                     DO_N_POSIX(ret, namedclass, properties,
10938                                             PL_PerlSpace, PL_XPerlSpace);
10939                     break;
10940                 case ANYOF_UPPER:   /* Same as LOWER, above */
10941                 case ANYOF_NUPPER:
10942                 {
10943                     SV* ascii_source;
10944                     SV* l1_source;
10945                     const char *Xname;
10946
10947                     if (FOLD && ! LOC) {
10948                         ascii_source = PL_PosixAlpha;
10949                         l1_source = PL_L1Cased;
10950                         Xname = "Cased";
10951                     }
10952                     else {
10953                         ascii_source = PL_PosixUpper;
10954                         l1_source = PL_L1PosixUpper;
10955                         Xname = "XPosixUpper";
10956                     }
10957                     if (namedclass == ANYOF_UPPER) {
10958                         DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10959                                     ascii_source, l1_source, Xname, listsv);
10960                     }
10961                     else {
10962                         DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
10963                         properties, ascii_source, l1_source, Xname, listsv);
10964                     }
10965                     break;
10966                 }
10967                 case ANYOF_ALNUM:   /* Really is 'Word' */
10968                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10969                             PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
10970                     break;
10971                 case ANYOF_NALNUM:
10972                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10973                             PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
10974                     break;
10975                 case ANYOF_VERTWS:
10976                     /* For these, we use the nonbitmap, as /d doesn't make a
10977                      * difference in what these match.  There would be problems
10978                      * if these characters had folds other than themselves, as
10979                      * nonbitmap is subject to folding */
10980                     _invlist_union(nonbitmap, PL_VertSpace, &nonbitmap);
10981                     break;
10982                 case ANYOF_NVERTWS:
10983                     _invlist_union_complement_2nd(nonbitmap,
10984                                                     PL_VertSpace, &nonbitmap);
10985                     break;
10986                 case ANYOF_XDIGIT:
10987                     DO_POSIX(ret, namedclass, properties,
10988                                             PL_PosixXDigit, PL_XPosixXDigit);
10989                     break;
10990                 case ANYOF_NXDIGIT:
10991                     DO_N_POSIX(ret, namedclass, properties,
10992                                             PL_PosixXDigit, PL_XPosixXDigit);
10993                     break;
10994                 case ANYOF_MAX:
10995                     /* this is to handle \p and \P */
10996                     break;
10997                 default:
10998                     vFAIL("Invalid [::] class");
10999                     break;
11000                 }
11001
11002                 continue;
11003             }
11004         } /* end of namedclass \blah */
11005
11006         if (range) {
11007             if (prevvalue > (IV)value) /* b-a */ {
11008                 const int w = RExC_parse - rangebegin;
11009                 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
11010                 range = 0; /* not a valid range */
11011             }
11012         }
11013         else {
11014             prevvalue = value; /* save the beginning of the range */
11015             if (RExC_parse+1 < RExC_end
11016                 && *RExC_parse == '-'
11017                 && RExC_parse[1] != ']')
11018             {
11019                 RExC_parse++;
11020
11021                 /* a bad range like \w-, [:word:]- ? */
11022                 if (namedclass > OOB_NAMEDCLASS) {
11023                     if (ckWARN(WARN_REGEXP)) {
11024                         const int w =
11025                             RExC_parse >= rangebegin ?
11026                             RExC_parse - rangebegin : 0;
11027                         vWARN4(RExC_parse,
11028                                "False [] range \"%*.*s\"",
11029                                w, w, rangebegin);
11030                     }
11031                     if (!SIZE_ONLY)
11032                         stored +=
11033                             set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
11034                 } else
11035                     range = 1;  /* yeah, it's a range! */
11036                 continue;       /* but do it the next time */
11037             }
11038         }
11039
11040         /* non-Latin1 code point implies unicode semantics.  Must be set in
11041          * pass1 so is there for the whole of pass 2 */
11042         if (value > 255) {
11043             RExC_uni_semantics = 1;
11044         }
11045
11046         /* now is the next time */
11047         if (!SIZE_ONLY) {
11048             if (prevvalue < 256) {
11049                 const IV ceilvalue = value < 256 ? value : 255;
11050                 IV i;
11051 #ifdef EBCDIC
11052                 /* In EBCDIC [\x89-\x91] should include
11053                  * the \x8e but [i-j] should not. */
11054                 if (literal_endpoint == 2 &&
11055                     ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
11056                      (isUPPER(prevvalue) && isUPPER(ceilvalue))))
11057                 {
11058                     if (isLOWER(prevvalue)) {
11059                         for (i = prevvalue; i <= ceilvalue; i++)
11060                             if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
11061                                 stored +=
11062                                   set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
11063                             }
11064                     } else {
11065                         for (i = prevvalue; i <= ceilvalue; i++)
11066                             if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
11067                                 stored +=
11068                                   set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
11069                             }
11070                     }
11071                 }
11072                 else
11073 #endif
11074                       for (i = prevvalue; i <= ceilvalue; i++) {
11075                         stored += set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
11076                       }
11077           }
11078           if (value > 255) {
11079             const UV prevnatvalue  = NATIVE_TO_UNI(prevvalue);
11080             const UV natvalue      = NATIVE_TO_UNI(value);
11081             nonbitmap = add_range_to_invlist(nonbitmap, prevnatvalue, natvalue);
11082         }
11083 #ifdef EBCDIC
11084             literal_endpoint = 0;
11085 #endif
11086         }
11087
11088         range = 0; /* this range (if it was one) is done now */
11089     }
11090
11091
11092
11093     if (SIZE_ONLY)
11094         return ret;
11095     /****** !SIZE_ONLY AFTER HERE *********/
11096
11097     /* If folding and there are code points above 255, we calculate all
11098      * characters that could fold to or from the ones already on the list */
11099     if (FOLD && nonbitmap) {
11100         UV start, end;  /* End points of code point ranges */
11101
11102         SV* fold_intersection = NULL;
11103
11104         /* This is a list of all the characters that participate in folds
11105             * (except marks, etc in multi-char folds */
11106         if (! PL_utf8_foldable) {
11107             SV* swash = swash_init("utf8", "Cased", &PL_sv_undef, 1, 0);
11108             PL_utf8_foldable = _swash_to_invlist(swash);
11109             SvREFCNT_dec(swash);
11110         }
11111
11112         /* This is a hash that for a particular fold gives all characters
11113             * that are involved in it */
11114         if (! PL_utf8_foldclosures) {
11115
11116             /* If we were unable to find any folds, then we likely won't be
11117              * able to find the closures.  So just create an empty list.
11118              * Folding will effectively be restricted to the non-Unicode rules
11119              * hard-coded into Perl.  (This case happens legitimately during
11120              * compilation of Perl itself before the Unicode tables are
11121              * generated) */
11122             if (invlist_len(PL_utf8_foldable) == 0) {
11123                 PL_utf8_foldclosures = newHV();
11124             } else {
11125                 /* If the folds haven't been read in, call a fold function
11126                     * to force that */
11127                 if (! PL_utf8_tofold) {
11128                     U8 dummy[UTF8_MAXBYTES+1];
11129                     STRLEN dummy_len;
11130
11131                     /* This particular string is above \xff in both UTF-8 and
11132                      * UTFEBCDIC */
11133                     to_utf8_fold((U8*) "\xC8\x80", dummy, &dummy_len);
11134                     assert(PL_utf8_tofold); /* Verify that worked */
11135                 }
11136                 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
11137             }
11138         }
11139
11140         /* Only the characters in this class that participate in folds need be
11141          * checked.  Get the intersection of this class and all the possible
11142          * characters that are foldable.  This can quickly narrow down a large
11143          * class */
11144         _invlist_intersection(PL_utf8_foldable, nonbitmap, &fold_intersection);
11145
11146         /* Now look at the foldable characters in this class individually */
11147         invlist_iterinit(fold_intersection);
11148         while (invlist_iternext(fold_intersection, &start, &end)) {
11149             UV j;
11150
11151             /* Look at every character in the range */
11152             for (j = start; j <= end; j++) {
11153
11154                 /* Get its fold */
11155                 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
11156                 STRLEN foldlen;
11157                 const UV f =
11158                     _to_uni_fold_flags(j, foldbuf, &foldlen, allow_full_fold);
11159
11160                 if (foldlen > (STRLEN)UNISKIP(f)) {
11161
11162                     /* Any multicharacter foldings (disallowed in lookbehind
11163                      * patterns) require the following transform: [ABCDEF] ->
11164                      * (?:[ABCabcDEFd]|pq|rst) where E folds into "pq" and F
11165                      * folds into "rst", all other characters fold to single
11166                      * characters.  We save away these multicharacter foldings,
11167                      * to be later saved as part of the additional "s" data. */
11168                     if (! RExC_in_lookbehind) {
11169                         U8* loc = foldbuf;
11170                         U8* e = foldbuf + foldlen;
11171
11172                         /* If any of the folded characters of this are in the
11173                          * Latin1 range, tell the regex engine that this can
11174                          * match a non-utf8 target string.  The only multi-byte
11175                          * fold whose source is in the Latin1 range (U+00DF)
11176                          * applies only when the target string is utf8, or
11177                          * under unicode rules */
11178                         if (j > 255 || AT_LEAST_UNI_SEMANTICS) {
11179                             while (loc < e) {
11180
11181                                 /* Can't mix ascii with non- under /aa */
11182                                 if (MORE_ASCII_RESTRICTED
11183                                     && (isASCII(*loc) != isASCII(j)))
11184                                 {
11185                                     goto end_multi_fold;
11186                                 }
11187                                 if (UTF8_IS_INVARIANT(*loc)
11188                                     || UTF8_IS_DOWNGRADEABLE_START(*loc))
11189                                 {
11190                                     /* Can't mix above and below 256 under LOC
11191                                      */
11192                                     if (LOC) {
11193                                         goto end_multi_fold;
11194                                     }
11195                                     ANYOF_FLAGS(ret)
11196                                             |= ANYOF_NONBITMAP_NON_UTF8;
11197                                     break;
11198                                 }
11199                                 loc += UTF8SKIP(loc);
11200                             }
11201                         }
11202
11203                         add_alternate(&unicode_alternate, foldbuf, foldlen);
11204                     end_multi_fold: ;
11205                     }
11206
11207                     /* This is special-cased, as it is the only letter which
11208                      * has both a multi-fold and single-fold in Latin1.  All
11209                      * the other chars that have single and multi-folds are
11210                      * always in utf8, and the utf8 folding algorithm catches
11211                      * them */
11212                     if (! LOC && j == LATIN_CAPITAL_LETTER_SHARP_S) {
11213                         stored += set_regclass_bit(pRExC_state,
11214                                         ret,
11215                                         LATIN_SMALL_LETTER_SHARP_S,
11216                                         &l1_fold_invlist, &unicode_alternate);
11217                     }
11218                 }
11219                 else {
11220                     /* Single character fold.  Add everything in its fold
11221                      * closure to the list that this node should match */
11222                     SV** listp;
11223
11224                     /* The fold closures data structure is a hash with the keys
11225                      * being every character that is folded to, like 'k', and
11226                      * the values each an array of everything that folds to its
11227                      * key.  e.g. [ 'k', 'K', KELVIN_SIGN ] */
11228                     if ((listp = hv_fetch(PL_utf8_foldclosures,
11229                                     (char *) foldbuf, foldlen, FALSE)))
11230                     {
11231                         AV* list = (AV*) *listp;
11232                         IV k;
11233                         for (k = 0; k <= av_len(list); k++) {
11234                             SV** c_p = av_fetch(list, k, FALSE);
11235                             UV c;
11236                             if (c_p == NULL) {
11237                                 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
11238                             }
11239                             c = SvUV(*c_p);
11240
11241                             /* /aa doesn't allow folds between ASCII and non-;
11242                              * /l doesn't allow them between above and below
11243                              * 256 */
11244                             if ((MORE_ASCII_RESTRICTED
11245                                  && (isASCII(c) != isASCII(j)))
11246                                     || (LOC && ((c < 256) != (j < 256))))
11247                             {
11248                                 continue;
11249                             }
11250
11251                             if (c < 256 && AT_LEAST_UNI_SEMANTICS) {
11252                                 stored += set_regclass_bit(pRExC_state,
11253                                         ret,
11254                                         (U8) c,
11255                                         &l1_fold_invlist, &unicode_alternate);
11256                             }
11257                                 /* It may be that the code point is already in
11258                                  * this range or already in the bitmap, in
11259                                  * which case we need do nothing */
11260                             else if ((c < start || c > end)
11261                                         && (c > 255
11262                                             || ! ANYOF_BITMAP_TEST(ret, c)))
11263                             {
11264                                 nonbitmap = add_cp_to_invlist(nonbitmap, c);
11265                             }
11266                         }
11267                     }
11268                 }
11269             }
11270         }
11271         SvREFCNT_dec(fold_intersection);
11272     }
11273
11274     /* Combine the two lists into one. */
11275     if (l1_fold_invlist) {
11276         if (nonbitmap) {
11277             _invlist_union(nonbitmap, l1_fold_invlist, &nonbitmap);
11278             SvREFCNT_dec(l1_fold_invlist);
11279         }
11280         else {
11281             nonbitmap = l1_fold_invlist;
11282         }
11283     }
11284
11285     /* And combine the result (if any) with any inversion list from properties.
11286      * The lists are kept separate up to now because we don't want to fold the
11287      * properties */
11288     if (properties) {
11289         if (nonbitmap) {
11290             _invlist_union(nonbitmap, properties, &nonbitmap);
11291             SvREFCNT_dec(properties);
11292         }
11293         else {
11294             nonbitmap = properties;
11295         }
11296     }
11297
11298     /* Here, <nonbitmap> contains all the code points we can determine at
11299      * compile time that we haven't put into the bitmap.  Go through it, and
11300      * for things that belong in the bitmap, put them there, and delete from
11301      * <nonbitmap> */
11302     if (nonbitmap) {
11303
11304         /* Above-ASCII code points in /d have to stay in <nonbitmap>, as they
11305          * possibly only should match when the target string is UTF-8 */
11306         UV max_cp_to_set = (DEPENDS_SEMANTICS) ? 127 : 255;
11307
11308         /* This gets set if we actually need to modify things */
11309         bool change_invlist = FALSE;
11310
11311         UV start, end;
11312
11313         /* Start looking through <nonbitmap> */
11314         invlist_iterinit(nonbitmap);
11315         while (invlist_iternext(nonbitmap, &start, &end)) {
11316             UV high;
11317             int i;
11318
11319             /* Quit if are above what we should change */
11320             if (start > max_cp_to_set) {
11321                 break;
11322             }
11323
11324             change_invlist = TRUE;
11325
11326             /* Set all the bits in the range, up to the max that we are doing */
11327             high = (end < max_cp_to_set) ? end : max_cp_to_set;
11328             for (i = start; i <= (int) high; i++) {
11329                 if (! ANYOF_BITMAP_TEST(ret, i)) {
11330                     ANYOF_BITMAP_SET(ret, i);
11331                     stored++;
11332                     prevvalue = value;
11333                     value = i;
11334                 }
11335             }
11336         }
11337
11338         /* Done with loop; remove any code points that are in the bitmap from
11339          * <nonbitmap> */
11340         if (change_invlist) {
11341             _invlist_subtract(nonbitmap,
11342                               (DEPENDS_SEMANTICS)
11343                                 ? PL_ASCII
11344                                 : PL_Latin1,
11345                               &nonbitmap);
11346         }
11347
11348         /* If have completely emptied it, remove it completely */
11349         if (invlist_len(nonbitmap) == 0) {
11350             SvREFCNT_dec(nonbitmap);
11351             nonbitmap = NULL;
11352         }
11353     }
11354
11355     /* Here, we have calculated what code points should be in the character
11356      * class.  <nonbitmap> does not overlap the bitmap except possibly in the
11357      * case of DEPENDS rules.
11358      *
11359      * Now we can see about various optimizations.  Fold calculation (which we
11360      * did above) needs to take place before inversion.  Otherwise /[^k]/i
11361      * would invert to include K, which under /i would match k, which it
11362      * shouldn't. */
11363
11364     /* Optimize inverted simple patterns (e.g. [^a-z]).  Note that we haven't
11365      * set the FOLD flag yet, so this does optimize those.  It doesn't
11366      * optimize locale.  Doing so perhaps could be done as long as there is
11367      * nothing like \w in it; some thought also would have to be given to the
11368      * interaction with above 0x100 chars */
11369     if ((ANYOF_FLAGS(ret) & ANYOF_INVERT)
11370         && ! LOC
11371         && ! unicode_alternate
11372         /* In case of /d, there are some things that should match only when in
11373          * not in the bitmap, i.e., they require UTF8 to match.  These are
11374          * listed in nonbitmap, but if ANYOF_NONBITMAP_NON_UTF8 is set in this
11375          * case, they don't require UTF8, so can invert here */
11376         && (! nonbitmap
11377             || ! DEPENDS_SEMANTICS
11378             || (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP_NON_UTF8))
11379         && SvCUR(listsv) == initial_listsv_len)
11380     {
11381         int i;
11382         if (! nonbitmap) {
11383             for (i = 0; i < 256; ++i) {
11384                 if (ANYOF_BITMAP_TEST(ret, i)) {
11385                     ANYOF_BITMAP_CLEAR(ret, i);
11386                 }
11387                 else {
11388                     ANYOF_BITMAP_SET(ret, i);
11389                     prevvalue = value;
11390                     value = i;
11391                 }
11392             }
11393             /* The inversion means that everything above 255 is matched */
11394             ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
11395         }
11396         else {
11397             /* Here, also has things outside the bitmap that may overlap with
11398              * the bitmap.  We have to sync them up, so that they get inverted
11399              * in both places.  Earlier, we removed all overlaps except in the
11400              * case of /d rules, so no syncing is needed except for this case
11401              */
11402             SV *remove_list = NULL;
11403
11404             if (DEPENDS_SEMANTICS) {
11405                 UV start, end;
11406
11407                 /* Set the bits that correspond to the ones that aren't in the
11408                  * bitmap.  Otherwise, when we invert, we'll miss these.
11409                  * Earlier, we removed from the nonbitmap all code points
11410                  * < 128, so there is no extra work here */
11411                 invlist_iterinit(nonbitmap);
11412                 while (invlist_iternext(nonbitmap, &start, &end)) {
11413                     if (start > 255) {  /* The bit map goes to 255 */
11414                         break;
11415                     }
11416                     if (end > 255) {
11417                         end = 255;
11418                     }
11419                     for (i = start; i <= (int) end; ++i) {
11420                         ANYOF_BITMAP_SET(ret, i);
11421                         prevvalue = value;
11422                         value = i;
11423                     }
11424                 }
11425             }
11426
11427             /* Now invert both the bitmap and the nonbitmap.  Anything in the
11428              * bitmap has to also be removed from the non-bitmap, but again,
11429              * there should not be overlap unless is /d rules. */
11430             _invlist_invert(nonbitmap);
11431
11432             /* Any swash can't be used as-is, because we've inverted things */
11433             if (swash) {
11434                 SvREFCNT_dec(swash);
11435                 swash = NULL;
11436             }
11437
11438             for (i = 0; i < 256; ++i) {
11439                 if (ANYOF_BITMAP_TEST(ret, i)) {
11440                     ANYOF_BITMAP_CLEAR(ret, i);
11441                     if (DEPENDS_SEMANTICS) {
11442                         if (! remove_list) {
11443                             remove_list = _new_invlist(2);
11444                         }
11445                         remove_list = add_cp_to_invlist(remove_list, i);
11446                     }
11447                 }
11448                 else {
11449                     ANYOF_BITMAP_SET(ret, i);
11450                     prevvalue = value;
11451                     value = i;
11452                 }
11453             }
11454
11455             /* And do the removal */
11456             if (DEPENDS_SEMANTICS) {
11457                 if (remove_list) {
11458                     _invlist_subtract(nonbitmap, remove_list, &nonbitmap);
11459                     SvREFCNT_dec(remove_list);
11460                 }
11461             }
11462             else {
11463                 /* There is no overlap for non-/d, so just delete anything
11464                  * below 256 */
11465                 _invlist_intersection(nonbitmap, PL_AboveLatin1, &nonbitmap);
11466             }
11467         }
11468
11469         stored = 256 - stored;
11470
11471         /* Clear the invert flag since have just done it here */
11472         ANYOF_FLAGS(ret) &= ~ANYOF_INVERT;
11473     }
11474
11475     /* Folding in the bitmap is taken care of above, but not for locale (for
11476      * which we have to wait to see what folding is in effect at runtime), and
11477      * for some things not in the bitmap (only the upper latin folds in this
11478      * case, as all other single-char folding has been set above).  Set
11479      * run-time fold flag for these */
11480     if (FOLD && (LOC
11481                 || (DEPENDS_SEMANTICS
11482                     && nonbitmap
11483                     && ! (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP_NON_UTF8))
11484                 || unicode_alternate))
11485     {
11486         ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD;
11487     }
11488
11489     /* A single character class can be "optimized" into an EXACTish node.
11490      * Note that since we don't currently count how many characters there are
11491      * outside the bitmap, we are XXX missing optimization possibilities for
11492      * them.  This optimization can't happen unless this is a truly single
11493      * character class, which means that it can't be an inversion into a
11494      * many-character class, and there must be no possibility of there being
11495      * things outside the bitmap.  'stored' (only) for locales doesn't include
11496      * \w, etc, so have to make a special test that they aren't present
11497      *
11498      * Similarly A 2-character class of the very special form like [bB] can be
11499      * optimized into an EXACTFish node, but only for non-locales, and for
11500      * characters which only have the two folds; so things like 'fF' and 'Ii'
11501      * wouldn't work because they are part of the fold of 'LATIN SMALL LIGATURE
11502      * FI'. */
11503     if (! nonbitmap
11504         && ! unicode_alternate
11505         && SvCUR(listsv) == initial_listsv_len
11506         && ! (ANYOF_FLAGS(ret) & (ANYOF_INVERT|ANYOF_UNICODE_ALL))
11507         && (((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
11508                               || (! ANYOF_CLASS_TEST_ANY_SET(ret)))))
11509             || (stored == 2 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
11510                                  && (! _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value))
11511                                  /* If the latest code point has a fold whose
11512                                   * bit is set, it must be the only other one */
11513                                 && ((prevvalue = PL_fold_latin1[value]) != (IV)value)
11514                                  && ANYOF_BITMAP_TEST(ret, prevvalue)))))
11515     {
11516         /* Note that the information needed to decide to do this optimization
11517          * is not currently available until the 2nd pass, and that the actually
11518          * used EXACTish node takes less space than the calculated ANYOF node,
11519          * and hence the amount of space calculated in the first pass is larger
11520          * than actually used, so this optimization doesn't gain us any space.
11521          * But an EXACT node is faster than an ANYOF node, and can be combined
11522          * with any adjacent EXACT nodes later by the optimizer for further
11523          * gains.  The speed of executing an EXACTF is similar to an ANYOF
11524          * node, so the optimization advantage comes from the ability to join
11525          * it to adjacent EXACT nodes */
11526
11527         const char * cur_parse= RExC_parse;
11528         U8 op;
11529         RExC_emit = (regnode *)orig_emit;
11530         RExC_parse = (char *)orig_parse;
11531
11532         if (stored == 1) {
11533
11534             /* A locale node with one point can be folded; all the other cases
11535              * with folding will have two points, since we calculate them above
11536              */
11537             if (ANYOF_FLAGS(ret) & ANYOF_LOC_NONBITMAP_FOLD) {
11538                  op = EXACTFL;
11539             }
11540             else {
11541                 op = EXACT;
11542             }
11543         }
11544         else {   /* else 2 chars in the bit map: the folds of each other */
11545
11546             /* Use the folded value, which for the cases where we get here,
11547              * is just the lower case of the current one (which may resolve to
11548              * itself, or to the other one */
11549             value = toLOWER_LATIN1(value);
11550
11551             /* To join adjacent nodes, they must be the exact EXACTish type.
11552              * Try to use the most likely type, by using EXACTFA if possible,
11553              * then EXACTFU if the regex calls for it, or is required because
11554              * the character is non-ASCII.  (If <value> is ASCII, its fold is
11555              * also ASCII for the cases where we get here.) */
11556             if (MORE_ASCII_RESTRICTED && isASCII(value)) {
11557                 op = EXACTFA;
11558             }
11559             else if (AT_LEAST_UNI_SEMANTICS || !isASCII(value)) {
11560                 op = EXACTFU;
11561             }
11562             else {    /* Otherwise, more likely to be EXACTF type */
11563                 op = EXACTF;
11564             }
11565         }
11566
11567         ret = reg_node(pRExC_state, op);
11568         RExC_parse = (char *)cur_parse;
11569         if (UTF && ! NATIVE_IS_INVARIANT(value)) {
11570             *STRING(ret)= UTF8_EIGHT_BIT_HI((U8) value);
11571             *(STRING(ret) + 1)= UTF8_EIGHT_BIT_LO((U8) value);
11572             STR_LEN(ret)= 2;
11573             RExC_emit += STR_SZ(2);
11574         }
11575         else {
11576             *STRING(ret)= (char)value;
11577             STR_LEN(ret)= 1;
11578             RExC_emit += STR_SZ(1);
11579         }
11580         SvREFCNT_dec(listsv);
11581         return ret;
11582     }
11583
11584     /* If there is a swash and more than one element, we can't use the swash in
11585      * the optimization below. */
11586     if (swash && element_count > 1) {
11587         SvREFCNT_dec(swash);
11588         swash = NULL;
11589     }
11590     if (! nonbitmap
11591         && SvCUR(listsv) == initial_listsv_len
11592         && ! unicode_alternate)
11593     {
11594         ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
11595         SvREFCNT_dec(listsv);
11596         SvREFCNT_dec(unicode_alternate);
11597     }
11598     else {
11599         /* av[0] stores the character class description in its textual form:
11600          *       used later (regexec.c:Perl_regclass_swash()) to initialize the
11601          *       appropriate swash, and is also useful for dumping the regnode.
11602          * av[1] if NULL, is a placeholder to later contain the swash computed
11603          *       from av[0].  But if no further computation need be done, the
11604          *       swash is stored there now.
11605          * av[2] stores the multicharacter foldings, used later in
11606          *       regexec.c:S_reginclass().
11607          * av[3] stores the nonbitmap inversion list for use in addition or
11608          *       instead of av[0]; not used if av[1] isn't NULL
11609          * av[4] is set if any component of the class is from a user-defined
11610          *       property; not used if av[1] isn't NULL */
11611         AV * const av = newAV();
11612         SV *rv;
11613
11614         av_store(av, 0, (SvCUR(listsv) == initial_listsv_len)
11615                         ? &PL_sv_undef
11616                         : listsv);
11617         if (swash) {
11618             av_store(av, 1, swash);
11619             SvREFCNT_dec(nonbitmap);
11620         }
11621         else {
11622             av_store(av, 1, NULL);
11623             if (nonbitmap) {
11624                 av_store(av, 3, nonbitmap);
11625                 av_store(av, 4, newSVuv(has_user_defined_property));
11626             }
11627         }
11628
11629         /* Store any computed multi-char folds only if we are allowing
11630          * them */
11631         if (allow_full_fold) {
11632             av_store(av, 2, MUTABLE_SV(unicode_alternate));
11633             if (unicode_alternate) { /* This node is variable length */
11634                 OP(ret) = ANYOFV;
11635             }
11636         }
11637         else {
11638             av_store(av, 2, NULL);
11639         }
11640         rv = newRV_noinc(MUTABLE_SV(av));
11641         n = add_data(pRExC_state, 1, "s");
11642         RExC_rxi->data->data[n] = (void*)rv;
11643         ARG_SET(ret, n);
11644     }
11645     return ret;
11646 }
11647
11648
11649 /* reg_skipcomment()
11650
11651    Absorbs an /x style # comments from the input stream.
11652    Returns true if there is more text remaining in the stream.
11653    Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
11654    terminates the pattern without including a newline.
11655
11656    Note its the callers responsibility to ensure that we are
11657    actually in /x mode
11658
11659 */
11660
11661 STATIC bool
11662 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
11663 {
11664     bool ended = 0;
11665
11666     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
11667
11668     while (RExC_parse < RExC_end)
11669         if (*RExC_parse++ == '\n') {
11670             ended = 1;
11671             break;
11672         }
11673     if (!ended) {
11674         /* we ran off the end of the pattern without ending
11675            the comment, so we have to add an \n when wrapping */
11676         RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11677         return 0;
11678     } else
11679         return 1;
11680 }
11681
11682 /* nextchar()
11683
11684    Advances the parse position, and optionally absorbs
11685    "whitespace" from the inputstream.
11686
11687    Without /x "whitespace" means (?#...) style comments only,
11688    with /x this means (?#...) and # comments and whitespace proper.
11689
11690    Returns the RExC_parse point from BEFORE the scan occurs.
11691
11692    This is the /x friendly way of saying RExC_parse++.
11693 */
11694
11695 STATIC char*
11696 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
11697 {
11698     char* const retval = RExC_parse++;
11699
11700     PERL_ARGS_ASSERT_NEXTCHAR;
11701
11702     for (;;) {
11703         if (RExC_end - RExC_parse >= 3
11704             && *RExC_parse == '('
11705             && RExC_parse[1] == '?'
11706             && RExC_parse[2] == '#')
11707         {
11708             while (*RExC_parse != ')') {
11709                 if (RExC_parse == RExC_end)
11710                     FAIL("Sequence (?#... not terminated");
11711                 RExC_parse++;
11712             }
11713             RExC_parse++;
11714             continue;
11715         }
11716         if (RExC_flags & RXf_PMf_EXTENDED) {
11717             if (isSPACE(*RExC_parse)) {
11718                 RExC_parse++;
11719                 continue;
11720             }
11721             else if (*RExC_parse == '#') {
11722                 if ( reg_skipcomment( pRExC_state ) )
11723                     continue;
11724             }
11725         }
11726         return retval;
11727     }
11728 }
11729
11730 /*
11731 - reg_node - emit a node
11732 */
11733 STATIC regnode *                        /* Location. */
11734 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
11735 {
11736     dVAR;
11737     register regnode *ptr;
11738     regnode * const ret = RExC_emit;
11739     GET_RE_DEBUG_FLAGS_DECL;
11740
11741     PERL_ARGS_ASSERT_REG_NODE;
11742
11743     if (SIZE_ONLY) {
11744         SIZE_ALIGN(RExC_size);
11745         RExC_size += 1;
11746         return(ret);
11747     }
11748     if (RExC_emit >= RExC_emit_bound)
11749         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
11750                    op, RExC_emit, RExC_emit_bound);
11751
11752     NODE_ALIGN_FILL(ret);
11753     ptr = ret;
11754     FILL_ADVANCE_NODE(ptr, op);
11755 #ifdef RE_TRACK_PATTERN_OFFSETS
11756     if (RExC_offsets) {         /* MJD */
11757         MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
11758               "reg_node", __LINE__,
11759               PL_reg_name[op],
11760               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
11761                 ? "Overwriting end of array!\n" : "OK",
11762               (UV)(RExC_emit - RExC_emit_start),
11763               (UV)(RExC_parse - RExC_start),
11764               (UV)RExC_offsets[0]));
11765         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
11766     }
11767 #endif
11768     RExC_emit = ptr;
11769     return(ret);
11770 }
11771
11772 /*
11773 - reganode - emit a node with an argument
11774 */
11775 STATIC regnode *                        /* Location. */
11776 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
11777 {
11778     dVAR;
11779     register regnode *ptr;
11780     regnode * const ret = RExC_emit;
11781     GET_RE_DEBUG_FLAGS_DECL;
11782
11783     PERL_ARGS_ASSERT_REGANODE;
11784
11785     if (SIZE_ONLY) {
11786         SIZE_ALIGN(RExC_size);
11787         RExC_size += 2;
11788         /*
11789            We can't do this:
11790
11791            assert(2==regarglen[op]+1);
11792
11793            Anything larger than this has to allocate the extra amount.
11794            If we changed this to be:
11795
11796            RExC_size += (1 + regarglen[op]);
11797
11798            then it wouldn't matter. Its not clear what side effect
11799            might come from that so its not done so far.
11800            -- dmq
11801         */
11802         return(ret);
11803     }
11804     if (RExC_emit >= RExC_emit_bound)
11805         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
11806                    op, RExC_emit, RExC_emit_bound);
11807
11808     NODE_ALIGN_FILL(ret);
11809     ptr = ret;
11810     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
11811 #ifdef RE_TRACK_PATTERN_OFFSETS
11812     if (RExC_offsets) {         /* MJD */
11813         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
11814               "reganode",
11815               __LINE__,
11816               PL_reg_name[op],
11817               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
11818               "Overwriting end of array!\n" : "OK",
11819               (UV)(RExC_emit - RExC_emit_start),
11820               (UV)(RExC_parse - RExC_start),
11821               (UV)RExC_offsets[0]));
11822         Set_Cur_Node_Offset;
11823     }
11824 #endif
11825     RExC_emit = ptr;
11826     return(ret);
11827 }
11828
11829 /*
11830 - reguni - emit (if appropriate) a Unicode character
11831 */
11832 STATIC STRLEN
11833 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
11834 {
11835     dVAR;
11836
11837     PERL_ARGS_ASSERT_REGUNI;
11838
11839     return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
11840 }
11841
11842 /*
11843 - reginsert - insert an operator in front of already-emitted operand
11844 *
11845 * Means relocating the operand.
11846 */
11847 STATIC void
11848 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
11849 {
11850     dVAR;
11851     register regnode *src;
11852     register regnode *dst;
11853     register regnode *place;
11854     const int offset = regarglen[(U8)op];
11855     const int size = NODE_STEP_REGNODE + offset;
11856     GET_RE_DEBUG_FLAGS_DECL;
11857
11858     PERL_ARGS_ASSERT_REGINSERT;
11859     PERL_UNUSED_ARG(depth);
11860 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
11861     DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
11862     if (SIZE_ONLY) {
11863         RExC_size += size;
11864         return;
11865     }
11866
11867     src = RExC_emit;
11868     RExC_emit += size;
11869     dst = RExC_emit;
11870     if (RExC_open_parens) {
11871         int paren;
11872         /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
11873         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
11874             if ( RExC_open_parens[paren] >= opnd ) {
11875                 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
11876                 RExC_open_parens[paren] += size;
11877             } else {
11878                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
11879             }
11880             if ( RExC_close_parens[paren] >= opnd ) {
11881                 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
11882                 RExC_close_parens[paren] += size;
11883             } else {
11884                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
11885             }
11886         }
11887     }
11888
11889     while (src > opnd) {
11890         StructCopy(--src, --dst, regnode);
11891 #ifdef RE_TRACK_PATTERN_OFFSETS
11892         if (RExC_offsets) {     /* MJD 20010112 */
11893             MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
11894                   "reg_insert",
11895                   __LINE__,
11896                   PL_reg_name[op],
11897                   (UV)(dst - RExC_emit_start) > RExC_offsets[0]
11898                     ? "Overwriting end of array!\n" : "OK",
11899                   (UV)(src - RExC_emit_start),
11900                   (UV)(dst - RExC_emit_start),
11901                   (UV)RExC_offsets[0]));
11902             Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
11903             Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
11904         }
11905 #endif
11906     }
11907
11908
11909     place = opnd;               /* Op node, where operand used to be. */
11910 #ifdef RE_TRACK_PATTERN_OFFSETS
11911     if (RExC_offsets) {         /* MJD */
11912         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
11913               "reginsert",
11914               __LINE__,
11915               PL_reg_name[op],
11916               (UV)(place - RExC_emit_start) > RExC_offsets[0]
11917               ? "Overwriting end of array!\n" : "OK",
11918               (UV)(place - RExC_emit_start),
11919               (UV)(RExC_parse - RExC_start),
11920               (UV)RExC_offsets[0]));
11921         Set_Node_Offset(place, RExC_parse);
11922         Set_Node_Length(place, 1);
11923     }
11924 #endif
11925     src = NEXTOPER(place);
11926     FILL_ADVANCE_NODE(place, op);
11927     Zero(src, offset, regnode);
11928 }
11929
11930 /*
11931 - regtail - set the next-pointer at the end of a node chain of p to val.
11932 - SEE ALSO: regtail_study
11933 */
11934 /* TODO: All three parms should be const */
11935 STATIC void
11936 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
11937 {
11938     dVAR;
11939     register regnode *scan;
11940     GET_RE_DEBUG_FLAGS_DECL;
11941
11942     PERL_ARGS_ASSERT_REGTAIL;
11943 #ifndef DEBUGGING
11944     PERL_UNUSED_ARG(depth);
11945 #endif
11946
11947     if (SIZE_ONLY)
11948         return;
11949
11950     /* Find last node. */
11951     scan = p;
11952     for (;;) {
11953         regnode * const temp = regnext(scan);
11954         DEBUG_PARSE_r({
11955             SV * const mysv=sv_newmortal();
11956             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
11957             regprop(RExC_rx, mysv, scan);
11958             PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
11959                 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
11960                     (temp == NULL ? "->" : ""),
11961                     (temp == NULL ? PL_reg_name[OP(val)] : "")
11962             );
11963         });
11964         if (temp == NULL)
11965             break;
11966         scan = temp;
11967     }
11968
11969     if (reg_off_by_arg[OP(scan)]) {
11970         ARG_SET(scan, val - scan);
11971     }
11972     else {
11973         NEXT_OFF(scan) = val - scan;
11974     }
11975 }
11976
11977 #ifdef DEBUGGING
11978 /*
11979 - regtail_study - set the next-pointer at the end of a node chain of p to val.
11980 - Look for optimizable sequences at the same time.
11981 - currently only looks for EXACT chains.
11982
11983 This is experimental code. The idea is to use this routine to perform
11984 in place optimizations on branches and groups as they are constructed,
11985 with the long term intention of removing optimization from study_chunk so
11986 that it is purely analytical.
11987
11988 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
11989 to control which is which.
11990
11991 */
11992 /* TODO: All four parms should be const */
11993
11994 STATIC U8
11995 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
11996 {
11997     dVAR;
11998     register regnode *scan;
11999     U8 exact = PSEUDO;
12000 #ifdef EXPERIMENTAL_INPLACESCAN
12001     I32 min = 0;
12002 #endif
12003     GET_RE_DEBUG_FLAGS_DECL;
12004
12005     PERL_ARGS_ASSERT_REGTAIL_STUDY;
12006
12007
12008     if (SIZE_ONLY)
12009         return exact;
12010
12011     /* Find last node. */
12012
12013     scan = p;
12014     for (;;) {
12015         regnode * const temp = regnext(scan);
12016 #ifdef EXPERIMENTAL_INPLACESCAN
12017         if (PL_regkind[OP(scan)] == EXACT) {
12018             bool has_exactf_sharp_s;    /* Unexamined in this routine */
12019             if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1))
12020                 return EXACT;
12021         }
12022 #endif
12023         if ( exact ) {
12024             switch (OP(scan)) {
12025                 case EXACT:
12026                 case EXACTF:
12027                 case EXACTFA:
12028                 case EXACTFU:
12029                 case EXACTFU_SS:
12030                 case EXACTFU_NO_TRIE:
12031                 case EXACTFL:
12032                         if( exact == PSEUDO )
12033                             exact= OP(scan);
12034                         else if ( exact != OP(scan) )
12035                             exact= 0;
12036                 case NOTHING:
12037                     break;
12038                 default:
12039                     exact= 0;
12040             }
12041         }
12042         DEBUG_PARSE_r({
12043             SV * const mysv=sv_newmortal();
12044             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
12045             regprop(RExC_rx, mysv, scan);
12046             PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
12047                 SvPV_nolen_const(mysv),
12048                 REG_NODE_NUM(scan),
12049                 PL_reg_name[exact]);
12050         });
12051         if (temp == NULL)
12052             break;
12053         scan = temp;
12054     }
12055     DEBUG_PARSE_r({
12056         SV * const mysv_val=sv_newmortal();
12057         DEBUG_PARSE_MSG("");
12058         regprop(RExC_rx, mysv_val, val);
12059         PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
12060                       SvPV_nolen_const(mysv_val),
12061                       (IV)REG_NODE_NUM(val),
12062                       (IV)(val - scan)
12063         );
12064     });
12065     if (reg_off_by_arg[OP(scan)]) {
12066         ARG_SET(scan, val - scan);
12067     }
12068     else {
12069         NEXT_OFF(scan) = val - scan;
12070     }
12071
12072     return exact;
12073 }
12074 #endif
12075
12076 /*
12077  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
12078  */
12079 #ifdef DEBUGGING
12080 static void
12081 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
12082 {
12083     int bit;
12084     int set=0;
12085     regex_charset cs;
12086
12087     for (bit=0; bit<32; bit++) {
12088         if (flags & (1<<bit)) {
12089             if ((1<<bit) & RXf_PMf_CHARSET) {   /* Output separately, below */
12090                 continue;
12091             }
12092             if (!set++ && lead)
12093                 PerlIO_printf(Perl_debug_log, "%s",lead);
12094             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
12095         }
12096     }
12097     if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
12098             if (!set++ && lead) {
12099                 PerlIO_printf(Perl_debug_log, "%s",lead);
12100             }
12101             switch (cs) {
12102                 case REGEX_UNICODE_CHARSET:
12103                     PerlIO_printf(Perl_debug_log, "UNICODE");
12104                     break;
12105                 case REGEX_LOCALE_CHARSET:
12106                     PerlIO_printf(Perl_debug_log, "LOCALE");
12107                     break;
12108                 case REGEX_ASCII_RESTRICTED_CHARSET:
12109                     PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
12110                     break;
12111                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
12112                     PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
12113                     break;
12114                 default:
12115                     PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
12116                     break;
12117             }
12118     }
12119     if (lead)  {
12120         if (set)
12121             PerlIO_printf(Perl_debug_log, "\n");
12122         else
12123             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
12124     }
12125 }
12126 #endif
12127
12128 void
12129 Perl_regdump(pTHX_ const regexp *r)
12130 {
12131 #ifdef DEBUGGING
12132     dVAR;
12133     SV * const sv = sv_newmortal();
12134     SV *dsv= sv_newmortal();
12135     RXi_GET_DECL(r,ri);
12136     GET_RE_DEBUG_FLAGS_DECL;
12137
12138     PERL_ARGS_ASSERT_REGDUMP;
12139
12140     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
12141
12142     /* Header fields of interest. */
12143     if (r->anchored_substr) {
12144         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
12145             RE_SV_DUMPLEN(r->anchored_substr), 30);
12146         PerlIO_printf(Perl_debug_log,
12147                       "anchored %s%s at %"IVdf" ",
12148                       s, RE_SV_TAIL(r->anchored_substr),
12149                       (IV)r->anchored_offset);
12150     } else if (r->anchored_utf8) {
12151         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
12152             RE_SV_DUMPLEN(r->anchored_utf8), 30);
12153         PerlIO_printf(Perl_debug_log,
12154                       "anchored utf8 %s%s at %"IVdf" ",
12155                       s, RE_SV_TAIL(r->anchored_utf8),
12156                       (IV)r->anchored_offset);
12157     }
12158     if (r->float_substr) {
12159         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
12160             RE_SV_DUMPLEN(r->float_substr), 30);
12161         PerlIO_printf(Perl_debug_log,
12162                       "floating %s%s at %"IVdf"..%"UVuf" ",
12163                       s, RE_SV_TAIL(r->float_substr),
12164                       (IV)r->float_min_offset, (UV)r->float_max_offset);
12165     } else if (r->float_utf8) {
12166         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
12167             RE_SV_DUMPLEN(r->float_utf8), 30);
12168         PerlIO_printf(Perl_debug_log,
12169                       "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
12170                       s, RE_SV_TAIL(r->float_utf8),
12171                       (IV)r->float_min_offset, (UV)r->float_max_offset);
12172     }
12173     if (r->check_substr || r->check_utf8)
12174         PerlIO_printf(Perl_debug_log,
12175                       (const char *)
12176                       (r->check_substr == r->float_substr
12177                        && r->check_utf8 == r->float_utf8
12178                        ? "(checking floating" : "(checking anchored"));
12179     if (r->extflags & RXf_NOSCAN)
12180         PerlIO_printf(Perl_debug_log, " noscan");
12181     if (r->extflags & RXf_CHECK_ALL)
12182         PerlIO_printf(Perl_debug_log, " isall");
12183     if (r->check_substr || r->check_utf8)
12184         PerlIO_printf(Perl_debug_log, ") ");
12185
12186     if (ri->regstclass) {
12187         regprop(r, sv, ri->regstclass);
12188         PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
12189     }
12190     if (r->extflags & RXf_ANCH) {
12191         PerlIO_printf(Perl_debug_log, "anchored");
12192         if (r->extflags & RXf_ANCH_BOL)
12193             PerlIO_printf(Perl_debug_log, "(BOL)");
12194         if (r->extflags & RXf_ANCH_MBOL)
12195             PerlIO_printf(Perl_debug_log, "(MBOL)");
12196         if (r->extflags & RXf_ANCH_SBOL)
12197             PerlIO_printf(Perl_debug_log, "(SBOL)");
12198         if (r->extflags & RXf_ANCH_GPOS)
12199             PerlIO_printf(Perl_debug_log, "(GPOS)");
12200         PerlIO_putc(Perl_debug_log, ' ');
12201     }
12202     if (r->extflags & RXf_GPOS_SEEN)
12203         PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
12204     if (r->intflags & PREGf_SKIP)
12205         PerlIO_printf(Perl_debug_log, "plus ");
12206     if (r->intflags & PREGf_IMPLICIT)
12207         PerlIO_printf(Perl_debug_log, "implicit ");
12208     PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
12209     if (r->extflags & RXf_EVAL_SEEN)
12210         PerlIO_printf(Perl_debug_log, "with eval ");
12211     PerlIO_printf(Perl_debug_log, "\n");
12212     DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
12213 #else
12214     PERL_ARGS_ASSERT_REGDUMP;
12215     PERL_UNUSED_CONTEXT;
12216     PERL_UNUSED_ARG(r);
12217 #endif  /* DEBUGGING */
12218 }
12219
12220 /*
12221 - regprop - printable representation of opcode
12222 */
12223 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
12224 STMT_START { \
12225         if (do_sep) {                           \
12226             Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
12227             if (flags & ANYOF_INVERT)           \
12228                 /*make sure the invert info is in each */ \
12229                 sv_catpvs(sv, "^");             \
12230             do_sep = 0;                         \
12231         }                                       \
12232 } STMT_END
12233
12234 void
12235 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
12236 {
12237 #ifdef DEBUGGING
12238     dVAR;
12239     register int k;
12240     RXi_GET_DECL(prog,progi);
12241     GET_RE_DEBUG_FLAGS_DECL;
12242
12243     PERL_ARGS_ASSERT_REGPROP;
12244
12245     sv_setpvs(sv, "");
12246
12247     if (OP(o) > REGNODE_MAX)            /* regnode.type is unsigned */
12248         /* It would be nice to FAIL() here, but this may be called from
12249            regexec.c, and it would be hard to supply pRExC_state. */
12250         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
12251     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
12252
12253     k = PL_regkind[OP(o)];
12254
12255     if (k == EXACT) {
12256         sv_catpvs(sv, " ");
12257         /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
12258          * is a crude hack but it may be the best for now since
12259          * we have no flag "this EXACTish node was UTF-8"
12260          * --jhi */
12261         pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
12262                   PERL_PV_ESCAPE_UNI_DETECT |
12263                   PERL_PV_ESCAPE_NONASCII   |
12264                   PERL_PV_PRETTY_ELLIPSES   |
12265                   PERL_PV_PRETTY_LTGT       |
12266                   PERL_PV_PRETTY_NOCLEAR
12267                   );
12268     } else if (k == TRIE) {
12269         /* print the details of the trie in dumpuntil instead, as
12270          * progi->data isn't available here */
12271         const char op = OP(o);
12272         const U32 n = ARG(o);
12273         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
12274                (reg_ac_data *)progi->data->data[n] :
12275                NULL;
12276         const reg_trie_data * const trie
12277             = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
12278
12279         Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
12280         DEBUG_TRIE_COMPILE_r(
12281             Perl_sv_catpvf(aTHX_ sv,
12282                 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
12283                 (UV)trie->startstate,
12284                 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
12285                 (UV)trie->wordcount,
12286                 (UV)trie->minlen,
12287                 (UV)trie->maxlen,
12288                 (UV)TRIE_CHARCOUNT(trie),
12289                 (UV)trie->uniquecharcount
12290             )
12291         );
12292         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
12293             int i;
12294             int rangestart = -1;
12295             U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
12296             sv_catpvs(sv, "[");
12297             for (i = 0; i <= 256; i++) {
12298                 if (i < 256 && BITMAP_TEST(bitmap,i)) {
12299                     if (rangestart == -1)
12300                         rangestart = i;
12301                 } else if (rangestart != -1) {
12302                     if (i <= rangestart + 3)
12303                         for (; rangestart < i; rangestart++)
12304                             put_byte(sv, rangestart);
12305                     else {
12306                         put_byte(sv, rangestart);
12307                         sv_catpvs(sv, "-");
12308                         put_byte(sv, i - 1);
12309                     }
12310                     rangestart = -1;
12311                 }
12312             }
12313             sv_catpvs(sv, "]");
12314         }
12315
12316     } else if (k == CURLY) {
12317         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
12318             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
12319         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
12320     }
12321     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
12322         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
12323     else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
12324         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
12325         if ( RXp_PAREN_NAMES(prog) ) {
12326             if ( k != REF || (OP(o) < NREF)) {
12327                 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
12328                 SV **name= av_fetch(list, ARG(o), 0 );
12329                 if (name)
12330                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
12331             }
12332             else {
12333                 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
12334                 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
12335                 I32 *nums=(I32*)SvPVX(sv_dat);
12336                 SV **name= av_fetch(list, nums[0], 0 );
12337                 I32 n;
12338                 if (name) {
12339                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
12340                         Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
12341                                     (n ? "," : ""), (IV)nums[n]);
12342                     }
12343                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
12344                 }
12345             }
12346         }
12347     } else if (k == GOSUB)
12348         Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
12349     else if (k == VERB) {
12350         if (!o->flags)
12351             Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
12352                            SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
12353     } else if (k == LOGICAL)
12354         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);     /* 2: embedded, otherwise 1 */
12355     else if (k == ANYOF) {
12356         int i, rangestart = -1;
12357         const U8 flags = ANYOF_FLAGS(o);
12358         int do_sep = 0;
12359
12360         /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
12361         static const char * const anyofs[] = {
12362             "\\w",
12363             "\\W",
12364             "\\s",
12365             "\\S",
12366             "\\d",
12367             "\\D",
12368             "[:alnum:]",
12369             "[:^alnum:]",
12370             "[:alpha:]",
12371             "[:^alpha:]",
12372             "[:ascii:]",
12373             "[:^ascii:]",
12374             "[:cntrl:]",
12375             "[:^cntrl:]",
12376             "[:graph:]",
12377             "[:^graph:]",
12378             "[:lower:]",
12379             "[:^lower:]",
12380             "[:print:]",
12381             "[:^print:]",
12382             "[:punct:]",
12383             "[:^punct:]",
12384             "[:upper:]",
12385             "[:^upper:]",
12386             "[:xdigit:]",
12387             "[:^xdigit:]",
12388             "[:space:]",
12389             "[:^space:]",
12390             "[:blank:]",
12391             "[:^blank:]"
12392         };
12393
12394         if (flags & ANYOF_LOCALE)
12395             sv_catpvs(sv, "{loc}");
12396         if (flags & ANYOF_LOC_NONBITMAP_FOLD)
12397             sv_catpvs(sv, "{i}");
12398         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
12399         if (flags & ANYOF_INVERT)
12400             sv_catpvs(sv, "^");
12401
12402         /* output what the standard cp 0-255 bitmap matches */
12403         for (i = 0; i <= 256; i++) {
12404             if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
12405                 if (rangestart == -1)
12406                     rangestart = i;
12407             } else if (rangestart != -1) {
12408                 if (i <= rangestart + 3)
12409                     for (; rangestart < i; rangestart++)
12410                         put_byte(sv, rangestart);
12411                 else {
12412                     put_byte(sv, rangestart);
12413                     sv_catpvs(sv, "-");
12414                     put_byte(sv, i - 1);
12415                 }
12416                 do_sep = 1;
12417                 rangestart = -1;
12418             }
12419         }
12420
12421         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
12422         /* output any special charclass tests (used entirely under use locale) */
12423         if (ANYOF_CLASS_TEST_ANY_SET(o))
12424             for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
12425                 if (ANYOF_CLASS_TEST(o,i)) {
12426                     sv_catpv(sv, anyofs[i]);
12427                     do_sep = 1;
12428                 }
12429
12430         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
12431
12432         if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
12433             sv_catpvs(sv, "{non-utf8-latin1-all}");
12434         }
12435
12436         /* output information about the unicode matching */
12437         if (flags & ANYOF_UNICODE_ALL)
12438             sv_catpvs(sv, "{unicode_all}");
12439         else if (ANYOF_NONBITMAP(o))
12440             sv_catpvs(sv, "{unicode}");
12441         if (flags & ANYOF_NONBITMAP_NON_UTF8)
12442             sv_catpvs(sv, "{outside bitmap}");
12443
12444         if (ANYOF_NONBITMAP(o)) {
12445             SV *lv; /* Set if there is something outside the bit map */
12446             SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
12447             bool byte_output = FALSE;   /* If something in the bitmap has been
12448                                            output */
12449
12450             if (lv && lv != &PL_sv_undef) {
12451                 if (sw) {
12452                     U8 s[UTF8_MAXBYTES_CASE+1];
12453
12454                     for (i = 0; i <= 256; i++) { /* Look at chars in bitmap */
12455                         uvchr_to_utf8(s, i);
12456
12457                         if (i < 256
12458                             && ! ANYOF_BITMAP_TEST(o, i)    /* Don't duplicate
12459                                                                things already
12460                                                                output as part
12461                                                                of the bitmap */
12462                             && swash_fetch(sw, s, TRUE))
12463                         {
12464                             if (rangestart == -1)
12465                                 rangestart = i;
12466                         } else if (rangestart != -1) {
12467                             byte_output = TRUE;
12468                             if (i <= rangestart + 3)
12469                                 for (; rangestart < i; rangestart++) {
12470                                     put_byte(sv, rangestart);
12471                                 }
12472                             else {
12473                                 put_byte(sv, rangestart);
12474                                 sv_catpvs(sv, "-");
12475                                 put_byte(sv, i-1);
12476                             }
12477                             rangestart = -1;
12478                         }
12479                     }
12480                 }
12481
12482                 {
12483                     char *s = savesvpv(lv);
12484                     char * const origs = s;
12485
12486                     while (*s && *s != '\n')
12487                         s++;
12488
12489                     if (*s == '\n') {
12490                         const char * const t = ++s;
12491
12492                         if (byte_output) {
12493                             sv_catpvs(sv, " ");
12494                         }
12495
12496                         while (*s) {
12497                             if (*s == '\n') {
12498
12499                                 /* Truncate very long output */
12500                                 if (s - origs > 256) {
12501                                     Perl_sv_catpvf(aTHX_ sv,
12502                                                    "%.*s...",
12503                                                    (int) (s - origs - 1),
12504                                                    t);
12505                                     goto out_dump;
12506                                 }
12507                                 *s = ' ';
12508                             }
12509                             else if (*s == '\t') {
12510                                 *s = '-';
12511                             }
12512                             s++;
12513                         }
12514                         if (s[-1] == ' ')
12515                             s[-1] = 0;
12516
12517                         sv_catpv(sv, t);
12518                     }
12519
12520                 out_dump:
12521
12522                     Safefree(origs);
12523                 }
12524                 SvREFCNT_dec(lv);
12525             }
12526         }
12527
12528         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
12529     }
12530     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
12531         Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
12532 #else
12533     PERL_UNUSED_CONTEXT;
12534     PERL_UNUSED_ARG(sv);
12535     PERL_UNUSED_ARG(o);
12536     PERL_UNUSED_ARG(prog);
12537 #endif  /* DEBUGGING */
12538 }
12539
12540 SV *
12541 Perl_re_intuit_string(pTHX_ REGEXP * const r)
12542 {                               /* Assume that RE_INTUIT is set */
12543     dVAR;
12544     struct regexp *const prog = (struct regexp *)SvANY(r);
12545     GET_RE_DEBUG_FLAGS_DECL;
12546
12547     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
12548     PERL_UNUSED_CONTEXT;
12549
12550     DEBUG_COMPILE_r(
12551         {
12552             const char * const s = SvPV_nolen_const(prog->check_substr
12553                       ? prog->check_substr : prog->check_utf8);
12554
12555             if (!PL_colorset) reginitcolors();
12556             PerlIO_printf(Perl_debug_log,
12557                       "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
12558                       PL_colors[4],
12559                       prog->check_substr ? "" : "utf8 ",
12560                       PL_colors[5],PL_colors[0],
12561                       s,
12562                       PL_colors[1],
12563                       (strlen(s) > 60 ? "..." : ""));
12564         } );
12565
12566     return prog->check_substr ? prog->check_substr : prog->check_utf8;
12567 }
12568
12569 /*
12570    pregfree()
12571
12572    handles refcounting and freeing the perl core regexp structure. When
12573    it is necessary to actually free the structure the first thing it
12574    does is call the 'free' method of the regexp_engine associated to
12575    the regexp, allowing the handling of the void *pprivate; member
12576    first. (This routine is not overridable by extensions, which is why
12577    the extensions free is called first.)
12578
12579    See regdupe and regdupe_internal if you change anything here.
12580 */
12581 #ifndef PERL_IN_XSUB_RE
12582 void
12583 Perl_pregfree(pTHX_ REGEXP *r)
12584 {
12585     SvREFCNT_dec(r);
12586 }
12587
12588 void
12589 Perl_pregfree2(pTHX_ REGEXP *rx)
12590 {
12591     dVAR;
12592     struct regexp *const r = (struct regexp *)SvANY(rx);
12593     GET_RE_DEBUG_FLAGS_DECL;
12594
12595     PERL_ARGS_ASSERT_PREGFREE2;
12596
12597     if (r->mother_re) {
12598         ReREFCNT_dec(r->mother_re);
12599     } else {
12600         CALLREGFREE_PVT(rx); /* free the private data */
12601         SvREFCNT_dec(RXp_PAREN_NAMES(r));
12602     }
12603     if (r->substrs) {
12604         SvREFCNT_dec(r->anchored_substr);
12605         SvREFCNT_dec(r->anchored_utf8);
12606         SvREFCNT_dec(r->float_substr);
12607         SvREFCNT_dec(r->float_utf8);
12608         Safefree(r->substrs);
12609     }
12610     RX_MATCH_COPY_FREE(rx);
12611 #ifdef PERL_OLD_COPY_ON_WRITE
12612     SvREFCNT_dec(r->saved_copy);
12613 #endif
12614     Safefree(r->offs);
12615 }
12616
12617 /*  reg_temp_copy()
12618
12619     This is a hacky workaround to the structural issue of match results
12620     being stored in the regexp structure which is in turn stored in
12621     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
12622     could be PL_curpm in multiple contexts, and could require multiple
12623     result sets being associated with the pattern simultaneously, such
12624     as when doing a recursive match with (??{$qr})
12625
12626     The solution is to make a lightweight copy of the regexp structure
12627     when a qr// is returned from the code executed by (??{$qr}) this
12628     lightweight copy doesn't actually own any of its data except for
12629     the starp/end and the actual regexp structure itself.
12630
12631 */
12632
12633
12634 REGEXP *
12635 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
12636 {
12637     struct regexp *ret;
12638     struct regexp *const r = (struct regexp *)SvANY(rx);
12639     register const I32 npar = r->nparens+1;
12640
12641     PERL_ARGS_ASSERT_REG_TEMP_COPY;
12642
12643     if (!ret_x)
12644         ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
12645     ret = (struct regexp *)SvANY(ret_x);
12646
12647     (void)ReREFCNT_inc(rx);
12648     /* We can take advantage of the existing "copied buffer" mechanism in SVs
12649        by pointing directly at the buffer, but flagging that the allocated
12650        space in the copy is zero. As we've just done a struct copy, it's now
12651        a case of zero-ing that, rather than copying the current length.  */
12652     SvPV_set(ret_x, RX_WRAPPED(rx));
12653     SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
12654     memcpy(&(ret->xpv_cur), &(r->xpv_cur),
12655            sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
12656     SvLEN_set(ret_x, 0);
12657     SvSTASH_set(ret_x, NULL);
12658     SvMAGIC_set(ret_x, NULL);
12659     Newx(ret->offs, npar, regexp_paren_pair);
12660     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
12661     if (r->substrs) {
12662         Newx(ret->substrs, 1, struct reg_substr_data);
12663         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
12664
12665         SvREFCNT_inc_void(ret->anchored_substr);
12666         SvREFCNT_inc_void(ret->anchored_utf8);
12667         SvREFCNT_inc_void(ret->float_substr);
12668         SvREFCNT_inc_void(ret->float_utf8);
12669
12670         /* check_substr and check_utf8, if non-NULL, point to either their
12671            anchored or float namesakes, and don't hold a second reference.  */
12672     }
12673     RX_MATCH_COPIED_off(ret_x);
12674 #ifdef PERL_OLD_COPY_ON_WRITE
12675     ret->saved_copy = NULL;
12676 #endif
12677     ret->mother_re = rx;
12678
12679     return ret_x;
12680 }
12681 #endif
12682
12683 /* regfree_internal()
12684
12685    Free the private data in a regexp. This is overloadable by
12686    extensions. Perl takes care of the regexp structure in pregfree(),
12687    this covers the *pprivate pointer which technically perl doesn't
12688    know about, however of course we have to handle the
12689    regexp_internal structure when no extension is in use.
12690
12691    Note this is called before freeing anything in the regexp
12692    structure.
12693  */
12694
12695 void
12696 Perl_regfree_internal(pTHX_ REGEXP * const rx)
12697 {
12698     dVAR;
12699     struct regexp *const r = (struct regexp *)SvANY(rx);
12700     RXi_GET_DECL(r,ri);
12701     GET_RE_DEBUG_FLAGS_DECL;
12702
12703     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
12704
12705     DEBUG_COMPILE_r({
12706         if (!PL_colorset)
12707             reginitcolors();
12708         {
12709             SV *dsv= sv_newmortal();
12710             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
12711                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
12712             PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
12713                 PL_colors[4],PL_colors[5],s);
12714         }
12715     });
12716 #ifdef RE_TRACK_PATTERN_OFFSETS
12717     if (ri->u.offsets)
12718         Safefree(ri->u.offsets);             /* 20010421 MJD */
12719 #endif
12720     if (ri->data) {
12721         int n = ri->data->count;
12722         PAD* new_comppad = NULL;
12723         PAD* old_comppad;
12724         PADOFFSET refcnt;
12725
12726         while (--n >= 0) {
12727           /* If you add a ->what type here, update the comment in regcomp.h */
12728             switch (ri->data->what[n]) {
12729             case 'a':
12730             case 's':
12731             case 'S':
12732             case 'u':
12733                 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
12734                 break;
12735             case 'f':
12736                 Safefree(ri->data->data[n]);
12737                 break;
12738             case 'p':
12739                 new_comppad = MUTABLE_AV(ri->data->data[n]);
12740                 break;
12741             case 'o':
12742                 if (new_comppad == NULL)
12743                     Perl_croak(aTHX_ "panic: pregfree comppad");
12744                 PAD_SAVE_LOCAL(old_comppad,
12745                     /* Watch out for global destruction's random ordering. */
12746                     (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
12747                 );
12748                 OP_REFCNT_LOCK;
12749                 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
12750                 OP_REFCNT_UNLOCK;
12751                 if (!refcnt)
12752                     op_free((OP_4tree*)ri->data->data[n]);
12753
12754                 PAD_RESTORE_LOCAL(old_comppad);
12755                 SvREFCNT_dec(MUTABLE_SV(new_comppad));
12756                 new_comppad = NULL;
12757                 break;
12758             case 'n':
12759                 break;
12760             case 'T':
12761                 { /* Aho Corasick add-on structure for a trie node.
12762                      Used in stclass optimization only */
12763                     U32 refcount;
12764                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
12765                     OP_REFCNT_LOCK;
12766                     refcount = --aho->refcount;
12767                     OP_REFCNT_UNLOCK;
12768                     if ( !refcount ) {
12769                         PerlMemShared_free(aho->states);
12770                         PerlMemShared_free(aho->fail);
12771                          /* do this last!!!! */
12772                         PerlMemShared_free(ri->data->data[n]);
12773                         PerlMemShared_free(ri->regstclass);
12774                     }
12775                 }
12776                 break;
12777             case 't':
12778                 {
12779                     /* trie structure. */
12780                     U32 refcount;
12781                     reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
12782                     OP_REFCNT_LOCK;
12783                     refcount = --trie->refcount;
12784                     OP_REFCNT_UNLOCK;
12785                     if ( !refcount ) {
12786                         PerlMemShared_free(trie->charmap);
12787                         PerlMemShared_free(trie->states);
12788                         PerlMemShared_free(trie->trans);
12789                         if (trie->bitmap)
12790                             PerlMemShared_free(trie->bitmap);
12791                         if (trie->jump)
12792                             PerlMemShared_free(trie->jump);
12793                         PerlMemShared_free(trie->wordinfo);
12794                         /* do this last!!!! */
12795                         PerlMemShared_free(ri->data->data[n]);
12796                     }
12797                 }
12798                 break;
12799             default:
12800                 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
12801             }
12802         }
12803         Safefree(ri->data->what);
12804         Safefree(ri->data);
12805     }
12806
12807     Safefree(ri);
12808 }
12809
12810 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
12811 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
12812 #define SAVEPVN(p,n)    ((p) ? savepvn(p,n) : NULL)
12813
12814 /*
12815    re_dup - duplicate a regexp.
12816
12817    This routine is expected to clone a given regexp structure. It is only
12818    compiled under USE_ITHREADS.
12819
12820    After all of the core data stored in struct regexp is duplicated
12821    the regexp_engine.dupe method is used to copy any private data
12822    stored in the *pprivate pointer. This allows extensions to handle
12823    any duplication it needs to do.
12824
12825    See pregfree() and regfree_internal() if you change anything here.
12826 */
12827 #if defined(USE_ITHREADS)
12828 #ifndef PERL_IN_XSUB_RE
12829 void
12830 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
12831 {
12832     dVAR;
12833     I32 npar;
12834     const struct regexp *r = (const struct regexp *)SvANY(sstr);
12835     struct regexp *ret = (struct regexp *)SvANY(dstr);
12836
12837     PERL_ARGS_ASSERT_RE_DUP_GUTS;
12838
12839     npar = r->nparens+1;
12840     Newx(ret->offs, npar, regexp_paren_pair);
12841     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
12842     if(ret->swap) {
12843         /* no need to copy these */
12844         Newx(ret->swap, npar, regexp_paren_pair);
12845     }
12846
12847     if (ret->substrs) {
12848         /* Do it this way to avoid reading from *r after the StructCopy().
12849            That way, if any of the sv_dup_inc()s dislodge *r from the L1
12850            cache, it doesn't matter.  */
12851         const bool anchored = r->check_substr
12852             ? r->check_substr == r->anchored_substr
12853             : r->check_utf8 == r->anchored_utf8;
12854         Newx(ret->substrs, 1, struct reg_substr_data);
12855         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
12856
12857         ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
12858         ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
12859         ret->float_substr = sv_dup_inc(ret->float_substr, param);
12860         ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
12861
12862         /* check_substr and check_utf8, if non-NULL, point to either their
12863            anchored or float namesakes, and don't hold a second reference.  */
12864
12865         if (ret->check_substr) {
12866             if (anchored) {
12867                 assert(r->check_utf8 == r->anchored_utf8);
12868                 ret->check_substr = ret->anchored_substr;
12869                 ret->check_utf8 = ret->anchored_utf8;
12870             } else {
12871                 assert(r->check_substr == r->float_substr);
12872                 assert(r->check_utf8 == r->float_utf8);
12873                 ret->check_substr = ret->float_substr;
12874                 ret->check_utf8 = ret->float_utf8;
12875             }
12876         } else if (ret->check_utf8) {
12877             if (anchored) {
12878                 ret->check_utf8 = ret->anchored_utf8;
12879             } else {
12880                 ret->check_utf8 = ret->float_utf8;
12881             }
12882         }
12883     }
12884
12885     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
12886
12887     if (ret->pprivate)
12888         RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
12889
12890     if (RX_MATCH_COPIED(dstr))
12891         ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
12892     else
12893         ret->subbeg = NULL;
12894 #ifdef PERL_OLD_COPY_ON_WRITE
12895     ret->saved_copy = NULL;
12896 #endif
12897
12898     if (ret->mother_re) {
12899         if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
12900             /* Our storage points directly to our mother regexp, but that's
12901                1: a buffer in a different thread
12902                2: something we no longer hold a reference on
12903                so we need to copy it locally.  */
12904             /* Note we need to use SvCUR(), rather than
12905                SvLEN(), on our mother_re, because it, in
12906                turn, may well be pointing to its own mother_re.  */
12907             SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
12908                                    SvCUR(ret->mother_re)+1));
12909             SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
12910         }
12911         ret->mother_re      = NULL;
12912     }
12913     ret->gofs = 0;
12914 }
12915 #endif /* PERL_IN_XSUB_RE */
12916
12917 /*
12918    regdupe_internal()
12919
12920    This is the internal complement to regdupe() which is used to copy
12921    the structure pointed to by the *pprivate pointer in the regexp.
12922    This is the core version of the extension overridable cloning hook.
12923    The regexp structure being duplicated will be copied by perl prior
12924    to this and will be provided as the regexp *r argument, however
12925    with the /old/ structures pprivate pointer value. Thus this routine
12926    may override any copying normally done by perl.
12927
12928    It returns a pointer to the new regexp_internal structure.
12929 */
12930
12931 void *
12932 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
12933 {
12934     dVAR;
12935     struct regexp *const r = (struct regexp *)SvANY(rx);
12936     regexp_internal *reti;
12937     int len;
12938     RXi_GET_DECL(r,ri);
12939
12940     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
12941
12942     len = ProgLen(ri);
12943
12944     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
12945     Copy(ri->program, reti->program, len+1, regnode);
12946
12947
12948     reti->regstclass = NULL;
12949
12950     if (ri->data) {
12951         struct reg_data *d;
12952         const int count = ri->data->count;
12953         int i;
12954
12955         Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
12956                 char, struct reg_data);
12957         Newx(d->what, count, U8);
12958
12959         d->count = count;
12960         for (i = 0; i < count; i++) {
12961             d->what[i] = ri->data->what[i];
12962             switch (d->what[i]) {
12963                 /* legal options are one of: sSfpontTua
12964                    see also regcomp.h and pregfree() */
12965             case 'a': /* actually an AV, but the dup function is identical.  */
12966             case 's':
12967             case 'S':
12968             case 'p': /* actually an AV, but the dup function is identical.  */
12969             case 'u': /* actually an HV, but the dup function is identical.  */
12970                 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
12971                 break;
12972             case 'f':
12973                 /* This is cheating. */
12974                 Newx(d->data[i], 1, struct regnode_charclass_class);
12975                 StructCopy(ri->data->data[i], d->data[i],
12976                             struct regnode_charclass_class);
12977                 reti->regstclass = (regnode*)d->data[i];
12978                 break;
12979             case 'o':
12980                 /* Compiled op trees are readonly and in shared memory,
12981                    and can thus be shared without duplication. */
12982                 OP_REFCNT_LOCK;
12983                 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
12984                 OP_REFCNT_UNLOCK;
12985                 break;
12986             case 'T':
12987                 /* Trie stclasses are readonly and can thus be shared
12988                  * without duplication. We free the stclass in pregfree
12989                  * when the corresponding reg_ac_data struct is freed.
12990                  */
12991                 reti->regstclass= ri->regstclass;
12992                 /* Fall through */
12993             case 't':
12994                 OP_REFCNT_LOCK;
12995                 ((reg_trie_data*)ri->data->data[i])->refcount++;
12996                 OP_REFCNT_UNLOCK;
12997                 /* Fall through */
12998             case 'n':
12999                 d->data[i] = ri->data->data[i];
13000                 break;
13001             default:
13002                 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
13003             }
13004         }
13005
13006         reti->data = d;
13007     }
13008     else
13009         reti->data = NULL;
13010
13011     reti->name_list_idx = ri->name_list_idx;
13012
13013 #ifdef RE_TRACK_PATTERN_OFFSETS
13014     if (ri->u.offsets) {
13015         Newx(reti->u.offsets, 2*len+1, U32);
13016         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
13017     }
13018 #else
13019     SetProgLen(reti,len);
13020 #endif
13021
13022     return (void*)reti;
13023 }
13024
13025 #endif    /* USE_ITHREADS */
13026
13027 #ifndef PERL_IN_XSUB_RE
13028
13029 /*
13030  - regnext - dig the "next" pointer out of a node
13031  */
13032 regnode *
13033 Perl_regnext(pTHX_ register regnode *p)
13034 {
13035     dVAR;
13036     register I32 offset;
13037
13038     if (!p)
13039         return(NULL);
13040
13041     if (OP(p) > REGNODE_MAX) {          /* regnode.type is unsigned */
13042         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
13043     }
13044
13045     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
13046     if (offset == 0)
13047         return(NULL);
13048
13049     return(p+offset);
13050 }
13051 #endif
13052
13053 STATIC void
13054 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
13055 {
13056     va_list args;
13057     STRLEN l1 = strlen(pat1);
13058     STRLEN l2 = strlen(pat2);
13059     char buf[512];
13060     SV *msv;
13061     const char *message;
13062
13063     PERL_ARGS_ASSERT_RE_CROAK2;
13064
13065     if (l1 > 510)
13066         l1 = 510;
13067     if (l1 + l2 > 510)
13068         l2 = 510 - l1;
13069     Copy(pat1, buf, l1 , char);
13070     Copy(pat2, buf + l1, l2 , char);
13071     buf[l1 + l2] = '\n';
13072     buf[l1 + l2 + 1] = '\0';
13073 #ifdef I_STDARG
13074     /* ANSI variant takes additional second argument */
13075     va_start(args, pat2);
13076 #else
13077     va_start(args);
13078 #endif
13079     msv = vmess(buf, &args);
13080     va_end(args);
13081     message = SvPV_const(msv,l1);
13082     if (l1 > 512)
13083         l1 = 512;
13084     Copy(message, buf, l1 , char);
13085     buf[l1-1] = '\0';                   /* Overwrite \n */
13086     Perl_croak(aTHX_ "%s", buf);
13087 }
13088
13089 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
13090
13091 #ifndef PERL_IN_XSUB_RE
13092 void
13093 Perl_save_re_context(pTHX)
13094 {
13095     dVAR;
13096
13097     struct re_save_state *state;
13098
13099     SAVEVPTR(PL_curcop);
13100     SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
13101
13102     state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
13103     PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
13104     SSPUSHUV(SAVEt_RE_STATE);
13105
13106     Copy(&PL_reg_state, state, 1, struct re_save_state);
13107
13108     PL_reg_start_tmp = 0;
13109     PL_reg_start_tmpl = 0;
13110     PL_reg_oldsaved = NULL;
13111     PL_reg_oldsavedlen = 0;
13112     PL_reg_maxiter = 0;
13113     PL_reg_leftiter = 0;
13114     PL_reg_poscache = NULL;
13115     PL_reg_poscache_size = 0;
13116 #ifdef PERL_OLD_COPY_ON_WRITE
13117     PL_nrs = NULL;
13118 #endif
13119
13120     /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
13121     if (PL_curpm) {
13122         const REGEXP * const rx = PM_GETRE(PL_curpm);
13123         if (rx) {
13124             U32 i;
13125             for (i = 1; i <= RX_NPARENS(rx); i++) {
13126                 char digits[TYPE_CHARS(long)];
13127                 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
13128                 GV *const *const gvp
13129                     = (GV**)hv_fetch(PL_defstash, digits, len, 0);
13130
13131                 if (gvp) {
13132                     GV * const gv = *gvp;
13133                     if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
13134                         save_scalar(gv);
13135                 }
13136             }
13137         }
13138     }
13139 }
13140 #endif
13141
13142 static void
13143 clear_re(pTHX_ void *r)
13144 {
13145     dVAR;
13146     ReREFCNT_dec((REGEXP *)r);
13147 }
13148
13149 #ifdef DEBUGGING
13150
13151 STATIC void
13152 S_put_byte(pTHX_ SV *sv, int c)
13153 {
13154     PERL_ARGS_ASSERT_PUT_BYTE;
13155
13156     /* Our definition of isPRINT() ignores locales, so only bytes that are
13157        not part of UTF-8 are considered printable. I assume that the same
13158        holds for UTF-EBCDIC.
13159        Also, code point 255 is not printable in either (it's E0 in EBCDIC,
13160        which Wikipedia says:
13161
13162        EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
13163        ones (binary 1111 1111, hexadecimal FF). It is similar, but not
13164        identical, to the ASCII delete (DEL) or rubout control character.
13165        ) So the old condition can be simplified to !isPRINT(c)  */
13166     if (!isPRINT(c)) {
13167         if (c < 256) {
13168             Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
13169         }
13170         else {
13171             Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
13172         }
13173     }
13174     else {
13175         const char string = c;
13176         if (c == '-' || c == ']' || c == '\\' || c == '^')
13177             sv_catpvs(sv, "\\");
13178         sv_catpvn(sv, &string, 1);
13179     }
13180 }
13181
13182
13183 #define CLEAR_OPTSTART \
13184     if (optstart) STMT_START { \
13185             DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
13186             optstart=NULL; \
13187     } STMT_END
13188
13189 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
13190
13191 STATIC const regnode *
13192 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
13193             const regnode *last, const regnode *plast,
13194             SV* sv, I32 indent, U32 depth)
13195 {
13196     dVAR;
13197     register U8 op = PSEUDO;    /* Arbitrary non-END op. */
13198     register const regnode *next;
13199     const regnode *optstart= NULL;
13200
13201     RXi_GET_DECL(r,ri);
13202     GET_RE_DEBUG_FLAGS_DECL;
13203
13204     PERL_ARGS_ASSERT_DUMPUNTIL;
13205
13206 #ifdef DEBUG_DUMPUNTIL
13207     PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
13208         last ? last-start : 0,plast ? plast-start : 0);
13209 #endif
13210
13211     if (plast && plast < last)
13212         last= plast;
13213
13214     while (PL_regkind[op] != END && (!last || node < last)) {
13215         /* While that wasn't END last time... */
13216         NODE_ALIGN(node);
13217         op = OP(node);
13218         if (op == CLOSE || op == WHILEM)
13219             indent--;
13220         next = regnext((regnode *)node);
13221
13222         /* Where, what. */
13223         if (OP(node) == OPTIMIZED) {
13224             if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
13225                 optstart = node;
13226             else
13227                 goto after_print;
13228         } else
13229             CLEAR_OPTSTART;
13230
13231         regprop(r, sv, node);
13232         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
13233                       (int)(2*indent + 1), "", SvPVX_const(sv));
13234
13235         if (OP(node) != OPTIMIZED) {
13236             if (next == NULL)           /* Next ptr. */
13237                 PerlIO_printf(Perl_debug_log, " (0)");
13238             else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
13239                 PerlIO_printf(Perl_debug_log, " (FAIL)");
13240             else
13241                 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
13242             (void)PerlIO_putc(Perl_debug_log, '\n');
13243         }
13244
13245       after_print:
13246         if (PL_regkind[(U8)op] == BRANCHJ) {
13247             assert(next);
13248             {
13249                 register const regnode *nnode = (OP(next) == LONGJMP
13250                                              ? regnext((regnode *)next)
13251                                              : next);
13252                 if (last && nnode > last)
13253                     nnode = last;
13254                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
13255             }
13256         }
13257         else if (PL_regkind[(U8)op] == BRANCH) {
13258             assert(next);
13259             DUMPUNTIL(NEXTOPER(node), next);
13260         }
13261         else if ( PL_regkind[(U8)op]  == TRIE ) {
13262             const regnode *this_trie = node;
13263             const char op = OP(node);
13264             const U32 n = ARG(node);
13265             const reg_ac_data * const ac = op>=AHOCORASICK ?
13266                (reg_ac_data *)ri->data->data[n] :
13267                NULL;
13268             const reg_trie_data * const trie =
13269                 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
13270 #ifdef DEBUGGING
13271             AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
13272 #endif
13273             const regnode *nextbranch= NULL;
13274             I32 word_idx;
13275             sv_setpvs(sv, "");
13276             for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
13277                 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
13278
13279                 PerlIO_printf(Perl_debug_log, "%*s%s ",
13280                    (int)(2*(indent+3)), "",
13281                     elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
13282                             PL_colors[0], PL_colors[1],
13283                             (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
13284                             PERL_PV_PRETTY_ELLIPSES    |
13285                             PERL_PV_PRETTY_LTGT
13286                             )
13287                             : "???"
13288                 );
13289                 if (trie->jump) {
13290                     U16 dist= trie->jump[word_idx+1];
13291                     PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
13292                                   (UV)((dist ? this_trie + dist : next) - start));
13293                     if (dist) {
13294                         if (!nextbranch)
13295                             nextbranch= this_trie + trie->jump[0];
13296                         DUMPUNTIL(this_trie + dist, nextbranch);
13297                     }
13298                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
13299                         nextbranch= regnext((regnode *)nextbranch);
13300                 } else {
13301                     PerlIO_printf(Perl_debug_log, "\n");
13302                 }
13303             }
13304             if (last && next > last)
13305                 node= last;
13306             else
13307                 node= next;
13308         }
13309         else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
13310             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
13311                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
13312         }
13313         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
13314             assert(next);
13315             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
13316         }
13317         else if ( op == PLUS || op == STAR) {
13318             DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
13319         }
13320         else if (PL_regkind[(U8)op] == ANYOF) {
13321             /* arglen 1 + class block */
13322             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
13323                     ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
13324             node = NEXTOPER(node);
13325         }
13326         else if (PL_regkind[(U8)op] == EXACT) {
13327             /* Literal string, where present. */
13328             node += NODE_SZ_STR(node) - 1;
13329             node = NEXTOPER(node);
13330         }
13331         else {
13332             node = NEXTOPER(node);
13333             node += regarglen[(U8)op];
13334         }
13335         if (op == CURLYX || op == OPEN)
13336             indent++;
13337     }
13338     CLEAR_OPTSTART;
13339 #ifdef DEBUG_DUMPUNTIL
13340     PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
13341 #endif
13342     return node;
13343 }
13344
13345 #endif  /* DEBUGGING */
13346
13347 /*
13348  * Local variables:
13349  * c-indentation-style: bsd
13350  * c-basic-offset: 4
13351  * indent-tabs-mode: t
13352  * End:
13353  *
13354  * ex: set ts=8 sts=4 sw=4 noet:
13355  */