src/5015008/regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  *      Copyright (c) 1986 by University of Toronto.
  42  *      Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  *      Permission is granted to anyone to use this software for any
  45  *      purpose on any computer system, and to redistribute it freely,
  46  *      subject to the following restrictions:
  47  *
  48  *      1. The author is not responsible for the consequences of use of
  49  *              this software, no matter how awful, even if they arise
  50  *              from defects in it.
  51  *
  52  *      2. The origin of this software must not be misrepresented, either
  53  *              by explicit claim or by omission.
  54  *
  55  *      3. Altered versions must be plainly marked as such, and must not
  56  *              be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #include "perl.h"
  76
  77 #ifndef PERL_IN_XSUB_RE
  78 #include "re_defs.h"
  79 #endif
  80
  81 #define REG_COMP_C
  82 #ifdef PERL_IN_XSUB_RE
  83 #  include "re_comp.h"
  84 #else
  85 #  include "regcomp.h"
  86 #endif
  87
  88 #include "dquote_static.c"
  89 #ifndef PERL_IN_XSUB_RE
  90 #  include "charclass_invlists.h"
  91 #endif
  92
  93 #ifdef op
  94 #undef op
  95 #endif /* op */
  96
  97 #ifdef MSDOS
  98 #  if defined(BUGGY_MSC6)
  99  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
 100 #    pragma optimize("a",off)
 101  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
 102 #    pragma optimize("w",on )
 103 #  endif /* BUGGY_MSC6 */
 104 #endif /* MSDOS */
 105
 106 #ifndef STATIC
 107 #define STATIC  static
 108 #endif
 109
 110 typedef struct RExC_state_t {
 111     U32         flags;                  /* are we folding, multilining? */
 112     char        *precomp;               /* uncompiled string. */
 113     REGEXP      *rx_sv;                 /* The SV that is the regexp. */
 114     regexp      *rx;                    /* perl core regexp structure */
 115     regexp_internal     *rxi;           /* internal data for regexp object pprivate field */
 116     char        *start;                 /* Start of input for compile */
 117     char        *end;                   /* End of input for compile */
 118     char        *parse;                 /* Input-scan pointer. */
 119     I32         whilem_seen;            /* number of WHILEM in this expr */
 120     regnode     *emit_start;            /* Start of emitted-code area */
 121     regnode     *emit_bound;            /* First regnode outside of the allocated space */
 122     regnode     *emit;                  /* Code-emit pointer; &regdummy = don't = compiling */
 123     I32         naughty;                /* How bad is this pattern? */
 124     I32         sawback;                /* Did we see \1, ...? */
 125     U32         seen;
 126     I32         size;                   /* Code size. */
 127     I32         npar;                   /* Capture buffer count, (OPEN). */
 128     I32         cpar;                   /* Capture buffer count, (CLOSE). */
 129     I32         nestroot;               /* root parens we are in - used by accept */
 130     I32         extralen;
 131     I32         seen_zerolen;
 132     I32         seen_evals;
 133     regnode     **open_parens;          /* pointers to open parens */
 134     regnode     **close_parens;         /* pointers to close parens */
 135     regnode     *opend;                 /* END node in program */
 136     I32         utf8;           /* whether the pattern is utf8 or not */
 137     I32         orig_utf8;      /* whether the pattern was originally in utf8 */
 138                                 /* XXX use this for future optimisation of case
 139                                  * where pattern must be upgraded to utf8. */
 140     I32         uni_semantics;  /* If a d charset modifier should use unicode
 141                                    rules, even if the pattern is not in
 142                                    utf8 */
 143     HV          *paren_names;           /* Paren names */
 144
 145     regnode     **recurse;              /* Recurse regops */
 146     I32         recurse_count;          /* Number of recurse regops */
 147     I32         in_lookbehind;
 148     I32         contains_locale;
 149     I32         override_recoding;
 150 #if ADD_TO_REGEXEC
 151     char        *starttry;              /* -Dr: where regtry was called. */
 152 #define RExC_starttry   (pRExC_state->starttry)
 153 #endif
 154 #ifdef DEBUGGING
 155     const char  *lastparse;
 156     I32         lastnum;
 157     AV          *paren_name_list;       /* idx -> name */
 158 #define RExC_lastparse  (pRExC_state->lastparse)
 159 #define RExC_lastnum    (pRExC_state->lastnum)
 160 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 161 #endif
 162 } RExC_state_t;
 163
 164 #define RExC_flags      (pRExC_state->flags)
 165 #define RExC_precomp    (pRExC_state->precomp)
 166 #define RExC_rx_sv      (pRExC_state->rx_sv)
 167 #define RExC_rx         (pRExC_state->rx)
 168 #define RExC_rxi        (pRExC_state->rxi)
 169 #define RExC_start      (pRExC_state->start)
 170 #define RExC_end        (pRExC_state->end)
 171 #define RExC_parse      (pRExC_state->parse)
 172 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
 173 #ifdef RE_TRACK_PATTERN_OFFSETS
 174 #define RExC_offsets    (pRExC_state->rxi->u.offsets) /* I am not like the others */
 175 #endif
 176 #define RExC_emit       (pRExC_state->emit)
 177 #define RExC_emit_start (pRExC_state->emit_start)
 178 #define RExC_emit_bound (pRExC_state->emit_bound)
 179 #define RExC_naughty    (pRExC_state->naughty)
 180 #define RExC_sawback    (pRExC_state->sawback)
 181 #define RExC_seen       (pRExC_state->seen)
 182 #define RExC_size       (pRExC_state->size)
 183 #define RExC_npar       (pRExC_state->npar)
 184 #define RExC_nestroot   (pRExC_state->nestroot)
 185 #define RExC_extralen   (pRExC_state->extralen)
 186 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
 187 #define RExC_seen_evals (pRExC_state->seen_evals)
 188 #define RExC_utf8       (pRExC_state->utf8)
 189 #define RExC_uni_semantics      (pRExC_state->uni_semantics)
 190 #define RExC_orig_utf8  (pRExC_state->orig_utf8)
 191 #define RExC_open_parens        (pRExC_state->open_parens)
 192 #define RExC_close_parens       (pRExC_state->close_parens)
 193 #define RExC_opend      (pRExC_state->opend)
 194 #define RExC_paren_names        (pRExC_state->paren_names)
 195 #define RExC_recurse    (pRExC_state->recurse)
 196 #define RExC_recurse_count      (pRExC_state->recurse_count)
 197 #define RExC_in_lookbehind      (pRExC_state->in_lookbehind)
 198 #define RExC_contains_locale    (pRExC_state->contains_locale)
 199 #define RExC_override_recoding  (pRExC_state->override_recoding)
 200
 201
 202 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
 203 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 204         ((*s) == '{' && regcurly(s)))
 205
 206 #ifdef SPSTART
 207 #undef SPSTART          /* dratted cpp namespace... */
 208 #endif
 209 /*
 210  * Flags to be passed up and down.
 211  */
 212 #define WORST           0       /* Worst case. */
 213 #define HASWIDTH        0x01    /* Known to match non-null strings. */
 214
 215 /* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
 216  * character, and if utf8, must be invariant.  Note that this is not the same thing as REGNODE_SIMPLE */
 217 #define SIMPLE          0x02
 218 #define SPSTART         0x04    /* Starts with * or +. */
 219 #define TRYAGAIN        0x08    /* Weeded out a declaration. */
 220 #define POSTPONED       0x10    /* (?1),(?&name), (??{...}) or similar */
 221
 222 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 223
 224 /* whether trie related optimizations are enabled */
 225 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 226 #define TRIE_STUDY_OPT
 227 #define FULL_TRIE_STUDY
 228 #define TRIE_STCLASS
 229 #endif
 230
 231
 232
 233 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 234 #define PBITVAL(paren) (1 << ((paren) & 7))
 235 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 236 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 237 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 238
 239 /* If not already in utf8, do a longjmp back to the beginning */
 240 #define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
 241 #define REQUIRE_UTF8    STMT_START {                                       \
 242                                      if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
 243                         } STMT_END
 244
 245 /* About scan_data_t.
 246
 247   During optimisation we recurse through the regexp program performing
 248   various inplace (keyhole style) optimisations. In addition study_chunk
 249   and scan_commit populate this data structure with information about
 250   what strings MUST appear in the pattern. We look for the longest
 251   string that must appear at a fixed location, and we look for the
 252   longest string that may appear at a floating location. So for instance
 253   in the pattern:
 254
 255     /FOO[xX]A.*B[xX]BAR/
 256
 257   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 258   strings (because they follow a .* construct). study_chunk will identify
 259   both FOO and BAR as being the longest fixed and floating strings respectively.
 260
 261   The strings can be composites, for instance
 262
 263      /(f)(o)(o)/
 264
 265   will result in a composite fixed substring 'foo'.
 266
 267   For each string some basic information is maintained:
 268
 269   - offset or min_offset
 270     This is the position the string must appear at, or not before.
 271     It also implicitly (when combined with minlenp) tells us how many
 272     characters must match before the string we are searching for.
 273     Likewise when combined with minlenp and the length of the string it
 274     tells us how many characters must appear after the string we have
 275     found.
 276
 277   - max_offset
 278     Only used for floating strings. This is the rightmost point that
 279     the string can appear at. If set to I32 max it indicates that the
 280     string can occur infinitely far to the right.
 281
 282   - minlenp
 283     A pointer to the minimum length of the pattern that the string
 284     was found inside. This is important as in the case of positive
 285     lookahead or positive lookbehind we can have multiple patterns
 286     involved. Consider
 287
 288     /(?=FOO).*F/
 289
 290     The minimum length of the pattern overall is 3, the minimum length
 291     of the lookahead part is 3, but the minimum length of the part that
 292     will actually match is 1. So 'FOO's minimum length is 3, but the
 293     minimum length for the F is 1. This is important as the minimum length
 294     is used to determine offsets in front of and behind the string being
 295     looked for.  Since strings can be composites this is the length of the
 296     pattern at the time it was committed with a scan_commit. Note that
 297     the length is calculated by study_chunk, so that the minimum lengths
 298     are not known until the full pattern has been compiled, thus the
 299     pointer to the value.
 300
 301   - lookbehind
 302
 303     In the case of lookbehind the string being searched for can be
 304     offset past the start point of the final matching string.
 305     If this value was just blithely removed from the min_offset it would
 306     invalidate some of the calculations for how many chars must match
 307     before or after (as they are derived from min_offset and minlen and
 308     the length of the string being searched for).
 309     When the final pattern is compiled and the data is moved from the
 310     scan_data_t structure into the regexp structure the information
 311     about lookbehind is factored in, with the information that would
 312     have been lost precalculated in the end_shift field for the
 313     associated string.
 314
 315   The fields pos_min and pos_delta are used to store the minimum offset
 316   and the delta to the maximum offset at the current point in the pattern.
 317
 318 */
 319
 320 typedef struct scan_data_t {
 321     /*I32 len_min;      unused */
 322     /*I32 len_delta;    unused */
 323     I32 pos_min;
 324     I32 pos_delta;
 325     SV *last_found;
 326     I32 last_end;           /* min value, <0 unless valid. */
 327     I32 last_start_min;
 328     I32 last_start_max;
 329     SV **longest;           /* Either &l_fixed, or &l_float. */
 330     SV *longest_fixed;      /* longest fixed string found in pattern */
 331     I32 offset_fixed;       /* offset where it starts */
 332     I32 *minlen_fixed;      /* pointer to the minlen relevant to the string */
 333     I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 334     SV *longest_float;      /* longest floating string found in pattern */
 335     I32 offset_float_min;   /* earliest point in string it can appear */
 336     I32 offset_float_max;   /* latest point in string it can appear */
 337     I32 *minlen_float;      /* pointer to the minlen relevant to the string */
 338     I32 lookbehind_float;   /* is the position of the string modified by LB */
 339     I32 flags;
 340     I32 whilem_c;
 341     I32 *last_closep;
 342     struct regnode_charclass_class *start_class;
 343 } scan_data_t;
 344
 345 /*
 346  * Forward declarations for pregcomp()'s friends.
 347  */
 348
 349 static const scan_data_t zero_scan_data =
 350   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 351
 352 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 353 #define SF_BEFORE_SEOL          0x0001
 354 #define SF_BEFORE_MEOL          0x0002
 355 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 356 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 357
 358 #ifdef NO_UNARY_PLUS
 359 #  define SF_FIX_SHIFT_EOL      (0+2)
 360 #  define SF_FL_SHIFT_EOL               (0+4)
 361 #else
 362 #  define SF_FIX_SHIFT_EOL      (+2)
 363 #  define SF_FL_SHIFT_EOL               (+4)
 364 #endif
 365
 366 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 367 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 368
 369 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 370 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 371 #define SF_IS_INF               0x0040
 372 #define SF_HAS_PAR              0x0080
 373 #define SF_IN_PAR               0x0100
 374 #define SF_HAS_EVAL             0x0200
 375 #define SCF_DO_SUBSTR           0x0400
 376 #define SCF_DO_STCLASS_AND      0x0800
 377 #define SCF_DO_STCLASS_OR       0x1000
 378 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 379 #define SCF_WHILEM_VISITED_POS  0x2000
 380
 381 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 382 #define SCF_SEEN_ACCEPT         0x8000
 383
 384 #define UTF cBOOL(RExC_utf8)
 385 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 386 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
 387 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
 388 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
 389 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
 390 #define MORE_ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
 391 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
 392
 393 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 394
 395 #define OOB_UNICODE             12345678
 396 #define OOB_NAMEDCLASS          -1
 397
 398 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 399 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 400
 401
 402 /* length of regex to show in messages that don't mark a position within */
 403 #define RegexLengthToShowInErrorMessages 127
 404
 405 /*
 406  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 407  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 408  * op/pragma/warn/regcomp.
 409  */
 410 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 411 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 412
 413 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
 414
 415 /*
 416  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 417  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 418  * "...".
 419  */
 420 #define _FAIL(code) STMT_START {                                        \
 421     const char *ellipses = "";                                          \
 422     IV len = RExC_end - RExC_precomp;                                   \
 423                                                                         \
 424     if (!SIZE_ONLY)                                                     \
 425         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);                   \
 426     if (len > RegexLengthToShowInErrorMessages) {                       \
 427         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
 428         len = RegexLengthToShowInErrorMessages - 10;                    \
 429         ellipses = "...";                                               \
 430     }                                                                   \
 431     code;                                                               \
 432 } STMT_END
 433
 434 #define FAIL(msg) _FAIL(                            \
 435     Perl_croak(aTHX_ "%s in regex m/%.*s%s/",       \
 436             msg, (int)len, RExC_precomp, ellipses))
 437
 438 #define FAIL2(msg,arg) _FAIL(                       \
 439     Perl_croak(aTHX_ msg " in regex m/%.*s%s/",     \
 440             arg, (int)len, RExC_precomp, ellipses))
 441
 442 /*
 443  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 444  */
 445 #define Simple_vFAIL(m) STMT_START {                                    \
 446     const IV offset = RExC_parse - RExC_precomp;                        \
 447     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
 448             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
 449 } STMT_END
 450
 451 /*
 452  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 453  */
 454 #define vFAIL(m) STMT_START {                           \
 455     if (!SIZE_ONLY)                                     \
 456         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 457     Simple_vFAIL(m);                                    \
 458 } STMT_END
 459
 460 /*
 461  * Like Simple_vFAIL(), but accepts two arguments.
 462  */
 463 #define Simple_vFAIL2(m,a1) STMT_START {                        \
 464     const IV offset = RExC_parse - RExC_precomp;                        \
 465     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,                   \
 466             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 467 } STMT_END
 468
 469 /*
 470  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 471  */
 472 #define vFAIL2(m,a1) STMT_START {                       \
 473     if (!SIZE_ONLY)                                     \
 474         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 475     Simple_vFAIL2(m, a1);                               \
 476 } STMT_END
 477
 478
 479 /*
 480  * Like Simple_vFAIL(), but accepts three arguments.
 481  */
 482 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
 483     const IV offset = RExC_parse - RExC_precomp;                \
 484     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,               \
 485             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 486 } STMT_END
 487
 488 /*
 489  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 490  */
 491 #define vFAIL3(m,a1,a2) STMT_START {                    \
 492     if (!SIZE_ONLY)                                     \
 493         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 494     Simple_vFAIL3(m, a1, a2);                           \
 495 } STMT_END
 496
 497 /*
 498  * Like Simple_vFAIL(), but accepts four arguments.
 499  */
 500 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
 501     const IV offset = RExC_parse - RExC_precomp;                \
 502     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,           \
 503             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 504 } STMT_END
 505
 506 #define ckWARNreg(loc,m) STMT_START {                                   \
 507     const IV offset = loc - RExC_precomp;                               \
 508     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 509             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 510 } STMT_END
 511
 512 #define ckWARNregdep(loc,m) STMT_START {                                \
 513     const IV offset = loc - RExC_precomp;                               \
 514     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 515             m REPORT_LOCATION,                                          \
 516             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 517 } STMT_END
 518
 519 #define ckWARN2regdep(loc,m, a1) STMT_START {                           \
 520     const IV offset = loc - RExC_precomp;                               \
 521     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 522             m REPORT_LOCATION,                                          \
 523             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
 524 } STMT_END
 525
 526 #define ckWARN2reg(loc, m, a1) STMT_START {                             \
 527     const IV offset = loc - RExC_precomp;                               \
 528     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 529             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
 530 } STMT_END
 531
 532 #define vWARN3(loc, m, a1, a2) STMT_START {                             \
 533     const IV offset = loc - RExC_precomp;                               \
 534     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 535             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 536 } STMT_END
 537
 538 #define ckWARN3reg(loc, m, a1, a2) STMT_START {                         \
 539     const IV offset = loc - RExC_precomp;                               \
 540     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 541             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 542 } STMT_END
 543
 544 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
 545     const IV offset = loc - RExC_precomp;                               \
 546     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 547             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 548 } STMT_END
 549
 550 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START {                     \
 551     const IV offset = loc - RExC_precomp;                               \
 552     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 553             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 554 } STMT_END
 555
 556 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
 557     const IV offset = loc - RExC_precomp;                               \
 558     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 559             a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
 560 } STMT_END
 561
 562
 563 /* Allow for side effects in s */
 564 #define REGC(c,s) STMT_START {                  \
 565     if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
 566 } STMT_END
 567
 568 /* Macros for recording node offsets.   20001227 mjd@plover.com
 569  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 570  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 571  * Element 0 holds the number n.
 572  * Position is 1 indexed.
 573  */
 574 #ifndef RE_TRACK_PATTERN_OFFSETS
 575 #define Set_Node_Offset_To_R(node,byte)
 576 #define Set_Node_Offset(node,byte)
 577 #define Set_Cur_Node_Offset
 578 #define Set_Node_Length_To_R(node,len)
 579 #define Set_Node_Length(node,len)
 580 #define Set_Node_Cur_Length(node)
 581 #define Node_Offset(n)
 582 #define Node_Length(n)
 583 #define Set_Node_Offset_Length(node,offset,len)
 584 #define ProgLen(ri) ri->u.proglen
 585 #define SetProgLen(ri,x) ri->u.proglen = x
 586 #else
 587 #define ProgLen(ri) ri->u.offsets[0]
 588 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 589 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
 590     if (! SIZE_ONLY) {                                                  \
 591         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
 592                     __LINE__, (int)(node), (int)(byte)));               \
 593         if((node) < 0) {                                                \
 594             Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
 595         } else {                                                        \
 596             RExC_offsets[2*(node)-1] = (byte);                          \
 597         }                                                               \
 598     }                                                                   \
 599 } STMT_END
 600
 601 #define Set_Node_Offset(node,byte) \
 602     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 603 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 604
 605 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
 606     if (! SIZE_ONLY) {                                                  \
 607         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
 608                 __LINE__, (int)(node), (int)(len)));                    \
 609         if((node) < 0) {                                                \
 610             Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
 611         } else {                                                        \
 612             RExC_offsets[2*(node)] = (len);                             \
 613         }                                                               \
 614     }                                                                   \
 615 } STMT_END
 616
 617 #define Set_Node_Length(node,len) \
 618     Set_Node_Length_To_R((node)-RExC_emit_start, len)
 619 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
 620 #define Set_Node_Cur_Length(node) \
 621     Set_Node_Length(node, RExC_parse - parse_start)
 622
 623 /* Get offsets and lengths */
 624 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 625 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 626
 627 #define Set_Node_Offset_Length(node,offset,len) STMT_START {    \
 628     Set_Node_Offset_To_R((node)-RExC_emit_start, (offset));     \
 629     Set_Node_Length_To_R((node)-RExC_emit_start, (len));        \
 630 } STMT_END
 631 #endif
 632
 633 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 634 #define EXPERIMENTAL_INPLACESCAN
 635 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
 636
 637 #define DEBUG_STUDYDATA(str,data,depth)                              \
 638 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
 639     PerlIO_printf(Perl_debug_log,                                    \
 640         "%*s" str "Pos:%"IVdf"/%"IVdf                                \
 641         " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s",       \
 642         (int)(depth)*2, "",                                          \
 643         (IV)((data)->pos_min),                                       \
 644         (IV)((data)->pos_delta),                                     \
 645         (UV)((data)->flags),                                         \
 646         (IV)((data)->whilem_c),                                      \
 647         (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
 648         is_inf ? "INF " : ""                                         \
 649     );                                                               \
 650     if ((data)->last_found)                                          \
 651         PerlIO_printf(Perl_debug_log,                                \
 652             "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 653             " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 654             SvPVX_const((data)->last_found),                         \
 655             (IV)((data)->last_end),                                  \
 656             (IV)((data)->last_start_min),                            \
 657             (IV)((data)->last_start_max),                            \
 658             ((data)->longest &&                                      \
 659              (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 660             SvPVX_const((data)->longest_fixed),                      \
 661             (IV)((data)->offset_fixed),                              \
 662             ((data)->longest &&                                      \
 663              (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 664             SvPVX_const((data)->longest_float),                      \
 665             (IV)((data)->offset_float_min),                          \
 666             (IV)((data)->offset_float_max)                           \
 667         );                                                           \
 668     PerlIO_printf(Perl_debug_log,"\n");                              \
 669 });
 670
 671 static void clear_re(pTHX_ void *r);
 672
 673 /* Mark that we cannot extend a found fixed substring at this point.
 674    Update the longest found anchored substring and the longest found
 675    floating substrings if needed. */
 676
 677 STATIC void
 678 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
 679 {
 680     const STRLEN l = CHR_SVLEN(data->last_found);
 681     const STRLEN old_l = CHR_SVLEN(*data->longest);
 682     GET_RE_DEBUG_FLAGS_DECL;
 683
 684     PERL_ARGS_ASSERT_SCAN_COMMIT;
 685
 686     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 687         SvSetMagicSV(*data->longest, data->last_found);
 688         if (*data->longest == data->longest_fixed) {
 689             data->offset_fixed = l ? data->last_start_min : data->pos_min;
 690             if (data->flags & SF_BEFORE_EOL)
 691                 data->flags
 692                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
 693             else
 694                 data->flags &= ~SF_FIX_BEFORE_EOL;
 695             data->minlen_fixed=minlenp;
 696             data->lookbehind_fixed=0;
 697         }
 698         else { /* *data->longest == data->longest_float */
 699             data->offset_float_min = l ? data->last_start_min : data->pos_min;
 700             data->offset_float_max = (l
 701                                       ? data->last_start_max
 702                                       : data->pos_min + data->pos_delta);
 703             if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
 704                 data->offset_float_max = I32_MAX;
 705             if (data->flags & SF_BEFORE_EOL)
 706                 data->flags
 707                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
 708             else
 709                 data->flags &= ~SF_FL_BEFORE_EOL;
 710             data->minlen_float=minlenp;
 711             data->lookbehind_float=0;
 712         }
 713     }
 714     SvCUR_set(data->last_found, 0);
 715     {
 716         SV * const sv = data->last_found;
 717         if (SvUTF8(sv) && SvMAGICAL(sv)) {
 718             MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 719             if (mg)
 720                 mg->mg_len = 0;
 721         }
 722     }
 723     data->last_end = -1;
 724     data->flags &= ~SF_BEFORE_EOL;
 725     DEBUG_STUDYDATA("commit: ",data,0);
 726 }
 727
 728 /* Can match anything (initialization) */
 729 STATIC void
 730 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 731 {
 732     PERL_ARGS_ASSERT_CL_ANYTHING;
 733
 734     ANYOF_BITMAP_SETALL(cl);
 735     cl->flags = ANYOF_CLASS|ANYOF_EOS|ANYOF_UNICODE_ALL
 736                 |ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL;
 737
 738     /* If any portion of the regex is to operate under locale rules,
 739      * initialization includes it.  The reason this isn't done for all regexes
 740      * is that the optimizer was written under the assumption that locale was
 741      * all-or-nothing.  Given the complexity and lack of documentation in the
 742      * optimizer, and that there are inadequate test cases for locale, so many
 743      * parts of it may not work properly, it is safest to avoid locale unless
 744      * necessary. */
 745     if (RExC_contains_locale) {
 746         ANYOF_CLASS_SETALL(cl);     /* /l uses class */
 747         cl->flags |= ANYOF_LOCALE;
 748     }
 749     else {
 750         ANYOF_CLASS_ZERO(cl);       /* Only /l uses class now */
 751     }
 752 }
 753
 754 /* Can match anything (initialization) */
 755 STATIC int
 756 S_cl_is_anything(const struct regnode_charclass_class *cl)
 757 {
 758     int value;
 759
 760     PERL_ARGS_ASSERT_CL_IS_ANYTHING;
 761
 762     for (value = 0; value <= ANYOF_MAX; value += 2)
 763         if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
 764             return 1;
 765     if (!(cl->flags & ANYOF_UNICODE_ALL))
 766         return 0;
 767     if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
 768         return 0;
 769     return 1;
 770 }
 771
 772 /* Can match anything (initialization) */
 773 STATIC void
 774 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 775 {
 776     PERL_ARGS_ASSERT_CL_INIT;
 777
 778     Zero(cl, 1, struct regnode_charclass_class);
 779     cl->type = ANYOF;
 780     cl_anything(pRExC_state, cl);
 781     ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
 782 }
 783
 784 /* These two functions currently do the exact same thing */
 785 #define cl_init_zero            S_cl_init
 786
 787 /* 'AND' a given class with another one.  Can create false positives.  'cl'
 788  * should not be inverted.  'and_with->flags & ANYOF_CLASS' should be 0 if
 789  * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
 790 STATIC void
 791 S_cl_and(struct regnode_charclass_class *cl,
 792         const struct regnode_charclass_class *and_with)
 793 {
 794     PERL_ARGS_ASSERT_CL_AND;
 795
 796     assert(and_with->type == ANYOF);
 797
 798     /* I (khw) am not sure all these restrictions are necessary XXX */
 799     if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
 800         && !(ANYOF_CLASS_TEST_ANY_SET(cl))
 801         && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 802         && !(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
 803         && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) {
 804         int i;
 805
 806         if (and_with->flags & ANYOF_INVERT)
 807             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 808                 cl->bitmap[i] &= ~and_with->bitmap[i];
 809         else
 810             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 811                 cl->bitmap[i] &= and_with->bitmap[i];
 812     } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
 813
 814     if (and_with->flags & ANYOF_INVERT) {
 815
 816         /* Here, the and'ed node is inverted.  Get the AND of the flags that
 817          * aren't affected by the inversion.  Those that are affected are
 818          * handled individually below */
 819         U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
 820         cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
 821         cl->flags |= affected_flags;
 822
 823         /* We currently don't know how to deal with things that aren't in the
 824          * bitmap, but we know that the intersection is no greater than what
 825          * is already in cl, so let there be false positives that get sorted
 826          * out after the synthetic start class succeeds, and the node is
 827          * matched for real. */
 828
 829         /* The inversion of these two flags indicate that the resulting
 830          * intersection doesn't have them */
 831         if (and_with->flags & ANYOF_UNICODE_ALL) {
 832             cl->flags &= ~ANYOF_UNICODE_ALL;
 833         }
 834         if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
 835             cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
 836         }
 837     }
 838     else {   /* and'd node is not inverted */
 839         U8 outside_bitmap_but_not_utf8; /* Temp variable */
 840
 841         if (! ANYOF_NONBITMAP(and_with)) {
 842
 843             /* Here 'and_with' doesn't match anything outside the bitmap
 844              * (except possibly ANYOF_UNICODE_ALL), which means the
 845              * intersection can't either, except for ANYOF_UNICODE_ALL, in
 846              * which case we don't know what the intersection is, but it's no
 847              * greater than what cl already has, so can just leave it alone,
 848              * with possible false positives */
 849             if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
 850                 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
 851                 cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
 852             }
 853         }
 854         else if (! ANYOF_NONBITMAP(cl)) {
 855
 856             /* Here, 'and_with' does match something outside the bitmap, and cl
 857              * doesn't have a list of things to match outside the bitmap.  If
 858              * cl can match all code points above 255, the intersection will
 859              * be those above-255 code points that 'and_with' matches.  If cl
 860              * can't match all Unicode code points, it means that it can't
 861              * match anything outside the bitmap (since the 'if' that got us
 862              * into this block tested for that), so we leave the bitmap empty.
 863              */
 864             if (cl->flags & ANYOF_UNICODE_ALL) {
 865                 ARG_SET(cl, ARG(and_with));
 866
 867                 /* and_with's ARG may match things that don't require UTF8.
 868                  * And now cl's will too, in spite of this being an 'and'.  See
 869                  * the comments below about the kludge */
 870                 cl->flags |= and_with->flags & ANYOF_NONBITMAP_NON_UTF8;
 871             }
 872         }
 873         else {
 874             /* Here, both 'and_with' and cl match something outside the
 875              * bitmap.  Currently we do not do the intersection, so just match
 876              * whatever cl had at the beginning.  */
 877         }
 878
 879
 880         /* Take the intersection of the two sets of flags.  However, the
 881          * ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'.  This is a
 882          * kludge around the fact that this flag is not treated like the others
 883          * which are initialized in cl_anything().  The way the optimizer works
 884          * is that the synthetic start class (SSC) is initialized to match
 885          * anything, and then the first time a real node is encountered, its
 886          * values are AND'd with the SSC's with the result being the values of
 887          * the real node.  However, there are paths through the optimizer where
 888          * the AND never gets called, so those initialized bits are set
 889          * inappropriately, which is not usually a big deal, as they just cause
 890          * false positives in the SSC, which will just mean a probably
 891          * imperceptible slow down in execution.  However this bit has a
 892          * higher false positive consequence in that it can cause utf8.pm,
 893          * utf8_heavy.pl ... to be loaded when not necessary, which is a much
 894          * bigger slowdown and also causes significant extra memory to be used.
 895          * In order to prevent this, the code now takes a different tack.  The
 896          * bit isn't set unless some part of the regular expression needs it,
 897          * but once set it won't get cleared.  This means that these extra
 898          * modules won't get loaded unless there was some path through the
 899          * pattern that would have required them anyway, and  so any false
 900          * positives that occur by not ANDing them out when they could be
 901          * aren't as severe as they would be if we treated this bit like all
 902          * the others */
 903         outside_bitmap_but_not_utf8 = (cl->flags | and_with->flags)
 904                                       & ANYOF_NONBITMAP_NON_UTF8;
 905         cl->flags &= and_with->flags;
 906         cl->flags |= outside_bitmap_but_not_utf8;
 907     }
 908 }
 909
 910 /* 'OR' a given class with another one.  Can create false positives.  'cl'
 911  * should not be inverted.  'or_with->flags & ANYOF_CLASS' should be 0 if
 912  * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
 913 STATIC void
 914 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
 915 {
 916     PERL_ARGS_ASSERT_CL_OR;
 917
 918     if (or_with->flags & ANYOF_INVERT) {
 919
 920         /* Here, the or'd node is to be inverted.  This means we take the
 921          * complement of everything not in the bitmap, but currently we don't
 922          * know what that is, so give up and match anything */
 923         if (ANYOF_NONBITMAP(or_with)) {
 924             cl_anything(pRExC_state, cl);
 925         }
 926         /* We do not use
 927          * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
 928          *   <= (B1 | !B2) | (CL1 | !CL2)
 929          * which is wasteful if CL2 is small, but we ignore CL2:
 930          *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
 931          * XXXX Can we handle case-fold?  Unclear:
 932          *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
 933          *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
 934          */
 935         else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 936              && !(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
 937              && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD) ) {
 938             int i;
 939
 940             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 941                 cl->bitmap[i] |= ~or_with->bitmap[i];
 942         } /* XXXX: logic is complicated otherwise */
 943         else {
 944             cl_anything(pRExC_state, cl);
 945         }
 946
 947         /* And, we can just take the union of the flags that aren't affected
 948          * by the inversion */
 949         cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
 950
 951         /* For the remaining flags:
 952             ANYOF_UNICODE_ALL and inverted means to not match anything above
 953                     255, which means that the union with cl should just be
 954                     what cl has in it, so can ignore this flag
 955             ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
 956                     is 127-255 to match them, but then invert that, so the
 957                     union with cl should just be what cl has in it, so can
 958                     ignore this flag
 959          */
 960     } else {    /* 'or_with' is not inverted */
 961         /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
 962         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 963              && (!(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
 964                  || (cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) ) {
 965             int i;
 966
 967             /* OR char bitmap and class bitmap separately */
 968             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 969                 cl->bitmap[i] |= or_with->bitmap[i];
 970             if (ANYOF_CLASS_TEST_ANY_SET(or_with)) {
 971                 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
 972                     cl->classflags[i] |= or_with->classflags[i];
 973                 cl->flags |= ANYOF_CLASS;
 974             }
 975         }
 976         else { /* XXXX: logic is complicated, leave it along for a moment. */
 977             cl_anything(pRExC_state, cl);
 978         }
 979
 980         if (ANYOF_NONBITMAP(or_with)) {
 981
 982             /* Use the added node's outside-the-bit-map match if there isn't a
 983              * conflict.  If there is a conflict (both nodes match something
 984              * outside the bitmap, but what they match outside is not the same
 985              * pointer, and hence not easily compared until XXX we extend
 986              * inversion lists this far), give up and allow the start class to
 987              * match everything outside the bitmap.  If that stuff is all above
 988              * 255, can just set UNICODE_ALL, otherwise caould be anything. */
 989             if (! ANYOF_NONBITMAP(cl)) {
 990                 ARG_SET(cl, ARG(or_with));
 991             }
 992             else if (ARG(cl) != ARG(or_with)) {
 993
 994                 if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
 995                     cl_anything(pRExC_state, cl);
 996                 }
 997                 else {
 998                     cl->flags |= ANYOF_UNICODE_ALL;
 999                 }
1000             }
1001         }
1002
1003         /* Take the union */
1004         cl->flags |= or_with->flags;
1005     }
1006 }
1007
1008 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1009 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
1010 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1011 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
1012
1013
1014 #ifdef DEBUGGING
1015 /*
1016    dump_trie(trie,widecharmap,revcharmap)
1017    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1018    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1019
1020    These routines dump out a trie in a somewhat readable format.
1021    The _interim_ variants are used for debugging the interim
1022    tables that are used to generate the final compressed
1023    representation which is what dump_trie expects.
1024
1025    Part of the reason for their existence is to provide a form
1026    of documentation as to how the different representations function.
1027
1028 */
1029
1030 /*
1031   Dumps the final compressed table form of the trie to Perl_debug_log.
1032   Used for debugging make_trie().
1033 */
1034
1035 STATIC void
1036 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1037             AV *revcharmap, U32 depth)
1038 {
1039     U32 state;
1040     SV *sv=sv_newmortal();
1041     int colwidth= widecharmap ? 6 : 4;
1042     U16 word;
1043     GET_RE_DEBUG_FLAGS_DECL;
1044
1045     PERL_ARGS_ASSERT_DUMP_TRIE;
1046
1047     PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1048         (int)depth * 2 + 2,"",
1049         "Match","Base","Ofs" );
1050
1051     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1052         SV ** const tmp = av_fetch( revcharmap, state, 0);
1053         if ( tmp ) {
1054             PerlIO_printf( Perl_debug_log, "%*s",
1055                 colwidth,
1056                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1057                             PL_colors[0], PL_colors[1],
1058                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1059                             PERL_PV_ESCAPE_FIRSTCHAR
1060                 )
1061             );
1062         }
1063     }
1064     PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1065         (int)depth * 2 + 2,"");
1066
1067     for( state = 0 ; state < trie->uniquecharcount ; state++ )
1068         PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1069     PerlIO_printf( Perl_debug_log, "\n");
1070
1071     for( state = 1 ; state < trie->statecount ; state++ ) {
1072         const U32 base = trie->states[ state ].trans.base;
1073
1074         PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1075
1076         if ( trie->states[ state ].wordnum ) {
1077             PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1078         } else {
1079             PerlIO_printf( Perl_debug_log, "%6s", "" );
1080         }
1081
1082         PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1083
1084         if ( base ) {
1085             U32 ofs = 0;
1086
1087             while( ( base + ofs  < trie->uniquecharcount ) ||
1088                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
1089                      && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1090                     ofs++;
1091
1092             PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1093
1094             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1095                 if ( ( base + ofs >= trie->uniquecharcount ) &&
1096                      ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1097                      trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1098                 {
1099                    PerlIO_printf( Perl_debug_log, "%*"UVXf,
1100                     colwidth,
1101                     (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1102                 } else {
1103                     PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
1104                 }
1105             }
1106
1107             PerlIO_printf( Perl_debug_log, "]");
1108
1109         }
1110         PerlIO_printf( Perl_debug_log, "\n" );
1111     }
1112     PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1113     for (word=1; word <= trie->wordcount; word++) {
1114         PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1115             (int)word, (int)(trie->wordinfo[word].prev),
1116             (int)(trie->wordinfo[word].len));
1117     }
1118     PerlIO_printf(Perl_debug_log, "\n" );
1119 }
1120 /*
1121   Dumps a fully constructed but uncompressed trie in list form.
1122   List tries normally only are used for construction when the number of
1123   possible chars (trie->uniquecharcount) is very high.
1124   Used for debugging make_trie().
1125 */
1126 STATIC void
1127 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1128                          HV *widecharmap, AV *revcharmap, U32 next_alloc,
1129                          U32 depth)
1130 {
1131     U32 state;
1132     SV *sv=sv_newmortal();
1133     int colwidth= widecharmap ? 6 : 4;
1134     GET_RE_DEBUG_FLAGS_DECL;
1135
1136     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1137
1138     /* print out the table precompression.  */
1139     PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1140         (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1141         "------:-----+-----------------\n" );
1142
1143     for( state=1 ; state < next_alloc ; state ++ ) {
1144         U16 charid;
1145
1146         PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1147             (int)depth * 2 + 2,"", (UV)state  );
1148         if ( ! trie->states[ state ].wordnum ) {
1149             PerlIO_printf( Perl_debug_log, "%5s| ","");
1150         } else {
1151             PerlIO_printf( Perl_debug_log, "W%4x| ",
1152                 trie->states[ state ].wordnum
1153             );
1154         }
1155         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1156             SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1157             if ( tmp ) {
1158                 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1159                     colwidth,
1160                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1161                             PL_colors[0], PL_colors[1],
1162                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1163                             PERL_PV_ESCAPE_FIRSTCHAR
1164                     ) ,
1165                     TRIE_LIST_ITEM(state,charid).forid,
1166                     (UV)TRIE_LIST_ITEM(state,charid).newstate
1167                 );
1168                 if (!(charid % 10))
1169                     PerlIO_printf(Perl_debug_log, "\n%*s| ",
1170                         (int)((depth * 2) + 14), "");
1171             }
1172         }
1173         PerlIO_printf( Perl_debug_log, "\n");
1174     }
1175 }
1176
1177 /*
1178   Dumps a fully constructed but uncompressed trie in table form.
1179   This is the normal DFA style state transition table, with a few
1180   twists to facilitate compression later.
1181   Used for debugging make_trie().
1182 */
1183 STATIC void
1184 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1185                           HV *widecharmap, AV *revcharmap, U32 next_alloc,
1186                           U32 depth)
1187 {
1188     U32 state;
1189     U16 charid;
1190     SV *sv=sv_newmortal();
1191     int colwidth= widecharmap ? 6 : 4;
1192     GET_RE_DEBUG_FLAGS_DECL;
1193
1194     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1195
1196     /*
1197        print out the table precompression so that we can do a visual check
1198        that they are identical.
1199      */
1200
1201     PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1202
1203     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1204         SV ** const tmp = av_fetch( revcharmap, charid, 0);
1205         if ( tmp ) {
1206             PerlIO_printf( Perl_debug_log, "%*s",
1207                 colwidth,
1208                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1209                             PL_colors[0], PL_colors[1],
1210                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1211                             PERL_PV_ESCAPE_FIRSTCHAR
1212                 )
1213             );
1214         }
1215     }
1216
1217     PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1218
1219     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1220         PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1221     }
1222
1223     PerlIO_printf( Perl_debug_log, "\n" );
1224
1225     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1226
1227         PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1228             (int)depth * 2 + 2,"",
1229             (UV)TRIE_NODENUM( state ) );
1230
1231         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1232             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1233             if (v)
1234                 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1235             else
1236                 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1237         }
1238         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1239             PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1240         } else {
1241             PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1242             trie->states[ TRIE_NODENUM( state ) ].wordnum );
1243         }
1244     }
1245 }
1246
1247 #endif
1248
1249
1250 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1251   startbranch: the first branch in the whole branch sequence
1252   first      : start branch of sequence of branch-exact nodes.
1253                May be the same as startbranch
1254   last       : Thing following the last branch.
1255                May be the same as tail.
1256   tail       : item following the branch sequence
1257   count      : words in the sequence
1258   flags      : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1259   depth      : indent depth
1260
1261 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1262
1263 A trie is an N'ary tree where the branches are determined by digital
1264 decomposition of the key. IE, at the root node you look up the 1st character and
1265 follow that branch repeat until you find the end of the branches. Nodes can be
1266 marked as "accepting" meaning they represent a complete word. Eg:
1267
1268   /he|she|his|hers/
1269
1270 would convert into the following structure. Numbers represent states, letters
1271 following numbers represent valid transitions on the letter from that state, if
1272 the number is in square brackets it represents an accepting state, otherwise it
1273 will be in parenthesis.
1274
1275       +-h->+-e->[3]-+-r->(8)-+-s->[9]
1276       |    |
1277       |   (2)
1278       |    |
1279      (1)   +-i->(6)-+-s->[7]
1280       |
1281       +-s->(3)-+-h->(4)-+-e->[5]
1282
1283       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1284
1285 This shows that when matching against the string 'hers' we will begin at state 1
1286 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1287 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1288 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1289 single traverse. We store a mapping from accepting to state to which word was
1290 matched, and then when we have multiple possibilities we try to complete the
1291 rest of the regex in the order in which they occured in the alternation.
1292
1293 The only prior NFA like behaviour that would be changed by the TRIE support is
1294 the silent ignoring of duplicate alternations which are of the form:
1295
1296  / (DUPE|DUPE) X? (?{ ... }) Y /x
1297
1298 Thus EVAL blocks following a trie may be called a different number of times with
1299 and without the optimisation. With the optimisations dupes will be silently
1300 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1301 the following demonstrates:
1302
1303  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1304
1305 which prints out 'word' three times, but
1306
1307  'words'=~/(word|word|word)(?{ print $1 })S/
1308
1309 which doesnt print it out at all. This is due to other optimisations kicking in.
1310
1311 Example of what happens on a structural level:
1312
1313 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1314
1315    1: CURLYM[1] {1,32767}(18)
1316    5:   BRANCH(8)
1317    6:     EXACT <ac>(16)
1318    8:   BRANCH(11)
1319    9:     EXACT <ad>(16)
1320   11:   BRANCH(14)
1321   12:     EXACT <ab>(16)
1322   16:   SUCCEED(0)
1323   17:   NOTHING(18)
1324   18: END(0)
1325
1326 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1327 and should turn into:
1328
1329    1: CURLYM[1] {1,32767}(18)
1330    5:   TRIE(16)
1331         [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1332           <ac>
1333           <ad>
1334           <ab>
1335   16:   SUCCEED(0)
1336   17:   NOTHING(18)
1337   18: END(0)
1338
1339 Cases where tail != last would be like /(?foo|bar)baz/:
1340
1341    1: BRANCH(4)
1342    2:   EXACT <foo>(8)
1343    4: BRANCH(7)
1344    5:   EXACT <bar>(8)
1345    7: TAIL(8)
1346    8: EXACT <baz>(10)
1347   10: END(0)
1348
1349 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1350 and would end up looking like:
1351
1352     1: TRIE(8)
1353       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1354         <foo>
1355         <bar>
1356    7: TAIL(8)
1357    8: EXACT <baz>(10)
1358   10: END(0)
1359
1360     d = uvuni_to_utf8_flags(d, uv, 0);
1361
1362 is the recommended Unicode-aware way of saying
1363
1364     *(d++) = uv;
1365 */
1366
1367 #define TRIE_STORE_REVCHAR                                                 \
1368     STMT_START {                                                           \
1369         if (UTF) {                                                         \
1370             SV *zlopp = newSV(2);                                          \
1371             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
1372             unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
1373             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
1374             SvPOK_on(zlopp);                                               \
1375             SvUTF8_on(zlopp);                                              \
1376             av_push(revcharmap, zlopp);                                    \
1377         } else {                                                           \
1378             char ooooff = (char)uvc;                                               \
1379             av_push(revcharmap, newSVpvn(&ooooff, 1));                     \
1380         }                                                                  \
1381         } STMT_END
1382
1383 #define TRIE_READ_CHAR STMT_START {                                           \
1384     wordlen++;                                                                \
1385     if ( UTF ) {                                                              \
1386         if ( folder ) {                                                       \
1387             if ( foldlen > 0 ) {                                              \
1388                uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags );     \
1389                foldlen -= len;                                                \
1390                scan += len;                                                   \
1391                len = 0;                                                       \
1392             } else {                                                          \
1393                 len = UTF8SKIP(uc);\
1394                 uvc = to_utf8_fold( uc, foldbuf, &foldlen);                   \
1395                 foldlen -= UNISKIP( uvc );                                    \
1396                 scan = foldbuf + UNISKIP( uvc );                              \
1397             }                                                                 \
1398         } else {                                                              \
1399             uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1400         }                                                                     \
1401     } else {                                                                  \
1402         uvc = (U32)*uc;                                                       \
1403         len = 1;                                                              \
1404     }                                                                         \
1405 } STMT_END
1406
1407
1408
1409 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
1410     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
1411         U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
1412         Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1413     }                                                           \
1414     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
1415     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
1416     TRIE_LIST_CUR( state )++;                                   \
1417 } STMT_END
1418
1419 #define TRIE_LIST_NEW(state) STMT_START {                       \
1420     Newxz( trie->states[ state ].trans.list,               \
1421         4, reg_trie_trans_le );                                 \
1422      TRIE_LIST_CUR( state ) = 1;                                \
1423      TRIE_LIST_LEN( state ) = 4;                                \
1424 } STMT_END
1425
1426 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
1427     U16 dupe= trie->states[ state ].wordnum;                    \
1428     regnode * const noper_next = regnext( noper );              \
1429                                                                 \
1430     DEBUG_r({                                                   \
1431         /* store the word for dumping */                        \
1432         SV* tmp;                                                \
1433         if (OP(noper) != NOTHING)                               \
1434             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);    \
1435         else                                                    \
1436             tmp = newSVpvn_utf8( "", 0, UTF );                  \
1437         av_push( trie_words, tmp );                             \
1438     });                                                         \
1439                                                                 \
1440     curword++;                                                  \
1441     trie->wordinfo[curword].prev   = 0;                         \
1442     trie->wordinfo[curword].len    = wordlen;                   \
1443     trie->wordinfo[curword].accept = state;                     \
1444                                                                 \
1445     if ( noper_next < tail ) {                                  \
1446         if (!trie->jump)                                        \
1447             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1448         trie->jump[curword] = (U16)(noper_next - convert);      \
1449         if (!jumper)                                            \
1450             jumper = noper_next;                                \
1451         if (!nextbranch)                                        \
1452             nextbranch= regnext(cur);                           \
1453     }                                                           \
1454                                                                 \
1455     if ( dupe ) {                                               \
1456         /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
1457         /* chain, so that when the bits of chain are later    */\
1458         /* linked together, the dups appear in the chain      */\
1459         trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1460         trie->wordinfo[dupe].prev = curword;                    \
1461     } else {                                                    \
1462         /* we haven't inserted this word yet.                */ \
1463         trie->states[ state ].wordnum = curword;                \
1464     }                                                           \
1465 } STMT_END
1466
1467
1468 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)          \
1469      ( ( base + charid >=  ucharcount                                   \
1470          && base + charid < ubound                                      \
1471          && state == trie->trans[ base - ucharcount + charid ].check    \
1472          && trie->trans[ base - ucharcount + charid ].next )            \
1473            ? trie->trans[ base - ucharcount + charid ].next             \
1474            : ( state==1 ? special : 0 )                                 \
1475       )
1476
1477 #define MADE_TRIE       1
1478 #define MADE_JUMP_TRIE  2
1479 #define MADE_EXACT_TRIE 4
1480
1481 STATIC I32
1482 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1483 {
1484     dVAR;
1485     /* first pass, loop through and scan words */
1486     reg_trie_data *trie;
1487     HV *widecharmap = NULL;
1488     AV *revcharmap = newAV();
1489     regnode *cur;
1490     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1491     STRLEN len = 0;
1492     UV uvc = 0;
1493     U16 curword = 0;
1494     U32 next_alloc = 0;
1495     regnode *jumper = NULL;
1496     regnode *nextbranch = NULL;
1497     regnode *convert = NULL;
1498     U32 *prev_states; /* temp array mapping each state to previous one */
1499     /* we just use folder as a flag in utf8 */
1500     const U8 * folder = NULL;
1501
1502 #ifdef DEBUGGING
1503     const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1504     AV *trie_words = NULL;
1505     /* along with revcharmap, this only used during construction but both are
1506      * useful during debugging so we store them in the struct when debugging.
1507      */
1508 #else
1509     const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1510     STRLEN trie_charcount=0;
1511 #endif
1512     SV *re_trie_maxbuff;
1513     GET_RE_DEBUG_FLAGS_DECL;
1514
1515     PERL_ARGS_ASSERT_MAKE_TRIE;
1516 #ifndef DEBUGGING
1517     PERL_UNUSED_ARG(depth);
1518 #endif
1519
1520     switch (flags) {
1521         case EXACT: break;
1522         case EXACTFA:
1523         case EXACTFU: folder = PL_fold_latin1; break;
1524         case EXACTF:  folder = PL_fold; break;
1525         case EXACTFL: folder = PL_fold_locale; break;
1526         default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u", (unsigned) flags );
1527     }
1528
1529     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1530     trie->refcount = 1;
1531     trie->startstate = 1;
1532     trie->wordcount = word_count;
1533     RExC_rxi->data->data[ data_slot ] = (void*)trie;
1534     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1535     if (!(UTF && folder))
1536         trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1537     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1538                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
1539
1540     DEBUG_r({
1541         trie_words = newAV();
1542     });
1543
1544     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1545     if (!SvIOK(re_trie_maxbuff)) {
1546         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1547     }
1548     DEBUG_OPTIMISE_r({
1549                 PerlIO_printf( Perl_debug_log,
1550                   "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1551                   (int)depth * 2 + 2, "",
1552                   REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1553                   REG_NODE_NUM(last), REG_NODE_NUM(tail),
1554                   (int)depth);
1555     });
1556
1557    /* Find the node we are going to overwrite */
1558     if ( first == startbranch && OP( last ) != BRANCH ) {
1559         /* whole branch chain */
1560         convert = first;
1561     } else {
1562         /* branch sub-chain */
1563         convert = NEXTOPER( first );
1564     }
1565
1566     /*  -- First loop and Setup --
1567
1568        We first traverse the branches and scan each word to determine if it
1569        contains widechars, and how many unique chars there are, this is
1570        important as we have to build a table with at least as many columns as we
1571        have unique chars.
1572
1573        We use an array of integers to represent the character codes 0..255
1574        (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1575        native representation of the character value as the key and IV's for the
1576        coded index.
1577
1578        *TODO* If we keep track of how many times each character is used we can
1579        remap the columns so that the table compression later on is more
1580        efficient in terms of memory by ensuring the most common value is in the
1581        middle and the least common are on the outside.  IMO this would be better
1582        than a most to least common mapping as theres a decent chance the most
1583        common letter will share a node with the least common, meaning the node
1584        will not be compressible. With a middle is most common approach the worst
1585        case is when we have the least common nodes twice.
1586
1587      */
1588
1589     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1590         regnode * const noper = NEXTOPER( cur );
1591         const U8 *uc = (U8*)STRING( noper );
1592         const U8 * const e  = uc + STR_LEN( noper );
1593         STRLEN foldlen = 0;
1594         U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1595         const U8 *scan = (U8*)NULL;
1596         U32 wordlen      = 0;         /* required init */
1597         STRLEN chars = 0;
1598         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1599
1600         if (OP(noper) == NOTHING) {
1601             trie->minlen= 0;
1602             continue;
1603         }
1604         if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1605             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1606                                           regardless of encoding */
1607
1608         for ( ; uc < e ; uc += len ) {
1609             TRIE_CHARCOUNT(trie)++;
1610             TRIE_READ_CHAR;
1611             chars++;
1612             if ( uvc < 256 ) {
1613                 if ( !trie->charmap[ uvc ] ) {
1614                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1615                     if ( folder )
1616                         trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1617                     TRIE_STORE_REVCHAR;
1618                 }
1619                 if ( set_bit ) {
1620                     /* store the codepoint in the bitmap, and its folded
1621                      * equivalent. */
1622                     TRIE_BITMAP_SET(trie,uvc);
1623
1624                     /* store the folded codepoint */
1625                     if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1626
1627                     if ( !UTF ) {
1628                         /* store first byte of utf8 representation of
1629                            variant codepoints */
1630                         if (! UNI_IS_INVARIANT(uvc)) {
1631                             TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
1632                         }
1633                     }
1634                     set_bit = 0; /* We've done our bit :-) */
1635                 }
1636             } else {
1637                 SV** svpp;
1638                 if ( !widecharmap )
1639                     widecharmap = newHV();
1640
1641                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1642
1643                 if ( !svpp )
1644                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1645
1646                 if ( !SvTRUE( *svpp ) ) {
1647                     sv_setiv( *svpp, ++trie->uniquecharcount );
1648                     TRIE_STORE_REVCHAR;
1649                 }
1650             }
1651         }
1652         if( cur == first ) {
1653             trie->minlen=chars;
1654             trie->maxlen=chars;
1655         } else if (chars < trie->minlen) {
1656             trie->minlen=chars;
1657         } else if (chars > trie->maxlen) {
1658             trie->maxlen=chars;
1659         }
1660
1661     } /* end first pass */
1662     DEBUG_TRIE_COMPILE_r(
1663         PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1664                 (int)depth * 2 + 2,"",
1665                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1666                 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1667                 (int)trie->minlen, (int)trie->maxlen )
1668     );
1669
1670     /*
1671         We now know what we are dealing with in terms of unique chars and
1672         string sizes so we can calculate how much memory a naive
1673         representation using a flat table  will take. If it's over a reasonable
1674         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1675         conservative but potentially much slower representation using an array
1676         of lists.
1677
1678         At the end we convert both representations into the same compressed
1679         form that will be used in regexec.c for matching with. The latter
1680         is a form that cannot be used to construct with but has memory
1681         properties similar to the list form and access properties similar
1682         to the table form making it both suitable for fast searches and
1683         small enough that its feasable to store for the duration of a program.
1684
1685         See the comment in the code where the compressed table is produced
1686         inplace from the flat tabe representation for an explanation of how
1687         the compression works.
1688
1689     */
1690
1691
1692     Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1693     prev_states[1] = 0;
1694
1695     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1696         /*
1697             Second Pass -- Array Of Lists Representation
1698
1699             Each state will be represented by a list of charid:state records
1700             (reg_trie_trans_le) the first such element holds the CUR and LEN
1701             points of the allocated array. (See defines above).
1702
1703             We build the initial structure using the lists, and then convert
1704             it into the compressed table form which allows faster lookups
1705             (but cant be modified once converted).
1706         */
1707
1708         STRLEN transcount = 1;
1709
1710         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1711             "%*sCompiling trie using list compiler\n",
1712             (int)depth * 2 + 2, ""));
1713
1714         trie->states = (reg_trie_state *)
1715             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1716                                   sizeof(reg_trie_state) );
1717         TRIE_LIST_NEW(1);
1718         next_alloc = 2;
1719
1720         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1721
1722             regnode * const noper = NEXTOPER( cur );
1723             U8 *uc           = (U8*)STRING( noper );
1724             const U8 * const e = uc + STR_LEN( noper );
1725             U32 state        = 1;         /* required init */
1726             U16 charid       = 0;         /* sanity init */
1727             U8 *scan         = (U8*)NULL; /* sanity init */
1728             STRLEN foldlen   = 0;         /* required init */
1729             U32 wordlen      = 0;         /* required init */
1730             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1731
1732             if (OP(noper) != NOTHING) {
1733                 for ( ; uc < e ; uc += len ) {
1734
1735                     TRIE_READ_CHAR;
1736
1737                     if ( uvc < 256 ) {
1738                         charid = trie->charmap[ uvc ];
1739                     } else {
1740                         SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1741                         if ( !svpp ) {
1742                             charid = 0;
1743                         } else {
1744                             charid=(U16)SvIV( *svpp );
1745                         }
1746                     }
1747                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
1748                     if ( charid ) {
1749
1750                         U16 check;
1751                         U32 newstate = 0;
1752
1753                         charid--;
1754                         if ( !trie->states[ state ].trans.list ) {
1755                             TRIE_LIST_NEW( state );
1756                         }
1757                         for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1758                             if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1759                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1760                                 break;
1761                             }
1762                         }
1763                         if ( ! newstate ) {
1764                             newstate = next_alloc++;
1765                             prev_states[newstate] = state;
1766                             TRIE_LIST_PUSH( state, charid, newstate );
1767                             transcount++;
1768                         }
1769                         state = newstate;
1770                     } else {
1771                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1772                     }
1773                 }
1774             }
1775             TRIE_HANDLE_WORD(state);
1776
1777         } /* end second pass */
1778
1779         /* next alloc is the NEXT state to be allocated */
1780         trie->statecount = next_alloc;
1781         trie->states = (reg_trie_state *)
1782             PerlMemShared_realloc( trie->states,
1783                                    next_alloc
1784                                    * sizeof(reg_trie_state) );
1785
1786         /* and now dump it out before we compress it */
1787         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1788                                                          revcharmap, next_alloc,
1789                                                          depth+1)
1790         );
1791
1792         trie->trans = (reg_trie_trans *)
1793             PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1794         {
1795             U32 state;
1796             U32 tp = 0;
1797             U32 zp = 0;
1798
1799
1800             for( state=1 ; state < next_alloc ; state ++ ) {
1801                 U32 base=0;
1802
1803                 /*
1804                 DEBUG_TRIE_COMPILE_MORE_r(
1805                     PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1806                 );
1807                 */
1808
1809                 if (trie->states[state].trans.list) {
1810                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1811                     U16 maxid=minid;
1812                     U16 idx;
1813
1814                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1815                         const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1816                         if ( forid < minid ) {
1817                             minid=forid;
1818                         } else if ( forid > maxid ) {
1819                             maxid=forid;
1820                         }
1821                     }
1822                     if ( transcount < tp + maxid - minid + 1) {
1823                         transcount *= 2;
1824                         trie->trans = (reg_trie_trans *)
1825                             PerlMemShared_realloc( trie->trans,
1826                                                      transcount
1827                                                      * sizeof(reg_trie_trans) );
1828                         Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1829                     }
1830                     base = trie->uniquecharcount + tp - minid;
1831                     if ( maxid == minid ) {
1832                         U32 set = 0;
1833                         for ( ; zp < tp ; zp++ ) {
1834                             if ( ! trie->trans[ zp ].next ) {
1835                                 base = trie->uniquecharcount + zp - minid;
1836                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1837                                 trie->trans[ zp ].check = state;
1838                                 set = 1;
1839                                 break;
1840                             }
1841                         }
1842                         if ( !set ) {
1843                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1844                             trie->trans[ tp ].check = state;
1845                             tp++;
1846                             zp = tp;
1847                         }
1848                     } else {
1849                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1850                             const U32 tid = base -  trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1851                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1852                             trie->trans[ tid ].check = state;
1853                         }
1854                         tp += ( maxid - minid + 1 );
1855                     }
1856                     Safefree(trie->states[ state ].trans.list);
1857                 }
1858                 /*
1859                 DEBUG_TRIE_COMPILE_MORE_r(
1860                     PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1861                 );
1862                 */
1863                 trie->states[ state ].trans.base=base;
1864             }
1865             trie->lasttrans = tp + 1;
1866         }
1867     } else {
1868         /*
1869            Second Pass -- Flat Table Representation.
1870
1871            we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1872            We know that we will need Charcount+1 trans at most to store the data
1873            (one row per char at worst case) So we preallocate both structures
1874            assuming worst case.
1875
1876            We then construct the trie using only the .next slots of the entry
1877            structs.
1878
1879            We use the .check field of the first entry of the node temporarily to
1880            make compression both faster and easier by keeping track of how many non
1881            zero fields are in the node.
1882
1883            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1884            transition.
1885
1886            There are two terms at use here: state as a TRIE_NODEIDX() which is a
1887            number representing the first entry of the node, and state as a
1888            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1889            TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1890            are 2 entrys per node. eg:
1891
1892              A B       A B
1893           1. 2 4    1. 3 7
1894           2. 0 3    3. 0 5
1895           3. 0 0    5. 0 0
1896           4. 0 0    7. 0 0
1897
1898            The table is internally in the right hand, idx form. However as we also
1899            have to deal with the states array which is indexed by nodenum we have to
1900            use TRIE_NODENUM() to convert.
1901
1902         */
1903         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1904             "%*sCompiling trie using table compiler\n",
1905             (int)depth * 2 + 2, ""));
1906
1907         trie->trans = (reg_trie_trans *)
1908             PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1909                                   * trie->uniquecharcount + 1,
1910                                   sizeof(reg_trie_trans) );
1911         trie->states = (reg_trie_state *)
1912             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1913                                   sizeof(reg_trie_state) );
1914         next_alloc = trie->uniquecharcount + 1;
1915
1916
1917         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1918
1919             regnode * const noper   = NEXTOPER( cur );
1920             const U8 *uc     = (U8*)STRING( noper );
1921             const U8 * const e = uc + STR_LEN( noper );
1922
1923             U32 state        = 1;         /* required init */
1924
1925             U16 charid       = 0;         /* sanity init */
1926             U32 accept_state = 0;         /* sanity init */
1927             U8 *scan         = (U8*)NULL; /* sanity init */
1928
1929             STRLEN foldlen   = 0;         /* required init */
1930             U32 wordlen      = 0;         /* required init */
1931             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1932
1933             if ( OP(noper) != NOTHING ) {
1934                 for ( ; uc < e ; uc += len ) {
1935
1936                     TRIE_READ_CHAR;
1937
1938                     if ( uvc < 256 ) {
1939                         charid = trie->charmap[ uvc ];
1940                     } else {
1941                         SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1942                         charid = svpp ? (U16)SvIV(*svpp) : 0;
1943                     }
1944                     if ( charid ) {
1945                         charid--;
1946                         if ( !trie->trans[ state + charid ].next ) {
1947                             trie->trans[ state + charid ].next = next_alloc;
1948                             trie->trans[ state ].check++;
1949                             prev_states[TRIE_NODENUM(next_alloc)]
1950                                     = TRIE_NODENUM(state);
1951                             next_alloc += trie->uniquecharcount;
1952                         }
1953                         state = trie->trans[ state + charid ].next;
1954                     } else {
1955                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1956                     }
1957                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
1958                 }
1959             }
1960             accept_state = TRIE_NODENUM( state );
1961             TRIE_HANDLE_WORD(accept_state);
1962
1963         } /* end second pass */
1964
1965         /* and now dump it out before we compress it */
1966         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1967                                                           revcharmap,
1968                                                           next_alloc, depth+1));
1969
1970         {
1971         /*
1972            * Inplace compress the table.*
1973
1974            For sparse data sets the table constructed by the trie algorithm will
1975            be mostly 0/FAIL transitions or to put it another way mostly empty.
1976            (Note that leaf nodes will not contain any transitions.)
1977
1978            This algorithm compresses the tables by eliminating most such
1979            transitions, at the cost of a modest bit of extra work during lookup:
1980
1981            - Each states[] entry contains a .base field which indicates the
1982            index in the state[] array wheres its transition data is stored.
1983
1984            - If .base is 0 there are no valid transitions from that node.
1985
1986            - If .base is nonzero then charid is added to it to find an entry in
1987            the trans array.
1988
1989            -If trans[states[state].base+charid].check!=state then the
1990            transition is taken to be a 0/Fail transition. Thus if there are fail
1991            transitions at the front of the node then the .base offset will point
1992            somewhere inside the previous nodes data (or maybe even into a node
1993            even earlier), but the .check field determines if the transition is
1994            valid.
1995
1996            XXX - wrong maybe?
1997            The following process inplace converts the table to the compressed
1998            table: We first do not compress the root node 1,and mark all its
1999            .check pointers as 1 and set its .base pointer as 1 as well. This
2000            allows us to do a DFA construction from the compressed table later,
2001            and ensures that any .base pointers we calculate later are greater
2002            than 0.
2003
2004            - We set 'pos' to indicate the first entry of the second node.
2005
2006            - We then iterate over the columns of the node, finding the first and
2007            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2008            and set the .check pointers accordingly, and advance pos
2009            appropriately and repreat for the next node. Note that when we copy
2010            the next pointers we have to convert them from the original
2011            NODEIDX form to NODENUM form as the former is not valid post
2012            compression.
2013
2014            - If a node has no transitions used we mark its base as 0 and do not
2015            advance the pos pointer.
2016
2017            - If a node only has one transition we use a second pointer into the
2018            structure to fill in allocated fail transitions from other states.
2019            This pointer is independent of the main pointer and scans forward
2020            looking for null transitions that are allocated to a state. When it
2021            finds one it writes the single transition into the "hole".  If the
2022            pointer doesnt find one the single transition is appended as normal.
2023
2024            - Once compressed we can Renew/realloc the structures to release the
2025            excess space.
2026
2027            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2028            specifically Fig 3.47 and the associated pseudocode.
2029
2030            demq
2031         */
2032         const U32 laststate = TRIE_NODENUM( next_alloc );
2033         U32 state, charid;
2034         U32 pos = 0, zp=0;
2035         trie->statecount = laststate;
2036
2037         for ( state = 1 ; state < laststate ; state++ ) {
2038             U8 flag = 0;
2039             const U32 stateidx = TRIE_NODEIDX( state );
2040             const U32 o_used = trie->trans[ stateidx ].check;
2041             U32 used = trie->trans[ stateidx ].check;
2042             trie->trans[ stateidx ].check = 0;
2043
2044             for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2045                 if ( flag || trie->trans[ stateidx + charid ].next ) {
2046                     if ( trie->trans[ stateidx + charid ].next ) {
2047                         if (o_used == 1) {
2048                             for ( ; zp < pos ; zp++ ) {
2049                                 if ( ! trie->trans[ zp ].next ) {
2050                                     break;
2051                                 }
2052                             }
2053                             trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2054                             trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2055                             trie->trans[ zp ].check = state;
2056                             if ( ++zp > pos ) pos = zp;
2057                             break;
2058                         }
2059                         used--;
2060                     }
2061                     if ( !flag ) {
2062                         flag = 1;
2063                         trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2064                     }
2065                     trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2066                     trie->trans[ pos ].check = state;
2067                     pos++;
2068                 }
2069             }
2070         }
2071         trie->lasttrans = pos + 1;
2072         trie->states = (reg_trie_state *)
2073             PerlMemShared_realloc( trie->states, laststate
2074                                    * sizeof(reg_trie_state) );
2075         DEBUG_TRIE_COMPILE_MORE_r(
2076                 PerlIO_printf( Perl_debug_log,
2077                     "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2078                     (int)depth * 2 + 2,"",
2079                     (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
2080                     (IV)next_alloc,
2081                     (IV)pos,
2082                     ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2083             );
2084
2085         } /* end table compress */
2086     }
2087     DEBUG_TRIE_COMPILE_MORE_r(
2088             PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2089                 (int)depth * 2 + 2, "",
2090                 (UV)trie->statecount,
2091                 (UV)trie->lasttrans)
2092     );
2093     /* resize the trans array to remove unused space */
2094     trie->trans = (reg_trie_trans *)
2095         PerlMemShared_realloc( trie->trans, trie->lasttrans
2096                                * sizeof(reg_trie_trans) );
2097
2098     {   /* Modify the program and insert the new TRIE node */
2099         U8 nodetype =(U8)(flags & 0xFF);
2100         char *str=NULL;
2101
2102 #ifdef DEBUGGING
2103         regnode *optimize = NULL;
2104 #ifdef RE_TRACK_PATTERN_OFFSETS
2105
2106         U32 mjd_offset = 0;
2107         U32 mjd_nodelen = 0;
2108 #endif /* RE_TRACK_PATTERN_OFFSETS */
2109 #endif /* DEBUGGING */
2110         /*
2111            This means we convert either the first branch or the first Exact,
2112            depending on whether the thing following (in 'last') is a branch
2113            or not and whther first is the startbranch (ie is it a sub part of
2114            the alternation or is it the whole thing.)
2115            Assuming its a sub part we convert the EXACT otherwise we convert
2116            the whole branch sequence, including the first.
2117          */
2118         /* Find the node we are going to overwrite */
2119         if ( first != startbranch || OP( last ) == BRANCH ) {
2120             /* branch sub-chain */
2121             NEXT_OFF( first ) = (U16)(last - first);
2122 #ifdef RE_TRACK_PATTERN_OFFSETS
2123             DEBUG_r({
2124                 mjd_offset= Node_Offset((convert));
2125                 mjd_nodelen= Node_Length((convert));
2126             });
2127 #endif
2128             /* whole branch chain */
2129         }
2130 #ifdef RE_TRACK_PATTERN_OFFSETS
2131         else {
2132             DEBUG_r({
2133                 const  regnode *nop = NEXTOPER( convert );
2134                 mjd_offset= Node_Offset((nop));
2135                 mjd_nodelen= Node_Length((nop));
2136             });
2137         }
2138         DEBUG_OPTIMISE_r(
2139             PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2140                 (int)depth * 2 + 2, "",
2141                 (UV)mjd_offset, (UV)mjd_nodelen)
2142         );
2143 #endif
2144         /* But first we check to see if there is a common prefix we can
2145            split out as an EXACT and put in front of the TRIE node.  */
2146         trie->startstate= 1;
2147         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
2148             U32 state;
2149             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2150                 U32 ofs = 0;
2151                 I32 idx = -1;
2152                 U32 count = 0;
2153                 const U32 base = trie->states[ state ].trans.base;
2154
2155                 if ( trie->states[state].wordnum )
2156                         count = 1;
2157
2158                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2159                     if ( ( base + ofs >= trie->uniquecharcount ) &&
2160                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2161                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2162                     {
2163                         if ( ++count > 1 ) {
2164                             SV **tmp = av_fetch( revcharmap, ofs, 0);
2165                             const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2166                             if ( state == 1 ) break;
2167                             if ( count == 2 ) {
2168                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2169                                 DEBUG_OPTIMISE_r(
2170                                     PerlIO_printf(Perl_debug_log,
2171                                         "%*sNew Start State=%"UVuf" Class: [",
2172                                         (int)depth * 2 + 2, "",
2173                                         (UV)state));
2174                                 if (idx >= 0) {
2175                                     SV ** const tmp = av_fetch( revcharmap, idx, 0);
2176                                     const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2177
2178                                     TRIE_BITMAP_SET(trie,*ch);
2179                                     if ( folder )
2180                                         TRIE_BITMAP_SET(trie, folder[ *ch ]);
2181                                     DEBUG_OPTIMISE_r(
2182                                         PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2183                                     );
2184                                 }
2185                             }
2186                             TRIE_BITMAP_SET(trie,*ch);
2187                             if ( folder )
2188                                 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2189                             DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2190                         }
2191                         idx = ofs;
2192                     }
2193                 }
2194                 if ( count == 1 ) {
2195                     SV **tmp = av_fetch( revcharmap, idx, 0);
2196                     STRLEN len;
2197                     char *ch = SvPV( *tmp, len );
2198                     DEBUG_OPTIMISE_r({
2199                         SV *sv=sv_newmortal();
2200                         PerlIO_printf( Perl_debug_log,
2201                             "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2202                             (int)depth * 2 + 2, "",
2203                             (UV)state, (UV)idx,
2204                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2205                                 PL_colors[0], PL_colors[1],
2206                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2207                                 PERL_PV_ESCAPE_FIRSTCHAR
2208                             )
2209                         );
2210                     });
2211                     if ( state==1 ) {
2212                         OP( convert ) = nodetype;
2213                         str=STRING(convert);
2214                         STR_LEN(convert)=0;
2215                     }
2216                     STR_LEN(convert) += len;
2217                     while (len--)
2218                         *str++ = *ch++;
2219                 } else {
2220 #ifdef DEBUGGING
2221                     if (state>1)
2222                         DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2223 #endif
2224                     break;
2225                 }
2226             }
2227             trie->prefixlen = (state-1);
2228             if (str) {
2229                 regnode *n = convert+NODE_SZ_STR(convert);
2230                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2231                 trie->startstate = state;
2232                 trie->minlen -= (state - 1);
2233                 trie->maxlen -= (state - 1);
2234 #ifdef DEBUGGING
2235                /* At least the UNICOS C compiler choked on this
2236                 * being argument to DEBUG_r(), so let's just have
2237                 * it right here. */
2238                if (
2239 #ifdef PERL_EXT_RE_BUILD
2240                    1
2241 #else
2242                    DEBUG_r_TEST
2243 #endif
2244                    ) {
2245                    regnode *fix = convert;
2246                    U32 word = trie->wordcount;
2247                    mjd_nodelen++;
2248                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2249                    while( ++fix < n ) {
2250                        Set_Node_Offset_Length(fix, 0, 0);
2251                    }
2252                    while (word--) {
2253                        SV ** const tmp = av_fetch( trie_words, word, 0 );
2254                        if (tmp) {
2255                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
2256                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2257                            else
2258                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2259                        }
2260                    }
2261                }
2262 #endif
2263                 if (trie->maxlen) {
2264                     convert = n;
2265                 } else {
2266                     NEXT_OFF(convert) = (U16)(tail - convert);
2267                     DEBUG_r(optimize= n);
2268                 }
2269             }
2270         }
2271         if (!jumper)
2272             jumper = last;
2273         if ( trie->maxlen ) {
2274             NEXT_OFF( convert ) = (U16)(tail - convert);
2275             ARG_SET( convert, data_slot );
2276             /* Store the offset to the first unabsorbed branch in
2277                jump[0], which is otherwise unused by the jump logic.
2278                We use this when dumping a trie and during optimisation. */
2279             if (trie->jump)
2280                 trie->jump[0] = (U16)(nextbranch - convert);
2281
2282             /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2283              *   and there is a bitmap
2284              *   and the first "jump target" node we found leaves enough room
2285              * then convert the TRIE node into a TRIEC node, with the bitmap
2286              * embedded inline in the opcode - this is hypothetically faster.
2287              */
2288             if ( !trie->states[trie->startstate].wordnum
2289                  && trie->bitmap
2290                  && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2291             {
2292                 OP( convert ) = TRIEC;
2293                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2294                 PerlMemShared_free(trie->bitmap);
2295                 trie->bitmap= NULL;
2296             } else
2297                 OP( convert ) = TRIE;
2298
2299             /* store the type in the flags */
2300             convert->flags = nodetype;
2301             DEBUG_r({
2302             optimize = convert
2303                       + NODE_STEP_REGNODE
2304                       + regarglen[ OP( convert ) ];
2305             });
2306             /* XXX We really should free up the resource in trie now,
2307                    as we won't use them - (which resources?) dmq */
2308         }
2309         /* needed for dumping*/
2310         DEBUG_r(if (optimize) {
2311             regnode *opt = convert;
2312
2313             while ( ++opt < optimize) {
2314                 Set_Node_Offset_Length(opt,0,0);
2315             }
2316             /*
2317                 Try to clean up some of the debris left after the
2318                 optimisation.
2319              */
2320             while( optimize < jumper ) {
2321                 mjd_nodelen += Node_Length((optimize));
2322                 OP( optimize ) = OPTIMIZED;
2323                 Set_Node_Offset_Length(optimize,0,0);
2324                 optimize++;
2325             }
2326             Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2327         });
2328     } /* end node insert */
2329     REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, convert);
2330
2331     /*  Finish populating the prev field of the wordinfo array.  Walk back
2332      *  from each accept state until we find another accept state, and if
2333      *  so, point the first word's .prev field at the second word. If the
2334      *  second already has a .prev field set, stop now. This will be the
2335      *  case either if we've already processed that word's accept state,
2336      *  or that state had multiple words, and the overspill words were
2337      *  already linked up earlier.
2338      */
2339     {
2340         U16 word;
2341         U32 state;
2342         U16 prev;
2343
2344         for (word=1; word <= trie->wordcount; word++) {
2345             prev = 0;
2346             if (trie->wordinfo[word].prev)
2347                 continue;
2348             state = trie->wordinfo[word].accept;
2349             while (state) {
2350                 state = prev_states[state];
2351                 if (!state)
2352                     break;
2353                 prev = trie->states[state].wordnum;
2354                 if (prev)
2355                     break;
2356             }
2357             trie->wordinfo[word].prev = prev;
2358         }
2359         Safefree(prev_states);
2360     }
2361
2362
2363     /* and now dump out the compressed format */
2364     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2365
2366     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2367 #ifdef DEBUGGING
2368     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2369     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2370 #else
2371     SvREFCNT_dec(revcharmap);
2372 #endif
2373     return trie->jump
2374            ? MADE_JUMP_TRIE
2375            : trie->startstate>1
2376              ? MADE_EXACT_TRIE
2377              : MADE_TRIE;
2378 }
2379
2380 STATIC void
2381 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode *stclass, U32 depth)
2382 {
2383 /* The Trie is constructed and compressed now so we can build a fail array if it's needed
2384
2385    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2386    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2387    ISBN 0-201-10088-6
2388
2389    We find the fail state for each state in the trie, this state is the longest proper
2390    suffix of the current state's 'word' that is also a proper prefix of another word in our
2391    trie. State 1 represents the word '' and is thus the default fail state. This allows
2392    the DFA not to have to restart after its tried and failed a word at a given point, it
2393    simply continues as though it had been matching the other word in the first place.
2394    Consider
2395       'abcdgu'=~/abcdefg|cdgu/
2396    When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2397    fail, which would bring us to the state representing 'd' in the second word where we would
2398    try 'g' and succeed, proceeding to match 'cdgu'.
2399  */
2400  /* add a fail transition */
2401     const U32 trie_offset = ARG(source);
2402     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2403     U32 *q;
2404     const U32 ucharcount = trie->uniquecharcount;
2405     const U32 numstates = trie->statecount;
2406     const U32 ubound = trie->lasttrans + ucharcount;
2407     U32 q_read = 0;
2408     U32 q_write = 0;
2409     U32 charid;
2410     U32 base = trie->states[ 1 ].trans.base;
2411     U32 *fail;
2412     reg_ac_data *aho;
2413     const U32 data_slot = add_data( pRExC_state, 1, "T" );
2414     GET_RE_DEBUG_FLAGS_DECL;
2415
2416     PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2417 #ifndef DEBUGGING
2418     PERL_UNUSED_ARG(depth);
2419 #endif
2420
2421
2422     ARG_SET( stclass, data_slot );
2423     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2424     RExC_rxi->data->data[ data_slot ] = (void*)aho;
2425     aho->trie=trie_offset;
2426     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2427     Copy( trie->states, aho->states, numstates, reg_trie_state );
2428     Newxz( q, numstates, U32);
2429     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2430     aho->refcount = 1;
2431     fail = aho->fail;
2432     /* initialize fail[0..1] to be 1 so that we always have
2433        a valid final fail state */
2434     fail[ 0 ] = fail[ 1 ] = 1;
2435
2436     for ( charid = 0; charid < ucharcount ; charid++ ) {
2437         const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2438         if ( newstate ) {
2439             q[ q_write ] = newstate;
2440             /* set to point at the root */
2441             fail[ q[ q_write++ ] ]=1;
2442         }
2443     }
2444     while ( q_read < q_write) {
2445         const U32 cur = q[ q_read++ % numstates ];
2446         base = trie->states[ cur ].trans.base;
2447
2448         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2449             const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2450             if (ch_state) {
2451                 U32 fail_state = cur;
2452                 U32 fail_base;
2453                 do {
2454                     fail_state = fail[ fail_state ];
2455                     fail_base = aho->states[ fail_state ].trans.base;
2456                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2457
2458                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2459                 fail[ ch_state ] = fail_state;
2460                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2461                 {
2462                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
2463                 }
2464                 q[ q_write++ % numstates] = ch_state;
2465             }
2466         }
2467     }
2468     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2469        when we fail in state 1, this allows us to use the
2470        charclass scan to find a valid start char. This is based on the principle
2471        that theres a good chance the string being searched contains lots of stuff
2472        that cant be a start char.
2473      */
2474     fail[ 0 ] = fail[ 1 ] = 0;
2475     DEBUG_TRIE_COMPILE_r({
2476         PerlIO_printf(Perl_debug_log,
2477                       "%*sStclass Failtable (%"UVuf" states): 0",
2478                       (int)(depth * 2), "", (UV)numstates
2479         );
2480         for( q_read=1; q_read<numstates; q_read++ ) {
2481             PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2482         }
2483         PerlIO_printf(Perl_debug_log, "\n");
2484     });
2485     Safefree(q);
2486     /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2487 }
2488
2489
2490 /*
2491  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2492  * These need to be revisited when a newer toolchain becomes available.
2493  */
2494 #if defined(__sparc64__) && defined(__GNUC__)
2495 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2496 #       undef  SPARC64_GCC_WORKAROUND
2497 #       define SPARC64_GCC_WORKAROUND 1
2498 #   endif
2499 #endif
2500
2501 #define DEBUG_PEEP(str,scan,depth) \
2502     DEBUG_OPTIMISE_r({if (scan){ \
2503        SV * const mysv=sv_newmortal(); \
2504        regnode *Next = regnext(scan); \
2505        regprop(RExC_rx, mysv, scan); \
2506        PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2507        (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2508        Next ? (REG_NODE_NUM(Next)) : 0 ); \
2509    }});
2510
2511
2512 /* The below joins as many adjacent EXACTish nodes as possible into a single
2513  * one, and looks for problematic sequences of characters whose folds vs.
2514  * non-folds have sufficiently different lengths, that the optimizer would be
2515  * fooled into rejecting legitimate matches of them, and the trie construction
2516  * code can't cope with them.  The joining is only done if:
2517  * 1) there is room in the current conglomerated node to entirely contain the
2518  *    next one.
2519  * 2) they are the exact same node type
2520  *
2521  * The adjacent nodes actually may be separated by NOTHING kind nodes, and
2522  * these get optimized out
2523  *
2524  * If there are problematic code sequences, *min_subtract is set to the delta
2525  * that the minimum size of the node can be less than its actual size.  And,
2526  * the node type of the result is changed to reflect that it contains these
2527  * sequences.
2528  *
2529  * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
2530  * and contains LATIN SMALL LETTER SHARP S
2531  *
2532  * This is as good a place as any to discuss the design of handling these
2533  * problematic sequences.  It's been wrong in Perl for a very long time.  There
2534  * are three code points in Unicode whose folded lengths differ so much from
2535  * the un-folded lengths that it causes problems for the optimizer and trie
2536  * construction.  Why only these are problematic, and not others where lengths
2537  * also differ is something I (khw) do not understand.  New versions of Unicode
2538  * might add more such code points.  Hopefully the logic in fold_grind.t that
2539  * figures out what to test (in part by verifying that each size-combination
2540  * gets tested) will catch any that do come along, so they can be added to the
2541  * special handling below.  The chances of new ones are actually rather small,
2542  * as most, if not all, of the world's scripts that have casefolding have
2543  * already been encoded by Unicode.  Also, a number of Unicode's decisions were
2544  * made to allow compatibility with pre-existing standards, and almost all of
2545  * those have already been dealt with.  These would otherwise be the most
2546  * likely candidates for generating further tricky sequences.  In other words,
2547  * Unicode by itself is unlikely to add new ones unless it is for compatibility
2548  * with pre-existing standards, and there aren't many of those left.
2549  *
2550  * The previous designs for dealing with these involved assigning a special
2551  * node for them.  This approach doesn't work, as evidenced by this example:
2552  *      "\xDFs" =~ /s\xDF/ui    # Used to fail before these patches
2553  * Both these fold to "sss", but if the pattern is parsed to create a node of
2554  * that would match just the \xDF, it won't be able to handle the case where a
2555  * successful match would have to cross the node's boundary.  The new approach
2556  * that hopefully generally solves the problem generates an EXACTFU_SS node
2557  * that is "sss".
2558  *
2559  * There are a number of components to the approach (a lot of work for just
2560  * three code points!):
2561  * 1)   This routine examines each EXACTFish node that could contain the
2562  *      problematic sequences.  It returns in *min_subtract how much to
2563  *      subtract from the the actual length of the string to get a real minimum
2564  *      for one that could match it.  This number is usually 0 except for the
2565  *      problematic sequences.  This delta is used by the caller to adjust the
2566  *      min length of the match, and the delta between min and max, so that the
2567  *      optimizer doesn't reject these possibilities based on size constraints.
2568  * 2)   These sequences are not currently correctly handled by the trie code
2569  *      either, so it changes the joined node type to ops that are not handled
2570  *      by trie's, those new ops being EXACTFU_SS and EXACTFU_NO_TRIE.
2571  * 3)   This is sufficient for the two Greek sequences (described below), but
2572  *      the one involving the Sharp s (\xDF) needs more.  The node type
2573  *      EXACTFU_SS is used for an EXACTFU node that contains at least one "ss"
2574  *      sequence in it.  For non-UTF-8 patterns and strings, this is the only
2575  *      case where there is a possible fold length change.  That means that a
2576  *      regular EXACTFU node without UTF-8 involvement doesn't have to concern
2577  *      itself with length changes, and so can be processed faster.  regexec.c
2578  *      takes advantage of this.  Generally, an EXACTFish node that is in UTF-8
2579  *      is pre-folded by regcomp.c.  This saves effort in regex matching.
2580  *      However, probably mostly for historical reasons, the pre-folding isn't
2581  *      done for non-UTF8 patterns (and it can't be for EXACTF and EXACTFL
2582  *      nodes, as what they fold to isn't known until runtime.)  The fold
2583  *      possibilities for the non-UTF8 patterns are quite simple, except for
2584  *      the sharp s.  All the ones that don't involve a UTF-8 target string
2585  *      are members of a fold-pair, and arrays are set up for all of them
2586  *      that quickly find the other member of the pair.  It might actually
2587  *      be faster to pre-fold these, but it isn't currently done, except for
2588  *      the sharp s.  Code elsewhere in this file makes sure that it gets
2589  *      folded to 'ss', even if the pattern isn't UTF-8.  This avoids the
2590  *      issues described in the next item.
2591  * 4)   A problem remains for the sharp s in EXACTF nodes.  Whether it matches
2592  *      'ss' or not is not knowable at compile time.  It will match iff the
2593  *      target string is in UTF-8, unlike the EXACTFU nodes, where it always
2594  *      matches; and the EXACTFL and EXACTFA nodes where it never does.  Thus
2595  *      it can't be folded to "ss" at compile time, unlike EXACTFU does as
2596  *      described in item 3).  An assumption that the optimizer part of
2597  *      regexec.c (probably unwittingly) makes is that a character in the
2598  *      pattern corresponds to at most a single character in the target string.
2599  *      (And I do mean character, and not byte here, unlike other parts of the
2600  *      documentation that have never been updated to account for multibyte
2601  *      Unicode.)  This assumption is wrong only in this case, as all other
2602  *      cases are either 1-1 folds when no UTF-8 is involved; or is true by
2603  *      virtue of having this file pre-fold UTF-8 patterns.   I'm
2604  *      reluctant to try to change this assumption, so instead the code punts.
2605  *      This routine examines EXACTF nodes for the sharp s, and returns a
2606  *      boolean indicating whether or not the node is an EXACTF node that
2607  *      contains a sharp s.  When it is true, the caller sets a flag that later
2608  *      causes the optimizer in this file to not set values for the floating
2609  *      and fixed string lengths, and thus avoids the optimizer code in
2610  *      regexec.c that makes the invalid assumption.  Thus, there is no
2611  *      optimization based on string lengths for EXACTF nodes that contain the
2612  *      sharp s.  This only happens for /id rules (which means the pattern
2613  *      isn't in UTF-8).
2614  */
2615
2616 #define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
2617     if (PL_regkind[OP(scan)] == EXACT) \
2618         join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
2619
2620 STATIC U32
2621 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
2622     /* Merge several consecutive EXACTish nodes into one. */
2623     regnode *n = regnext(scan);
2624     U32 stringok = 1;
2625     regnode *next = scan + NODE_SZ_STR(scan);
2626     U32 merged = 0;
2627     U32 stopnow = 0;
2628 #ifdef DEBUGGING
2629     regnode *stop = scan;
2630     GET_RE_DEBUG_FLAGS_DECL;
2631 #else
2632     PERL_UNUSED_ARG(depth);
2633 #endif
2634
2635     PERL_ARGS_ASSERT_JOIN_EXACT;
2636 #ifndef EXPERIMENTAL_INPLACESCAN
2637     PERL_UNUSED_ARG(flags);
2638     PERL_UNUSED_ARG(val);
2639 #endif
2640     DEBUG_PEEP("join",scan,depth);
2641
2642     /* Look through the subsequent nodes in the chain.  Skip NOTHING, merge
2643      * EXACT ones that are mergeable to the current one. */
2644     while (n
2645            && (PL_regkind[OP(n)] == NOTHING
2646                || (stringok && OP(n) == OP(scan)))
2647            && NEXT_OFF(n)
2648            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
2649     {
2650
2651         if (OP(n) == TAIL || n > next)
2652             stringok = 0;
2653         if (PL_regkind[OP(n)] == NOTHING) {
2654             DEBUG_PEEP("skip:",n,depth);
2655             NEXT_OFF(scan) += NEXT_OFF(n);
2656             next = n + NODE_STEP_REGNODE;
2657 #ifdef DEBUGGING
2658             if (stringok)
2659                 stop = n;
2660 #endif
2661             n = regnext(n);
2662         }
2663         else if (stringok) {
2664             const unsigned int oldl = STR_LEN(scan);
2665             regnode * const nnext = regnext(n);
2666
2667             if (oldl + STR_LEN(n) > U8_MAX)
2668                 break;
2669
2670             DEBUG_PEEP("merg",n,depth);
2671             merged++;
2672
2673             NEXT_OFF(scan) += NEXT_OFF(n);
2674             STR_LEN(scan) += STR_LEN(n);
2675             next = n + NODE_SZ_STR(n);
2676             /* Now we can overwrite *n : */
2677             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2678 #ifdef DEBUGGING
2679             stop = next - 1;
2680 #endif
2681             n = nnext;
2682             if (stopnow) break;
2683         }
2684
2685 #ifdef EXPERIMENTAL_INPLACESCAN
2686         if (flags && !NEXT_OFF(n)) {
2687             DEBUG_PEEP("atch", val, depth);
2688             if (reg_off_by_arg[OP(n)]) {
2689                 ARG_SET(n, val - n);
2690             }
2691             else {
2692                 NEXT_OFF(n) = val - n;
2693             }
2694             stopnow = 1;
2695         }
2696 #endif
2697     }
2698
2699     *min_subtract = 0;
2700     *has_exactf_sharp_s = FALSE;
2701
2702     /* Here, all the adjacent mergeable EXACTish nodes have been merged.  We
2703      * can now analyze for sequences of problematic code points.  (Prior to
2704      * this final joining, sequences could have been split over boundaries, and
2705      * hence missed).  The sequences only happen in folding, hence for any
2706      * non-EXACT EXACTish node */
2707     if (OP(scan) != EXACT) {
2708         U8 *s;
2709         U8 * s0 = (U8*) STRING(scan);
2710         U8 * const s_end = s0 + STR_LEN(scan);
2711
2712         /* The below is perhaps overboard, but this allows us to save a test
2713          * each time through the loop at the expense of a mask.  This is
2714          * because on both EBCDIC and ASCII machines, 'S' and 's' differ by a
2715          * single bit.  On ASCII they are 32 apart; on EBCDIC, they are 64.
2716          * This uses an exclusive 'or' to find that bit and then inverts it to
2717          * form a mask, with just a single 0, in the bit position where 'S' and
2718          * 's' differ. */
2719         const U8 S_or_s_mask = ~ ('S' ^ 's');
2720         const U8 s_masked = 's' & S_or_s_mask;
2721
2722         /* One pass is made over the node's string looking for all the
2723          * possibilities.  to avoid some tests in the loop, there are two main
2724          * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
2725          * non-UTF-8 */
2726         if (UTF) {
2727
2728             /* There are two problematic Greek code points in Unicode
2729              * casefolding
2730              *
2731              * U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2732              * U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2733              *
2734              * which casefold to
2735              *
2736              * Unicode                      UTF-8
2737              *
2738              * U+03B9 U+0308 U+0301         0xCE 0xB9 0xCC 0x88 0xCC 0x81
2739              * U+03C5 U+0308 U+0301         0xCF 0x85 0xCC 0x88 0xCC 0x81
2740              *
2741              * This means that in case-insensitive matching (or "loose
2742              * matching", as Unicode calls it), an EXACTF of length six (the
2743              * UTF-8 encoded byte length of the above casefolded versions) can
2744              * match a target string of length two (the byte length of UTF-8
2745              * encoded U+0390 or U+03B0).  This would rather mess up the
2746              * minimum length computation.  (there are other code points that
2747              * also fold to these two sequences, but the delta is smaller)
2748              *
2749              * If these sequences are found, the minimum length is decreased by
2750              * four (six minus two).
2751              *
2752              * Similarly, 'ss' may match the single char and byte LATIN SMALL
2753              * LETTER SHARP S.  We decrease the min length by 1 for each
2754              * occurrence of 'ss' found */
2755
2756 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2757 #           define U390_first_byte 0xb4
2758             const U8 U390_tail[] = "\x68\xaf\x49\xaf\x42";
2759 #           define U3B0_first_byte 0xb5
2760             const U8 U3B0_tail[] = "\x46\xaf\x49\xaf\x42";
2761 #else
2762 #           define U390_first_byte 0xce
2763             const U8 U390_tail[] = "\xb9\xcc\x88\xcc\x81";
2764 #           define U3B0_first_byte 0xcf
2765             const U8 U3B0_tail[] = "\x85\xcc\x88\xcc\x81";
2766 #endif
2767             const U8 len = sizeof(U390_tail); /* (-1 for NUL; +1 for 1st byte;
2768                                                  yields a net of 0 */
2769             /* Examine the string for one of the problematic sequences */
2770             for (s = s0;
2771                  s < s_end - 1; /* Can stop 1 before the end, as minimum length
2772                                  * sequence we are looking for is 2 */
2773                  s += UTF8SKIP(s))
2774             {
2775
2776                 /* Look for the first byte in each problematic sequence */
2777                 switch (*s) {
2778                     /* We don't have to worry about other things that fold to
2779                      * 's' (such as the long s, U+017F), as all above-latin1
2780                      * code points have been pre-folded */
2781                     case 's':
2782                     case 'S':
2783
2784                         /* Current character is an 's' or 'S'.  If next one is
2785                          * as well, we have the dreaded sequence */
2786                         if (((*(s+1) & S_or_s_mask) == s_masked)
2787                             /* These two node types don't have special handling
2788                              * for 'ss' */
2789                             && OP(scan) != EXACTFL && OP(scan) != EXACTFA)
2790                         {
2791                             *min_subtract += 1;
2792                             OP(scan) = EXACTFU_SS;
2793                             s++;    /* No need to look at this character again */
2794                         }
2795                         break;
2796
2797                     case U390_first_byte:
2798                         if (s_end - s >= len
2799
2800                             /* The 1's are because are skipping comparing the
2801                              * first byte */
2802                             && memEQ(s + 1, U390_tail, len - 1))
2803                         {
2804                             goto greek_sequence;
2805                         }
2806                         break;
2807
2808                     case U3B0_first_byte:
2809                         if (! (s_end - s >= len
2810                                && memEQ(s + 1, U3B0_tail, len - 1)))
2811                         {
2812                             break;
2813                         }
2814                       greek_sequence:
2815                         *min_subtract += 4;
2816
2817                         /* This can't currently be handled by trie's, so change
2818                          * the node type to indicate this.  If EXACTFA and
2819                          * EXACTFL were ever to be handled by trie's, this
2820                          * would have to be changed.  If this node has already
2821                          * been changed to EXACTFU_SS in this loop, leave it as
2822                          * is.  (I (khw) think it doesn't matter in regexec.c
2823                          * for UTF patterns, but no need to change it */
2824                         if (OP(scan) == EXACTFU) {
2825                             OP(scan) = EXACTFU_NO_TRIE;
2826                         }
2827                         s += 6; /* We already know what this sequence is.  Skip
2828                                    the rest of it */
2829                         break;
2830                 }
2831             }
2832         }
2833         else if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
2834
2835             /* Here, the pattern is not UTF-8.  We need to look only for the
2836              * 'ss' sequence, and in the EXACTF case, the sharp s, which can be
2837              * in the final position.  Otherwise we can stop looking 1 byte
2838              * earlier because have to find both the first and second 's' */
2839             const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
2840
2841             for (s = s0; s < upper; s++) {
2842                 switch (*s) {
2843                     case 'S':
2844                     case 's':
2845                         if (s_end - s > 1
2846                             && ((*(s+1) & S_or_s_mask) == s_masked))
2847                         {
2848                             *min_subtract += 1;
2849
2850                             /* EXACTF nodes need to know that the minimum
2851                              * length changed so that a sharp s in the string
2852                              * can match this ss in the pattern, but they
2853                              * remain EXACTF nodes, as they are not trie'able,
2854                              * so don't have to invent a new node type to
2855                              * exclude them from the trie code */
2856                             if (OP(scan) != EXACTF) {
2857                                 OP(scan) = EXACTFU_SS;
2858                             }
2859                             s++;
2860                         }
2861                         break;
2862                     case LATIN_SMALL_LETTER_SHARP_S:
2863                         if (OP(scan) == EXACTF) {
2864                             *has_exactf_sharp_s = TRUE;
2865                         }
2866                         break;
2867                 }
2868             }
2869         }
2870     }
2871
2872 #ifdef DEBUGGING
2873     /* Allow dumping but overwriting the collection of skipped
2874      * ops and/or strings with fake optimized ops */
2875     n = scan + NODE_SZ_STR(scan);
2876     while (n <= stop) {
2877         OP(n) = OPTIMIZED;
2878         FLAGS(n) = 0;
2879         NEXT_OFF(n) = 0;
2880         n++;
2881     }
2882 #endif
2883     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2884     return stopnow;
2885 }
2886
2887 /* REx optimizer.  Converts nodes into quicker variants "in place".
2888    Finds fixed substrings.  */
2889
2890 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2891    to the position after last scanned or to NULL. */
2892
2893 #define INIT_AND_WITHP \
2894     assert(!and_withp); \
2895     Newx(and_withp,1,struct regnode_charclass_class); \
2896     SAVEFREEPV(and_withp)
2897
2898 /* this is a chain of data about sub patterns we are processing that
2899    need to be handled separately/specially in study_chunk. Its so
2900    we can simulate recursion without losing state.  */
2901 struct scan_frame;
2902 typedef struct scan_frame {
2903     regnode *last;  /* last node to process in this frame */
2904     regnode *next;  /* next node to process when last is reached */
2905     struct scan_frame *prev; /*previous frame*/
2906     I32 stop; /* what stopparen do we use */
2907 } scan_frame;
2908
2909
2910 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2911
2912 #define CASE_SYNST_FNC(nAmE)                                       \
2913 case nAmE:                                                         \
2914     if (flags & SCF_DO_STCLASS_AND) {                              \
2915             for (value = 0; value < 256; value++)                  \
2916                 if (!is_ ## nAmE ## _cp(value))                       \
2917                     ANYOF_BITMAP_CLEAR(data->start_class, value);  \
2918     }                                                              \
2919     else {                                                         \
2920             for (value = 0; value < 256; value++)                  \
2921                 if (is_ ## nAmE ## _cp(value))                        \
2922                     ANYOF_BITMAP_SET(data->start_class, value);    \
2923     }                                                              \
2924     break;                                                         \
2925 case N ## nAmE:                                                    \
2926     if (flags & SCF_DO_STCLASS_AND) {                              \
2927             for (value = 0; value < 256; value++)                   \
2928                 if (is_ ## nAmE ## _cp(value))                         \
2929                     ANYOF_BITMAP_CLEAR(data->start_class, value);   \
2930     }                                                               \
2931     else {                                                          \
2932             for (value = 0; value < 256; value++)                   \
2933                 if (!is_ ## nAmE ## _cp(value))                        \
2934                     ANYOF_BITMAP_SET(data->start_class, value);     \
2935     }                                                               \
2936     break
2937
2938
2939
2940 STATIC I32
2941 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2942                         I32 *minlenp, I32 *deltap,
2943                         regnode *last,
2944                         scan_data_t *data,
2945                         I32 stopparen,
2946                         U8* recursed,
2947                         struct regnode_charclass_class *and_withp,
2948                         U32 flags, U32 depth)
2949                         /* scanp: Start here (read-write). */
2950                         /* deltap: Write maxlen-minlen here. */
2951                         /* last: Stop before this one. */
2952                         /* data: string data about the pattern */
2953                         /* stopparen: treat close N as END */
2954                         /* recursed: which subroutines have we recursed into */
2955                         /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2956 {
2957     dVAR;
2958     I32 min = 0, pars = 0, code;
2959     regnode *scan = *scanp, *next;
2960     I32 delta = 0;
2961     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2962     int is_inf_internal = 0;            /* The studied chunk is infinite */
2963     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2964     scan_data_t data_fake;
2965     SV *re_trie_maxbuff = NULL;
2966     regnode *first_non_open = scan;
2967     I32 stopmin = I32_MAX;
2968     scan_frame *frame = NULL;
2969     GET_RE_DEBUG_FLAGS_DECL;
2970
2971     PERL_ARGS_ASSERT_STUDY_CHUNK;
2972
2973 #ifdef DEBUGGING
2974     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2975 #endif
2976
2977     if ( depth == 0 ) {
2978         while (first_non_open && OP(first_non_open) == OPEN)
2979             first_non_open=regnext(first_non_open);
2980     }
2981
2982
2983   fake_study_recurse:
2984     while ( scan && OP(scan) != END && scan < last ){
2985         UV min_subtract = 0;    /* How much to subtract from the minimum node
2986                                    length to get a real minimum (because the
2987                                    folded version may be shorter) */
2988         bool has_exactf_sharp_s = FALSE;
2989         /* Peephole optimizer: */
2990         DEBUG_STUDYDATA("Peep:", data,depth);
2991         DEBUG_PEEP("Peep",scan,depth);
2992
2993         /* Its not clear to khw or hv why this is done here, and not in the
2994          * clauses that deal with EXACT nodes.  khw's guess is that it's
2995          * because of a previous design */
2996         JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
2997
2998         /* Follow the next-chain of the current node and optimize
2999            away all the NOTHINGs from it.  */
3000         if (OP(scan) != CURLYX) {
3001             const int max = (reg_off_by_arg[OP(scan)]
3002                        ? I32_MAX
3003                        /* I32 may be smaller than U16 on CRAYs! */
3004                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3005             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3006             int noff;
3007             regnode *n = scan;
3008
3009             /* Skip NOTHING and LONGJMP. */
3010             while ((n = regnext(n))
3011                    && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3012                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3013                    && off + noff < max)
3014                 off += noff;
3015             if (reg_off_by_arg[OP(scan)])
3016                 ARG(scan) = off;
3017             else
3018                 NEXT_OFF(scan) = off;
3019         }
3020
3021
3022
3023         /* The principal pseudo-switch.  Cannot be a switch, since we
3024            look into several different things.  */
3025         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3026                    || OP(scan) == IFTHEN) {
3027             next = regnext(scan);
3028             code = OP(scan);
3029             /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
3030
3031             if (OP(next) == code || code == IFTHEN) {
3032                 /* NOTE - There is similar code to this block below for handling
3033                    TRIE nodes on a re-study.  If you change stuff here check there
3034                    too. */
3035                 I32 max1 = 0, min1 = I32_MAX, num = 0;
3036                 struct regnode_charclass_class accum;
3037                 regnode * const startbranch=scan;
3038
3039                 if (flags & SCF_DO_SUBSTR)
3040                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
3041                 if (flags & SCF_DO_STCLASS)
3042                     cl_init_zero(pRExC_state, &accum);
3043
3044                 while (OP(scan) == code) {
3045                     I32 deltanext, minnext, f = 0, fake;
3046                     struct regnode_charclass_class this_class;
3047
3048                     num++;
3049                     data_fake.flags = 0;
3050                     if (data) {
3051                         data_fake.whilem_c = data->whilem_c;
3052                         data_fake.last_closep = data->last_closep;
3053                     }
3054                     else
3055                         data_fake.last_closep = &fake;
3056
3057                     data_fake.pos_delta = delta;
3058                     next = regnext(scan);
3059                     scan = NEXTOPER(scan);
3060                     if (code != BRANCH)
3061                         scan = NEXTOPER(scan);
3062                     if (flags & SCF_DO_STCLASS) {
3063                         cl_init(pRExC_state, &this_class);
3064                         data_fake.start_class = &this_class;
3065                         f = SCF_DO_STCLASS_AND;
3066                     }
3067                     if (flags & SCF_WHILEM_VISITED_POS)
3068                         f |= SCF_WHILEM_VISITED_POS;
3069
3070                     /* we suppose the run is continuous, last=next...*/
3071                     minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3072                                           next, &data_fake,
3073                                           stopparen, recursed, NULL, f,depth+1);
3074                     if (min1 > minnext)
3075                         min1 = minnext;
3076                     if (max1 < minnext + deltanext)
3077                         max1 = minnext + deltanext;
3078                     if (deltanext == I32_MAX)
3079                         is_inf = is_inf_internal = 1;
3080                     scan = next;
3081                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3082                         pars++;
3083                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
3084                         if ( stopmin > minnext)
3085                             stopmin = min + min1;
3086                         flags &= ~SCF_DO_SUBSTR;
3087                         if (data)
3088                             data->flags |= SCF_SEEN_ACCEPT;
3089                     }
3090                     if (data) {
3091                         if (data_fake.flags & SF_HAS_EVAL)
3092                             data->flags |= SF_HAS_EVAL;
3093                         data->whilem_c = data_fake.whilem_c;
3094                     }
3095                     if (flags & SCF_DO_STCLASS)
3096                         cl_or(pRExC_state, &accum, &this_class);
3097                 }
3098                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3099                     min1 = 0;
3100                 if (flags & SCF_DO_SUBSTR) {
3101                     data->pos_min += min1;
3102                     data->pos_delta += max1 - min1;
3103                     if (max1 != min1 || is_inf)
3104                         data->longest = &(data->longest_float);
3105                 }
3106                 min += min1;
3107                 delta += max1 - min1;
3108                 if (flags & SCF_DO_STCLASS_OR) {
3109                     cl_or(pRExC_state, data->start_class, &accum);
3110                     if (min1) {
3111                         cl_and(data->start_class, and_withp);
3112                         flags &= ~SCF_DO_STCLASS;
3113                     }
3114                 }
3115                 else if (flags & SCF_DO_STCLASS_AND) {
3116                     if (min1) {
3117                         cl_and(data->start_class, &accum);
3118                         flags &= ~SCF_DO_STCLASS;
3119                     }
3120                     else {
3121                         /* Switch to OR mode: cache the old value of
3122                          * data->start_class */
3123                         INIT_AND_WITHP;
3124                         StructCopy(data->start_class, and_withp,
3125                                    struct regnode_charclass_class);
3126                         flags &= ~SCF_DO_STCLASS_AND;
3127                         StructCopy(&accum, data->start_class,
3128                                    struct regnode_charclass_class);
3129                         flags |= SCF_DO_STCLASS_OR;
3130                         data->start_class->flags |= ANYOF_EOS;
3131                     }
3132                 }
3133
3134                 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
3135                 /* demq.
3136
3137                    Assuming this was/is a branch we are dealing with: 'scan' now
3138                    points at the item that follows the branch sequence, whatever
3139                    it is. We now start at the beginning of the sequence and look
3140                    for subsequences of
3141
3142                    BRANCH->EXACT=>x1
3143                    BRANCH->EXACT=>x2
3144                    tail
3145
3146                    which would be constructed from a pattern like /A|LIST|OF|WORDS/
3147
3148                    If we can find such a subsequence we need to turn the first
3149                    element into a trie and then add the subsequent branch exact
3150                    strings to the trie.
3151
3152                    We have two cases
3153
3154                      1. patterns where the whole set of branches can be converted.
3155
3156                      2. patterns where only a subset can be converted.
3157
3158                    In case 1 we can replace the whole set with a single regop
3159                    for the trie. In case 2 we need to keep the start and end
3160                    branches so
3161
3162                      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3163                      becomes BRANCH TRIE; BRANCH X;
3164
3165                   There is an additional case, that being where there is a
3166                   common prefix, which gets split out into an EXACT like node
3167                   preceding the TRIE node.
3168
3169                   If x(1..n)==tail then we can do a simple trie, if not we make
3170                   a "jump" trie, such that when we match the appropriate word
3171                   we "jump" to the appropriate tail node. Essentially we turn
3172                   a nested if into a case structure of sorts.
3173
3174                 */
3175
3176                     int made=0;
3177                     if (!re_trie_maxbuff) {
3178                         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3179                         if (!SvIOK(re_trie_maxbuff))
3180                             sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3181                     }
3182                     if ( SvIV(re_trie_maxbuff)>=0  ) {
3183                         regnode *cur;
3184                         regnode *first = (regnode *)NULL;
3185                         regnode *last = (regnode *)NULL;
3186                         regnode *tail = scan;
3187                         U8 optype = 0;
3188                         U32 count=0;
3189
3190 #ifdef DEBUGGING
3191                         SV * const mysv = sv_newmortal();       /* for dumping */
3192 #endif
3193                         /* var tail is used because there may be a TAIL
3194                            regop in the way. Ie, the exacts will point to the
3195                            thing following the TAIL, but the last branch will
3196                            point at the TAIL. So we advance tail. If we
3197                            have nested (?:) we may have to move through several
3198                            tails.
3199                          */
3200
3201                         while ( OP( tail ) == TAIL ) {
3202                             /* this is the TAIL generated by (?:) */
3203                             tail = regnext( tail );
3204                         }
3205
3206
3207                         DEBUG_OPTIMISE_r({
3208                             regprop(RExC_rx, mysv, tail );
3209                             PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3210                                 (int)depth * 2 + 2, "",
3211                                 "Looking for TRIE'able sequences. Tail node is: ",
3212                                 SvPV_nolen_const( mysv )
3213                             );
3214                         });
3215
3216                         /*
3217
3218                            step through the branches, cur represents each
3219                            branch, noper is the first thing to be matched
3220                            as part of that branch and noper_next is the
3221                            regnext() of that node. if noper is an EXACT
3222                            and noper_next is the same as scan (our current
3223                            position in the regex) then the EXACT branch is
3224                            a possible optimization target. Once we have
3225                            two or more consecutive such branches we can
3226                            create a trie of the EXACT's contents and stich
3227                            it in place. If the sequence represents all of
3228                            the branches we eliminate the whole thing and
3229                            replace it with a single TRIE. If it is a
3230                            subsequence then we need to stitch it in. This
3231                            means the first branch has to remain, and needs
3232                            to be repointed at the item on the branch chain
3233                            following the last branch optimized. This could
3234                            be either a BRANCH, in which case the
3235                            subsequence is internal, or it could be the
3236                            item following the branch sequence in which
3237                            case the subsequence is at the end.
3238
3239                         */
3240
3241                         /* dont use tail as the end marker for this traverse */
3242                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3243                             regnode * const noper = NEXTOPER( cur );
3244 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3245                             regnode * const noper_next = regnext( noper );
3246 #endif
3247
3248                             DEBUG_OPTIMISE_r({
3249                                 regprop(RExC_rx, mysv, cur);
3250                                 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3251                                    (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3252
3253                                 regprop(RExC_rx, mysv, noper);
3254                                 PerlIO_printf( Perl_debug_log, " -> %s",
3255                                     SvPV_nolen_const(mysv));
3256
3257                                 if ( noper_next ) {
3258                                   regprop(RExC_rx, mysv, noper_next );
3259                                   PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3260                                     SvPV_nolen_const(mysv));
3261                                 }
3262                                 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
3263                                    REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
3264                             });
3265                             if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
3266                                          : PL_regkind[ OP( noper ) ] == EXACT )
3267                                   || OP(noper) == NOTHING )
3268 #ifdef NOJUMPTRIE
3269                                   && noper_next == tail
3270 #endif
3271                                   && count < U16_MAX)
3272                             {
3273                                 count++;
3274                                 if ( !first || optype == NOTHING ) {
3275                                     if (!first) first = cur;
3276                                     optype = OP( noper );
3277                                 } else {
3278                                     last = cur;
3279                                 }
3280                             } else {
3281 /*
3282     Currently the trie logic handles case insensitive matching properly only
3283     when the pattern is UTF-8 and the node is EXACTFU (thus forcing unicode
3284     semantics).
3285
3286     If/when this is fixed the following define can be swapped
3287     in below to fully enable trie logic.
3288
3289 #define TRIE_TYPE_IS_SAFE 1
3290
3291 Note that join_exact() assumes that the other types of EXACTFish nodes are not
3292 used in tries, so that would have to be updated if this changed
3293
3294 */
3295 #define TRIE_TYPE_IS_SAFE ((UTF && optype == EXACTFU) || optype==EXACT)
3296
3297                                 if ( last && TRIE_TYPE_IS_SAFE ) {
3298                                     make_trie( pRExC_state,
3299                                             startbranch, first, cur, tail, count,
3300                                             optype, depth+1 );
3301                                 }
3302                                 if ( PL_regkind[ OP( noper ) ] == EXACT
3303 #ifdef NOJUMPTRIE
3304                                      && noper_next == tail
3305 #endif
3306                                 ){
3307                                     count = 1;
3308                                     first = cur;
3309                                     optype = OP( noper );
3310                                 } else {
3311                                     count = 0;
3312                                     first = NULL;
3313                                     optype = 0;
3314                                 }
3315                                 last = NULL;
3316                             }
3317                         }
3318                         DEBUG_OPTIMISE_r({
3319                             regprop(RExC_rx, mysv, cur);
3320                             PerlIO_printf( Perl_debug_log,
3321                               "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3322                               "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3323
3324                         });
3325
3326                         if ( last && TRIE_TYPE_IS_SAFE ) {
3327                             made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3328 #ifdef TRIE_STUDY_OPT
3329                             if ( ((made == MADE_EXACT_TRIE &&
3330                                  startbranch == first)
3331                                  || ( first_non_open == first )) &&
3332                                  depth==0 ) {
3333                                 flags |= SCF_TRIE_RESTUDY;
3334                                 if ( startbranch == first
3335                                      && scan == tail )
3336                                 {
3337                                     RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3338                                 }
3339                             }
3340 #endif
3341                         }
3342                     }
3343
3344                 } /* do trie */
3345
3346             }
3347             else if ( code == BRANCHJ ) {  /* single branch is optimized. */
3348                 scan = NEXTOPER(NEXTOPER(scan));
3349             } else                      /* single branch is optimized. */
3350                 scan = NEXTOPER(scan);
3351             continue;
3352         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3353             scan_frame *newframe = NULL;
3354             I32 paren;
3355             regnode *start;
3356             regnode *end;
3357
3358             if (OP(scan) != SUSPEND) {
3359             /* set the pointer */
3360                 if (OP(scan) == GOSUB) {
3361                     paren = ARG(scan);
3362                     RExC_recurse[ARG2L(scan)] = scan;
3363                     start = RExC_open_parens[paren-1];
3364                     end   = RExC_close_parens[paren-1];
3365                 } else {
3366                     paren = 0;
3367                     start = RExC_rxi->program + 1;
3368                     end   = RExC_opend;
3369                 }
3370                 if (!recursed) {
3371                     Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3372                     SAVEFREEPV(recursed);
3373                 }
3374                 if (!PAREN_TEST(recursed,paren+1)) {
3375                     PAREN_SET(recursed,paren+1);
3376                     Newx(newframe,1,scan_frame);
3377                 } else {
3378                     if (flags & SCF_DO_SUBSTR) {
3379                         SCAN_COMMIT(pRExC_state,data,minlenp);
3380                         data->longest = &(data->longest_float);
3381                     }
3382                     is_inf = is_inf_internal = 1;
3383                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3384                         cl_anything(pRExC_state, data->start_class);
3385                     flags &= ~SCF_DO_STCLASS;
3386                 }
3387             } else {
3388                 Newx(newframe,1,scan_frame);
3389                 paren = stopparen;
3390                 start = scan+2;
3391                 end = regnext(scan);
3392             }
3393             if (newframe) {
3394                 assert(start);
3395                 assert(end);
3396                 SAVEFREEPV(newframe);
3397                 newframe->next = regnext(scan);
3398                 newframe->last = last;
3399                 newframe->stop = stopparen;
3400                 newframe->prev = frame;
3401
3402                 frame = newframe;
3403                 scan =  start;
3404                 stopparen = paren;
3405                 last = end;
3406
3407                 continue;
3408             }
3409         }
3410         else if (OP(scan) == EXACT) {
3411             I32 l = STR_LEN(scan);
3412             UV uc;
3413             if (UTF) {
3414                 const U8 * const s = (U8*)STRING(scan);
3415                 l = utf8_length(s, s + l);
3416                 uc = utf8_to_uvchr(s, NULL);
3417             } else {
3418                 uc = *((U8*)STRING(scan));
3419             }
3420             min += l;
3421             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3422                 /* The code below prefers earlier match for fixed
3423                    offset, later match for variable offset.  */
3424                 if (data->last_end == -1) { /* Update the start info. */
3425                     data->last_start_min = data->pos_min;
3426                     data->last_start_max = is_inf
3427                         ? I32_MAX : data->pos_min + data->pos_delta;
3428                 }
3429                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3430                 if (UTF)
3431                     SvUTF8_on(data->last_found);
3432                 {
3433                     SV * const sv = data->last_found;
3434                     MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3435                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
3436                     if (mg && mg->mg_len >= 0)
3437                         mg->mg_len += utf8_length((U8*)STRING(scan),
3438                                                   (U8*)STRING(scan)+STR_LEN(scan));
3439                 }
3440                 data->last_end = data->pos_min + l;
3441                 data->pos_min += l; /* As in the first entry. */
3442                 data->flags &= ~SF_BEFORE_EOL;
3443             }
3444             if (flags & SCF_DO_STCLASS_AND) {
3445                 /* Check whether it is compatible with what we know already! */
3446                 int compat = 1;
3447
3448
3449                 /* If compatible, we or it in below.  It is compatible if is
3450                  * in the bitmp and either 1) its bit or its fold is set, or 2)
3451                  * it's for a locale.  Even if there isn't unicode semantics
3452                  * here, at runtime there may be because of matching against a
3453                  * utf8 string, so accept a possible false positive for
3454                  * latin1-range folds */
3455                 if (uc >= 0x100 ||
3456                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3457                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
3458                     && (!(data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD)
3459                         || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3460                     )
3461                 {
3462                     compat = 0;
3463                 }
3464                 ANYOF_CLASS_ZERO(data->start_class);
3465                 ANYOF_BITMAP_ZERO(data->start_class);
3466                 if (compat)
3467                     ANYOF_BITMAP_SET(data->start_class, uc);
3468                 else if (uc >= 0x100) {
3469                     int i;
3470
3471                     /* Some Unicode code points fold to the Latin1 range; as
3472                      * XXX temporary code, instead of figuring out if this is
3473                      * one, just assume it is and set all the start class bits
3474                      * that could be some such above 255 code point's fold
3475                      * which will generate fals positives.  As the code
3476                      * elsewhere that does compute the fold settles down, it
3477                      * can be extracted out and re-used here */
3478                     for (i = 0; i < 256; i++){
3479                         if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3480                             ANYOF_BITMAP_SET(data->start_class, i);
3481                         }
3482                     }
3483                 }
3484                 data->start_class->flags &= ~ANYOF_EOS;
3485                 if (uc < 0x100)
3486                   data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3487             }
3488             else if (flags & SCF_DO_STCLASS_OR) {
3489                 /* false positive possible if the class is case-folded */
3490                 if (uc < 0x100)
3491                     ANYOF_BITMAP_SET(data->start_class, uc);
3492                 else
3493                     data->start_class->flags |= ANYOF_UNICODE_ALL;
3494                 data->start_class->flags &= ~ANYOF_EOS;
3495                 cl_and(data->start_class, and_withp);
3496             }
3497             flags &= ~SCF_DO_STCLASS;
3498         }
3499         else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3500             I32 l = STR_LEN(scan);
3501             UV uc = *((U8*)STRING(scan));
3502
3503             /* Search for fixed substrings supports EXACT only. */
3504             if (flags & SCF_DO_SUBSTR) {
3505                 assert(data);
3506                 SCAN_COMMIT(pRExC_state, data, minlenp);
3507             }
3508             if (UTF) {
3509                 const U8 * const s = (U8 *)STRING(scan);
3510                 l = utf8_length(s, s + l);
3511                 uc = utf8_to_uvchr(s, NULL);
3512             }
3513             else if (has_exactf_sharp_s) {
3514                 RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
3515             }
3516             min += l - min_subtract;
3517             if (min < 0) {
3518                 min = 0;
3519             }
3520             delta += min_subtract;
3521             if (flags & SCF_DO_SUBSTR) {
3522                 data->pos_min += l - min_subtract;
3523                 if (data->pos_min < 0) {
3524                     data->pos_min = 0;
3525                 }
3526                 data->pos_delta += min_subtract;
3527                 if (min_subtract) {
3528                     data->longest = &(data->longest_float);
3529                 }
3530             }
3531             if (flags & SCF_DO_STCLASS_AND) {
3532                 /* Check whether it is compatible with what we know already! */
3533                 int compat = 1;
3534                 if (uc >= 0x100 ||
3535                  (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3536                   && !ANYOF_BITMAP_TEST(data->start_class, uc)
3537                   && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3538                 {
3539                     compat = 0;
3540                 }
3541                 ANYOF_CLASS_ZERO(data->start_class);
3542                 ANYOF_BITMAP_ZERO(data->start_class);
3543                 if (compat) {
3544                     ANYOF_BITMAP_SET(data->start_class, uc);
3545                     data->start_class->flags &= ~ANYOF_EOS;
3546                     data->start_class->flags |= ANYOF_LOC_NONBITMAP_FOLD;
3547                     if (OP(scan) == EXACTFL) {
3548                         /* XXX This set is probably no longer necessary, and
3549                          * probably wrong as LOCALE now is on in the initial
3550                          * state */
3551                         data->start_class->flags |= ANYOF_LOCALE;
3552                     }
3553                     else {
3554
3555                         /* Also set the other member of the fold pair.  In case
3556                          * that unicode semantics is called for at runtime, use
3557                          * the full latin1 fold.  (Can't do this for locale,
3558                          * because not known until runtime) */
3559                         ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3560
3561                         /* All other (EXACTFL handled above) folds except under
3562                          * /iaa that include s, S, and sharp_s also may include
3563                          * the others */
3564                         if (OP(scan) != EXACTFA) {
3565                             if (uc == 's' || uc == 'S') {
3566                                 ANYOF_BITMAP_SET(data->start_class,
3567                                                  LATIN_SMALL_LETTER_SHARP_S);
3568                             }
3569                             else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3570                                 ANYOF_BITMAP_SET(data->start_class, 's');
3571                                 ANYOF_BITMAP_SET(data->start_class, 'S');
3572                             }
3573                         }
3574                     }
3575                 }
3576                 else if (uc >= 0x100) {
3577                     int i;
3578                     for (i = 0; i < 256; i++){
3579                         if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3580                             ANYOF_BITMAP_SET(data->start_class, i);
3581                         }
3582                     }
3583                 }
3584             }
3585             else if (flags & SCF_DO_STCLASS_OR) {
3586                 if (data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) {
3587                     /* false positive possible if the class is case-folded.
3588                        Assume that the locale settings are the same... */
3589                     if (uc < 0x100) {
3590                         ANYOF_BITMAP_SET(data->start_class, uc);
3591                         if (OP(scan) != EXACTFL) {
3592
3593                             /* And set the other member of the fold pair, but
3594                              * can't do that in locale because not known until
3595                              * run-time */
3596                             ANYOF_BITMAP_SET(data->start_class,
3597                                              PL_fold_latin1[uc]);
3598
3599                             /* All folds except under /iaa that include s, S,
3600                              * and sharp_s also may include the others */
3601                             if (OP(scan) != EXACTFA) {
3602                                 if (uc == 's' || uc == 'S') {
3603                                     ANYOF_BITMAP_SET(data->start_class,
3604                                                    LATIN_SMALL_LETTER_SHARP_S);
3605                                 }
3606                                 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3607                                     ANYOF_BITMAP_SET(data->start_class, 's');
3608                                     ANYOF_BITMAP_SET(data->start_class, 'S');
3609                                 }
3610                             }
3611                         }
3612                     }
3613                     data->start_class->flags &= ~ANYOF_EOS;
3614                 }
3615                 cl_and(data->start_class, and_withp);
3616             }
3617             flags &= ~SCF_DO_STCLASS;
3618         }
3619         else if (REGNODE_VARIES(OP(scan))) {
3620             I32 mincount, maxcount, minnext, deltanext, fl = 0;
3621             I32 f = flags, pos_before = 0;
3622             regnode * const oscan = scan;
3623             struct regnode_charclass_class this_class;
3624             struct regnode_charclass_class *oclass = NULL;
3625             I32 next_is_eval = 0;
3626
3627             switch (PL_regkind[OP(scan)]) {
3628             case WHILEM:                /* End of (?:...)* . */
3629                 scan = NEXTOPER(scan);
3630                 goto finish;
3631             case PLUS:
3632                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3633                     next = NEXTOPER(scan);
3634                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3635                         mincount = 1;
3636                         maxcount = REG_INFTY;
3637                         next = regnext(scan);
3638                         scan = NEXTOPER(scan);
3639                         goto do_curly;
3640                     }
3641                 }
3642                 if (flags & SCF_DO_SUBSTR)
3643                     data->pos_min++;
3644                 min++;
3645                 /* Fall through. */
3646             case STAR:
3647                 if (flags & SCF_DO_STCLASS) {
3648                     mincount = 0;
3649                     maxcount = REG_INFTY;
3650                     next = regnext(scan);
3651                     scan = NEXTOPER(scan);
3652                     goto do_curly;
3653                 }
3654                 is_inf = is_inf_internal = 1;
3655                 scan = regnext(scan);
3656                 if (flags & SCF_DO_SUBSTR) {
3657                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3658                     data->longest = &(data->longest_float);
3659                 }
3660                 goto optimize_curly_tail;
3661             case CURLY:
3662                 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3663                     && (scan->flags == stopparen))
3664                 {
3665                     mincount = 1;
3666                     maxcount = 1;
3667                 } else {
3668                     mincount = ARG1(scan);
3669                     maxcount = ARG2(scan);
3670                 }
3671                 next = regnext(scan);
3672                 if (OP(scan) == CURLYX) {
3673                     I32 lp = (data ? *(data->last_closep) : 0);
3674                     scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3675                 }
3676                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3677                 next_is_eval = (OP(scan) == EVAL);
3678               do_curly:
3679                 if (flags & SCF_DO_SUBSTR) {
3680                     if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3681                     pos_before = data->pos_min;
3682                 }
3683                 if (data) {
3684                     fl = data->flags;
3685                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3686                     if (is_inf)
3687                         data->flags |= SF_IS_INF;
3688                 }
3689                 if (flags & SCF_DO_STCLASS) {
3690                     cl_init(pRExC_state, &this_class);
3691                     oclass = data->start_class;
3692                     data->start_class = &this_class;
3693                     f |= SCF_DO_STCLASS_AND;
3694                     f &= ~SCF_DO_STCLASS_OR;
3695                 }
3696                 /* Exclude from super-linear cache processing any {n,m}
3697                    regops for which the combination of input pos and regex
3698                    pos is not enough information to determine if a match
3699                    will be possible.
3700
3701                    For example, in the regex /foo(bar\s*){4,8}baz/ with the
3702                    regex pos at the \s*, the prospects for a match depend not
3703                    only on the input position but also on how many (bar\s*)
3704                    repeats into the {4,8} we are. */
3705                if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3706                     f &= ~SCF_WHILEM_VISITED_POS;
3707
3708                 /* This will finish on WHILEM, setting scan, or on NULL: */
3709                 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3710                                       last, data, stopparen, recursed, NULL,
3711                                       (mincount == 0
3712                                         ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3713
3714                 if (flags & SCF_DO_STCLASS)
3715                     data->start_class = oclass;
3716                 if (mincount == 0 || minnext == 0) {
3717                     if (flags & SCF_DO_STCLASS_OR) {
3718                         cl_or(pRExC_state, data->start_class, &this_class);
3719                     }
3720                     else if (flags & SCF_DO_STCLASS_AND) {
3721                         /* Switch to OR mode: cache the old value of
3722                          * data->start_class */
3723                         INIT_AND_WITHP;
3724                         StructCopy(data->start_class, and_withp,
3725                                    struct regnode_charclass_class);
3726                         flags &= ~SCF_DO_STCLASS_AND;
3727                         StructCopy(&this_class, data->start_class,
3728                                    struct regnode_charclass_class);
3729                         flags |= SCF_DO_STCLASS_OR;
3730                         data->start_class->flags |= ANYOF_EOS;
3731                     }
3732                 } else {                /* Non-zero len */
3733                     if (flags & SCF_DO_STCLASS_OR) {
3734                         cl_or(pRExC_state, data->start_class, &this_class);
3735                         cl_and(data->start_class, and_withp);
3736                     }
3737                     else if (flags & SCF_DO_STCLASS_AND)
3738                         cl_and(data->start_class, &this_class);
3739                     flags &= ~SCF_DO_STCLASS;
3740                 }
3741                 if (!scan)              /* It was not CURLYX, but CURLY. */
3742                     scan = next;
3743                 if ( /* ? quantifier ok, except for (?{ ... }) */
3744                     (next_is_eval || !(mincount == 0 && maxcount == 1))
3745                     && (minnext == 0) && (deltanext == 0)
3746                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3747                     && maxcount <= REG_INFTY/3) /* Complement check for big count */
3748                 {
3749                     ckWARNreg(RExC_parse,
3750                               "Quantifier unexpected on zero-length expression");
3751                 }
3752
3753                 min += minnext * mincount;
3754                 is_inf_internal |= ((maxcount == REG_INFTY
3755                                      && (minnext + deltanext) > 0)
3756                                     || deltanext == I32_MAX);
3757                 is_inf |= is_inf_internal;
3758                 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3759
3760                 /* Try powerful optimization CURLYX => CURLYN. */
3761                 if (  OP(oscan) == CURLYX && data
3762                       && data->flags & SF_IN_PAR
3763                       && !(data->flags & SF_HAS_EVAL)
3764                       && !deltanext && minnext == 1 ) {
3765                     /* Try to optimize to CURLYN.  */
3766                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3767                     regnode * const nxt1 = nxt;
3768 #ifdef DEBUGGING
3769                     regnode *nxt2;
3770 #endif
3771
3772                     /* Skip open. */
3773                     nxt = regnext(nxt);
3774                     if (!REGNODE_SIMPLE(OP(nxt))
3775                         && !(PL_regkind[OP(nxt)] == EXACT
3776                              && STR_LEN(nxt) == 1))
3777                         goto nogo;
3778 #ifdef DEBUGGING
3779                     nxt2 = nxt;
3780 #endif
3781                     nxt = regnext(nxt);
3782                     if (OP(nxt) != CLOSE)
3783                         goto nogo;
3784                     if (RExC_open_parens) {
3785                         RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3786                         RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3787                     }
3788                     /* Now we know that nxt2 is the only contents: */
3789                     oscan->flags = (U8)ARG(nxt);
3790                     OP(oscan) = CURLYN;
3791                     OP(nxt1) = NOTHING; /* was OPEN. */
3792
3793 #ifdef DEBUGGING
3794                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3795                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3796                     NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3797                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
3798                     OP(nxt + 1) = OPTIMIZED; /* was count. */
3799                     NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3800 #endif
3801                 }
3802               nogo:
3803
3804                 /* Try optimization CURLYX => CURLYM. */
3805                 if (  OP(oscan) == CURLYX && data
3806                       && !(data->flags & SF_HAS_PAR)
3807                       && !(data->flags & SF_HAS_EVAL)
3808                       && !deltanext     /* atom is fixed width */
3809                       && minnext != 0   /* CURLYM can't handle zero width */
3810                 ) {
3811                     /* XXXX How to optimize if data == 0? */
3812                     /* Optimize to a simpler form.  */
3813                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3814                     regnode *nxt2;
3815
3816                     OP(oscan) = CURLYM;
3817                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3818                             && (OP(nxt2) != WHILEM))
3819                         nxt = nxt2;
3820                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
3821                     /* Need to optimize away parenths. */
3822                     if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3823                         /* Set the parenth number.  */
3824                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3825
3826                         oscan->flags = (U8)ARG(nxt);
3827                         if (RExC_open_parens) {
3828                             RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3829                             RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3830                         }
3831                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
3832                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
3833
3834 #ifdef DEBUGGING
3835                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3836                         OP(nxt + 1) = OPTIMIZED; /* was count. */
3837                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3838                         NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
3839 #endif
3840 #if 0
3841                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
3842                             regnode *nnxt = regnext(nxt1);
3843                             if (nnxt == nxt) {
3844                                 if (reg_off_by_arg[OP(nxt1)])
3845                                     ARG_SET(nxt1, nxt2 - nxt1);
3846                                 else if (nxt2 - nxt1 < U16_MAX)
3847                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
3848                                 else
3849                                     OP(nxt) = NOTHING;  /* Cannot beautify */
3850                             }
3851                             nxt1 = nnxt;
3852                         }
3853 #endif
3854                         /* Optimize again: */
3855                         study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3856                                     NULL, stopparen, recursed, NULL, 0,depth+1);
3857                     }
3858                     else
3859                         oscan->flags = 0;
3860                 }
3861                 else if ((OP(oscan) == CURLYX)
3862                          && (flags & SCF_WHILEM_VISITED_POS)
3863                          /* See the comment on a similar expression above.
3864                             However, this time it's not a subexpression
3865                             we care about, but the expression itself. */
3866                          && (maxcount == REG_INFTY)
3867                          && data && ++data->whilem_c < 16) {
3868                     /* This stays as CURLYX, we can put the count/of pair. */
3869                     /* Find WHILEM (as in regexec.c) */
3870                     regnode *nxt = oscan + NEXT_OFF(oscan);
3871
3872                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3873                         nxt += ARG(nxt);
3874                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
3875                         | (RExC_whilem_seen << 4)); /* On WHILEM */
3876                 }
3877                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3878                     pars++;
3879                 if (flags & SCF_DO_SUBSTR) {
3880                     SV *last_str = NULL;
3881                     int counted = mincount != 0;
3882
3883                     if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3884 #if defined(SPARC64_GCC_WORKAROUND)
3885                         I32 b = 0;
3886                         STRLEN l = 0;
3887                         const char *s = NULL;
3888                         I32 old = 0;
3889
3890                         if (pos_before >= data->last_start_min)
3891                             b = pos_before;
3892                         else
3893                             b = data->last_start_min;
3894
3895                         l = 0;
3896                         s = SvPV_const(data->last_found, l);
3897                         old = b - data->last_start_min;
3898
3899 #else
3900                         I32 b = pos_before >= data->last_start_min
3901                             ? pos_before : data->last_start_min;
3902                         STRLEN l;
3903                         const char * const s = SvPV_const(data->last_found, l);
3904                         I32 old = b - data->last_start_min;
3905 #endif
3906
3907                         if (UTF)
3908                             old = utf8_hop((U8*)s, old) - (U8*)s;
3909                         l -= old;
3910                         /* Get the added string: */
3911                         last_str = newSVpvn_utf8(s  + old, l, UTF);
3912                         if (deltanext == 0 && pos_before == b) {
3913                             /* What was added is a constant string */
3914                             if (mincount > 1) {
3915                                 SvGROW(last_str, (mincount * l) + 1);
3916                                 repeatcpy(SvPVX(last_str) + l,
3917                                           SvPVX_const(last_str), l, mincount - 1);
3918                                 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3919                                 /* Add additional parts. */
3920                                 SvCUR_set(data->last_found,
3921                                           SvCUR(data->last_found) - l);
3922                                 sv_catsv(data->last_found, last_str);
3923                                 {
3924                                     SV * sv = data->last_found;
3925                                     MAGIC *mg =
3926                                         SvUTF8(sv) && SvMAGICAL(sv) ?
3927                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
3928                                     if (mg && mg->mg_len >= 0)
3929                                         mg->mg_len += CHR_SVLEN(last_str) - l;
3930                                 }
3931                                 data->last_end += l * (mincount - 1);
3932                             }
3933                         } else {
3934                             /* start offset must point into the last copy */
3935                             data->last_start_min += minnext * (mincount - 1);
3936                             data->last_start_max += is_inf ? I32_MAX
3937                                 : (maxcount - 1) * (minnext + data->pos_delta);
3938                         }
3939                     }
3940                     /* It is counted once already... */
3941                     data->pos_min += minnext * (mincount - counted);
3942                     data->pos_delta += - counted * deltanext +
3943                         (minnext + deltanext) * maxcount - minnext * mincount;
3944                     if (mincount != maxcount) {
3945                          /* Cannot extend fixed substrings found inside
3946                             the group.  */
3947                         SCAN_COMMIT(pRExC_state,data,minlenp);
3948                         if (mincount && last_str) {
3949                             SV * const sv = data->last_found;
3950                             MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3951                                 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3952
3953                             if (mg)
3954                                 mg->mg_len = -1;
3955                             sv_setsv(sv, last_str);
3956                             data->last_end = data->pos_min;
3957                             data->last_start_min =
3958                                 data->pos_min - CHR_SVLEN(last_str);
3959                             data->last_start_max = is_inf
3960                                 ? I32_MAX
3961                                 : data->pos_min + data->pos_delta
3962                                 - CHR_SVLEN(last_str);
3963                         }
3964                         data->longest = &(data->longest_float);
3965                     }
3966                     SvREFCNT_dec(last_str);
3967                 }
3968                 if (data && (fl & SF_HAS_EVAL))
3969                     data->flags |= SF_HAS_EVAL;
3970               optimize_curly_tail:
3971                 if (OP(oscan) != CURLYX) {
3972                     while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3973                            && NEXT_OFF(next))
3974                         NEXT_OFF(oscan) += NEXT_OFF(next);
3975                 }
3976                 continue;
3977             default:                    /* REF, ANYOFV, and CLUMP only? */
3978                 if (flags & SCF_DO_SUBSTR) {
3979                     SCAN_COMMIT(pRExC_state,data,minlenp);      /* Cannot expect anything... */
3980                     data->longest = &(data->longest_float);
3981                 }
3982                 is_inf = is_inf_internal = 1;
3983                 if (flags & SCF_DO_STCLASS_OR)
3984                     cl_anything(pRExC_state, data->start_class);
3985                 flags &= ~SCF_DO_STCLASS;
3986                 break;
3987             }
3988         }
3989         else if (OP(scan) == LNBREAK) {
3990             if (flags & SCF_DO_STCLASS) {
3991                 int value = 0;
3992                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3993                 if (flags & SCF_DO_STCLASS_AND) {
3994                     for (value = 0; value < 256; value++)
3995                         if (!is_VERTWS_cp(value))
3996                             ANYOF_BITMAP_CLEAR(data->start_class, value);
3997                 }
3998                 else {
3999                     for (value = 0; value < 256; value++)
4000                         if (is_VERTWS_cp(value))
4001                             ANYOF_BITMAP_SET(data->start_class, value);
4002                 }
4003                 if (flags & SCF_DO_STCLASS_OR)
4004                     cl_and(data->start_class, and_withp);
4005                 flags &= ~SCF_DO_STCLASS;
4006             }
4007             min += 1;
4008             delta += 1;
4009             if (flags & SCF_DO_SUBSTR) {
4010                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
4011                 data->pos_min += 1;
4012                 data->pos_delta += 1;
4013                 data->longest = &(data->longest_float);
4014             }
4015         }
4016         else if (REGNODE_SIMPLE(OP(scan))) {
4017             int value = 0;
4018
4019             if (flags & SCF_DO_SUBSTR) {
4020                 SCAN_COMMIT(pRExC_state,data,minlenp);
4021                 data->pos_min++;
4022             }
4023             min++;
4024             if (flags & SCF_DO_STCLASS) {
4025                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
4026
4027                 /* Some of the logic below assumes that switching
4028                    locale on will only add false positives. */
4029                 switch (PL_regkind[OP(scan)]) {
4030                 case SANY:
4031                 default:
4032                   do_default:
4033                     /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
4034                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4035                         cl_anything(pRExC_state, data->start_class);
4036                     break;
4037                 case REG_ANY:
4038                     if (OP(scan) == SANY)
4039                         goto do_default;
4040                     if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
4041                         value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
4042                                  || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
4043                         cl_anything(pRExC_state, data->start_class);
4044                     }
4045                     if (flags & SCF_DO_STCLASS_AND || !value)
4046                         ANYOF_BITMAP_CLEAR(data->start_class,'\n');
4047                     break;
4048                 case ANYOF:
4049                     if (flags & SCF_DO_STCLASS_AND)
4050                         cl_and(data->start_class,
4051                                (struct regnode_charclass_class*)scan);
4052                     else
4053                         cl_or(pRExC_state, data->start_class,
4054                               (struct regnode_charclass_class*)scan);
4055                     break;
4056                 case ALNUM:
4057                     if (flags & SCF_DO_STCLASS_AND) {
4058                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4059                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
4060                             if (OP(scan) == ALNUMU) {
4061                                 for (value = 0; value < 256; value++) {
4062                                     if (!isWORDCHAR_L1(value)) {
4063                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4064                                     }
4065                                 }
4066                             } else {
4067                                 for (value = 0; value < 256; value++) {
4068                                     if (!isALNUM(value)) {
4069                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4070                                     }
4071                                 }
4072                             }
4073                         }
4074                     }
4075                     else {
4076                         if (data->start_class->flags & ANYOF_LOCALE)
4077                             ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
4078
4079                         /* Even if under locale, set the bits for non-locale
4080                          * in case it isn't a true locale-node.  This will
4081                          * create false positives if it truly is locale */
4082                         if (OP(scan) == ALNUMU) {
4083                             for (value = 0; value < 256; value++) {
4084                                 if (isWORDCHAR_L1(value)) {
4085                                     ANYOF_BITMAP_SET(data->start_class, value);
4086                                 }
4087                             }
4088                         } else {
4089                             for (value = 0; value < 256; value++) {
4090                                 if (isALNUM(value)) {
4091                                     ANYOF_BITMAP_SET(data->start_class, value);
4092                                 }
4093                             }
4094                         }
4095                     }
4096                     break;
4097                 case NALNUM:
4098                     if (flags & SCF_DO_STCLASS_AND) {
4099                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4100                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
4101                             if (OP(scan) == NALNUMU) {
4102                                 for (value = 0; value < 256; value++) {
4103                                     if (isWORDCHAR_L1(value)) {
4104                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4105                                     }
4106                                 }
4107                             } else {
4108                                 for (value = 0; value < 256; value++) {
4109                                     if (isALNUM(value)) {
4110                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4111                                     }
4112                                 }
4113                             }
4114                         }
4115                     }
4116                     else {
4117                         if (data->start_class->flags & ANYOF_LOCALE)
4118                             ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
4119
4120                         /* Even if under locale, set the bits for non-locale in
4121                          * case it isn't a true locale-node.  This will create
4122                          * false positives if it truly is locale */
4123                         if (OP(scan) == NALNUMU) {
4124                             for (value = 0; value < 256; value++) {
4125                                 if (! isWORDCHAR_L1(value)) {
4126                                     ANYOF_BITMAP_SET(data->start_class, value);
4127                                 }
4128                             }
4129                         } else {
4130                             for (value = 0; value < 256; value++) {
4131                                 if (! isALNUM(value)) {
4132                                     ANYOF_BITMAP_SET(data->start_class, value);
4133                                 }
4134                             }
4135                         }
4136                     }
4137                     break;
4138                 case SPACE:
4139                     if (flags & SCF_DO_STCLASS_AND) {
4140                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4141                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
4142                             if (OP(scan) == SPACEU) {
4143                                 for (value = 0; value < 256; value++) {
4144                                     if (!isSPACE_L1(value)) {
4145                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4146                                     }
4147                                 }
4148                             } else {
4149                                 for (value = 0; value < 256; value++) {
4150                                     if (!isSPACE(value)) {
4151                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4152                                     }
4153                                 }
4154                             }
4155                         }
4156                     }
4157                     else {
4158                         if (data->start_class->flags & ANYOF_LOCALE) {
4159                             ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
4160                         }
4161                         if (OP(scan) == SPACEU) {
4162                             for (value = 0; value < 256; value++) {
4163                                 if (isSPACE_L1(value)) {
4164                                     ANYOF_BITMAP_SET(data->start_class, value);
4165                                 }
4166                             }
4167                         } else {
4168                             for (value = 0; value < 256; value++) {
4169                                 if (isSPACE(value)) {
4170                                     ANYOF_BITMAP_SET(data->start_class, value);
4171                                 }
4172                             }
4173                         }
4174                     }
4175                     break;
4176                 case NSPACE:
4177                     if (flags & SCF_DO_STCLASS_AND) {
4178                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4179                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
4180                             if (OP(scan) == NSPACEU) {
4181                                 for (value = 0; value < 256; value++) {
4182                                     if (isSPACE_L1(value)) {
4183                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4184                                     }
4185                                 }
4186                             } else {
4187                                 for (value = 0; value < 256; value++) {
4188                                     if (isSPACE(value)) {
4189                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4190                                     }
4191                                 }
4192                             }
4193                         }
4194                     }
4195                     else {
4196                         if (data->start_class->flags & ANYOF_LOCALE)
4197                             ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
4198                         if (OP(scan) == NSPACEU) {
4199                             for (value = 0; value < 256; value++) {
4200                                 if (!isSPACE_L1(value)) {
4201                                     ANYOF_BITMAP_SET(data->start_class, value);
4202                                 }
4203                             }
4204                         }
4205                         else {
4206                             for (value = 0; value < 256; value++) {
4207                                 if (!isSPACE(value)) {
4208                                     ANYOF_BITMAP_SET(data->start_class, value);
4209                                 }
4210                             }
4211                         }
4212                     }
4213                     break;
4214                 case DIGIT:
4215                     if (flags & SCF_DO_STCLASS_AND) {
4216                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4217                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
4218                             for (value = 0; value < 256; value++)
4219                                 if (!isDIGIT(value))
4220                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
4221                         }
4222                     }
4223                     else {
4224                         if (data->start_class->flags & ANYOF_LOCALE)
4225                             ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
4226                         for (value = 0; value < 256; value++)
4227                             if (isDIGIT(value))
4228                                 ANYOF_BITMAP_SET(data->start_class, value);
4229                     }
4230                     break;
4231                 case NDIGIT:
4232                     if (flags & SCF_DO_STCLASS_AND) {
4233                         if (!(data->start_class->flags & ANYOF_LOCALE))
4234                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
4235                         for (value = 0; value < 256; value++)
4236                             if (isDIGIT(value))
4237                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
4238                     }
4239                     else {
4240                         if (data->start_class->flags & ANYOF_LOCALE)
4241                             ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
4242                         for (value = 0; value < 256; value++)
4243                             if (!isDIGIT(value))
4244                                 ANYOF_BITMAP_SET(data->start_class, value);
4245                     }
4246                     break;
4247                 CASE_SYNST_FNC(VERTWS);
4248                 CASE_SYNST_FNC(HORIZWS);
4249
4250                 }
4251                 if (flags & SCF_DO_STCLASS_OR)
4252                     cl_and(data->start_class, and_withp);
4253                 flags &= ~SCF_DO_STCLASS;
4254             }
4255         }
4256         else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
4257             data->flags |= (OP(scan) == MEOL
4258                             ? SF_BEFORE_MEOL
4259                             : SF_BEFORE_SEOL);
4260         }
4261         else if (  PL_regkind[OP(scan)] == BRANCHJ
4262                  /* Lookbehind, or need to calculate parens/evals/stclass: */
4263                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
4264                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4265             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4266                 || OP(scan) == UNLESSM )
4267             {
4268                 /* Negative Lookahead/lookbehind
4269                    In this case we can't do fixed string optimisation.
4270                 */
4271
4272                 I32 deltanext, minnext, fake = 0;
4273                 regnode *nscan;
4274                 struct regnode_charclass_class intrnl;
4275                 int f = 0;
4276
4277                 data_fake.flags = 0;
4278                 if (data) {
4279                     data_fake.whilem_c = data->whilem_c;
4280                     data_fake.last_closep = data->last_closep;
4281                 }
4282                 else
4283                     data_fake.last_closep = &fake;
4284                 data_fake.pos_delta = delta;
4285                 if ( flags & SCF_DO_STCLASS && !scan->flags
4286                      && OP(scan) == IFMATCH ) { /* Lookahead */
4287                     cl_init(pRExC_state, &intrnl);
4288                     data_fake.start_class = &intrnl;
4289                     f |= SCF_DO_STCLASS_AND;
4290                 }
4291                 if (flags & SCF_WHILEM_VISITED_POS)
4292                     f |= SCF_WHILEM_VISITED_POS;
4293                 next = regnext(scan);
4294                 nscan = NEXTOPER(NEXTOPER(scan));
4295                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
4296                     last, &data_fake, stopparen, recursed, NULL, f, depth+1);
4297                 if (scan->flags) {
4298                     if (deltanext) {
4299                         FAIL("Variable length lookbehind not implemented");
4300                     }
4301                     else if (minnext > (I32)U8_MAX) {
4302                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4303                     }
4304                     scan->flags = (U8)minnext;
4305                 }
4306                 if (data) {
4307                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4308                         pars++;
4309                     if (data_fake.flags & SF_HAS_EVAL)
4310                         data->flags |= SF_HAS_EVAL;
4311                     data->whilem_c = data_fake.whilem_c;
4312                 }
4313                 if (f & SCF_DO_STCLASS_AND) {
4314                     if (flags & SCF_DO_STCLASS_OR) {
4315                         /* OR before, AND after: ideally we would recurse with
4316                          * data_fake to get the AND applied by study of the
4317                          * remainder of the pattern, and then derecurse;
4318                          * *** HACK *** for now just treat as "no information".
4319                          * See [perl #56690].
4320                          */
4321                         cl_init(pRExC_state, data->start_class);
4322                     }  else {
4323                         /* AND before and after: combine and continue */
4324                         const int was = (data->start_class->flags & ANYOF_EOS);
4325
4326                         cl_and(data->start_class, &intrnl);
4327                         if (was)
4328                             data->start_class->flags |= ANYOF_EOS;
4329                     }
4330                 }
4331             }
4332 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4333             else {
4334                 /* Positive Lookahead/lookbehind
4335                    In this case we can do fixed string optimisation,
4336                    but we must be careful about it. Note in the case of
4337                    lookbehind the positions will be offset by the minimum
4338                    length of the pattern, something we won't know about
4339                    until after the recurse.
4340                 */
4341                 I32 deltanext, fake = 0;
4342                 regnode *nscan;
4343                 struct regnode_charclass_class intrnl;
4344                 int f = 0;
4345                 /* We use SAVEFREEPV so that when the full compile
4346                     is finished perl will clean up the allocated
4347                     minlens when it's all done. This way we don't
4348                     have to worry about freeing them when we know
4349                     they wont be used, which would be a pain.
4350                  */
4351                 I32 *minnextp;
4352                 Newx( minnextp, 1, I32 );
4353                 SAVEFREEPV(minnextp);
4354
4355                 if (data) {
4356                     StructCopy(data, &data_fake, scan_data_t);
4357                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
4358                         f |= SCF_DO_SUBSTR;
4359                         if (scan->flags)
4360                             SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
4361                         data_fake.last_found=newSVsv(data->last_found);
4362                     }
4363                 }
4364                 else
4365                     data_fake.last_closep = &fake;
4366                 data_fake.flags = 0;
4367                 data_fake.pos_delta = delta;
4368                 if (is_inf)
4369                     data_fake.flags |= SF_IS_INF;
4370                 if ( flags & SCF_DO_STCLASS && !scan->flags
4371                      && OP(scan) == IFMATCH ) { /* Lookahead */
4372                     cl_init(pRExC_state, &intrnl);
4373                     data_fake.start_class = &intrnl;
4374                     f |= SCF_DO_STCLASS_AND;
4375                 }
4376                 if (flags & SCF_WHILEM_VISITED_POS)
4377                     f |= SCF_WHILEM_VISITED_POS;
4378                 next = regnext(scan);
4379                 nscan = NEXTOPER(NEXTOPER(scan));
4380
4381                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
4382                     last, &data_fake, stopparen, recursed, NULL, f,depth+1);
4383                 if (scan->flags) {
4384                     if (deltanext) {
4385                         FAIL("Variable length lookbehind not implemented");
4386                     }
4387                     else if (*minnextp > (I32)U8_MAX) {
4388                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4389                     }
4390                     scan->flags = (U8)*minnextp;
4391                 }
4392
4393                 *minnextp += min;
4394
4395                 if (f & SCF_DO_STCLASS_AND) {
4396                     const int was = (data->start_class->flags & ANYOF_EOS);
4397
4398                     cl_and(data->start_class, &intrnl);
4399                     if (was)
4400                         data->start_class->flags |= ANYOF_EOS;
4401                 }
4402                 if (data) {
4403                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4404                         pars++;
4405                     if (data_fake.flags & SF_HAS_EVAL)
4406                         data->flags |= SF_HAS_EVAL;
4407                     data->whilem_c = data_fake.whilem_c;
4408                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4409                         if (RExC_rx->minlen<*minnextp)
4410                             RExC_rx->minlen=*minnextp;
4411                         SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4412                         SvREFCNT_dec(data_fake.last_found);
4413
4414                         if ( data_fake.minlen_fixed != minlenp )
4415                         {
4416                             data->offset_fixed= data_fake.offset_fixed;
4417                             data->minlen_fixed= data_fake.minlen_fixed;
4418                             data->lookbehind_fixed+= scan->flags;
4419                         }
4420                         if ( data_fake.minlen_float != minlenp )
4421                         {
4422                             data->minlen_float= data_fake.minlen_float;
4423                             data->offset_float_min=data_fake.offset_float_min;
4424                             data->offset_float_max=data_fake.offset_float_max;
4425                             data->lookbehind_float+= scan->flags;
4426                         }
4427                     }
4428                 }
4429
4430
4431             }
4432 #endif
4433         }
4434         else if (OP(scan) == OPEN) {
4435             if (stopparen != (I32)ARG(scan))
4436                 pars++;
4437         }
4438         else if (OP(scan) == CLOSE) {
4439             if (stopparen == (I32)ARG(scan)) {
4440                 break;
4441             }
4442             if ((I32)ARG(scan) == is_par) {
4443                 next = regnext(scan);
4444
4445                 if ( next && (OP(next) != WHILEM) && next < last)
4446                     is_par = 0;         /* Disable optimization */
4447             }
4448             if (data)
4449                 *(data->last_closep) = ARG(scan);
4450         }
4451         else if (OP(scan) == EVAL) {
4452                 if (data)
4453                     data->flags |= SF_HAS_EVAL;
4454         }
4455         else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4456             if (flags & SCF_DO_SUBSTR) {
4457                 SCAN_COMMIT(pRExC_state,data,minlenp);
4458                 flags &= ~SCF_DO_SUBSTR;
4459             }
4460             if (data && OP(scan)==ACCEPT) {
4461                 data->flags |= SCF_SEEN_ACCEPT;
4462                 if (stopmin > min)
4463                     stopmin = min;
4464             }
4465         }
4466         else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4467         {
4468                 if (flags & SCF_DO_SUBSTR) {
4469                     SCAN_COMMIT(pRExC_state,data,minlenp);
4470                     data->longest = &(data->longest_float);
4471                 }
4472                 is_inf = is_inf_internal = 1;
4473                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4474                     cl_anything(pRExC_state, data->start_class);
4475                 flags &= ~SCF_DO_STCLASS;
4476         }
4477         else if (OP(scan) == GPOS) {
4478             if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4479                 !(delta || is_inf || (data && data->pos_delta)))
4480             {
4481                 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4482                     RExC_rx->extflags |= RXf_ANCH_GPOS;
4483                 if (RExC_rx->gofs < (U32)min)
4484                     RExC_rx->gofs = min;
4485             } else {
4486                 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4487                 RExC_rx->gofs = 0;
4488             }
4489         }
4490 #ifdef TRIE_STUDY_OPT
4491 #ifdef FULL_TRIE_STUDY
4492         else if (PL_regkind[OP(scan)] == TRIE) {
4493             /* NOTE - There is similar code to this block above for handling
4494                BRANCH nodes on the initial study.  If you change stuff here
4495                check there too. */
4496             regnode *trie_node= scan;
4497             regnode *tail= regnext(scan);
4498             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4499             I32 max1 = 0, min1 = I32_MAX;
4500             struct regnode_charclass_class accum;
4501
4502             if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4503                 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4504             if (flags & SCF_DO_STCLASS)
4505                 cl_init_zero(pRExC_state, &accum);
4506
4507             if (!trie->jump) {
4508                 min1= trie->minlen;
4509                 max1= trie->maxlen;
4510             } else {
4511                 const regnode *nextbranch= NULL;
4512                 U32 word;
4513
4514                 for ( word=1 ; word <= trie->wordcount ; word++)
4515                 {
4516                     I32 deltanext=0, minnext=0, f = 0, fake;
4517                     struct regnode_charclass_class this_class;
4518
4519                     data_fake.flags = 0;
4520                     if (data) {
4521                         data_fake.whilem_c = data->whilem_c;
4522                         data_fake.last_closep = data->last_closep;
4523                     }
4524                     else
4525                         data_fake.last_closep = &fake;
4526                     data_fake.pos_delta = delta;
4527                     if (flags & SCF_DO_STCLASS) {
4528                         cl_init(pRExC_state, &this_class);
4529                         data_fake.start_class = &this_class;
4530                         f = SCF_DO_STCLASS_AND;
4531                     }
4532                     if (flags & SCF_WHILEM_VISITED_POS)
4533                         f |= SCF_WHILEM_VISITED_POS;
4534
4535                     if (trie->jump[word]) {
4536                         if (!nextbranch)
4537                             nextbranch = trie_node + trie->jump[0];
4538                         scan= trie_node + trie->jump[word];
4539                         /* We go from the jump point to the branch that follows
4540                            it. Note this means we need the vestigal unused branches
4541                            even though they arent otherwise used.
4542                          */
4543                         minnext = study_chunk(pRExC_state, &scan, minlenp,
4544                             &deltanext, (regnode *)nextbranch, &data_fake,
4545                             stopparen, recursed, NULL, f,depth+1);
4546                     }
4547                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4548                         nextbranch= regnext((regnode*)nextbranch);
4549
4550                     if (min1 > (I32)(minnext + trie->minlen))
4551                         min1 = minnext + trie->minlen;
4552                     if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4553                         max1 = minnext + deltanext + trie->maxlen;
4554                     if (deltanext == I32_MAX)
4555                         is_inf = is_inf_internal = 1;
4556
4557                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4558                         pars++;
4559                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
4560                         if ( stopmin > min + min1)
4561                             stopmin = min + min1;
4562                         flags &= ~SCF_DO_SUBSTR;
4563                         if (data)
4564                             data->flags |= SCF_SEEN_ACCEPT;
4565                     }
4566                     if (data) {
4567                         if (data_fake.flags & SF_HAS_EVAL)
4568                             data->flags |= SF_HAS_EVAL;
4569                         data->whilem_c = data_fake.whilem_c;
4570                     }
4571                     if (flags & SCF_DO_STCLASS)
4572                         cl_or(pRExC_state, &accum, &this_class);
4573                 }
4574             }
4575             if (flags & SCF_DO_SUBSTR) {
4576                 data->pos_min += min1;
4577                 data->pos_delta += max1 - min1;
4578                 if (max1 != min1 || is_inf)
4579                     data->longest = &(data->longest_float);
4580             }
4581             min += min1;
4582             delta += max1 - min1;
4583             if (flags & SCF_DO_STCLASS_OR) {
4584                 cl_or(pRExC_state, data->start_class, &accum);
4585                 if (min1) {
4586                     cl_and(data->start_class, and_withp);
4587                     flags &= ~SCF_DO_STCLASS;
4588                 }
4589             }
4590             else if (flags & SCF_DO_STCLASS_AND) {
4591                 if (min1) {
4592                     cl_and(data->start_class, &accum);
4593                     flags &= ~SCF_DO_STCLASS;
4594                 }
4595                 else {
4596                     /* Switch to OR mode: cache the old value of
4597                      * data->start_class */
4598                     INIT_AND_WITHP;
4599                     StructCopy(data->start_class, and_withp,
4600                                struct regnode_charclass_class);
4601                     flags &= ~SCF_DO_STCLASS_AND;
4602                     StructCopy(&accum, data->start_class,
4603                                struct regnode_charclass_class);
4604                     flags |= SCF_DO_STCLASS_OR;
4605                     data->start_class->flags |= ANYOF_EOS;
4606                 }
4607             }
4608             scan= tail;
4609             continue;
4610         }
4611 #else
4612         else if (PL_regkind[OP(scan)] == TRIE) {
4613             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4614             U8*bang=NULL;
4615
4616             min += trie->minlen;
4617             delta += (trie->maxlen - trie->minlen);
4618             flags &= ~SCF_DO_STCLASS; /* xxx */
4619             if (flags & SCF_DO_SUBSTR) {
4620                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
4621                 data->pos_min += trie->minlen;
4622                 data->pos_delta += (trie->maxlen - trie->minlen);
4623                 if (trie->maxlen != trie->minlen)
4624                     data->longest = &(data->longest_float);
4625             }
4626             if (trie->jump) /* no more substrings -- for now /grr*/
4627                 flags &= ~SCF_DO_SUBSTR;
4628         }
4629 #endif /* old or new */
4630 #endif /* TRIE_STUDY_OPT */
4631
4632         /* Else: zero-length, ignore. */
4633         scan = regnext(scan);
4634     }
4635     if (frame) {
4636         last = frame->last;
4637         scan = frame->next;
4638         stopparen = frame->stop;
4639         frame = frame->prev;
4640         goto fake_study_recurse;
4641     }
4642
4643   finish:
4644     assert(!frame);
4645     DEBUG_STUDYDATA("pre-fin:",data,depth);
4646
4647     *scanp = scan;
4648     *deltap = is_inf_internal ? I32_MAX : delta;
4649     if (flags & SCF_DO_SUBSTR && is_inf)
4650         data->pos_delta = I32_MAX - data->pos_min;
4651     if (is_par > (I32)U8_MAX)
4652         is_par = 0;
4653     if (is_par && pars==1 && data) {
4654         data->flags |= SF_IN_PAR;
4655         data->flags &= ~SF_HAS_PAR;
4656     }
4657     else if (pars && data) {
4658         data->flags |= SF_HAS_PAR;
4659         data->flags &= ~SF_IN_PAR;
4660     }
4661     if (flags & SCF_DO_STCLASS_OR)
4662         cl_and(data->start_class, and_withp);
4663     if (flags & SCF_TRIE_RESTUDY)
4664         data->flags |=  SCF_TRIE_RESTUDY;
4665
4666     DEBUG_STUDYDATA("post-fin:",data,depth);
4667
4668     return min < stopmin ? min : stopmin;
4669 }
4670
4671 STATIC U32
4672 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4673 {
4674     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4675
4676     PERL_ARGS_ASSERT_ADD_DATA;
4677
4678     Renewc(RExC_rxi->data,
4679            sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4680            char, struct reg_data);
4681     if(count)
4682         Renew(RExC_rxi->data->what, count + n, U8);
4683     else
4684         Newx(RExC_rxi->data->what, n, U8);
4685     RExC_rxi->data->count = count + n;
4686     Copy(s, RExC_rxi->data->what + count, n, U8);
4687     return count;
4688 }
4689
4690 /*XXX: todo make this not included in a non debugging perl */
4691 #ifndef PERL_IN_XSUB_RE
4692 void
4693 Perl_reginitcolors(pTHX)
4694 {
4695     dVAR;
4696     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4697     if (s) {
4698         char *t = savepv(s);
4699         int i = 0;
4700         PL_colors[0] = t;
4701         while (++i < 6) {
4702             t = strchr(t, '\t');
4703             if (t) {
4704                 *t = '\0';
4705                 PL_colors[i] = ++t;
4706             }
4707             else
4708                 PL_colors[i] = t = (char *)"";
4709         }
4710     } else {
4711         int i = 0;
4712         while (i < 6)
4713             PL_colors[i++] = (char *)"";
4714     }
4715     PL_colorset = 1;
4716 }
4717 #endif
4718
4719
4720 #ifdef TRIE_STUDY_OPT
4721 #define CHECK_RESTUDY_GOTO                                  \
4722         if (                                                \
4723               (data.flags & SCF_TRIE_RESTUDY)               \
4724               && ! restudied++                              \
4725         )     goto reStudy
4726 #else
4727 #define CHECK_RESTUDY_GOTO
4728 #endif
4729
4730 /*
4731  - pregcomp - compile a regular expression into internal code
4732  *
4733  * We can't allocate space until we know how big the compiled form will be,
4734  * but we can't compile it (and thus know how big it is) until we've got a
4735  * place to put the code.  So we cheat:  we compile it twice, once with code
4736  * generation turned off and size counting turned on, and once "for real".
4737  * This also means that we don't allocate space until we are sure that the
4738  * thing really will compile successfully, and we never have to move the
4739  * code and thus invalidate pointers into it.  (Note that it has to be in
4740  * one piece because free() must be able to free it all.) [NB: not true in perl]
4741  *
4742  * Beware that the optimization-preparation code in here knows about some
4743  * of the structure of the compiled regexp.  [I'll say.]
4744  */
4745
4746
4747
4748 #ifndef PERL_IN_XSUB_RE
4749 #define RE_ENGINE_PTR &reh_regexp_engine
4750 #else
4751 extern const struct regexp_engine my_reg_engine;
4752 #define RE_ENGINE_PTR &my_reg_engine
4753 #endif
4754
4755 #ifndef PERL_IN_XSUB_RE
4756 REGEXP *
4757 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4758 {
4759     dVAR;
4760     HV * const table = GvHV(PL_hintgv);
4761
4762     PERL_ARGS_ASSERT_PREGCOMP;
4763
4764     /* Dispatch a request to compile a regexp to correct
4765        regexp engine. */
4766     if (table) {
4767         SV **ptr= hv_fetchs(table, "regcomp", FALSE);
4768         GET_RE_DEBUG_FLAGS_DECL;
4769         if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
4770             const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4771             DEBUG_COMPILE_r({
4772                 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4773                     SvIV(*ptr));
4774             });
4775             return CALLREGCOMP_ENG(eng, pattern, flags);
4776         }
4777     }
4778     return Perl_re_compile(aTHX_ pattern, flags);
4779 }
4780 #endif
4781
4782 REGEXP *
4783 Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
4784 {
4785     dVAR;
4786     REGEXP *rx;
4787     struct regexp *r;
4788     register regexp_internal *ri;
4789     STRLEN plen;
4790     char* VOL exp;
4791     char* xend;
4792     regnode *scan;
4793     I32 flags;
4794     I32 minlen = 0;
4795     U32 pm_flags;
4796
4797     /* these are all flags - maybe they should be turned
4798      * into a single int with different bit masks */
4799     I32 sawlookahead = 0;
4800     I32 sawplus = 0;
4801     I32 sawopen = 0;
4802     bool used_setjump = FALSE;
4803     regex_charset initial_charset = get_regex_charset(orig_pm_flags);
4804
4805     U8 jump_ret = 0;
4806     dJMPENV;
4807     scan_data_t data;
4808     RExC_state_t RExC_state;
4809     RExC_state_t * const pRExC_state = &RExC_state;
4810 #ifdef TRIE_STUDY_OPT
4811     int restudied;
4812     RExC_state_t copyRExC_state;
4813 #endif
4814     GET_RE_DEBUG_FLAGS_DECL;
4815
4816     PERL_ARGS_ASSERT_RE_COMPILE;
4817
4818     DEBUG_r(if (!PL_colorset) reginitcolors());
4819
4820 #ifndef PERL_IN_XSUB_RE
4821     /* Initialize these here instead of as-needed, as is quick and avoids
4822      * having to test them each time otherwise */
4823     if (! PL_AboveLatin1) {
4824         PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
4825         PL_ASCII = _new_invlist_C_array(ASCII_invlist);
4826         PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
4827
4828         PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist);
4829         PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist);
4830
4831         PL_L1PosixAlpha = _new_invlist_C_array(L1PosixAlpha_invlist);
4832         PL_PosixAlpha = _new_invlist_C_array(PosixAlpha_invlist);
4833
4834         PL_PosixBlank = _new_invlist_C_array(PosixBlank_invlist);
4835         PL_XPosixBlank = _new_invlist_C_array(XPosixBlank_invlist);
4836
4837         PL_L1Cased = _new_invlist_C_array(L1Cased_invlist);
4838
4839         PL_PosixCntrl = _new_invlist_C_array(PosixCntrl_invlist);
4840         PL_XPosixCntrl = _new_invlist_C_array(XPosixCntrl_invlist);
4841
4842         PL_PosixDigit = _new_invlist_C_array(PosixDigit_invlist);
4843
4844         PL_L1PosixGraph = _new_invlist_C_array(L1PosixGraph_invlist);
4845         PL_PosixGraph = _new_invlist_C_array(PosixGraph_invlist);
4846
4847         PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist);
4848         PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist);
4849
4850         PL_L1PosixLower = _new_invlist_C_array(L1PosixLower_invlist);
4851         PL_PosixLower = _new_invlist_C_array(PosixLower_invlist);
4852
4853         PL_L1PosixPrint = _new_invlist_C_array(L1PosixPrint_invlist);
4854         PL_PosixPrint = _new_invlist_C_array(PosixPrint_invlist);
4855
4856         PL_L1PosixPunct = _new_invlist_C_array(L1PosixPunct_invlist);
4857         PL_PosixPunct = _new_invlist_C_array(PosixPunct_invlist);
4858
4859         PL_PerlSpace = _new_invlist_C_array(PerlSpace_invlist);
4860         PL_XPerlSpace = _new_invlist_C_array(XPerlSpace_invlist);
4861
4862         PL_PosixSpace = _new_invlist_C_array(PosixSpace_invlist);
4863         PL_XPosixSpace = _new_invlist_C_array(XPosixSpace_invlist);
4864
4865         PL_L1PosixUpper = _new_invlist_C_array(L1PosixUpper_invlist);
4866         PL_PosixUpper = _new_invlist_C_array(PosixUpper_invlist);
4867
4868         PL_VertSpace = _new_invlist_C_array(VertSpace_invlist);
4869
4870         PL_PosixWord = _new_invlist_C_array(PosixWord_invlist);
4871         PL_L1PosixWord = _new_invlist_C_array(L1PosixWord_invlist);
4872
4873         PL_PosixXDigit = _new_invlist_C_array(PosixXDigit_invlist);
4874         PL_XPosixXDigit = _new_invlist_C_array(XPosixXDigit_invlist);
4875     }
4876 #endif
4877
4878     exp = SvPV(pattern, plen);
4879
4880     if (plen == 0) { /* ignore the utf8ness if the pattern is 0 length */
4881         RExC_utf8 = RExC_orig_utf8 = 0;
4882     }
4883     else {
4884         RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
4885     }
4886     RExC_uni_semantics = 0;
4887     RExC_contains_locale = 0;
4888
4889     /****************** LONG JUMP TARGET HERE***********************/
4890     /* Longjmp back to here if have to switch in midstream to utf8 */
4891     if (! RExC_orig_utf8) {
4892         JMPENV_PUSH(jump_ret);
4893         used_setjump = TRUE;
4894     }
4895
4896     if (jump_ret == 0) {    /* First time through */
4897         xend = exp + plen;
4898
4899         DEBUG_COMPILE_r({
4900             SV *dsv= sv_newmortal();
4901             RE_PV_QUOTED_DECL(s, RExC_utf8,
4902                 dsv, exp, plen, 60);
4903             PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4904                            PL_colors[4],PL_colors[5],s);
4905         });
4906     }
4907     else {  /* longjumped back */
4908         STRLEN len = plen;
4909
4910         /* If the cause for the longjmp was other than changing to utf8, pop
4911          * our own setjmp, and longjmp to the correct handler */
4912         if (jump_ret != UTF8_LONGJMP) {
4913             JMPENV_POP;
4914             JMPENV_JUMP(jump_ret);
4915         }
4916
4917         GET_RE_DEBUG_FLAGS;
4918
4919         /* It's possible to write a regexp in ascii that represents Unicode
4920         codepoints outside of the byte range, such as via \x{100}. If we
4921         detect such a sequence we have to convert the entire pattern to utf8
4922         and then recompile, as our sizing calculation will have been based
4923         on 1 byte == 1 character, but we will need to use utf8 to encode
4924         at least some part of the pattern, and therefore must convert the whole
4925         thing.
4926         -- dmq */
4927         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4928             "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4929         exp = (char*)Perl_bytes_to_utf8(aTHX_
4930                                         (U8*)SvPV_nomg(pattern, plen),
4931                                         &len);
4932         xend = exp + len;
4933         RExC_orig_utf8 = RExC_utf8 = 1;
4934         SAVEFREEPV(exp);
4935     }
4936
4937 #ifdef TRIE_STUDY_OPT
4938     restudied = 0;
4939 #endif
4940
4941     pm_flags = orig_pm_flags;
4942
4943     if (initial_charset == REGEX_LOCALE_CHARSET) {
4944         RExC_contains_locale = 1;
4945     }
4946     else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
4947
4948         /* Set to use unicode semantics if the pattern is in utf8 and has the
4949          * 'depends' charset specified, as it means unicode when utf8  */
4950         set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
4951     }
4952
4953     RExC_precomp = exp;
4954     RExC_flags = pm_flags;
4955     RExC_sawback = 0;
4956
4957     RExC_seen = 0;
4958     RExC_in_lookbehind = 0;
4959     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4960     RExC_seen_evals = 0;
4961     RExC_extralen = 0;
4962     RExC_override_recoding = 0;
4963
4964     /* First pass: determine size, legality. */
4965     RExC_parse = exp;
4966     RExC_start = exp;
4967     RExC_end = xend;
4968     RExC_naughty = 0;
4969     RExC_npar = 1;
4970     RExC_nestroot = 0;
4971     RExC_size = 0L;
4972     RExC_emit = &PL_regdummy;
4973     RExC_whilem_seen = 0;
4974     RExC_open_parens = NULL;
4975     RExC_close_parens = NULL;
4976     RExC_opend = NULL;
4977     RExC_paren_names = NULL;
4978 #ifdef DEBUGGING
4979     RExC_paren_name_list = NULL;
4980 #endif
4981     RExC_recurse = NULL;
4982     RExC_recurse_count = 0;
4983
4984 #if 0 /* REGC() is (currently) a NOP at the first pass.
4985        * Clever compilers notice this and complain. --jhi */
4986     REGC((U8)REG_MAGIC, (char*)RExC_emit);
4987 #endif
4988     DEBUG_PARSE_r(
4989         PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
4990         RExC_lastnum=0;
4991         RExC_lastparse=NULL;
4992     );
4993     if (reg(pRExC_state, 0, &flags,1) == NULL) {
4994         RExC_precomp = NULL;
4995         return(NULL);
4996     }
4997
4998     /* Here, finished first pass.  Get rid of any added setjmp */
4999     if (used_setjump) {
5000         JMPENV_POP;
5001     }
5002
5003     DEBUG_PARSE_r({
5004         PerlIO_printf(Perl_debug_log,
5005             "Required size %"IVdf" nodes\n"
5006             "Starting second pass (creation)\n",
5007             (IV)RExC_size);
5008         RExC_lastnum=0;
5009         RExC_lastparse=NULL;
5010     });
5011
5012     /* The first pass could have found things that force Unicode semantics */
5013     if ((RExC_utf8 || RExC_uni_semantics)
5014          && get_regex_charset(pm_flags) == REGEX_DEPENDS_CHARSET)
5015     {
5016         set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
5017     }
5018
5019     /* Small enough for pointer-storage convention?
5020        If extralen==0, this means that we will not need long jumps. */
5021     if (RExC_size >= 0x10000L && RExC_extralen)
5022         RExC_size += RExC_extralen;
5023     else
5024         RExC_extralen = 0;
5025     if (RExC_whilem_seen > 15)
5026         RExC_whilem_seen = 15;
5027
5028     /* Allocate space and zero-initialize. Note, the two step process
5029        of zeroing when in debug mode, thus anything assigned has to
5030        happen after that */
5031     rx = (REGEXP*) newSV_type(SVt_REGEXP);
5032     r = (struct regexp*)SvANY(rx);
5033     Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
5034          char, regexp_internal);
5035     if ( r == NULL || ri == NULL )
5036         FAIL("Regexp out of space");
5037 #ifdef DEBUGGING
5038     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
5039     Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
5040 #else
5041     /* bulk initialize base fields with 0. */
5042     Zero(ri, sizeof(regexp_internal), char);
5043 #endif
5044
5045     /* non-zero initialization begins here */
5046     RXi_SET( r, ri );
5047     r->engine= RE_ENGINE_PTR;
5048     r->extflags = pm_flags;
5049     {
5050         bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
5051         bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
5052
5053         /* The caret is output if there are any defaults: if not all the STD
5054          * flags are set, or if no character set specifier is needed */
5055         bool has_default =
5056                     (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
5057                     || ! has_charset);
5058         bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
5059         U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
5060                             >> RXf_PMf_STD_PMMOD_SHIFT);
5061         const char *fptr = STD_PAT_MODS;        /*"msix"*/
5062         char *p;
5063         /* Allocate for the worst case, which is all the std flags are turned
5064          * on.  If more precision is desired, we could do a population count of
5065          * the flags set.  This could be done with a small lookup table, or by
5066          * shifting, masking and adding, or even, when available, assembly
5067          * language for a machine-language population count.
5068          * We never output a minus, as all those are defaults, so are
5069          * covered by the caret */
5070         const STRLEN wraplen = plen + has_p + has_runon
5071             + has_default       /* If needs a caret */
5072
5073                 /* If needs a character set specifier */
5074             + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
5075             + (sizeof(STD_PAT_MODS) - 1)
5076             + (sizeof("(?:)") - 1);
5077
5078         p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
5079         SvPOK_on(rx);
5080         SvFLAGS(rx) |= SvUTF8(pattern);
5081         *p++='('; *p++='?';
5082
5083         /* If a default, cover it using the caret */
5084         if (has_default) {
5085             *p++= DEFAULT_PAT_MOD;
5086         }
5087         if (has_charset) {
5088             STRLEN len;
5089             const char* const name = get_regex_charset_name(r->extflags, &len);
5090             Copy(name, p, len, char);
5091             p += len;
5092         }
5093         if (has_p)
5094             *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
5095         {
5096             char ch;
5097             while((ch = *fptr++)) {
5098                 if(reganch & 1)
5099                     *p++ = ch;
5100                 reganch >>= 1;
5101             }
5102         }
5103
5104         *p++ = ':';
5105         Copy(RExC_precomp, p, plen, char);
5106         assert ((RX_WRAPPED(rx) - p) < 16);
5107         r->pre_prefix = p - RX_WRAPPED(rx);
5108         p += plen;
5109         if (has_runon)
5110             *p++ = '\n';
5111         *p++ = ')';
5112         *p = 0;
5113         SvCUR_set(rx, p - SvPVX_const(rx));
5114     }
5115
5116     r->intflags = 0;
5117     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
5118
5119     if (RExC_seen & REG_SEEN_RECURSE) {
5120         Newxz(RExC_open_parens, RExC_npar,regnode *);
5121         SAVEFREEPV(RExC_open_parens);
5122         Newxz(RExC_close_parens,RExC_npar,regnode *);
5123         SAVEFREEPV(RExC_close_parens);
5124     }
5125
5126     /* Useful during FAIL. */
5127 #ifdef RE_TRACK_PATTERN_OFFSETS
5128     Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
5129     DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
5130                           "%s %"UVuf" bytes for offset annotations.\n",
5131                           ri->u.offsets ? "Got" : "Couldn't get",
5132                           (UV)((2*RExC_size+1) * sizeof(U32))));
5133 #endif
5134     SetProgLen(ri,RExC_size);
5135     RExC_rx_sv = rx;
5136     RExC_rx = r;
5137     RExC_rxi = ri;
5138     REH_CALL_COMP_BEGIN_HOOK(pRExC_state->rx);
5139
5140     /* Second pass: emit code. */
5141     RExC_flags = pm_flags;      /* don't let top level (?i) bleed */
5142     RExC_parse = exp;
5143     RExC_end = xend;
5144     RExC_naughty = 0;
5145     RExC_npar = 1;
5146     RExC_emit_start = ri->program;
5147     RExC_emit = ri->program;
5148     RExC_emit_bound = ri->program + RExC_size + 1;
5149
5150     /* Store the count of eval-groups for security checks: */
5151     RExC_rx->seen_evals = RExC_seen_evals;
5152     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
5153     if (reg(pRExC_state, 0, &flags,1) == NULL) {
5154         ReREFCNT_dec(rx);
5155         return(NULL);
5156     }
5157     /* XXXX To minimize changes to RE engine we always allocate
5158        3-units-long substrs field. */
5159     Newx(r->substrs, 1, struct reg_substr_data);
5160     if (RExC_recurse_count) {
5161         Newxz(RExC_recurse,RExC_recurse_count,regnode *);
5162         SAVEFREEPV(RExC_recurse);
5163     }
5164
5165 reStudy:
5166     r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
5167     Zero(r->substrs, 1, struct reg_substr_data);
5168
5169 #ifdef TRIE_STUDY_OPT
5170     if (!restudied) {
5171         StructCopy(&zero_scan_data, &data, scan_data_t);
5172         copyRExC_state = RExC_state;
5173     } else {
5174         U32 seen=RExC_seen;
5175         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
5176
5177         RExC_state = copyRExC_state;
5178         if (seen & REG_TOP_LEVEL_BRANCHES)
5179             RExC_seen |= REG_TOP_LEVEL_BRANCHES;
5180         else
5181             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
5182         if (data.last_found) {
5183             SvREFCNT_dec(data.longest_fixed);
5184             SvREFCNT_dec(data.longest_float);
5185             SvREFCNT_dec(data.last_found);
5186         }
5187         StructCopy(&zero_scan_data, &data, scan_data_t);
5188     }
5189 #else
5190     StructCopy(&zero_scan_data, &data, scan_data_t);
5191 #endif
5192
5193     /* Dig out information for optimizations. */
5194     r->extflags = RExC_flags; /* was pm_op */
5195     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
5196
5197     if (UTF)
5198         SvUTF8_on(rx);  /* Unicode in it? */
5199     ri->regstclass = NULL;
5200     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
5201         r->intflags |= PREGf_NAUGHTY;
5202     scan = ri->program + 1;             /* First BRANCH. */
5203
5204     /* testing for BRANCH here tells us whether there is "must appear"
5205        data in the pattern. If there is then we can use it for optimisations */
5206     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /*  Only one top-level choice. */
5207         I32 fake;
5208         STRLEN longest_float_length, longest_fixed_length;
5209         struct regnode_charclass_class ch_class; /* pointed to by data */
5210         int stclass_flag;
5211         I32 last_close = 0; /* pointed to by data */
5212         regnode *first= scan;
5213         regnode *first_next= regnext(first);
5214         /*
5215          * Skip introductions and multiplicators >= 1
5216          * so that we can extract the 'meat' of the pattern that must
5217          * match in the large if() sequence following.
5218          * NOTE that EXACT is NOT covered here, as it is normally
5219          * picked up by the optimiser separately.
5220          *
5221          * This is unfortunate as the optimiser isnt handling lookahead
5222          * properly currently.
5223          *
5224          */
5225         while ((OP(first) == OPEN && (sawopen = 1)) ||
5226                /* An OR of *one* alternative - should not happen now. */
5227             (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
5228             /* for now we can't handle lookbehind IFMATCH*/
5229             (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
5230             (OP(first) == PLUS) ||
5231             (OP(first) == MINMOD) ||
5232                /* An {n,m} with n>0 */
5233             (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
5234             (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
5235         {
5236                 /*
5237                  * the only op that could be a regnode is PLUS, all the rest
5238                  * will be regnode_1 or regnode_2.
5239                  *
5240                  */
5241                 if (OP(first) == PLUS)
5242                     sawplus = 1;
5243                 else
5244                     first += regarglen[OP(first)];
5245
5246                 first = NEXTOPER(first);
5247                 first_next= regnext(first);
5248         }
5249
5250         /* Starting-point info. */
5251       again:
5252         DEBUG_PEEP("first:",first,0);
5253         /* Ignore EXACT as we deal with it later. */
5254         if (PL_regkind[OP(first)] == EXACT) {
5255             if (OP(first) == EXACT)
5256                 NOOP;   /* Empty, get anchored substr later. */
5257             else
5258                 ri->regstclass = first;
5259         }
5260 #ifdef TRIE_STCLASS
5261         else if (PL_regkind[OP(first)] == TRIE &&
5262                 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
5263         {
5264             regnode *trie_op;
5265             /* this can happen only on restudy */
5266             if ( OP(first) == TRIE ) {
5267                 struct regnode_1 *trieop = (struct regnode_1 *)
5268                     PerlMemShared_calloc(1, sizeof(struct regnode_1));
5269                 StructCopy(first,trieop,struct regnode_1);
5270                 trie_op=(regnode *)trieop;
5271             } else {
5272                 struct regnode_charclass *trieop = (struct regnode_charclass *)
5273                     PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
5274                 StructCopy(first,trieop,struct regnode_charclass);
5275                 trie_op=(regnode *)trieop;
5276             }
5277             OP(trie_op)+=2;
5278             make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
5279             ri->regstclass = trie_op;
5280         }
5281 #endif
5282         else if (REGNODE_SIMPLE(OP(first)))
5283             ri->regstclass = first;
5284         else if (PL_regkind[OP(first)] == BOUND ||
5285                  PL_regkind[OP(first)] == NBOUND)
5286             ri->regstclass = first;
5287         else if (PL_regkind[OP(first)] == BOL) {
5288             r->extflags |= (OP(first) == MBOL
5289                            ? RXf_ANCH_MBOL
5290                            : (OP(first) == SBOL
5291                               ? RXf_ANCH_SBOL
5292                               : RXf_ANCH_BOL));
5293             first = NEXTOPER(first);
5294             goto again;
5295         }
5296         else if (OP(first) == GPOS) {
5297             r->extflags |= RXf_ANCH_GPOS;
5298             first = NEXTOPER(first);
5299             goto again;
5300         }
5301         else if ((!sawopen || !RExC_sawback) &&
5302             (OP(first) == STAR &&
5303             PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
5304             !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
5305         {
5306             /* turn .* into ^.* with an implied $*=1 */
5307             const int type =
5308                 (OP(NEXTOPER(first)) == REG_ANY)
5309                     ? RXf_ANCH_MBOL
5310                     : RXf_ANCH_SBOL;
5311             r->extflags |= type;
5312             r->intflags |= PREGf_IMPLICIT;
5313             first = NEXTOPER(first);
5314             goto again;
5315         }
5316         if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
5317             && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
5318             /* x+ must match at the 1st pos of run of x's */
5319             r->intflags |= PREGf_SKIP;
5320
5321         /* Scan is after the zeroth branch, first is atomic matcher. */
5322 #ifdef TRIE_STUDY_OPT
5323         DEBUG_PARSE_r(
5324             if (!restudied)
5325                 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
5326                               (IV)(first - scan + 1))
5327         );
5328 #else
5329         DEBUG_PARSE_r(
5330             PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
5331                 (IV)(first - scan + 1))
5332         );
5333 #endif
5334
5335
5336         /*
5337         * If there's something expensive in the r.e., find the
5338         * longest literal string that must appear and make it the
5339         * regmust.  Resolve ties in favor of later strings, since
5340         * the regstart check works with the beginning of the r.e.
5341         * and avoiding duplication strengthens checking.  Not a
5342         * strong reason, but sufficient in the absence of others.
5343         * [Now we resolve ties in favor of the earlier string if
5344         * it happens that c_offset_min has been invalidated, since the
5345         * earlier string may buy us something the later one won't.]
5346         */
5347
5348         data.longest_fixed = newSVpvs("");
5349         data.longest_float = newSVpvs("");
5350         data.last_found = newSVpvs("");
5351         data.longest = &(data.longest_fixed);
5352         first = scan;
5353         if (!ri->regstclass) {
5354             cl_init(pRExC_state, &ch_class);
5355             data.start_class = &ch_class;
5356             stclass_flag = SCF_DO_STCLASS_AND;
5357         } else                          /* XXXX Check for BOUND? */
5358             stclass_flag = 0;
5359         data.last_closep = &last_close;
5360
5361         minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
5362             &data, -1, NULL, NULL,
5363             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
5364
5365
5366         CHECK_RESTUDY_GOTO;
5367
5368
5369         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
5370              && data.last_start_min == 0 && data.last_end > 0
5371              && !RExC_seen_zerolen
5372              && !(RExC_seen & REG_SEEN_VERBARG)
5373              && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
5374             r->extflags |= RXf_CHECK_ALL;
5375         scan_commit(pRExC_state, &data,&minlen,0);
5376         SvREFCNT_dec(data.last_found);
5377
5378         /* Note that code very similar to this but for anchored string
5379            follows immediately below, changes may need to be made to both.
5380            Be careful.
5381          */
5382         longest_float_length = CHR_SVLEN(data.longest_float);
5383         if (longest_float_length
5384             || (data.flags & SF_FL_BEFORE_EOL
5385                 && (!(data.flags & SF_FL_BEFORE_MEOL)
5386                     || (RExC_flags & RXf_PMf_MULTILINE))))
5387         {
5388             I32 t,ml;
5389
5390             /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
5391             if ((RExC_seen & REG_SEEN_EXACTF_SHARP_S)
5392                 || (SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
5393                     && data.offset_fixed == data.offset_float_min
5394                     && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
5395                     goto remove_float;          /* As in (a)+. */
5396
5397             /* copy the information about the longest float from the reg_scan_data
5398                over to the program. */
5399             if (SvUTF8(data.longest_float)) {
5400                 r->float_utf8 = data.longest_float;
5401                 r->float_substr = NULL;
5402             } else {
5403                 r->float_substr = data.longest_float;
5404                 r->float_utf8 = NULL;
5405             }
5406             /* float_end_shift is how many chars that must be matched that
5407                follow this item. We calculate it ahead of time as once the
5408                lookbehind offset is added in we lose the ability to correctly
5409                calculate it.*/
5410             ml = data.minlen_float ? *(data.minlen_float)
5411                                    : (I32)longest_float_length;
5412             r->float_end_shift = ml - data.offset_float_min
5413                 - longest_float_length + (SvTAIL(data.longest_float) != 0)
5414                 + data.lookbehind_float;
5415             r->float_min_offset = data.offset_float_min - data.lookbehind_float;
5416             r->float_max_offset = data.offset_float_max;
5417             if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
5418                 r->float_max_offset -= data.lookbehind_float;
5419
5420             t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
5421                        && (!(data.flags & SF_FL_BEFORE_MEOL)
5422                            || (RExC_flags & RXf_PMf_MULTILINE)));
5423             fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
5424         }
5425         else {
5426           remove_float:
5427             r->float_substr = r->float_utf8 = NULL;
5428             SvREFCNT_dec(data.longest_float);
5429             longest_float_length = 0;
5430         }
5431
5432         /* Note that code very similar to this but for floating string
5433            is immediately above, changes may need to be made to both.
5434            Be careful.
5435          */
5436         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
5437
5438         /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
5439         if (! (RExC_seen & REG_SEEN_EXACTF_SHARP_S)
5440             && (longest_fixed_length
5441                 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
5442                     && (!(data.flags & SF_FIX_BEFORE_MEOL)
5443                         || (RExC_flags & RXf_PMf_MULTILINE)))) )
5444         {
5445             I32 t,ml;
5446
5447             /* copy the information about the longest fixed
5448                from the reg_scan_data over to the program. */
5449             if (SvUTF8(data.longest_fixed)) {
5450                 r->anchored_utf8 = data.longest_fixed;
5451                 r->anchored_substr = NULL;
5452             } else {
5453                 r->anchored_substr = data.longest_fixed;
5454                 r->anchored_utf8 = NULL;
5455             }
5456             /* fixed_end_shift is how many chars that must be matched that
5457                follow this item. We calculate it ahead of time as once the
5458                lookbehind offset is added in we lose the ability to correctly
5459                calculate it.*/
5460             ml = data.minlen_fixed ? *(data.minlen_fixed)
5461                                    : (I32)longest_fixed_length;
5462             r->anchored_end_shift = ml - data.offset_fixed
5463                 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
5464                 + data.lookbehind_fixed;
5465             r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
5466
5467             t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
5468                  && (!(data.flags & SF_FIX_BEFORE_MEOL)
5469                      || (RExC_flags & RXf_PMf_MULTILINE)));
5470             fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
5471         }
5472         else {
5473             r->anchored_substr = r->anchored_utf8 = NULL;
5474             SvREFCNT_dec(data.longest_fixed);
5475             longest_fixed_length = 0;
5476         }
5477         if (ri->regstclass
5478             && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
5479             ri->regstclass = NULL;
5480
5481         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
5482             && stclass_flag
5483             && !(data.start_class->flags & ANYOF_EOS)
5484             && !cl_is_anything(data.start_class))
5485         {
5486             const U32 n = add_data(pRExC_state, 1, "f");
5487             data.start_class->flags |= ANYOF_IS_SYNTHETIC;
5488
5489             Newx(RExC_rxi->data->data[n], 1,
5490                 struct regnode_charclass_class);
5491             StructCopy(data.start_class,
5492                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
5493                        struct regnode_charclass_class);
5494             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
5495             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
5496             DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
5497                       regprop(r, sv, (regnode*)data.start_class);
5498                       PerlIO_printf(Perl_debug_log,
5499                                     "synthetic stclass \"%s\".\n",
5500                                     SvPVX_const(sv));});
5501         }
5502
5503         /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
5504         if (longest_fixed_length > longest_float_length) {
5505             r->check_end_shift = r->anchored_end_shift;
5506             r->check_substr = r->anchored_substr;
5507             r->check_utf8 = r->anchored_utf8;
5508             r->check_offset_min = r->check_offset_max = r->anchored_offset;
5509             if (r->extflags & RXf_ANCH_SINGLE)
5510                 r->extflags |= RXf_NOSCAN;
5511         }
5512         else {
5513             r->check_end_shift = r->float_end_shift;
5514             r->check_substr = r->float_substr;
5515             r->check_utf8 = r->float_utf8;
5516             r->check_offset_min = r->float_min_offset;
5517             r->check_offset_max = r->float_max_offset;
5518         }
5519         /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
5520            This should be changed ASAP!  */
5521         if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
5522             r->extflags |= RXf_USE_INTUIT;
5523             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
5524                 r->extflags |= RXf_INTUIT_TAIL;
5525         }
5526         /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
5527         if ( (STRLEN)minlen < longest_float_length )
5528             minlen= longest_float_length;
5529         if ( (STRLEN)minlen < longest_fixed_length )
5530             minlen= longest_fixed_length;
5531         */
5532     }
5533     else {
5534         /* Several toplevels. Best we can is to set minlen. */
5535         I32 fake;
5536         struct regnode_charclass_class ch_class;
5537         I32 last_close = 0;
5538
5539         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
5540
5541         scan = ri->program + 1;
5542         cl_init(pRExC_state, &ch_class);
5543         data.start_class = &ch_class;
5544         data.last_closep = &last_close;
5545
5546
5547         minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
5548             &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
5549
5550         CHECK_RESTUDY_GOTO;
5551
5552         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
5553                 = r->float_substr = r->float_utf8 = NULL;
5554
5555         if (!(data.start_class->flags & ANYOF_EOS)
5556             && !cl_is_anything(data.start_class))
5557         {
5558             const U32 n = add_data(pRExC_state, 1, "f");
5559             data.start_class->flags |= ANYOF_IS_SYNTHETIC;
5560
5561             Newx(RExC_rxi->data->data[n], 1,
5562                 struct regnode_charclass_class);
5563             StructCopy(data.start_class,
5564                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
5565                        struct regnode_charclass_class);
5566             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
5567             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
5568             DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
5569                       regprop(r, sv, (regnode*)data.start_class);
5570                       PerlIO_printf(Perl_debug_log,
5571                                     "synthetic stclass \"%s\".\n",
5572                                     SvPVX_const(sv));});
5573         }
5574     }
5575
5576     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
5577        the "real" pattern. */
5578     DEBUG_OPTIMISE_r({
5579         PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
5580                       (IV)minlen, (IV)r->minlen);
5581     });
5582     r->minlenret = minlen;
5583     if (r->minlen < minlen)
5584         r->minlen = minlen;
5585
5586     if (RExC_seen & REG_SEEN_GPOS)
5587         r->extflags |= RXf_GPOS_SEEN;
5588     if (RExC_seen & REG_SEEN_LOOKBEHIND)
5589         r->extflags |= RXf_LOOKBEHIND_SEEN;
5590     if (RExC_seen & REG_SEEN_EVAL)
5591         r->extflags |= RXf_EVAL_SEEN;
5592     if (RExC_seen & REG_SEEN_CANY)
5593         r->extflags |= RXf_CANY_SEEN;
5594     if (RExC_seen & REG_SEEN_VERBARG)
5595         r->intflags |= PREGf_VERBARG_SEEN;
5596     if (RExC_seen & REG_SEEN_CUTGROUP)
5597         r->intflags |= PREGf_CUTGROUP_SEEN;
5598     if (RExC_paren_names)
5599         RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
5600     else
5601         RXp_PAREN_NAMES(r) = NULL;
5602
5603 #ifdef STUPID_PATTERN_CHECKS
5604     if (RX_PRELEN(rx) == 0)
5605         r->extflags |= RXf_NULL;
5606     if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
5607         /* XXX: this should happen BEFORE we compile */
5608         r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5609     else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
5610         r->extflags |= RXf_WHITE;
5611     else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
5612         r->extflags |= RXf_START_ONLY;
5613 #else
5614     if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
5615             /* XXX: this should happen BEFORE we compile */
5616             r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5617     else {
5618         regnode *first = ri->program + 1;
5619         U8 fop = OP(first);
5620
5621         if (PL_regkind[fop] == NOTHING && OP(NEXTOPER(first)) == END)
5622             r->extflags |= RXf_NULL;
5623         else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END)
5624             r->extflags |= RXf_START_ONLY;
5625         else if (fop == PLUS && OP(NEXTOPER(first)) == SPACE
5626                              && OP(regnext(first)) == END)
5627             r->extflags |= RXf_WHITE;
5628     }
5629 #endif
5630 #ifdef DEBUGGING
5631     if (RExC_paren_names) {
5632         ri->name_list_idx = add_data( pRExC_state, 1, "a" );
5633         ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
5634     } else
5635 #endif
5636         ri->name_list_idx = 0;
5637
5638     if (RExC_recurse_count) {
5639         for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
5640             const regnode *scan = RExC_recurse[RExC_recurse_count-1];
5641             ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
5642         }
5643     }
5644     Newxz(r->offs, RExC_npar, regexp_paren_pair);
5645     /* assume we don't need to swap parens around before we match */
5646
5647     DEBUG_DUMP_r({
5648         PerlIO_printf(Perl_debug_log,"Final program:\n");
5649         regdump(r);
5650     });
5651 #ifdef RE_TRACK_PATTERN_OFFSETS
5652     DEBUG_OFFSETS_r(if (ri->u.offsets) {
5653         const U32 len = ri->u.offsets[0];
5654         U32 i;
5655         GET_RE_DEBUG_FLAGS_DECL;
5656         PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
5657         for (i = 1; i <= len; i++) {
5658             if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
5659                 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
5660                 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
5661             }
5662         PerlIO_printf(Perl_debug_log, "\n");
5663     });
5664 #endif
5665     return rx;
5666 }
5667
5668 #undef RE_ENGINE_PTR
5669
5670
5671 SV*
5672 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
5673                     const U32 flags)
5674 {
5675     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
5676
5677     PERL_UNUSED_ARG(value);
5678
5679     if (flags & RXapif_FETCH) {
5680         return reg_named_buff_fetch(rx, key, flags);
5681     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
5682         Perl_croak_no_modify(aTHX);
5683         return NULL;
5684     } else if (flags & RXapif_EXISTS) {
5685         return reg_named_buff_exists(rx, key, flags)
5686             ? &PL_sv_yes
5687             : &PL_sv_no;
5688     } else if (flags & RXapif_REGNAMES) {
5689         return reg_named_buff_all(rx, flags);
5690     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
5691         return reg_named_buff_scalar(rx, flags);
5692     } else {
5693         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
5694         return NULL;
5695     }
5696 }
5697
5698 SV*
5699 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
5700                          const U32 flags)
5701 {
5702     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
5703     PERL_UNUSED_ARG(lastkey);
5704
5705     if (flags & RXapif_FIRSTKEY)
5706         return reg_named_buff_firstkey(rx, flags);
5707     else if (flags & RXapif_NEXTKEY)
5708         return reg_named_buff_nextkey(rx, flags);
5709     else {
5710         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
5711         return NULL;
5712     }
5713 }
5714
5715 SV*
5716 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
5717                           const U32 flags)
5718 {
5719     AV *retarray = NULL;
5720     SV *ret;
5721     struct regexp *const rx = (struct regexp *)SvANY(r);
5722
5723     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
5724
5725     if (flags & RXapif_ALL)
5726         retarray=newAV();
5727
5728     if (rx && RXp_PAREN_NAMES(rx)) {
5729         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
5730         if (he_str) {
5731             IV i;
5732             SV* sv_dat=HeVAL(he_str);
5733             I32 *nums=(I32*)SvPVX(sv_dat);
5734             for ( i=0; i<SvIVX(sv_dat); i++ ) {
5735                 if ((I32)(rx->nparens) >= nums[i]
5736                     && rx->offs[nums[i]].start != -1
5737                     && rx->offs[nums[i]].end != -1)
5738                 {
5739                     ret = newSVpvs("");
5740                     CALLREG_NUMBUF_FETCH(r,nums[i],ret);
5741                     if (!retarray)
5742                         return ret;
5743                 } else {
5744                     if (retarray)
5745                         ret = newSVsv(&PL_sv_undef);
5746                 }
5747                 if (retarray)
5748                     av_push(retarray, ret);
5749             }
5750             if (retarray)
5751                 return newRV_noinc(MUTABLE_SV(retarray));
5752         }
5753     }
5754     return NULL;
5755 }
5756
5757 bool
5758 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
5759                            const U32 flags)
5760 {
5761     struct regexp *const rx = (struct regexp *)SvANY(r);
5762
5763     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
5764
5765     if (rx && RXp_PAREN_NAMES(rx)) {
5766         if (flags & RXapif_ALL) {
5767             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
5768         } else {
5769             SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
5770             if (sv) {
5771                 SvREFCNT_dec(sv);
5772                 return TRUE;
5773             } else {
5774                 return FALSE;
5775             }
5776         }
5777     } else {
5778         return FALSE;
5779     }
5780 }
5781
5782 SV*
5783 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
5784 {
5785     struct regexp *const rx = (struct regexp *)SvANY(r);
5786
5787     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
5788
5789     if ( rx && RXp_PAREN_NAMES(rx) ) {
5790         (void)hv_iterinit(RXp_PAREN_NAMES(rx));
5791
5792         return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
5793     } else {
5794         return FALSE;
5795     }
5796 }
5797
5798 SV*
5799 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
5800 {
5801     struct regexp *const rx = (struct regexp *)SvANY(r);
5802     GET_RE_DEBUG_FLAGS_DECL;
5803
5804     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
5805
5806     if (rx && RXp_PAREN_NAMES(rx)) {
5807         HV *hv = RXp_PAREN_NAMES(rx);
5808         HE *temphe;
5809         while ( (temphe = hv_iternext_flags(hv,0)) ) {
5810             IV i;
5811             IV parno = 0;
5812             SV* sv_dat = HeVAL(temphe);
5813             I32 *nums = (I32*)SvPVX(sv_dat);
5814             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5815                 if ((I32)(rx->lastparen) >= nums[i] &&
5816                     rx->offs[nums[i]].start != -1 &&
5817                     rx->offs[nums[i]].end != -1)
5818                 {
5819                     parno = nums[i];
5820                     break;
5821                 }
5822             }
5823             if (parno || flags & RXapif_ALL) {
5824                 return newSVhek(HeKEY_hek(temphe));
5825             }
5826         }
5827     }
5828     return NULL;
5829 }
5830
5831 SV*
5832 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
5833 {
5834     SV *ret;
5835     AV *av;
5836     I32 length;
5837     struct regexp *const rx = (struct regexp *)SvANY(r);
5838
5839     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
5840
5841     if (rx && RXp_PAREN_NAMES(rx)) {
5842         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5843             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
5844         } else if (flags & RXapif_ONE) {
5845             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
5846             av = MUTABLE_AV(SvRV(ret));
5847             length = av_len(av);
5848             SvREFCNT_dec(ret);
5849             return newSViv(length + 1);
5850         } else {
5851             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5852             return NULL;
5853         }
5854     }
5855     return &PL_sv_undef;
5856 }
5857
5858 SV*
5859 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
5860 {
5861     struct regexp *const rx = (struct regexp *)SvANY(r);
5862     AV *av = newAV();
5863
5864     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
5865
5866     if (rx && RXp_PAREN_NAMES(rx)) {
5867         HV *hv= RXp_PAREN_NAMES(rx);
5868         HE *temphe;
5869         (void)hv_iterinit(hv);
5870         while ( (temphe = hv_iternext_flags(hv,0)) ) {
5871             IV i;
5872             IV parno = 0;
5873             SV* sv_dat = HeVAL(temphe);
5874             I32 *nums = (I32*)SvPVX(sv_dat);
5875             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5876                 if ((I32)(rx->lastparen) >= nums[i] &&
5877                     rx->offs[nums[i]].start != -1 &&
5878                     rx->offs[nums[i]].end != -1)
5879                 {
5880                     parno = nums[i];
5881                     break;
5882                 }
5883             }
5884             if (parno || flags & RXapif_ALL) {
5885                 av_push(av, newSVhek(HeKEY_hek(temphe)));
5886             }
5887         }
5888     }
5889
5890     return newRV_noinc(MUTABLE_SV(av));
5891 }
5892
5893 void
5894 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
5895                              SV * const sv)
5896 {
5897     struct regexp *const rx = (struct regexp *)SvANY(r);
5898     char *s = NULL;
5899     I32 i = 0;
5900     I32 s1, t1;
5901
5902     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
5903
5904     if (!rx->subbeg) {
5905         sv_setsv(sv,&PL_sv_undef);
5906         return;
5907     }
5908     else
5909     if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
5910         /* $` */
5911         i = rx->offs[0].start;
5912         s = rx->subbeg;
5913     }
5914     else
5915     if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
5916         /* $' */
5917         s = rx->subbeg + rx->offs[0].end;
5918         i = rx->sublen - rx->offs[0].end;
5919     }
5920     else
5921     if ( 0 <= paren && paren <= (I32)rx->nparens &&
5922         (s1 = rx->offs[paren].start) != -1 &&
5923         (t1 = rx->offs[paren].end) != -1)
5924     {
5925         /* $& $1 ... */
5926         i = t1 - s1;
5927         s = rx->subbeg + s1;
5928     } else {
5929         sv_setsv(sv,&PL_sv_undef);
5930         return;
5931     }
5932     assert(rx->sublen >= (s - rx->subbeg) + i );
5933     if (i >= 0) {
5934         const int oldtainted = PL_tainted;
5935         TAINT_NOT;
5936         sv_setpvn(sv, s, i);
5937         PL_tainted = oldtainted;
5938         if ( (rx->extflags & RXf_CANY_SEEN)
5939             ? (RXp_MATCH_UTF8(rx)
5940                         && (!i || is_utf8_string((U8*)s, i)))
5941             : (RXp_MATCH_UTF8(rx)) )
5942         {
5943             SvUTF8_on(sv);
5944         }
5945         else
5946             SvUTF8_off(sv);
5947         if (PL_tainting) {
5948             if (RXp_MATCH_TAINTED(rx)) {
5949                 if (SvTYPE(sv) >= SVt_PVMG) {
5950                     MAGIC* const mg = SvMAGIC(sv);
5951                     MAGIC* mgt;
5952                     PL_tainted = 1;
5953                     SvMAGIC_set(sv, mg->mg_moremagic);
5954                     SvTAINT(sv);
5955                     if ((mgt = SvMAGIC(sv))) {
5956                         mg->mg_moremagic = mgt;
5957                         SvMAGIC_set(sv, mg);
5958                     }
5959                 } else {
5960                     PL_tainted = 1;
5961                     SvTAINT(sv);
5962                 }
5963             } else
5964                 SvTAINTED_off(sv);
5965         }
5966     } else {
5967         sv_setsv(sv,&PL_sv_undef);
5968         return;
5969     }
5970 }
5971
5972 void
5973 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
5974                                                          SV const * const value)
5975 {
5976     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
5977
5978     PERL_UNUSED_ARG(rx);
5979     PERL_UNUSED_ARG(paren);
5980     PERL_UNUSED_ARG(value);
5981
5982     if (!PL_localizing)
5983         Perl_croak_no_modify(aTHX);
5984 }
5985
5986 I32
5987 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
5988                               const I32 paren)
5989 {
5990     struct regexp *const rx = (struct regexp *)SvANY(r);
5991     I32 i;
5992     I32 s1, t1;
5993
5994     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
5995
5996     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
5997         switch (paren) {
5998       /* $` / ${^PREMATCH} */
5999       case RX_BUFF_IDX_PREMATCH:
6000         if (rx->offs[0].start != -1) {
6001                         i = rx->offs[0].start;
6002                         if (i > 0) {
6003                                 s1 = 0;
6004                                 t1 = i;
6005                                 goto getlen;
6006                         }
6007             }
6008         return 0;
6009       /* $' / ${^POSTMATCH} */
6010       case RX_BUFF_IDX_POSTMATCH:
6011             if (rx->offs[0].end != -1) {
6012                         i = rx->sublen - rx->offs[0].end;
6013                         if (i > 0) {
6014                                 s1 = rx->offs[0].end;
6015                                 t1 = rx->sublen;
6016                                 goto getlen;
6017                         }
6018             }
6019         return 0;
6020       /* $& / ${^MATCH}, $1, $2, ... */
6021       default:
6022             if (paren <= (I32)rx->nparens &&
6023             (s1 = rx->offs[paren].start) != -1 &&
6024             (t1 = rx->offs[paren].end) != -1)
6025             {
6026             i = t1 - s1;
6027             goto getlen;
6028         } else {
6029             if (ckWARN(WARN_UNINITIALIZED))
6030                 report_uninit((const SV *)sv);
6031             return 0;
6032         }
6033     }
6034   getlen:
6035     if (i > 0 && RXp_MATCH_UTF8(rx)) {
6036         const char * const s = rx->subbeg + s1;
6037         const U8 *ep;
6038         STRLEN el;
6039
6040         i = t1 - s1;
6041         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
6042                         i = el;
6043     }
6044     return i;
6045 }
6046
6047 SV*
6048 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
6049 {
6050     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
6051         PERL_UNUSED_ARG(rx);
6052         if (0)
6053             return NULL;
6054         else
6055             return newSVpvs("Regexp");
6056 }
6057
6058 /* Scans the name of a named buffer from the pattern.
6059  * If flags is REG_RSN_RETURN_NULL returns null.
6060  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
6061  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
6062  * to the parsed name as looked up in the RExC_paren_names hash.
6063  * If there is an error throws a vFAIL().. type exception.
6064  */
6065
6066 #define REG_RSN_RETURN_NULL    0
6067 #define REG_RSN_RETURN_NAME    1
6068 #define REG_RSN_RETURN_DATA    2
6069
6070 STATIC SV*
6071 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
6072 {
6073     char *name_start = RExC_parse;
6074
6075     PERL_ARGS_ASSERT_REG_SCAN_NAME;
6076
6077     if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
6078          /* skip IDFIRST by using do...while */
6079         if (UTF)
6080             do {
6081                 RExC_parse += UTF8SKIP(RExC_parse);
6082             } while (isALNUM_utf8((U8*)RExC_parse));
6083         else
6084             do {
6085                 RExC_parse++;
6086             } while (isALNUM(*RExC_parse));
6087     }
6088
6089     if ( flags ) {
6090         SV* sv_name
6091             = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
6092                              SVs_TEMP | (UTF ? SVf_UTF8 : 0));
6093         if ( flags == REG_RSN_RETURN_NAME)
6094             return sv_name;
6095         else if (flags==REG_RSN_RETURN_DATA) {
6096             HE *he_str = NULL;
6097             SV *sv_dat = NULL;
6098             if ( ! sv_name )      /* should not happen*/
6099                 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
6100             if (RExC_paren_names)
6101                 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
6102             if ( he_str )
6103                 sv_dat = HeVAL(he_str);
6104             if ( ! sv_dat )
6105                 vFAIL("Reference to nonexistent named group");
6106             return sv_dat;
6107         }
6108         else {
6109             Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
6110                        (unsigned long) flags);
6111         }
6112         /* NOT REACHED */
6113     }
6114     return NULL;
6115 }
6116
6117 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
6118     int rem=(int)(RExC_end - RExC_parse);                       \
6119     int cut;                                                    \
6120     int num;                                                    \
6121     int iscut=0;                                                \
6122     if (rem>10) {                                               \
6123         rem=10;                                                 \
6124         iscut=1;                                                \
6125     }                                                           \
6126     cut=10-rem;                                                 \
6127     if (RExC_lastparse!=RExC_parse)                             \
6128         PerlIO_printf(Perl_debug_log," >%.*s%-*s",              \
6129             rem, RExC_parse,                                    \
6130             cut + 4,                                            \
6131             iscut ? "..." : "<"                                 \
6132         );                                                      \
6133     else                                                        \
6134         PerlIO_printf(Perl_debug_log,"%16s","");                \
6135                                                                 \
6136     if (SIZE_ONLY)                                              \
6137        num = RExC_size + 1;                                     \
6138     else                                                        \
6139        num=REG_NODE_NUM(RExC_emit);                             \
6140     if (RExC_lastnum!=num)                                      \
6141        PerlIO_printf(Perl_debug_log,"|%4d",num);                \
6142     else                                                        \
6143        PerlIO_printf(Perl_debug_log,"|%4s","");                 \
6144     PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
6145         (int)((depth*2)), "",                                   \
6146         (funcname)                                              \
6147     );                                                          \
6148     RExC_lastnum=num;                                           \
6149     RExC_lastparse=RExC_parse;                                  \
6150 })
6151
6152
6153
6154 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
6155     DEBUG_PARSE_MSG((funcname));                            \
6156     PerlIO_printf(Perl_debug_log,"%4s","\n");               \
6157 })
6158 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
6159     DEBUG_PARSE_MSG((funcname));                            \
6160     PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
6161 })
6162
6163 /* This section of code defines the inversion list object and its methods.  The
6164  * interfaces are highly subject to change, so as much as possible is static to
6165  * this file.  An inversion list is here implemented as a malloc'd C UV array
6166  * with some added info that is placed as UVs at the beginning in a header
6167  * portion.  An inversion list for Unicode is an array of code points, sorted
6168  * by ordinal number.  The zeroth element is the first code point in the list.
6169  * The 1th element is the first element beyond that not in the list.  In other
6170  * words, the first range is
6171  *  invlist[0]..(invlist[1]-1)
6172  * The other ranges follow.  Thus every element whose index is divisible by two
6173  * marks the beginning of a range that is in the list, and every element not
6174  * divisible by two marks the beginning of a range not in the list.  A single
6175  * element inversion list that contains the single code point N generally
6176  * consists of two elements
6177  *  invlist[0] == N
6178  *  invlist[1] == N+1
6179  * (The exception is when N is the highest representable value on the
6180  * machine, in which case the list containing just it would be a single
6181  * element, itself.  By extension, if the last range in the list extends to
6182  * infinity, then the first element of that range will be in the inversion list
6183  * at a position that is divisible by two, and is the final element in the
6184  * list.)
6185  * Taking the complement (inverting) an inversion list is quite simple, if the
6186  * first element is 0, remove it; otherwise add a 0 element at the beginning.
6187  * This implementation reserves an element at the beginning of each inversion list
6188  * to contain 0 when the list contains 0, and contains 1 otherwise.  The actual
6189  * beginning of the list is either that element if 0, or the next one if 1.
6190  *
6191  * More about inversion lists can be found in "Unicode Demystified"
6192  * Chapter 13 by Richard Gillam, published by Addison-Wesley.
6193  * More will be coming when functionality is added later.
6194  *
6195  * The inversion list data structure is currently implemented as an SV pointing
6196  * to an array of UVs that the SV thinks are bytes.  This allows us to have an
6197  * array of UV whose memory management is automatically handled by the existing
6198  * facilities for SV's.
6199  *
6200  * Some of the methods should always be private to the implementation, and some
6201  * should eventually be made public */
6202
6203 #define INVLIST_LEN_OFFSET 0    /* Number of elements in the inversion list */
6204 #define INVLIST_ITER_OFFSET 1   /* Current iteration position */
6205
6206 /* This is a combination of a version and data structure type, so that one
6207  * being passed in can be validated to be an inversion list of the correct
6208  * vintage.  When the structure of the header is changed, a new random number
6209  * in the range 2**31-1 should be generated and the new() method changed to
6210  * insert that at this location.  Then, if an auxiliary program doesn't change
6211  * correspondingly, it will be discovered immediately */
6212 #define INVLIST_VERSION_ID_OFFSET 2
6213 #define INVLIST_VERSION_ID 1064334010
6214
6215 /* For safety, when adding new elements, remember to #undef them at the end of
6216  * the inversion list code section */
6217
6218 #define INVLIST_ZERO_OFFSET 3   /* 0 or 1; must be last element in header */
6219 /* The UV at position ZERO contains either 0 or 1.  If 0, the inversion list
6220  * contains the code point U+00000, and begins here.  If 1, the inversion list
6221  * doesn't contain U+0000, and it begins at the next UV in the array.
6222  * Inverting an inversion list consists of adding or removing the 0 at the
6223  * beginning of it.  By reserving a space for that 0, inversion can be made
6224  * very fast */
6225
6226 #define HEADER_LENGTH (INVLIST_ZERO_OFFSET + 1)
6227
6228 /* Internally things are UVs */
6229 #define TO_INTERNAL_SIZE(x) ((x + HEADER_LENGTH) * sizeof(UV))
6230 #define FROM_INTERNAL_SIZE(x) ((x / sizeof(UV)) - HEADER_LENGTH)
6231
6232 #define INVLIST_INITIAL_LEN 10
6233
6234 PERL_STATIC_INLINE UV*
6235 S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
6236 {
6237     /* Returns a pointer to the first element in the inversion list's array.
6238      * This is called upon initialization of an inversion list.  Where the
6239      * array begins depends on whether the list has the code point U+0000
6240      * in it or not.  The other parameter tells it whether the code that
6241      * follows this call is about to put a 0 in the inversion list or not.
6242      * The first element is either the element with 0, if 0, or the next one,
6243      * if 1 */
6244
6245     UV* zero = get_invlist_zero_addr(invlist);
6246
6247     PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
6248
6249     /* Must be empty */
6250     assert(! *get_invlist_len_addr(invlist));
6251
6252     /* 1^1 = 0; 1^0 = 1 */
6253     *zero = 1 ^ will_have_0;
6254     return zero + *zero;
6255 }
6256
6257 PERL_STATIC_INLINE UV*
6258 S_invlist_array(pTHX_ SV* const invlist)
6259 {
6260     /* Returns the pointer to the inversion list's array.  Every time the
6261      * length changes, this needs to be called in case malloc or realloc moved
6262      * it */
6263
6264     PERL_ARGS_ASSERT_INVLIST_ARRAY;
6265
6266     /* Must not be empty.  If these fail, you probably didn't check for <len>
6267      * being non-zero before trying to get the array */
6268     assert(*get_invlist_len_addr(invlist));
6269     assert(*get_invlist_zero_addr(invlist) == 0
6270            || *get_invlist_zero_addr(invlist) == 1);
6271
6272     /* The array begins either at the element reserved for zero if the
6273      * list contains 0 (that element will be set to 0), or otherwise the next
6274      * element (in which case the reserved element will be set to 1). */
6275     return (UV *) (get_invlist_zero_addr(invlist)
6276                    + *get_invlist_zero_addr(invlist));
6277 }
6278
6279 PERL_STATIC_INLINE UV*
6280 S_get_invlist_len_addr(pTHX_ SV* invlist)
6281 {
6282     /* Return the address of the UV that contains the current number
6283      * of used elements in the inversion list */
6284
6285     PERL_ARGS_ASSERT_GET_INVLIST_LEN_ADDR;
6286
6287     return (UV *) (SvPVX(invlist) + (INVLIST_LEN_OFFSET * sizeof (UV)));
6288 }
6289
6290 PERL_STATIC_INLINE UV
6291 S_invlist_len(pTHX_ SV* const invlist)
6292 {
6293     /* Returns the current number of elements stored in the inversion list's
6294      * array */
6295
6296     PERL_ARGS_ASSERT_INVLIST_LEN;
6297
6298     return *get_invlist_len_addr(invlist);
6299 }
6300
6301 PERL_STATIC_INLINE void
6302 S_invlist_set_len(pTHX_ SV* const invlist, const UV len)
6303 {
6304     /* Sets the current number of elements stored in the inversion list */
6305
6306     PERL_ARGS_ASSERT_INVLIST_SET_LEN;
6307
6308     *get_invlist_len_addr(invlist) = len;
6309
6310     assert(len <= SvLEN(invlist));
6311
6312     SvCUR_set(invlist, TO_INTERNAL_SIZE(len));
6313     /* If the list contains U+0000, that element is part of the header,
6314      * and should not be counted as part of the array.  It will contain
6315      * 0 in that case, and 1 otherwise.  So we could flop 0=>1, 1=>0 and
6316      * subtract:
6317      *  SvCUR_set(invlist,
6318      *            TO_INTERNAL_SIZE(len
6319      *                             - (*get_invlist_zero_addr(inv_list) ^ 1)));
6320      * But, this is only valid if len is not 0.  The consequences of not doing
6321      * this is that the memory allocation code may think that 1 more UV is
6322      * being used than actually is, and so might do an unnecessary grow.  That
6323      * seems worth not bothering to make this the precise amount.
6324      *
6325      * Note that when inverting, SvCUR shouldn't change */
6326 }
6327
6328 PERL_STATIC_INLINE UV
6329 S_invlist_max(pTHX_ SV* const invlist)
6330 {
6331     /* Returns the maximum number of elements storable in the inversion list's
6332      * array, without having to realloc() */
6333
6334     PERL_ARGS_ASSERT_INVLIST_MAX;
6335
6336     return FROM_INTERNAL_SIZE(SvLEN(invlist));
6337 }
6338
6339 PERL_STATIC_INLINE UV*
6340 S_get_invlist_zero_addr(pTHX_ SV* invlist)
6341 {
6342     /* Return the address of the UV that is reserved to hold 0 if the inversion
6343      * list contains 0.  This has to be the last element of the heading, as the
6344      * list proper starts with either it if 0, or the next element if not.
6345      * (But we force it to contain either 0 or 1) */
6346
6347     PERL_ARGS_ASSERT_GET_INVLIST_ZERO_ADDR;
6348
6349     return (UV *) (SvPVX(invlist) + (INVLIST_ZERO_OFFSET * sizeof (UV)));
6350 }
6351
6352 #ifndef PERL_IN_XSUB_RE
6353 SV*
6354 Perl__new_invlist(pTHX_ IV initial_size)
6355 {
6356
6357     /* Return a pointer to a newly constructed inversion list, with enough
6358      * space to store 'initial_size' elements.  If that number is negative, a
6359      * system default is used instead */
6360
6361     SV* new_list;
6362
6363     if (initial_size < 0) {
6364         initial_size = INVLIST_INITIAL_LEN;
6365     }
6366
6367     /* Allocate the initial space */
6368     new_list = newSV(TO_INTERNAL_SIZE(initial_size));
6369     invlist_set_len(new_list, 0);
6370
6371     /* Force iterinit() to be used to get iteration to work */
6372     *get_invlist_iter_addr(new_list) = UV_MAX;
6373
6374     /* This should force a segfault if a method doesn't initialize this
6375      * properly */
6376     *get_invlist_zero_addr(new_list) = UV_MAX;
6377
6378     *get_invlist_version_id_addr(new_list) = INVLIST_VERSION_ID;
6379 #if HEADER_LENGTH != 4
6380 #   error Need to regenerate VERSION_ID by running perl -E 'say int(rand 2**31-1)', and then changing the #if to the new length
6381 #endif
6382
6383     return new_list;
6384 }
6385 #endif
6386
6387 STATIC SV*
6388 S__new_invlist_C_array(pTHX_ UV* list)
6389 {
6390     /* Return a pointer to a newly constructed inversion list, initialized to
6391      * point to <list>, which has to be in the exact correct inversion list
6392      * form, including internal fields.  Thus this is a dangerous routine that
6393      * should not be used in the wrong hands */
6394
6395     SV* invlist = newSV_type(SVt_PV);
6396
6397     PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
6398
6399     SvPV_set(invlist, (char *) list);
6400     SvLEN_set(invlist, 0);  /* Means we own the contents, and the system
6401                                shouldn't touch it */
6402     SvCUR_set(invlist, TO_INTERNAL_SIZE(invlist_len(invlist)));
6403
6404     if (*get_invlist_version_id_addr(invlist) != INVLIST_VERSION_ID) {
6405         Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
6406     }
6407
6408     return invlist;
6409 }
6410
6411 STATIC void
6412 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
6413 {
6414     /* Grow the maximum size of an inversion list */
6415
6416     PERL_ARGS_ASSERT_INVLIST_EXTEND;
6417
6418     SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max));
6419 }
6420
6421 PERL_STATIC_INLINE void
6422 S_invlist_trim(pTHX_ SV* const invlist)
6423 {
6424     PERL_ARGS_ASSERT_INVLIST_TRIM;
6425
6426     /* Change the length of the inversion list to how many entries it currently
6427      * has */
6428
6429     SvPV_shrink_to_cur((SV *) invlist);
6430 }
6431
6432 /* An element is in an inversion list iff its index is even numbered: 0, 2, 4,
6433  * etc */
6434 #define ELEMENT_RANGE_MATCHES_INVLIST(i) (! ((i) & 1))
6435 #define PREV_RANGE_MATCHES_INVLIST(i) (! ELEMENT_RANGE_MATCHES_INVLIST(i))
6436
6437 #define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
6438
6439 #ifndef PERL_IN_XSUB_RE
6440 void
6441 Perl__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
6442 {
6443    /* Subject to change or removal.  Append the range from 'start' to 'end' at
6444     * the end of the inversion list.  The range must be above any existing
6445     * ones. */
6446
6447     UV* array;
6448     UV max = invlist_max(invlist);
6449     UV len = invlist_len(invlist);
6450
6451     PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
6452
6453     if (len == 0) { /* Empty lists must be initialized */
6454         array = _invlist_array_init(invlist, start == 0);
6455     }
6456     else {
6457         /* Here, the existing list is non-empty. The current max entry in the
6458          * list is generally the first value not in the set, except when the
6459          * set extends to the end of permissible values, in which case it is
6460          * the first entry in that final set, and so this call is an attempt to
6461          * append out-of-order */
6462
6463         UV final_element = len - 1;
6464         array = invlist_array(invlist);
6465         if (array[final_element] > start
6466             || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
6467         {
6468             Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
6469                        array[final_element], start,
6470                        ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
6471         }
6472
6473         /* Here, it is a legal append.  If the new range begins with the first
6474          * value not in the set, it is extending the set, so the new first
6475          * value not in the set is one greater than the newly extended range.
6476          * */
6477         if (array[final_element] == start) {
6478             if (end != UV_MAX) {
6479                 array[final_element] = end + 1;
6480             }
6481             else {
6482                 /* But if the end is the maximum representable on the machine,
6483                  * just let the range that this would extend to have no end */
6484                 invlist_set_len(invlist, len - 1);
6485             }
6486             return;
6487         }
6488     }
6489
6490     /* Here the new range doesn't extend any existing set.  Add it */
6491
6492     len += 2;   /* Includes an element each for the start and end of range */
6493
6494     /* If overflows the existing space, extend, which may cause the array to be
6495      * moved */
6496     if (max < len) {
6497         invlist_extend(invlist, len);
6498         invlist_set_len(invlist, len);  /* Have to set len here to avoid assert
6499                                            failure in invlist_array() */
6500         array = invlist_array(invlist);
6501     }
6502     else {
6503         invlist_set_len(invlist, len);
6504     }
6505
6506     /* The next item on the list starts the range, the one after that is
6507      * one past the new range.  */
6508     array[len - 2] = start;
6509     if (end != UV_MAX) {
6510         array[len - 1] = end + 1;
6511     }
6512     else {
6513         /* But if the end is the maximum representable on the machine, just let
6514          * the range have no end */
6515         invlist_set_len(invlist, len - 1);
6516     }
6517 }
6518
6519 STATIC IV
6520 S_invlist_search(pTHX_ SV* const invlist, const UV cp)
6521 {
6522     /* Searches the inversion list for the entry that contains the input code
6523      * point <cp>.  If <cp> is not in the list, -1 is returned.  Otherwise, the
6524      * return value is the index into the list's array of the range that
6525      * contains <cp> */
6526
6527     IV low = 0;
6528     IV high = invlist_len(invlist);
6529     const UV * const array = invlist_array(invlist);
6530
6531     PERL_ARGS_ASSERT_INVLIST_SEARCH;
6532
6533     /* If list is empty or the code point is before the first element, return
6534      * failure. */
6535     if (high == 0 || cp < array[0]) {
6536         return -1;
6537     }
6538
6539     /* Binary search.  What we are looking for is <i> such that
6540      *  array[i] <= cp < array[i+1]
6541      * The loop below converges on the i+1. */
6542     while (low < high) {
6543         IV mid = (low + high) / 2;
6544         if (array[mid] <= cp) {
6545             low = mid + 1;
6546
6547             /* We could do this extra test to exit the loop early.
6548             if (cp < array[low]) {
6549                 return mid;
6550             }
6551             */
6552         }
6553         else { /* cp < array[mid] */
6554             high = mid;
6555         }
6556     }
6557
6558     return high - 1;
6559 }
6560
6561 void
6562 Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch)
6563 {
6564     /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
6565      * but is used when the swash has an inversion list.  This makes this much
6566      * faster, as it uses a binary search instead of a linear one.  This is
6567      * intimately tied to that function, and perhaps should be in utf8.c,
6568      * except it is intimately tied to inversion lists as well.  It assumes
6569      * that <swatch> is all 0's on input */
6570
6571     UV current = start;
6572     const IV len = invlist_len(invlist);
6573     IV i;
6574     const UV * array;
6575
6576     PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
6577
6578     if (len == 0) { /* Empty inversion list */
6579         return;
6580     }
6581
6582     array = invlist_array(invlist);
6583
6584     /* Find which element it is */
6585     i = invlist_search(invlist, start);
6586
6587     /* We populate from <start> to <end> */
6588     while (current < end) {
6589         UV upper;
6590
6591         /* The inversion list gives the results for every possible code point
6592          * after the first one in the list.  Only those ranges whose index is
6593          * even are ones that the inversion list matches.  For the odd ones,
6594          * and if the initial code point is not in the list, we have to skip
6595          * forward to the next element */
6596         if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
6597             i++;
6598             if (i >= len) { /* Finished if beyond the end of the array */
6599                 return;
6600             }
6601             current = array[i];
6602             if (current >= end) {   /* Finished if beyond the end of what we
6603                                        are populating */
6604                 return;
6605             }
6606         }
6607         assert(current >= start);
6608
6609         /* The current range ends one below the next one, except don't go past
6610          * <end> */
6611         i++;
6612         upper = (i < len && array[i] < end) ? array[i] : end;
6613
6614         /* Here we are in a range that matches.  Populate a bit in the 3-bit U8
6615          * for each code point in it */
6616         for (; current < upper; current++) {
6617             const STRLEN offset = (STRLEN)(current - start);
6618             swatch[offset >> 3] |= 1 << (offset & 7);
6619         }
6620
6621         /* Quit if at the end of the list */
6622         if (i >= len) {
6623
6624             /* But first, have to deal with the highest possible code point on
6625              * the platform.  The previous code assumes that <end> is one
6626              * beyond where we want to populate, but that is impossible at the
6627              * platform's infinity, so have to handle it specially */
6628             if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
6629             {
6630                 const STRLEN offset = (STRLEN)(end - start);
6631                 swatch[offset >> 3] |= 1 << (offset & 7);
6632             }
6633             return;
6634         }
6635
6636         /* Advance to the next range, which will be for code points not in the
6637          * inversion list */
6638         current = array[i];
6639     }
6640
6641     return;
6642 }
6643
6644
6645 void
6646 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** output)
6647 {
6648     /* Take the union of two inversion lists and point <output> to it.  *output
6649      * should be defined upon input, and if it points to one of the two lists,
6650      * the reference count to that list will be decremented.  The first list,
6651      * <a>, may be NULL, in which case a copy of the second list is returned.
6652      * If <complement_b> is TRUE, the union is taken of the complement
6653      * (inversion) of <b> instead of b itself.
6654      *
6655      * The basis for this comes from "Unicode Demystified" Chapter 13 by
6656      * Richard Gillam, published by Addison-Wesley, and explained at some
6657      * length there.  The preface says to incorporate its examples into your
6658      * code at your own risk.
6659      *
6660      * The algorithm is like a merge sort.
6661      *
6662      * XXX A potential performance improvement is to keep track as we go along
6663      * if only one of the inputs contributes to the result, meaning the other
6664      * is a subset of that one.  In that case, we can skip the final copy and
6665      * return the larger of the input lists, but then outside code might need
6666      * to keep track of whether to free the input list or not */
6667
6668     UV* array_a;    /* a's array */
6669     UV* array_b;
6670     UV len_a;       /* length of a's array */
6671     UV len_b;
6672
6673     SV* u;                      /* the resulting union */
6674     UV* array_u;
6675     UV len_u;
6676
6677     UV i_a = 0;             /* current index into a's array */
6678     UV i_b = 0;
6679     UV i_u = 0;
6680
6681     /* running count, as explained in the algorithm source book; items are
6682      * stopped accumulating and are output when the count changes to/from 0.
6683      * The count is incremented when we start a range that's in the set, and
6684      * decremented when we start a range that's not in the set.  So its range
6685      * is 0 to 2.  Only when the count is zero is something not in the set.
6686      */
6687     UV count = 0;
6688
6689     PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
6690     assert(a != b);
6691
6692     /* If either one is empty, the union is the other one */
6693     if (a == NULL || ((len_a = invlist_len(a)) == 0)) {
6694         if (*output == a) {
6695             if (a != NULL) {
6696                 SvREFCNT_dec(a);
6697             }
6698         }
6699         if (*output != b) {
6700             *output = invlist_clone(b);
6701             if (complement_b) {
6702                 _invlist_invert(*output);
6703             }
6704         } /* else *output already = b; */
6705         return;
6706     }
6707     else if ((len_b = invlist_len(b)) == 0) {
6708         if (*output == b) {
6709             SvREFCNT_dec(b);
6710         }
6711
6712         /* The complement of an empty list is a list that has everything in it,
6713          * so the union with <a> includes everything too */
6714         if (complement_b) {
6715             if (a == *output) {
6716                 SvREFCNT_dec(a);
6717             }
6718             *output = _new_invlist(1);
6719             _append_range_to_invlist(*output, 0, UV_MAX);
6720         }
6721         else if (*output != a) {
6722             *output = invlist_clone(a);
6723         }
6724         /* else *output already = a; */
6725         return;
6726     }
6727
6728     /* Here both lists exist and are non-empty */
6729     array_a = invlist_array(a);
6730     array_b = invlist_array(b);
6731
6732     /* If are to take the union of 'a' with the complement of b, set it
6733      * up so are looking at b's complement. */
6734     if (complement_b) {
6735
6736         /* To complement, we invert: if the first element is 0, remove it.  To
6737          * do this, we just pretend the array starts one later, and clear the
6738          * flag as we don't have to do anything else later */
6739         if (array_b[0] == 0) {
6740             array_b++;
6741             len_b--;
6742             complement_b = FALSE;
6743         }
6744         else {
6745
6746             /* But if the first element is not zero, we unshift a 0 before the
6747              * array.  The data structure reserves a space for that 0 (which
6748              * should be a '1' right now), so physical shifting is unneeded,
6749              * but temporarily change that element to 0.  Before exiting the
6750              * routine, we must restore the element to '1' */
6751             array_b--;
6752             len_b++;
6753             array_b[0] = 0;
6754         }
6755     }
6756
6757     /* Size the union for the worst case: that the sets are completely
6758      * disjoint */
6759     u = _new_invlist(len_a + len_b);
6760
6761     /* Will contain U+0000 if either component does */
6762     array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
6763                                       || (len_b > 0 && array_b[0] == 0));
6764
6765     /* Go through each list item by item, stopping when exhausted one of
6766      * them */
6767     while (i_a < len_a && i_b < len_b) {
6768         UV cp;      /* The element to potentially add to the union's array */
6769         bool cp_in_set;   /* is it in the the input list's set or not */
6770
6771         /* We need to take one or the other of the two inputs for the union.
6772          * Since we are merging two sorted lists, we take the smaller of the
6773          * next items.  In case of a tie, we take the one that is in its set
6774          * first.  If we took one not in the set first, it would decrement the
6775          * count, possibly to 0 which would cause it to be output as ending the
6776          * range, and the next time through we would take the same number, and
6777          * output it again as beginning the next range.  By doing it the
6778          * opposite way, there is no possibility that the count will be
6779          * momentarily decremented to 0, and thus the two adjoining ranges will
6780          * be seamlessly merged.  (In a tie and both are in the set or both not
6781          * in the set, it doesn't matter which we take first.) */
6782         if (array_a[i_a] < array_b[i_b]
6783             || (array_a[i_a] == array_b[i_b]
6784                 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
6785         {
6786             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
6787             cp= array_a[i_a++];
6788         }
6789         else {
6790             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
6791             cp= array_b[i_b++];
6792         }
6793
6794         /* Here, have chosen which of the two inputs to look at.  Only output
6795          * if the running count changes to/from 0, which marks the
6796          * beginning/end of a range in that's in the set */
6797         if (cp_in_set) {
6798             if (count == 0) {
6799                 array_u[i_u++] = cp;
6800             }
6801             count++;
6802         }
6803         else {
6804             count--;
6805             if (count == 0) {
6806                 array_u[i_u++] = cp;
6807             }
6808         }
6809     }
6810
6811     /* Here, we are finished going through at least one of the lists, which
6812      * means there is something remaining in at most one.  We check if the list
6813      * that hasn't been exhausted is positioned such that we are in the middle
6814      * of a range in its set or not.  (i_a and i_b point to the element beyond
6815      * the one we care about.) If in the set, we decrement 'count'; if 0, there
6816      * is potentially more to output.
6817      * There are four cases:
6818      *  1) Both weren't in their sets, count is 0, and remains 0.  What's left
6819      *     in the union is entirely from the non-exhausted set.
6820      *  2) Both were in their sets, count is 2.  Nothing further should
6821      *     be output, as everything that remains will be in the exhausted
6822      *     list's set, hence in the union; decrementing to 1 but not 0 insures
6823      *     that
6824      *  3) the exhausted was in its set, non-exhausted isn't, count is 1.
6825      *     Nothing further should be output because the union includes
6826      *     everything from the exhausted set.  Not decrementing ensures that.
6827      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1;
6828      *     decrementing to 0 insures that we look at the remainder of the
6829      *     non-exhausted set */
6830     if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
6831         || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
6832     {
6833         count--;
6834     }
6835
6836     /* The final length is what we've output so far, plus what else is about to
6837      * be output.  (If 'count' is non-zero, then the input list we exhausted
6838      * has everything remaining up to the machine's limit in its set, and hence
6839      * in the union, so there will be no further output. */
6840     len_u = i_u;
6841     if (count == 0) {
6842         /* At most one of the subexpressions will be non-zero */
6843         len_u += (len_a - i_a) + (len_b - i_b);
6844     }
6845
6846     /* Set result to final length, which can change the pointer to array_u, so
6847      * re-find it */
6848     if (len_u != invlist_len(u)) {
6849         invlist_set_len(u, len_u);
6850         invlist_trim(u);
6851         array_u = invlist_array(u);
6852     }
6853
6854     /* When 'count' is 0, the list that was exhausted (if one was shorter than
6855      * the other) ended with everything above it not in its set.  That means
6856      * that the remaining part of the union is precisely the same as the
6857      * non-exhausted list, so can just copy it unchanged.  (If both list were
6858      * exhausted at the same time, then the operations below will be both 0.)
6859      */
6860     if (count == 0) {
6861         IV copy_count; /* At most one will have a non-zero copy count */
6862         if ((copy_count = len_a - i_a) > 0) {
6863             Copy(array_a + i_a, array_u + i_u, copy_count, UV);
6864         }
6865         else if ((copy_count = len_b - i_b) > 0) {
6866             Copy(array_b + i_b, array_u + i_u, copy_count, UV);
6867         }
6868     }
6869
6870     /*  We may be removing a reference to one of the inputs */
6871     if (a == *output || b == *output) {
6872         SvREFCNT_dec(*output);
6873     }
6874
6875     /* If we've changed b, restore it */
6876     if (complement_b) {
6877         array_b[0] = 1;
6878     }
6879
6880     *output = u;
6881     return;
6882 }
6883
6884 void
6885 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** i)
6886 {
6887     /* Take the intersection of two inversion lists and point <i> to it.  *i
6888      * should be defined upon input, and if it points to one of the two lists,
6889      * the reference count to that list will be decremented.
6890      * If <complement_b> is TRUE, the result will be the intersection of <a>
6891      * and the complement (or inversion) of <b> instead of <b> directly.
6892      *
6893      * The basis for this comes from "Unicode Demystified" Chapter 13 by
6894      * Richard Gillam, published by Addison-Wesley, and explained at some
6895      * length there.  The preface says to incorporate its examples into your
6896      * code at your own risk.  In fact, it had bugs
6897      *
6898      * The algorithm is like a merge sort, and is essentially the same as the
6899      * union above
6900      */
6901
6902     UV* array_a;                /* a's array */
6903     UV* array_b;
6904     UV len_a;   /* length of a's array */
6905     UV len_b;
6906
6907     SV* r;                   /* the resulting intersection */
6908     UV* array_r;
6909     UV len_r;
6910
6911     UV i_a = 0;             /* current index into a's array */
6912     UV i_b = 0;
6913     UV i_r = 0;
6914
6915     /* running count, as explained in the algorithm source book; items are
6916      * stopped accumulating and are output when the count changes to/from 2.
6917      * The count is incremented when we start a range that's in the set, and
6918      * decremented when we start a range that's not in the set.  So its range
6919      * is 0 to 2.  Only when the count is 2 is something in the intersection.
6920      */
6921     UV count = 0;
6922
6923     PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
6924     assert(a != b);
6925
6926     /* Special case if either one is empty */
6927     len_a = invlist_len(a);
6928     if ((len_a == 0) || ((len_b = invlist_len(b)) == 0)) {
6929
6930         if (len_a != 0 && complement_b) {
6931
6932             /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
6933              * be empty.  Here, also we are using 'b's complement, which hence
6934              * must be every possible code point.  Thus the intersection is
6935              * simply 'a'. */
6936             if (*i != a) {
6937                 *i = invlist_clone(a);
6938
6939                 if (*i == b) {
6940                     SvREFCNT_dec(b);
6941                 }
6942             }
6943             /* else *i is already 'a' */
6944             return;
6945         }
6946
6947         /* Here, 'a' or 'b' is empty and not using the complement of 'b'.  The
6948          * intersection must be empty */
6949         if (*i == a) {
6950             SvREFCNT_dec(a);
6951         }
6952         else if (*i == b) {
6953             SvREFCNT_dec(b);
6954         }
6955         *i = _new_invlist(0);
6956         return;
6957     }
6958
6959     /* Here both lists exist and are non-empty */
6960     array_a = invlist_array(a);
6961     array_b = invlist_array(b);
6962
6963     /* If are to take the intersection of 'a' with the complement of b, set it
6964      * up so are looking at b's complement. */
6965     if (complement_b) {
6966
6967         /* To complement, we invert: if the first element is 0, remove it.  To
6968          * do this, we just pretend the array starts one later, and clear the
6969          * flag as we don't have to do anything else later */
6970         if (array_b[0] == 0) {
6971             array_b++;
6972             len_b--;
6973             complement_b = FALSE;
6974         }
6975         else {
6976
6977             /* But if the first element is not zero, we unshift a 0 before the
6978              * array.  The data structure reserves a space for that 0 (which
6979              * should be a '1' right now), so physical shifting is unneeded,
6980              * but temporarily change that element to 0.  Before exiting the
6981              * routine, we must restore the element to '1' */
6982             array_b--;
6983             len_b++;
6984             array_b[0] = 0;
6985         }
6986     }
6987
6988     /* Size the intersection for the worst case: that the intersection ends up
6989      * fragmenting everything to be completely disjoint */
6990     r= _new_invlist(len_a + len_b);
6991
6992     /* Will contain U+0000 iff both components do */
6993     array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
6994                                      && len_b > 0 && array_b[0] == 0);
6995
6996     /* Go through each list item by item, stopping when exhausted one of
6997      * them */
6998     while (i_a < len_a && i_b < len_b) {
6999         UV cp;      /* The element to potentially add to the intersection's
7000                        array */
7001         bool cp_in_set; /* Is it in the input list's set or not */
7002
7003         /* We need to take one or the other of the two inputs for the
7004          * intersection.  Since we are merging two sorted lists, we take the
7005          * smaller of the next items.  In case of a tie, we take the one that
7006          * is not in its set first (a difference from the union algorithm).  If
7007          * we took one in the set first, it would increment the count, possibly
7008          * to 2 which would cause it to be output as starting a range in the
7009          * intersection, and the next time through we would take that same
7010          * number, and output it again as ending the set.  By doing it the
7011          * opposite of this, there is no possibility that the count will be
7012          * momentarily incremented to 2.  (In a tie and both are in the set or
7013          * both not in the set, it doesn't matter which we take first.) */
7014         if (array_a[i_a] < array_b[i_b]
7015             || (array_a[i_a] == array_b[i_b]
7016                 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7017         {
7018             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7019             cp= array_a[i_a++];
7020         }
7021         else {
7022             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7023             cp= array_b[i_b++];
7024         }
7025
7026         /* Here, have chosen which of the two inputs to look at.  Only output
7027          * if the running count changes to/from 2, which marks the
7028          * beginning/end of a range that's in the intersection */
7029         if (cp_in_set) {
7030             count++;
7031             if (count == 2) {
7032                 array_r[i_r++] = cp;
7033             }
7034         }
7035         else {
7036             if (count == 2) {
7037                 array_r[i_r++] = cp;
7038             }
7039             count--;
7040         }
7041     }
7042
7043     /* Here, we are finished going through at least one of the lists, which
7044      * means there is something remaining in at most one.  We check if the list
7045      * that has been exhausted is positioned such that we are in the middle
7046      * of a range in its set or not.  (i_a and i_b point to elements 1 beyond
7047      * the ones we care about.)  There are four cases:
7048      *  1) Both weren't in their sets, count is 0, and remains 0.  There's
7049      *     nothing left in the intersection.
7050      *  2) Both were in their sets, count is 2 and perhaps is incremented to
7051      *     above 2.  What should be output is exactly that which is in the
7052      *     non-exhausted set, as everything it has is also in the intersection
7053      *     set, and everything it doesn't have can't be in the intersection
7054      *  3) The exhausted was in its set, non-exhausted isn't, count is 1, and
7055      *     gets incremented to 2.  Like the previous case, the intersection is
7056      *     everything that remains in the non-exhausted set.
7057      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
7058      *     remains 1.  And the intersection has nothing more. */
7059     if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7060         || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7061     {
7062         count++;
7063     }
7064
7065     /* The final length is what we've output so far plus what else is in the
7066      * intersection.  At most one of the subexpressions below will be non-zero */
7067     len_r = i_r;
7068     if (count >= 2) {
7069         len_r += (len_a - i_a) + (len_b - i_b);
7070     }
7071
7072     /* Set result to final length, which can change the pointer to array_r, so
7073      * re-find it */
7074     if (len_r != invlist_len(r)) {
7075         invlist_set_len(r, len_r);
7076         invlist_trim(r);
7077         array_r = invlist_array(r);
7078     }
7079
7080     /* Finish outputting any remaining */
7081     if (count >= 2) { /* At most one will have a non-zero copy count */
7082         IV copy_count;
7083         if ((copy_count = len_a - i_a) > 0) {
7084             Copy(array_a + i_a, array_r + i_r, copy_count, UV);
7085         }
7086         else if ((copy_count = len_b - i_b) > 0) {
7087             Copy(array_b + i_b, array_r + i_r, copy_count, UV);
7088         }
7089     }
7090
7091     /*  We may be removing a reference to one of the inputs */
7092     if (a == *i || b == *i) {
7093         SvREFCNT_dec(*i);
7094     }
7095
7096     /* If we've changed b, restore it */
7097     if (complement_b) {
7098         array_b[0] = 1;
7099     }
7100
7101     *i = r;
7102     return;
7103 }
7104
7105 #endif
7106
7107 STATIC SV*
7108 S_add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
7109 {
7110     /* Add the range from 'start' to 'end' inclusive to the inversion list's
7111      * set.  A pointer to the inversion list is returned.  This may actually be
7112      * a new list, in which case the passed in one has been destroyed.  The
7113      * passed in inversion list can be NULL, in which case a new one is created
7114      * with just the one range in it */
7115
7116     SV* range_invlist;
7117     UV len;
7118
7119     if (invlist == NULL) {
7120         invlist = _new_invlist(2);
7121         len = 0;
7122     }
7123     else {
7124         len = invlist_len(invlist);
7125     }
7126
7127     /* If comes after the final entry, can just append it to the end */
7128     if (len == 0
7129         || start >= invlist_array(invlist)
7130                                     [invlist_len(invlist) - 1])
7131     {
7132         _append_range_to_invlist(invlist, start, end);
7133         return invlist;
7134     }
7135
7136     /* Here, can't just append things, create and return a new inversion list
7137      * which is the union of this range and the existing inversion list */
7138     range_invlist = _new_invlist(2);
7139     _append_range_to_invlist(range_invlist, start, end);
7140
7141     _invlist_union(invlist, range_invlist, &invlist);
7142
7143     /* The temporary can be freed */
7144     SvREFCNT_dec(range_invlist);
7145
7146     return invlist;
7147 }
7148
7149 PERL_STATIC_INLINE SV*
7150 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
7151     return add_range_to_invlist(invlist, cp, cp);
7152 }
7153
7154 #ifndef PERL_IN_XSUB_RE
7155 void
7156 Perl__invlist_invert(pTHX_ SV* const invlist)
7157 {
7158     /* Complement the input inversion list.  This adds a 0 if the list didn't
7159      * have a zero; removes it otherwise.  As described above, the data
7160      * structure is set up so that this is very efficient */
7161
7162     UV* len_pos = get_invlist_len_addr(invlist);
7163
7164     PERL_ARGS_ASSERT__INVLIST_INVERT;
7165
7166     /* The inverse of matching nothing is matching everything */
7167     if (*len_pos == 0) {
7168         _append_range_to_invlist(invlist, 0, UV_MAX);
7169         return;
7170     }
7171
7172     /* The exclusive or complents 0 to 1; and 1 to 0.  If the result is 1, the
7173      * zero element was a 0, so it is being removed, so the length decrements
7174      * by 1; and vice-versa.  SvCUR is unaffected */
7175     if (*get_invlist_zero_addr(invlist) ^= 1) {
7176         (*len_pos)--;
7177     }
7178     else {
7179         (*len_pos)++;
7180     }
7181 }
7182
7183 void
7184 Perl__invlist_invert_prop(pTHX_ SV* const invlist)
7185 {
7186     /* Complement the input inversion list (which must be a Unicode property,
7187      * all of which don't match above the Unicode maximum code point.)  And
7188      * Perl has chosen to not have the inversion match above that either.  This
7189      * adds a 0x110000 if the list didn't end with it, and removes it if it did
7190      */
7191
7192     UV len;
7193     UV* array;
7194
7195     PERL_ARGS_ASSERT__INVLIST_INVERT_PROP;
7196
7197     _invlist_invert(invlist);
7198
7199     len = invlist_len(invlist);
7200
7201     if (len != 0) { /* If empty do nothing */
7202         array = invlist_array(invlist);
7203         if (array[len - 1] != PERL_UNICODE_MAX + 1) {
7204             /* Add 0x110000.  First, grow if necessary */
7205             len++;
7206             if (invlist_max(invlist) < len) {
7207                 invlist_extend(invlist, len);
7208                 array = invlist_array(invlist);
7209             }
7210             invlist_set_len(invlist, len);
7211             array[len - 1] = PERL_UNICODE_MAX + 1;
7212         }
7213         else {  /* Remove the 0x110000 */
7214             invlist_set_len(invlist, len - 1);
7215         }
7216     }
7217
7218     return;
7219 }
7220 #endif
7221
7222 PERL_STATIC_INLINE SV*
7223 S_invlist_clone(pTHX_ SV* const invlist)
7224 {
7225
7226     /* Return a new inversion list that is a copy of the input one, which is
7227      * unchanged */
7228
7229     /* Need to allocate extra space to accommodate Perl's addition of a
7230      * trailing NUL to SvPV's, since it thinks they are always strings */
7231     SV* new_invlist = _new_invlist(invlist_len(invlist) + 1);
7232     STRLEN length = SvCUR(invlist);
7233
7234     PERL_ARGS_ASSERT_INVLIST_CLONE;
7235
7236     SvCUR_set(new_invlist, length); /* This isn't done automatically */
7237     Copy(SvPVX(invlist), SvPVX(new_invlist), length, char);
7238
7239     return new_invlist;
7240 }
7241
7242 PERL_STATIC_INLINE UV*
7243 S_get_invlist_iter_addr(pTHX_ SV* invlist)
7244 {
7245     /* Return the address of the UV that contains the current iteration
7246      * position */
7247
7248     PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
7249
7250     return (UV *) (SvPVX(invlist) + (INVLIST_ITER_OFFSET * sizeof (UV)));
7251 }
7252
7253 PERL_STATIC_INLINE UV*
7254 S_get_invlist_version_id_addr(pTHX_ SV* invlist)
7255 {
7256     /* Return the address of the UV that contains the version id. */
7257
7258     PERL_ARGS_ASSERT_GET_INVLIST_VERSION_ID_ADDR;
7259
7260     return (UV *) (SvPVX(invlist) + (INVLIST_VERSION_ID_OFFSET * sizeof (UV)));
7261 }
7262
7263 PERL_STATIC_INLINE void
7264 S_invlist_iterinit(pTHX_ SV* invlist)   /* Initialize iterator for invlist */
7265 {
7266     PERL_ARGS_ASSERT_INVLIST_ITERINIT;
7267
7268     *get_invlist_iter_addr(invlist) = 0;
7269 }
7270
7271 STATIC bool
7272 S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
7273 {
7274     /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
7275      * This call sets in <*start> and <*end>, the next range in <invlist>.
7276      * Returns <TRUE> if successful and the next call will return the next
7277      * range; <FALSE> if was already at the end of the list.  If the latter,
7278      * <*start> and <*end> are unchanged, and the next call to this function
7279      * will start over at the beginning of the list */
7280
7281     UV* pos = get_invlist_iter_addr(invlist);
7282     UV len = invlist_len(invlist);
7283     UV *array;
7284
7285     PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
7286
7287     if (*pos >= len) {
7288         *pos = UV_MAX;  /* Force iternit() to be required next time */
7289         return FALSE;
7290     }
7291
7292     array = invlist_array(invlist);
7293
7294     *start = array[(*pos)++];
7295
7296     if (*pos >= len) {
7297         *end = UV_MAX;
7298     }
7299     else {
7300         *end = array[(*pos)++] - 1;
7301     }
7302
7303     return TRUE;
7304 }
7305
7306 #ifndef PERL_IN_XSUB_RE
7307 SV *
7308 Perl__invlist_contents(pTHX_ SV* const invlist)
7309 {
7310     /* Get the contents of an inversion list into a string SV so that they can
7311      * be printed out.  It uses the format traditionally done for debug tracing
7312      */
7313
7314     UV start, end;
7315     SV* output = newSVpvs("\n");
7316
7317     PERL_ARGS_ASSERT__INVLIST_CONTENTS;
7318
7319     invlist_iterinit(invlist);
7320     while (invlist_iternext(invlist, &start, &end)) {
7321         if (end == UV_MAX) {
7322             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
7323         }
7324         else if (end != start) {
7325             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
7326                     start,       end);
7327         }
7328         else {
7329             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
7330         }
7331     }
7332
7333     return output;
7334 }
7335 #endif
7336
7337 #if 0
7338 void
7339 S_invlist_dump(pTHX_ SV* const invlist, const char * const header)
7340 {
7341     /* Dumps out the ranges in an inversion list.  The string 'header'
7342      * if present is output on a line before the first range */
7343
7344     UV start, end;
7345
7346     if (header && strlen(header)) {
7347         PerlIO_printf(Perl_debug_log, "%s\n", header);
7348     }
7349     invlist_iterinit(invlist);
7350     while (invlist_iternext(invlist, &start, &end)) {
7351         if (end == UV_MAX) {
7352             PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. INFINITY\n", start);
7353         }
7354         else {
7355             PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. 0x%04"UVXf"\n", start, end);
7356         }
7357     }
7358 }
7359 #endif
7360
7361 #undef HEADER_LENGTH
7362 #undef INVLIST_INITIAL_LENGTH
7363 #undef TO_INTERNAL_SIZE
7364 #undef FROM_INTERNAL_SIZE
7365 #undef INVLIST_LEN_OFFSET
7366 #undef INVLIST_ZERO_OFFSET
7367 #undef INVLIST_ITER_OFFSET
7368 #undef INVLIST_VERSION_ID
7369
7370 /* End of inversion list object */
7371
7372 /*
7373  - reg - regular expression, i.e. main body or parenthesized thing
7374  *
7375  * Caller must absorb opening parenthesis.
7376  *
7377  * Combining parenthesis handling with the base level of regular expression
7378  * is a trifle forced, but the need to tie the tails of the branches to what
7379  * follows makes it hard to avoid.
7380  */
7381 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
7382 #ifdef DEBUGGING
7383 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
7384 #else
7385 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
7386 #endif
7387
7388 STATIC regnode *
7389 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
7390     /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
7391 {
7392     dVAR;
7393     register regnode *ret;              /* Will be the head of the group. */
7394     register regnode *br;
7395     register regnode *lastbr;
7396     register regnode *ender = NULL;
7397     register I32 parno = 0;
7398     I32 flags;
7399     U32 oregflags = RExC_flags;
7400     bool have_branch = 0;
7401     bool is_open = 0;
7402     I32 freeze_paren = 0;
7403     I32 after_freeze = 0;
7404
7405     /* for (?g), (?gc), and (?o) warnings; warning
7406        about (?c) will warn about (?g) -- japhy    */
7407
7408 #define WASTED_O  0x01
7409 #define WASTED_G  0x02
7410 #define WASTED_C  0x04
7411 #define WASTED_GC (0x02|0x04)
7412     I32 wastedflags = 0x00;
7413
7414     char * parse_start = RExC_parse; /* MJD */
7415     char * const oregcomp_parse = RExC_parse;
7416
7417     GET_RE_DEBUG_FLAGS_DECL;
7418
7419     PERL_ARGS_ASSERT_REG;
7420     DEBUG_PARSE("reg ");
7421
7422     *flagp = 0;                         /* Tentatively. */
7423
7424
7425     /* Make an OPEN node, if parenthesized. */
7426     if (paren) {
7427         if ( *RExC_parse == '*') { /* (*VERB:ARG) */
7428             char *start_verb = RExC_parse;
7429             STRLEN verb_len = 0;
7430             char *start_arg = NULL;
7431             unsigned char op = 0;
7432             int argok = 1;
7433             int internal_argval = 0; /* internal_argval is only useful if !argok */
7434             while ( *RExC_parse && *RExC_parse != ')' ) {
7435                 if ( *RExC_parse == ':' ) {
7436                     start_arg = RExC_parse + 1;
7437                     break;
7438                 }
7439                 RExC_parse++;
7440             }
7441             ++start_verb;
7442             verb_len = RExC_parse - start_verb;
7443             if ( start_arg ) {
7444                 RExC_parse++;
7445                 while ( *RExC_parse && *RExC_parse != ')' )
7446                     RExC_parse++;
7447                 if ( *RExC_parse != ')' )
7448                     vFAIL("Unterminated verb pattern argument");
7449                 if ( RExC_parse == start_arg )
7450                     start_arg = NULL;
7451             } else {
7452                 if ( *RExC_parse != ')' )
7453                     vFAIL("Unterminated verb pattern");
7454             }
7455
7456             switch ( *start_verb ) {
7457             case 'A':  /* (*ACCEPT) */
7458                 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
7459                     op = ACCEPT;
7460                     internal_argval = RExC_nestroot;
7461                 }
7462                 break;
7463             case 'C':  /* (*COMMIT) */
7464                 if ( memEQs(start_verb,verb_len,"COMMIT") )
7465                     op = COMMIT;
7466                 break;
7467             case 'F':  /* (*FAIL) */
7468                 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
7469                     op = OPFAIL;
7470                     argok = 0;
7471                 }
7472                 break;
7473             case ':':  /* (*:NAME) */
7474             case 'M':  /* (*MARK:NAME) */
7475                 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
7476                     op = MARKPOINT;
7477                     argok = -1;
7478                 }
7479                 break;
7480             case 'P':  /* (*PRUNE) */
7481                 if ( memEQs(start_verb,verb_len,"PRUNE") )
7482                     op = PRUNE;
7483                 break;
7484             case 'S':   /* (*SKIP) */
7485                 if ( memEQs(start_verb,verb_len,"SKIP") )
7486                     op = SKIP;
7487                 break;
7488             case 'T':  /* (*THEN) */
7489                 /* [19:06] <TimToady> :: is then */
7490                 if ( memEQs(start_verb,verb_len,"THEN") ) {
7491                     op = CUTGROUP;
7492                     RExC_seen |= REG_SEEN_CUTGROUP;
7493                 }
7494                 break;
7495             }
7496             if ( ! op ) {
7497                 RExC_parse++;
7498                 vFAIL3("Unknown verb pattern '%.*s'",
7499                     verb_len, start_verb);
7500             }
7501             if ( argok ) {
7502                 if ( start_arg && internal_argval ) {
7503                     vFAIL3("Verb pattern '%.*s' may not have an argument",
7504                         verb_len, start_verb);
7505                 } else if ( argok < 0 && !start_arg ) {
7506                     vFAIL3("Verb pattern '%.*s' has a mandatory argument",
7507                         verb_len, start_verb);
7508                 } else {
7509                     ret = reganode(pRExC_state, op, internal_argval);
7510                     if ( ! internal_argval && ! SIZE_ONLY ) {
7511                         if (start_arg) {
7512                             SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
7513                             ARG(ret) = add_data( pRExC_state, 1, "S" );
7514                             RExC_rxi->data->data[ARG(ret)]=(void*)sv;
7515                             ret->flags = 0;
7516                         } else {
7517                             ret->flags = 1;
7518                         }
7519                     }
7520                 }
7521                 if (!internal_argval)
7522                     RExC_seen |= REG_SEEN_VERBARG;
7523             } else if ( start_arg ) {
7524                 vFAIL3("Verb pattern '%.*s' may not have an argument",
7525                         verb_len, start_verb);
7526             } else {
7527                 ret = reg_node(pRExC_state, op);
7528             }
7529             nextchar(pRExC_state);
7530             return ret;
7531         } else
7532         if (*RExC_parse == '?') { /* (?...) */
7533             bool is_logical = 0;
7534             const char * const seqstart = RExC_parse;
7535             bool has_use_defaults = FALSE;
7536
7537             RExC_parse++;
7538             paren = *RExC_parse++;
7539             ret = NULL;                 /* For look-ahead/behind. */
7540             switch (paren) {
7541
7542             case 'P':   /* (?P...) variants for those used to PCRE/Python */
7543                 paren = *RExC_parse++;
7544                 if ( paren == '<')         /* (?P<...>) named capture */
7545                     goto named_capture;
7546                 else if (paren == '>') {   /* (?P>name) named recursion */
7547                     goto named_recursion;
7548                 }
7549                 else if (paren == '=') {   /* (?P=...)  named backref */
7550                     /* this pretty much dupes the code for \k<NAME> in regatom(), if
7551                        you change this make sure you change that */
7552                     char* name_start = RExC_parse;
7553                     U32 num = 0;
7554                     SV *sv_dat = reg_scan_name(pRExC_state,
7555                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7556                     if (RExC_parse == name_start || *RExC_parse != ')')
7557                         vFAIL2("Sequence %.3s... not terminated",parse_start);
7558
7559                     if (!SIZE_ONLY) {
7560                         num = add_data( pRExC_state, 1, "S" );
7561                         RExC_rxi->data->data[num]=(void*)sv_dat;
7562                         SvREFCNT_inc_simple_void(sv_dat);
7563                     }
7564                     RExC_sawback = 1;
7565                     ret = reganode(pRExC_state,
7566                                    ((! FOLD)
7567                                      ? NREF
7568                                      : (MORE_ASCII_RESTRICTED)
7569                                        ? NREFFA
7570                                        : (AT_LEAST_UNI_SEMANTICS)
7571                                          ? NREFFU
7572                                          : (LOC)
7573                                            ? NREFFL
7574                                            : NREFF),
7575                                     num);
7576                     *flagp |= HASWIDTH;
7577
7578                     Set_Node_Offset(ret, parse_start+1);
7579                     Set_Node_Cur_Length(ret); /* MJD */
7580
7581                     nextchar(pRExC_state);
7582                     return ret;
7583                 }
7584                 RExC_parse++;
7585                 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
7586                 /*NOTREACHED*/
7587             case '<':           /* (?<...) */
7588                 if (*RExC_parse == '!')
7589                     paren = ',';
7590                 else if (*RExC_parse != '=')
7591               named_capture:
7592                 {               /* (?<...>) */
7593                     char *name_start;
7594                     SV *svname;
7595                     paren= '>';
7596             case '\'':          /* (?'...') */
7597                     name_start= RExC_parse;
7598                     svname = reg_scan_name(pRExC_state,
7599                         SIZE_ONLY ?  /* reverse test from the others */
7600                         REG_RSN_RETURN_NAME :
7601                         REG_RSN_RETURN_NULL);
7602                     if (RExC_parse == name_start) {
7603                         RExC_parse++;
7604                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
7605                         /*NOTREACHED*/
7606                     }
7607                     if (*RExC_parse != paren)
7608                         vFAIL2("Sequence (?%c... not terminated",
7609                             paren=='>' ? '<' : paren);
7610                     if (SIZE_ONLY) {
7611                         HE *he_str;
7612                         SV *sv_dat = NULL;
7613                         if (!svname) /* shouldn't happen */
7614                             Perl_croak(aTHX_
7615                                 "panic: reg_scan_name returned NULL");
7616                         if (!RExC_paren_names) {
7617                             RExC_paren_names= newHV();
7618                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
7619 #ifdef DEBUGGING
7620                             RExC_paren_name_list= newAV();
7621                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
7622 #endif
7623                         }
7624                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
7625                         if ( he_str )
7626                             sv_dat = HeVAL(he_str);
7627                         if ( ! sv_dat ) {
7628                             /* croak baby croak */
7629                             Perl_croak(aTHX_
7630                                 "panic: paren_name hash element allocation failed");
7631                         } else if ( SvPOK(sv_dat) ) {
7632                             /* (?|...) can mean we have dupes so scan to check
7633                                its already been stored. Maybe a flag indicating
7634                                we are inside such a construct would be useful,
7635                                but the arrays are likely to be quite small, so
7636                                for now we punt -- dmq */
7637                             IV count = SvIV(sv_dat);
7638                             I32 *pv = (I32*)SvPVX(sv_dat);
7639                             IV i;
7640                             for ( i = 0 ; i < count ; i++ ) {
7641                                 if ( pv[i] == RExC_npar ) {
7642                                     count = 0;
7643                                     break;
7644                                 }
7645                             }
7646                             if ( count ) {
7647                                 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
7648                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
7649                                 pv[count] = RExC_npar;
7650                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
7651                             }
7652                         } else {
7653                             (void)SvUPGRADE(sv_dat,SVt_PVNV);
7654                             sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
7655                             SvIOK_on(sv_dat);
7656                             SvIV_set(sv_dat, 1);
7657                         }
7658 #ifdef DEBUGGING
7659                         /* Yes this does cause a memory leak in debugging Perls */
7660                         if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
7661                             SvREFCNT_dec(svname);
7662 #endif
7663
7664                         /*sv_dump(sv_dat);*/
7665                     }
7666                     nextchar(pRExC_state);
7667                     paren = 1;
7668                     goto capturing_parens;
7669                 }
7670                 RExC_seen |= REG_SEEN_LOOKBEHIND;
7671                 RExC_in_lookbehind++;
7672                 RExC_parse++;
7673             case '=':           /* (?=...) */
7674                 RExC_seen_zerolen++;
7675                 break;
7676             case '!':           /* (?!...) */
7677                 RExC_seen_zerolen++;
7678                 if (*RExC_parse == ')') {
7679                     ret=reg_node(pRExC_state, OPFAIL);
7680                     nextchar(pRExC_state);
7681                     return ret;
7682                 }
7683                 break;
7684             case '|':           /* (?|...) */
7685                 /* branch reset, behave like a (?:...) except that
7686                    buffers in alternations share the same numbers */
7687                 paren = ':';
7688                 after_freeze = freeze_paren = RExC_npar;
7689                 break;
7690             case ':':           /* (?:...) */
7691             case '>':           /* (?>...) */
7692                 break;
7693             case '$':           /* (?$...) */
7694             case '@':           /* (?@...) */
7695                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
7696                 break;
7697             case '#':           /* (?#...) */
7698                 while (*RExC_parse && *RExC_parse != ')')
7699                     RExC_parse++;
7700                 if (*RExC_parse != ')')
7701                     FAIL("Sequence (?#... not terminated");
7702                 nextchar(pRExC_state);
7703                 *flagp = TRYAGAIN;
7704                 return NULL;
7705             case '0' :           /* (?0) */
7706             case 'R' :           /* (?R) */
7707                 if (*RExC_parse != ')')
7708                     FAIL("Sequence (?R) not terminated");
7709                 ret = reg_node(pRExC_state, GOSTART);
7710                 *flagp |= POSTPONED;
7711                 nextchar(pRExC_state);
7712                 return ret;
7713                 /*notreached*/
7714             { /* named and numeric backreferences */
7715                 I32 num;
7716             case '&':            /* (?&NAME) */
7717                 parse_start = RExC_parse - 1;
7718               named_recursion:
7719                 {
7720                     SV *sv_dat = reg_scan_name(pRExC_state,
7721                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7722                      num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
7723                 }
7724                 goto gen_recurse_regop;
7725                 /* NOT REACHED */
7726             case '+':
7727                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
7728                     RExC_parse++;
7729                     vFAIL("Illegal pattern");
7730                 }
7731                 goto parse_recursion;
7732                 /* NOT REACHED*/
7733             case '-': /* (?-1) */
7734                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
7735                     RExC_parse--; /* rewind to let it be handled later */
7736                     goto parse_flags;
7737                 }
7738                 /*FALLTHROUGH */
7739             case '1': case '2': case '3': case '4': /* (?1) */
7740             case '5': case '6': case '7': case '8': case '9':
7741                 RExC_parse--;
7742               parse_recursion:
7743                 num = atoi(RExC_parse);
7744                 parse_start = RExC_parse - 1; /* MJD */
7745                 if (*RExC_parse == '-')
7746                     RExC_parse++;
7747                 while (isDIGIT(*RExC_parse))
7748                         RExC_parse++;
7749                 if (*RExC_parse!=')')
7750                     vFAIL("Expecting close bracket");
7751
7752               gen_recurse_regop:
7753                 if ( paren == '-' ) {
7754                     /*
7755                     Diagram of capture buffer numbering.
7756                     Top line is the normal capture buffer numbers
7757                     Bottom line is the negative indexing as from
7758                     the X (the (?-2))
7759
7760                     +   1 2    3 4 5 X          6 7
7761                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
7762                     -   5 4    3 2 1 X          x x
7763
7764                     */
7765                     num = RExC_npar + num;
7766                     if (num < 1)  {
7767                         RExC_parse++;
7768                         vFAIL("Reference to nonexistent group");
7769                     }
7770                 } else if ( paren == '+' ) {
7771                     num = RExC_npar + num - 1;
7772                 }
7773
7774                 ret = reganode(pRExC_state, GOSUB, num);
7775                 if (!SIZE_ONLY) {
7776                     if (num > (I32)RExC_rx->nparens) {
7777                         RExC_parse++;
7778                         vFAIL("Reference to nonexistent group");
7779                     }
7780                     ARG2L_SET( ret, RExC_recurse_count++);
7781                     RExC_emit++;
7782                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
7783                         "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
7784                 } else {
7785                     RExC_size++;
7786                 }
7787                 RExC_seen |= REG_SEEN_RECURSE;
7788                 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
7789                 Set_Node_Offset(ret, parse_start); /* MJD */
7790
7791                 *flagp |= POSTPONED;
7792                 nextchar(pRExC_state);
7793                 return ret;
7794             } /* named and numeric backreferences */
7795             /* NOT REACHED */
7796
7797             case '?':           /* (??...) */
7798                 is_logical = 1;
7799                 if (*RExC_parse != '{') {
7800                     RExC_parse++;
7801                     vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
7802                     /*NOTREACHED*/
7803                 }
7804                 *flagp |= POSTPONED;
7805                 paren = *RExC_parse++;
7806                 /* FALL THROUGH */
7807             case '{':           /* (?{...}) */
7808             {
7809                 I32 count = 1;
7810                 U32 n = 0;
7811                 char c;
7812                 char *s = RExC_parse;
7813
7814                 RExC_seen_zerolen++;
7815                 RExC_seen |= REG_SEEN_EVAL;
7816                 while (count && (c = *RExC_parse)) {
7817                     if (c == '\\') {
7818                         if (RExC_parse[1])
7819                             RExC_parse++;
7820                     }
7821                     else if (c == '{')
7822                         count++;
7823                     else if (c == '}')
7824                         count--;
7825                     RExC_parse++;
7826                 }
7827                 if (*RExC_parse != ')') {
7828                     RExC_parse = s;
7829                     vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
7830                 }
7831                 if (!SIZE_ONLY) {
7832                     PAD *pad;
7833                     OP_4tree *sop, *rop;
7834                     SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
7835
7836                     ENTER;
7837                     Perl_save_re_context(aTHX);
7838                     rop = Perl_sv_compile_2op_is_broken(aTHX_ sv, &sop, "re", &pad);
7839                     sop->op_private |= OPpREFCOUNTED;
7840                     /* re_dup will OpREFCNT_inc */
7841                     OpREFCNT_set(sop, 1);
7842                     LEAVE;
7843
7844                     n = add_data(pRExC_state, 3, "nop");
7845                     RExC_rxi->data->data[n] = (void*)rop;
7846                     RExC_rxi->data->data[n+1] = (void*)sop;
7847                     RExC_rxi->data->data[n+2] = (void*)pad;
7848                     SvREFCNT_dec(sv);
7849                 }
7850                 else {                                          /* First pass */
7851                     if (PL_reginterp_cnt < ++RExC_seen_evals
7852                         && IN_PERL_RUNTIME)
7853                         /* No compiled RE interpolated, has runtime
7854                            components ===> unsafe.  */
7855                         FAIL("Eval-group not allowed at runtime, use re 'eval'");
7856                     if (PL_tainting && PL_tainted)
7857                         FAIL("Eval-group in insecure regular expression");
7858 #if PERL_VERSION > 8
7859                     if (IN_PERL_COMPILETIME)
7860                         PL_cv_has_eval = 1;
7861 #endif
7862                 }
7863
7864                 nextchar(pRExC_state);
7865                 if (is_logical) {
7866                     ret = reg_node(pRExC_state, LOGICAL);
7867                     if (!SIZE_ONLY)
7868                         ret->flags = 2;
7869                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
7870                     /* deal with the length of this later - MJD */
7871                     return ret;
7872                 }
7873                 ret = reganode(pRExC_state, EVAL, n);
7874                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
7875                 Set_Node_Offset(ret, parse_start);
7876                 return ret;
7877             }
7878             case '(':           /* (?(?{...})...) and (?(?=...)...) */
7879             {
7880                 int is_define= 0;
7881                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
7882                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
7883                         || RExC_parse[1] == '<'
7884                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
7885                         I32 flag;
7886
7887                         ret = reg_node(pRExC_state, LOGICAL);
7888                         if (!SIZE_ONLY)
7889                             ret->flags = 1;
7890                         REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
7891                         goto insert_if;
7892                     }
7893                 }
7894                 else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
7895                          || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
7896                 {
7897                     char ch = RExC_parse[0] == '<' ? '>' : '\'';
7898                     char *name_start= RExC_parse++;
7899                     U32 num = 0;
7900                     SV *sv_dat=reg_scan_name(pRExC_state,
7901                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7902                     if (RExC_parse == name_start || *RExC_parse != ch)
7903                         vFAIL2("Sequence (?(%c... not terminated",
7904                             (ch == '>' ? '<' : ch));
7905                     RExC_parse++;
7906                     if (!SIZE_ONLY) {
7907                         num = add_data( pRExC_state, 1, "S" );
7908                         RExC_rxi->data->data[num]=(void*)sv_dat;
7909                         SvREFCNT_inc_simple_void(sv_dat);
7910                     }
7911                     ret = reganode(pRExC_state,NGROUPP,num);
7912                     goto insert_if_check_paren;
7913                 }
7914                 else if (RExC_parse[0] == 'D' &&
7915                          RExC_parse[1] == 'E' &&
7916                          RExC_parse[2] == 'F' &&
7917                          RExC_parse[3] == 'I' &&
7918                          RExC_parse[4] == 'N' &&
7919                          RExC_parse[5] == 'E')
7920                 {
7921                     ret = reganode(pRExC_state,DEFINEP,0);
7922                     RExC_parse +=6 ;
7923                     is_define = 1;
7924                     goto insert_if_check_paren;
7925                 }
7926                 else if (RExC_parse[0] == 'R') {
7927                     RExC_parse++;
7928                     parno = 0;
7929                     if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
7930                         parno = atoi(RExC_parse++);
7931                         while (isDIGIT(*RExC_parse))
7932                             RExC_parse++;
7933                     } else if (RExC_parse[0] == '&') {
7934                         SV *sv_dat;
7935                         RExC_parse++;
7936                         sv_dat = reg_scan_name(pRExC_state,
7937                             SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7938                         parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
7939                     }
7940                     ret = reganode(pRExC_state,INSUBP,parno);
7941                     goto insert_if_check_paren;
7942                 }
7943                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
7944                     /* (?(1)...) */
7945                     char c;
7946                     parno = atoi(RExC_parse++);
7947
7948                     while (isDIGIT(*RExC_parse))
7949                         RExC_parse++;
7950                     ret = reganode(pRExC_state, GROUPP, parno);
7951
7952                  insert_if_check_paren:
7953                     if ((c = *nextchar(pRExC_state)) != ')')
7954                         vFAIL("Switch condition not recognized");
7955                   insert_if:
7956                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
7957                     br = regbranch(pRExC_state, &flags, 1,depth+1);
7958                     if (br == NULL)
7959                         br = reganode(pRExC_state, LONGJMP, 0);
7960                     else
7961                         REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
7962                     c = *nextchar(pRExC_state);
7963                     if (flags&HASWIDTH)
7964                         *flagp |= HASWIDTH;
7965                     if (c == '|') {
7966                         if (is_define)
7967                             vFAIL("(?(DEFINE)....) does not allow branches");
7968                         lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
7969                         regbranch(pRExC_state, &flags, 1,depth+1);
7970                         REGTAIL(pRExC_state, ret, lastbr);
7971                         if (flags&HASWIDTH)
7972                             *flagp |= HASWIDTH;
7973                         c = *nextchar(pRExC_state);
7974                     }
7975                     else
7976                         lastbr = NULL;
7977                     if (c != ')')
7978                         vFAIL("Switch (?(condition)... contains too many branches");
7979                     ender = reg_node(pRExC_state, TAIL);
7980                     REGTAIL(pRExC_state, br, ender);
7981                     if (lastbr) {
7982                         REGTAIL(pRExC_state, lastbr, ender);
7983                         REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
7984                     }
7985                     else
7986                         REGTAIL(pRExC_state, ret, ender);
7987                     RExC_size++; /* XXX WHY do we need this?!!
7988                                     For large programs it seems to be required
7989                                     but I can't figure out why. -- dmq*/
7990                     return ret;
7991                 }
7992                 else {
7993                     vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
7994                 }
7995             }
7996             case 0:
7997                 RExC_parse--; /* for vFAIL to print correctly */
7998                 vFAIL("Sequence (? incomplete");
7999                 break;
8000             case DEFAULT_PAT_MOD:   /* Use default flags with the exceptions
8001                                        that follow */
8002                 has_use_defaults = TRUE;
8003                 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
8004                 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
8005                                                 ? REGEX_UNICODE_CHARSET
8006                                                 : REGEX_DEPENDS_CHARSET);
8007                 goto parse_flags;
8008             default:
8009                 --RExC_parse;
8010                 parse_flags:      /* (?i) */
8011             {
8012                 U32 posflags = 0, negflags = 0;
8013                 U32 *flagsp = &posflags;
8014                 char has_charset_modifier = '\0';
8015                 regex_charset cs = (RExC_utf8 || RExC_uni_semantics)
8016                                     ? REGEX_UNICODE_CHARSET
8017                                     : REGEX_DEPENDS_CHARSET;
8018
8019                 while (*RExC_parse) {
8020                     /* && strchr("iogcmsx", *RExC_parse) */
8021                     /* (?g), (?gc) and (?o) are useless here
8022                        and must be globally applied -- japhy */
8023                     switch (*RExC_parse) {
8024                     CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
8025                     case LOCALE_PAT_MOD:
8026                         if (has_charset_modifier) {
8027                             goto excess_modifier;
8028                         }
8029                         else if (flagsp == &negflags) {
8030                             goto neg_modifier;
8031                         }
8032                         cs = REGEX_LOCALE_CHARSET;
8033                         has_charset_modifier = LOCALE_PAT_MOD;
8034                         RExC_contains_locale = 1;
8035                         break;
8036                     case UNICODE_PAT_MOD:
8037                         if (has_charset_modifier) {
8038                             goto excess_modifier;
8039                         }
8040                         else if (flagsp == &negflags) {
8041                             goto neg_modifier;
8042                         }
8043                         cs = REGEX_UNICODE_CHARSET;
8044                         has_charset_modifier = UNICODE_PAT_MOD;
8045                         break;
8046                     case ASCII_RESTRICT_PAT_MOD:
8047                         if (flagsp == &negflags) {
8048                             goto neg_modifier;
8049                         }
8050                         if (has_charset_modifier) {
8051                             if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
8052                                 goto excess_modifier;
8053                             }
8054                             /* Doubled modifier implies more restricted */
8055                             cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
8056                         }
8057                         else {
8058                             cs = REGEX_ASCII_RESTRICTED_CHARSET;
8059                         }
8060                         has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
8061                         break;
8062                     case DEPENDS_PAT_MOD:
8063                         if (has_use_defaults) {
8064                             goto fail_modifiers;
8065                         }
8066                         else if (flagsp == &negflags) {
8067                             goto neg_modifier;
8068                         }
8069                         else if (has_charset_modifier) {
8070                             goto excess_modifier;
8071                         }
8072
8073                         /* The dual charset means unicode semantics if the
8074                          * pattern (or target, not known until runtime) are
8075                          * utf8, or something in the pattern indicates unicode
8076                          * semantics */
8077                         cs = (RExC_utf8 || RExC_uni_semantics)
8078                              ? REGEX_UNICODE_CHARSET
8079                              : REGEX_DEPENDS_CHARSET;
8080                         has_charset_modifier = DEPENDS_PAT_MOD;
8081                         break;
8082                     excess_modifier:
8083                         RExC_parse++;
8084                         if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
8085                             vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
8086                         }
8087                         else if (has_charset_modifier == *(RExC_parse - 1)) {
8088                             vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
8089                         }
8090                         else {
8091                             vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
8092                         }
8093                         /*NOTREACHED*/
8094                     neg_modifier:
8095                         RExC_parse++;
8096                         vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
8097                         /*NOTREACHED*/
8098                     case ONCE_PAT_MOD: /* 'o' */
8099                     case GLOBAL_PAT_MOD: /* 'g' */
8100                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
8101                             const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
8102                             if (! (wastedflags & wflagbit) ) {
8103                                 wastedflags |= wflagbit;
8104                                 vWARN5(
8105                                     RExC_parse + 1,
8106                                     "Useless (%s%c) - %suse /%c modifier",
8107                                     flagsp == &negflags ? "?-" : "?",
8108                                     *RExC_parse,
8109                                     flagsp == &negflags ? "don't " : "",
8110                                     *RExC_parse
8111                                 );
8112                             }
8113                         }
8114                         break;
8115
8116                     case CONTINUE_PAT_MOD: /* 'c' */
8117                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
8118                             if (! (wastedflags & WASTED_C) ) {
8119                                 wastedflags |= WASTED_GC;
8120                                 vWARN3(
8121                                     RExC_parse + 1,
8122                                     "Useless (%sc) - %suse /gc modifier",
8123                                     flagsp == &negflags ? "?-" : "?",
8124                                     flagsp == &negflags ? "don't " : ""
8125                                 );
8126                             }
8127                         }
8128                         break;
8129                     case KEEPCOPY_PAT_MOD: /* 'p' */
8130                         if (flagsp == &negflags) {
8131                             if (SIZE_ONLY)
8132                                 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
8133                         } else {
8134                             *flagsp |= RXf_PMf_KEEPCOPY;
8135                         }
8136                         break;
8137                     case '-':
8138                         /* A flag is a default iff it is following a minus, so
8139                          * if there is a minus, it means will be trying to
8140                          * re-specify a default which is an error */
8141                         if (has_use_defaults || flagsp == &negflags) {
8142             fail_modifiers:
8143                             RExC_parse++;
8144                             vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8145                             /*NOTREACHED*/
8146                         }
8147                         flagsp = &negflags;
8148                         wastedflags = 0;  /* reset so (?g-c) warns twice */
8149                         break;
8150                     case ':':
8151                         paren = ':';
8152                         /*FALLTHROUGH*/
8153                     case ')':
8154                         RExC_flags |= posflags;
8155                         RExC_flags &= ~negflags;
8156                         set_regex_charset(&RExC_flags, cs);
8157                         if (paren != ':') {
8158                             oregflags |= posflags;
8159                             oregflags &= ~negflags;
8160                             set_regex_charset(&oregflags, cs);
8161                         }
8162                         nextchar(pRExC_state);
8163                         if (paren != ':') {
8164                             *flagp = TRYAGAIN;
8165                             return NULL;
8166                         } else {
8167                             ret = NULL;
8168                             goto parse_rest;
8169                         }
8170                         /*NOTREACHED*/
8171                     default:
8172                         RExC_parse++;
8173                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8174                         /*NOTREACHED*/
8175                     }
8176                     ++RExC_parse;
8177                 }
8178             }} /* one for the default block, one for the switch */
8179         }
8180         else {                  /* (...) */
8181           capturing_parens:
8182             parno = RExC_npar;
8183             RExC_npar++;
8184
8185             ret = reganode(pRExC_state, OPEN, parno);
8186             if (!SIZE_ONLY ){
8187                 if (!RExC_nestroot)
8188                     RExC_nestroot = parno;
8189                 if (RExC_seen & REG_SEEN_RECURSE
8190                     && !RExC_open_parens[parno-1])
8191                 {
8192                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
8193                         "Setting open paren #%"IVdf" to %d\n",
8194                         (IV)parno, REG_NODE_NUM(ret)));
8195                     RExC_open_parens[parno-1]= ret;
8196                 }
8197             }
8198             Set_Node_Length(ret, 1); /* MJD */
8199             Set_Node_Offset(ret, RExC_parse); /* MJD */
8200             is_open = 1;
8201         }
8202     }
8203     else                        /* ! paren */
8204         ret = NULL;
8205
8206    parse_rest:
8207     /* Pick up the branches, linking them together. */
8208     parse_start = RExC_parse;   /* MJD */
8209     br = regbranch(pRExC_state, &flags, 1,depth+1);
8210
8211     /*     branch_len = (paren != 0); */
8212
8213     if (br == NULL)
8214         return(NULL);
8215     if (*RExC_parse == '|') {
8216         if (!SIZE_ONLY && RExC_extralen) {
8217             reginsert(pRExC_state, BRANCHJ, br, depth+1);
8218         }
8219         else {                  /* MJD */
8220             reginsert(pRExC_state, BRANCH, br, depth+1);
8221             Set_Node_Length(br, paren != 0);
8222             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
8223         }
8224         have_branch = 1;
8225         if (SIZE_ONLY)
8226             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
8227     }
8228     else if (paren == ':') {
8229         *flagp |= flags&SIMPLE;
8230     }
8231     if (is_open) {                              /* Starts with OPEN. */
8232         REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
8233     }
8234     else if (paren != '?')              /* Not Conditional */
8235         ret = br;
8236     *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
8237     lastbr = br;
8238     while (*RExC_parse == '|') {
8239         if (!SIZE_ONLY && RExC_extralen) {
8240             ender = reganode(pRExC_state, LONGJMP,0);
8241             REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
8242         }
8243         if (SIZE_ONLY)
8244             RExC_extralen += 2;         /* Account for LONGJMP. */
8245         nextchar(pRExC_state);
8246         if (freeze_paren) {
8247             if (RExC_npar > after_freeze)
8248                 after_freeze = RExC_npar;
8249             RExC_npar = freeze_paren;
8250         }
8251         br = regbranch(pRExC_state, &flags, 0, depth+1);
8252
8253         if (br == NULL)
8254             return(NULL);
8255         REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
8256         lastbr = br;
8257         *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
8258     }
8259
8260     if (have_branch || paren != ':') {
8261         /* Make a closing node, and hook it on the end. */
8262         switch (paren) {
8263         case ':':
8264             ender = reg_node(pRExC_state, TAIL);
8265             break;
8266         case 1:
8267             ender = reganode(pRExC_state, CLOSE, parno);
8268             if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
8269                 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
8270                         "Setting close paren #%"IVdf" to %d\n",
8271                         (IV)parno, REG_NODE_NUM(ender)));
8272                 RExC_close_parens[parno-1]= ender;
8273                 if (RExC_nestroot == parno)
8274                     RExC_nestroot = 0;
8275             }
8276             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
8277             Set_Node_Length(ender,1); /* MJD */
8278             break;
8279         case '<':
8280         case ',':
8281         case '=':
8282         case '!':
8283             *flagp &= ~HASWIDTH;
8284             /* FALL THROUGH */
8285         case '>':
8286             ender = reg_node(pRExC_state, SUCCEED);
8287             break;
8288         case 0:
8289             ender = reg_node(pRExC_state, END);
8290             if (!SIZE_ONLY) {
8291                 assert(!RExC_opend); /* there can only be one! */
8292                 RExC_opend = ender;
8293             }
8294             break;
8295         }
8296         REGTAIL(pRExC_state, lastbr, ender);
8297
8298         if (have_branch && !SIZE_ONLY) {
8299             if (depth==1)
8300                 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
8301
8302             /* Hook the tails of the branches to the closing node. */
8303             for (br = ret; br; br = regnext(br)) {
8304                 const U8 op = PL_regkind[OP(br)];
8305                 if (op == BRANCH) {
8306                     REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
8307                 }
8308                 else if (op == BRANCHJ) {
8309                     REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
8310                 }
8311             }
8312         }
8313     }
8314
8315     {
8316         const char *p;
8317         static const char parens[] = "=!<,>";
8318
8319         if (paren && (p = strchr(parens, paren))) {
8320             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
8321             int flag = (p - parens) > 1;
8322
8323             if (paren == '>')
8324                 node = SUSPEND, flag = 0;
8325             reginsert(pRExC_state, node,ret, depth+1);
8326             Set_Node_Cur_Length(ret);
8327             Set_Node_Offset(ret, parse_start + 1);
8328             ret->flags = flag;
8329             REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
8330         }
8331     }
8332
8333     /* Check for proper termination. */
8334     if (paren) {
8335         RExC_flags = oregflags;
8336         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
8337             RExC_parse = oregcomp_parse;
8338             vFAIL("Unmatched (");
8339         }
8340     }
8341     else if (!paren && RExC_parse < RExC_end) {
8342         if (*RExC_parse == ')') {
8343             RExC_parse++;
8344             vFAIL("Unmatched )");
8345         }
8346         else
8347             FAIL("Junk on end of regexp");      /* "Can't happen". */
8348         /* NOTREACHED */
8349     }
8350
8351     if (RExC_in_lookbehind) {
8352         RExC_in_lookbehind--;
8353     }
8354     if (after_freeze > RExC_npar)
8355         RExC_npar = after_freeze;
8356     return(ret);
8357 }
8358
8359 /*
8360  - regbranch - one alternative of an | operator
8361  *
8362  * Implements the concatenation operator.
8363  */
8364 STATIC regnode *
8365 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
8366 {
8367     dVAR;
8368     register regnode *ret;
8369     register regnode *chain = NULL;
8370     register regnode *latest;
8371     I32 flags = 0, c = 0;
8372     GET_RE_DEBUG_FLAGS_DECL;
8373
8374     PERL_ARGS_ASSERT_REGBRANCH;
8375
8376     DEBUG_PARSE("brnc");
8377
8378     if (first)
8379         ret = NULL;
8380     else {
8381         if (!SIZE_ONLY && RExC_extralen)
8382             ret = reganode(pRExC_state, BRANCHJ,0);
8383         else {
8384             ret = reg_node(pRExC_state, BRANCH);
8385             Set_Node_Length(ret, 1);
8386         }
8387     }
8388
8389     if (!first && SIZE_ONLY)
8390         RExC_extralen += 1;                     /* BRANCHJ */
8391
8392     *flagp = WORST;                     /* Tentatively. */
8393
8394     RExC_parse--;
8395     nextchar(pRExC_state);
8396     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
8397         flags &= ~TRYAGAIN;
8398         latest = regpiece(pRExC_state, &flags,depth+1);
8399         if (latest == NULL) {
8400             if (flags & TRYAGAIN)
8401                 continue;
8402             return(NULL);
8403         }
8404         else if (ret == NULL)
8405             ret = latest;
8406         *flagp |= flags&(HASWIDTH|POSTPONED);
8407         if (chain == NULL)      /* First piece. */
8408             *flagp |= flags&SPSTART;
8409         else {
8410             RExC_naughty++;
8411             REGTAIL(pRExC_state, chain, latest);
8412         }
8413         chain = latest;
8414         c++;
8415     }
8416     if (chain == NULL) {        /* Loop ran zero times. */
8417         chain = reg_node(pRExC_state, NOTHING);
8418         if (ret == NULL)
8419             ret = chain;
8420     }
8421     if (c == 1) {
8422         *flagp |= flags&SIMPLE;
8423     }
8424
8425     return ret;
8426 }
8427
8428 /*
8429  - regpiece - something followed by possible [*+?]
8430  *
8431  * Note that the branching code sequences used for ? and the general cases
8432  * of * and + are somewhat optimized:  they use the same NOTHING node as
8433  * both the endmarker for their branch list and the body of the last branch.
8434  * It might seem that this node could be dispensed with entirely, but the
8435  * endmarker role is not redundant.
8436  */
8437 STATIC regnode *
8438 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
8439 {
8440     dVAR;
8441     register regnode *ret;
8442     register char op;
8443     register char *next;
8444     I32 flags;
8445     const char * const origparse = RExC_parse;
8446     I32 min;
8447     I32 max = REG_INFTY;
8448 #ifdef RE_TRACK_PATTERN_OFFSETS
8449     char *parse_start;
8450 #endif
8451     const char *maxpos = NULL;
8452     GET_RE_DEBUG_FLAGS_DECL;
8453
8454     PERL_ARGS_ASSERT_REGPIECE;
8455
8456     DEBUG_PARSE("piec");
8457
8458     ret = regatom(pRExC_state, &flags,depth+1);
8459     if (ret == NULL) {
8460         if (flags & TRYAGAIN)
8461             *flagp |= TRYAGAIN;
8462         return(NULL);
8463     }
8464
8465     op = *RExC_parse;
8466
8467     if (op == '{' && regcurly(RExC_parse)) {
8468         maxpos = NULL;
8469 #ifdef RE_TRACK_PATTERN_OFFSETS
8470         parse_start = RExC_parse; /* MJD */
8471 #endif
8472         next = RExC_parse + 1;
8473         while (isDIGIT(*next) || *next == ',') {
8474             if (*next == ',') {
8475                 if (maxpos)
8476                     break;
8477                 else
8478                     maxpos = next;
8479             }
8480             next++;
8481         }
8482         if (*next == '}') {             /* got one */
8483             if (!maxpos)
8484                 maxpos = next;
8485             RExC_parse++;
8486             min = atoi(RExC_parse);
8487             if (*maxpos == ',')
8488                 maxpos++;
8489             else
8490                 maxpos = RExC_parse;
8491             max = atoi(maxpos);
8492             if (!max && *maxpos != '0')
8493                 max = REG_INFTY;                /* meaning "infinity" */
8494             else if (max >= REG_INFTY)
8495                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
8496             RExC_parse = next;
8497             nextchar(pRExC_state);
8498
8499         do_curly:
8500             if ((flags&SIMPLE)) {
8501                 RExC_naughty += 2 + RExC_naughty / 2;
8502                 reginsert(pRExC_state, CURLY, ret, depth+1);
8503                 Set_Node_Offset(ret, parse_start+1); /* MJD */
8504                 Set_Node_Cur_Length(ret);
8505             }
8506             else {
8507                 regnode * const w = reg_node(pRExC_state, WHILEM);
8508
8509                 w->flags = 0;
8510                 REGTAIL(pRExC_state, ret, w);
8511                 if (!SIZE_ONLY && RExC_extralen) {
8512                     reginsert(pRExC_state, LONGJMP,ret, depth+1);
8513                     reginsert(pRExC_state, NOTHING,ret, depth+1);
8514                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
8515                 }
8516                 reginsert(pRExC_state, CURLYX,ret, depth+1);
8517                                 /* MJD hk */
8518                 Set_Node_Offset(ret, parse_start+1);
8519                 Set_Node_Length(ret,
8520                                 op == '{' ? (RExC_parse - parse_start) : 1);
8521
8522                 if (!SIZE_ONLY && RExC_extralen)
8523                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
8524                 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
8525                 if (SIZE_ONLY)
8526                     RExC_whilem_seen++, RExC_extralen += 3;
8527                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
8528             }
8529             ret->flags = 0;
8530
8531             if (min > 0)
8532                 *flagp = WORST;
8533             if (max > 0)
8534                 *flagp |= HASWIDTH;
8535             if (max < min)
8536                 vFAIL("Can't do {n,m} with n > m");
8537             if (!SIZE_ONLY) {
8538                 ARG1_SET(ret, (U16)min);
8539                 ARG2_SET(ret, (U16)max);
8540             }
8541
8542             goto nest_check;
8543         }
8544     }
8545
8546     if (!ISMULT1(op)) {
8547         *flagp = flags;
8548         return(ret);
8549     }
8550
8551 #if 0                           /* Now runtime fix should be reliable. */
8552
8553     /* if this is reinstated, don't forget to put this back into perldiag:
8554
8555             =item Regexp *+ operand could be empty at {#} in regex m/%s/
8556
8557            (F) The part of the regexp subject to either the * or + quantifier
8558            could match an empty string. The {#} shows in the regular
8559            expression about where the problem was discovered.
8560
8561     */
8562
8563     if (!(flags&HASWIDTH) && op != '?')
8564       vFAIL("Regexp *+ operand could be empty");
8565 #endif
8566
8567 #ifdef RE_TRACK_PATTERN_OFFSETS
8568     parse_start = RExC_parse;
8569 #endif
8570     nextchar(pRExC_state);
8571
8572     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
8573
8574     if (op == '*' && (flags&SIMPLE)) {
8575         reginsert(pRExC_state, STAR, ret, depth+1);
8576         ret->flags = 0;
8577         RExC_naughty += 4;
8578     }
8579     else if (op == '*') {
8580         min = 0;
8581         goto do_curly;
8582     }
8583     else if (op == '+' && (flags&SIMPLE)) {
8584         reginsert(pRExC_state, PLUS, ret, depth+1);
8585         ret->flags = 0;
8586         RExC_naughty += 3;
8587     }
8588     else if (op == '+') {
8589         min = 1;
8590         goto do_curly;
8591     }
8592     else if (op == '?') {
8593         min = 0; max = 1;
8594         goto do_curly;
8595     }
8596   nest_check:
8597     if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
8598         ckWARN3reg(RExC_parse,
8599                    "%.*s matches null string many times",
8600                    (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
8601                    origparse);
8602     }
8603
8604     if (RExC_parse < RExC_end && *RExC_parse == '?') {
8605         nextchar(pRExC_state);
8606         reginsert(pRExC_state, MINMOD, ret, depth+1);
8607         REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
8608     }
8609 #ifndef REG_ALLOW_MINMOD_SUSPEND
8610     else
8611 #endif
8612     if (RExC_parse < RExC_end && *RExC_parse == '+') {
8613         regnode *ender;
8614         nextchar(pRExC_state);
8615         ender = reg_node(pRExC_state, SUCCEED);
8616         REGTAIL(pRExC_state, ret, ender);
8617         reginsert(pRExC_state, SUSPEND, ret, depth+1);
8618         ret->flags = 0;
8619         ender = reg_node(pRExC_state, TAIL);
8620         REGTAIL(pRExC_state, ret, ender);
8621         /*ret= ender;*/
8622     }
8623
8624     if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
8625         RExC_parse++;
8626         vFAIL("Nested quantifiers");
8627     }
8628
8629     return(ret);
8630 }
8631
8632
8633 /* reg_namedseq(pRExC_state,UVp, UV depth)
8634
8635    This is expected to be called by a parser routine that has
8636    recognized '\N' and needs to handle the rest. RExC_parse is
8637    expected to point at the first char following the N at the time
8638    of the call.
8639
8640    The \N may be inside (indicated by valuep not being NULL) or outside a
8641    character class.
8642
8643    \N may begin either a named sequence, or if outside a character class, mean
8644    to match a non-newline.  For non single-quoted regexes, the tokenizer has
8645    attempted to decide which, and in the case of a named sequence converted it
8646    into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
8647    where c1... are the characters in the sequence.  For single-quoted regexes,
8648    the tokenizer passes the \N sequence through unchanged; this code will not
8649    attempt to determine this nor expand those.  The net effect is that if the
8650    beginning of the passed-in pattern isn't '{U+' or there is no '}', it
8651    signals that this \N occurrence means to match a non-newline.
8652
8653    Only the \N{U+...} form should occur in a character class, for the same
8654    reason that '.' inside a character class means to just match a period: it
8655    just doesn't make sense.
8656
8657    If valuep is non-null then it is assumed that we are parsing inside
8658    of a charclass definition and the first codepoint in the resolved
8659    string is returned via *valuep and the routine will return NULL.
8660    In this mode if a multichar string is returned from the charnames
8661    handler, a warning will be issued, and only the first char in the
8662    sequence will be examined. If the string returned is zero length
8663    then the value of *valuep is undefined and NON-NULL will
8664    be returned to indicate failure. (This will NOT be a valid pointer
8665    to a regnode.)
8666
8667    If valuep is null then it is assumed that we are parsing normal text and a
8668    new EXACT node is inserted into the program containing the resolved string,
8669    and a pointer to the new node is returned.  But if the string is zero length
8670    a NOTHING node is emitted instead.
8671
8672    On success RExC_parse is set to the char following the endbrace.
8673    Parsing failures will generate a fatal error via vFAIL(...)
8674  */
8675 STATIC regnode *
8676 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp, U32 depth)
8677 {
8678     char * endbrace;    /* '}' following the name */
8679     regnode *ret = NULL;
8680     char* p;
8681
8682     GET_RE_DEBUG_FLAGS_DECL;
8683
8684     PERL_ARGS_ASSERT_REG_NAMEDSEQ;
8685
8686     GET_RE_DEBUG_FLAGS;
8687
8688     /* The [^\n] meaning of \N ignores spaces and comments under the /x
8689      * modifier.  The other meaning does not */
8690     p = (RExC_flags & RXf_PMf_EXTENDED)
8691         ? regwhite( pRExC_state, RExC_parse )
8692         : RExC_parse;
8693
8694     /* Disambiguate between \N meaning a named character versus \N meaning
8695      * [^\n].  The former is assumed when it can't be the latter. */
8696     if (*p != '{' || regcurly(p)) {
8697         RExC_parse = p;
8698         if (valuep) {
8699             /* no bare \N in a charclass */
8700             vFAIL("\\N in a character class must be a named character: \\N{...}");
8701         }
8702         nextchar(pRExC_state);
8703         ret = reg_node(pRExC_state, REG_ANY);
8704         *flagp |= HASWIDTH|SIMPLE;
8705         RExC_naughty++;
8706         RExC_parse--;
8707         Set_Node_Length(ret, 1); /* MJD */
8708         return ret;
8709     }
8710
8711     /* Here, we have decided it should be a named sequence */
8712
8713     /* The test above made sure that the next real character is a '{', but
8714      * under the /x modifier, it could be separated by space (or a comment and
8715      * \n) and this is not allowed (for consistency with \x{...} and the
8716      * tokenizer handling of \N{NAME}). */
8717     if (*RExC_parse != '{') {
8718         vFAIL("Missing braces on \\N{}");
8719     }
8720
8721     RExC_parse++;       /* Skip past the '{' */
8722
8723     if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
8724         || ! (endbrace == RExC_parse            /* nothing between the {} */
8725               || (endbrace - RExC_parse >= 2    /* U+ (bad hex is checked below */
8726                   && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
8727     {
8728         if (endbrace) RExC_parse = endbrace;    /* position msg's '<--HERE' */
8729         vFAIL("\\N{NAME} must be resolved by the lexer");
8730     }
8731
8732     if (endbrace == RExC_parse) {   /* empty: \N{} */
8733         if (! valuep) {
8734             RExC_parse = endbrace + 1;
8735             return reg_node(pRExC_state,NOTHING);
8736         }
8737
8738         if (SIZE_ONLY) {
8739             ckWARNreg(RExC_parse,
8740                     "Ignoring zero length \\N{} in character class"
8741             );
8742             RExC_parse = endbrace + 1;
8743         }
8744         *valuep = 0;
8745         return (regnode *) &RExC_parse; /* Invalid regnode pointer */
8746     }
8747
8748     REQUIRE_UTF8;       /* named sequences imply Unicode semantics */
8749     RExC_parse += 2;    /* Skip past the 'U+' */
8750
8751     if (valuep) {   /* In a bracketed char class */
8752         /* We only pay attention to the first char of
8753         multichar strings being returned. I kinda wonder
8754         if this makes sense as it does change the behaviour
8755         from earlier versions, OTOH that behaviour was broken
8756         as well. XXX Solution is to recharacterize as
8757         [rest-of-class]|multi1|multi2... */
8758
8759         STRLEN length_of_hex;
8760         I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
8761             | PERL_SCAN_DISALLOW_PREFIX
8762             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
8763
8764         char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
8765         if (endchar < endbrace) {
8766             ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
8767         }
8768
8769         length_of_hex = (STRLEN)(endchar - RExC_parse);
8770         *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
8771
8772         /* The tokenizer should have guaranteed validity, but it's possible to
8773          * bypass it by using single quoting, so check */
8774         if (length_of_hex == 0
8775             || length_of_hex != (STRLEN)(endchar - RExC_parse) )
8776         {
8777             RExC_parse += length_of_hex;        /* Includes all the valid */
8778             RExC_parse += (RExC_orig_utf8)      /* point to after 1st invalid */
8779                             ? UTF8SKIP(RExC_parse)
8780                             : 1;
8781             /* Guard against malformed utf8 */
8782             if (RExC_parse >= endchar) RExC_parse = endchar;
8783             vFAIL("Invalid hexadecimal number in \\N{U+...}");
8784         }
8785
8786         RExC_parse = endbrace + 1;
8787         if (endchar == endbrace) return NULL;
8788
8789         ret = (regnode *) &RExC_parse;  /* Invalid regnode pointer */
8790     }
8791     else {      /* Not a char class */
8792
8793         /* What is done here is to convert this to a sub-pattern of the form
8794          * (?:\x{char1}\x{char2}...)
8795          * and then call reg recursively.  That way, it retains its atomicness,
8796          * while not having to worry about special handling that some code
8797          * points may have.  toke.c has converted the original Unicode values
8798          * to native, so that we can just pass on the hex values unchanged.  We
8799          * do have to set a flag to keep recoding from happening in the
8800          * recursion */
8801
8802         SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
8803         STRLEN len;
8804         char *endchar;      /* Points to '.' or '}' ending cur char in the input
8805                                stream */
8806         char *orig_end = RExC_end;
8807
8808         while (RExC_parse < endbrace) {
8809
8810             /* Code points are separated by dots.  If none, there is only one
8811              * code point, and is terminated by the brace */
8812             endchar = RExC_parse + strcspn(RExC_parse, ".}");
8813
8814             /* Convert to notation the rest of the code understands */
8815             sv_catpv(substitute_parse, "\\x{");
8816             sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
8817             sv_catpv(substitute_parse, "}");
8818
8819             /* Point to the beginning of the next character in the sequence. */
8820             RExC_parse = endchar + 1;
8821         }
8822         sv_catpv(substitute_parse, ")");
8823
8824         RExC_parse = SvPV(substitute_parse, len);
8825
8826         /* Don't allow empty number */
8827         if (len < 8) {
8828             vFAIL("Invalid hexadecimal number in \\N{U+...}");
8829         }
8830         RExC_end = RExC_parse + len;
8831
8832         /* The values are Unicode, and therefore not subject to recoding */
8833         RExC_override_recoding = 1;
8834
8835         ret = reg(pRExC_state, 1, flagp, depth+1);
8836
8837         RExC_parse = endbrace;
8838         RExC_end = orig_end;
8839         RExC_override_recoding = 0;
8840
8841         nextchar(pRExC_state);
8842     }
8843
8844     return ret;
8845 }
8846
8847
8848 /*
8849  * reg_recode
8850  *
8851  * It returns the code point in utf8 for the value in *encp.
8852  *    value: a code value in the source encoding
8853  *    encp:  a pointer to an Encode object
8854  *
8855  * If the result from Encode is not a single character,
8856  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
8857  */
8858 STATIC UV
8859 S_reg_recode(pTHX_ const char value, SV **encp)
8860 {
8861     STRLEN numlen = 1;
8862     SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
8863     const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
8864     const STRLEN newlen = SvCUR(sv);
8865     UV uv = UNICODE_REPLACEMENT;
8866
8867     PERL_ARGS_ASSERT_REG_RECODE;
8868
8869     if (newlen)
8870         uv = SvUTF8(sv)
8871              ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
8872              : *(U8*)s;
8873
8874     if (!newlen || numlen != newlen) {
8875         uv = UNICODE_REPLACEMENT;
8876         *encp = NULL;
8877     }
8878     return uv;
8879 }
8880
8881
8882 /*
8883  - regatom - the lowest level
8884
8885    Try to identify anything special at the start of the pattern. If there
8886    is, then handle it as required. This may involve generating a single regop,
8887    such as for an assertion; or it may involve recursing, such as to
8888    handle a () structure.
8889
8890    If the string doesn't start with something special then we gobble up
8891    as much literal text as we can.
8892
8893    Once we have been able to handle whatever type of thing started the
8894    sequence, we return.
8895
8896    Note: we have to be careful with escapes, as they can be both literal
8897    and special, and in the case of \10 and friends can either, depending
8898    on context. Specifically there are two separate switches for handling
8899    escape sequences, with the one for handling literal escapes requiring
8900    a dummy entry for all of the special escapes that are actually handled
8901    by the other.
8902 */
8903
8904 STATIC regnode *
8905 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
8906 {
8907     dVAR;
8908     register regnode *ret = NULL;
8909     I32 flags;
8910     char *parse_start = RExC_parse;
8911     U8 op;
8912     GET_RE_DEBUG_FLAGS_DECL;
8913     DEBUG_PARSE("atom");
8914     *flagp = WORST;             /* Tentatively. */
8915
8916     PERL_ARGS_ASSERT_REGATOM;
8917
8918 tryagain:
8919     switch ((U8)*RExC_parse) {
8920     case '^':
8921         RExC_seen_zerolen++;
8922         nextchar(pRExC_state);
8923         if (RExC_flags & RXf_PMf_MULTILINE)
8924             ret = reg_node(pRExC_state, MBOL);
8925         else if (RExC_flags & RXf_PMf_SINGLELINE)
8926             ret = reg_node(pRExC_state, SBOL);
8927         else
8928             ret = reg_node(pRExC_state, BOL);
8929         Set_Node_Length(ret, 1); /* MJD */
8930         break;
8931     case '$':
8932         nextchar(pRExC_state);
8933         if (*RExC_parse)
8934             RExC_seen_zerolen++;
8935         if (RExC_flags & RXf_PMf_MULTILINE)
8936             ret = reg_node(pRExC_state, MEOL);
8937         else if (RExC_flags & RXf_PMf_SINGLELINE)
8938             ret = reg_node(pRExC_state, SEOL);
8939         else
8940             ret = reg_node(pRExC_state, EOL);
8941         Set_Node_Length(ret, 1); /* MJD */
8942         break;
8943     case '.':
8944         nextchar(pRExC_state);
8945         if (RExC_flags & RXf_PMf_SINGLELINE)
8946             ret = reg_node(pRExC_state, SANY);
8947         else
8948             ret = reg_node(pRExC_state, REG_ANY);
8949         *flagp |= HASWIDTH|SIMPLE;
8950         RExC_naughty++;
8951         Set_Node_Length(ret, 1); /* MJD */
8952         break;
8953     case '[':
8954     {
8955         char * const oregcomp_parse = ++RExC_parse;
8956         ret = regclass(pRExC_state,depth+1);
8957         if (*RExC_parse != ']') {
8958             RExC_parse = oregcomp_parse;
8959             vFAIL("Unmatched [");
8960         }
8961         nextchar(pRExC_state);
8962         *flagp |= HASWIDTH|SIMPLE;
8963         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
8964         break;
8965     }
8966     case '(':
8967         nextchar(pRExC_state);
8968         ret = reg(pRExC_state, 1, &flags,depth+1);
8969         if (ret == NULL) {
8970                 if (flags & TRYAGAIN) {
8971                     if (RExC_parse == RExC_end) {
8972                          /* Make parent create an empty node if needed. */
8973                         *flagp |= TRYAGAIN;
8974                         return(NULL);
8975                     }
8976                     goto tryagain;
8977                 }
8978                 return(NULL);
8979         }
8980         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
8981         break;
8982     case '|':
8983     case ')':
8984         if (flags & TRYAGAIN) {
8985             *flagp |= TRYAGAIN;
8986             return NULL;
8987         }
8988         vFAIL("Internal urp");
8989                                 /* Supposed to be caught earlier. */
8990         break;
8991     case '{':
8992         if (!regcurly(RExC_parse)) {
8993             RExC_parse++;
8994             goto defchar;
8995         }
8996         /* FALL THROUGH */
8997     case '?':
8998     case '+':
8999     case '*':
9000         RExC_parse++;
9001         vFAIL("Quantifier follows nothing");
9002         break;
9003     case '\\':
9004         /* Special Escapes
9005
9006            This switch handles escape sequences that resolve to some kind
9007            of special regop and not to literal text. Escape sequnces that
9008            resolve to literal text are handled below in the switch marked
9009            "Literal Escapes".
9010
9011            Every entry in this switch *must* have a corresponding entry
9012            in the literal escape switch. However, the opposite is not
9013            required, as the default for this switch is to jump to the
9014            literal text handling code.
9015         */
9016         switch ((U8)*++RExC_parse) {
9017         /* Special Escapes */
9018         case 'A':
9019             RExC_seen_zerolen++;
9020             ret = reg_node(pRExC_state, SBOL);
9021             *flagp |= SIMPLE;
9022             goto finish_meta_pat;
9023         case 'G':
9024             ret = reg_node(pRExC_state, GPOS);
9025             RExC_seen |= REG_SEEN_GPOS;
9026             *flagp |= SIMPLE;
9027             goto finish_meta_pat;
9028         case 'K':
9029             RExC_seen_zerolen++;
9030             ret = reg_node(pRExC_state, KEEPS);
9031             *flagp |= SIMPLE;
9032             /* XXX:dmq : disabling in-place substitution seems to
9033              * be necessary here to avoid cases of memory corruption, as
9034              * with: C<$_="x" x 80; s/x\K/y/> -- rgs
9035              */
9036             RExC_seen |= REG_SEEN_LOOKBEHIND;
9037             goto finish_meta_pat;
9038         case 'Z':
9039             ret = reg_node(pRExC_state, SEOL);
9040             *flagp |= SIMPLE;
9041             RExC_seen_zerolen++;                /* Do not optimize RE away */
9042             goto finish_meta_pat;
9043         case 'z':
9044             ret = reg_node(pRExC_state, EOS);
9045             *flagp |= SIMPLE;
9046             RExC_seen_zerolen++;                /* Do not optimize RE away */
9047             goto finish_meta_pat;
9048         case 'C':
9049             ret = reg_node(pRExC_state, CANY);
9050             RExC_seen |= REG_SEEN_CANY;
9051             *flagp |= HASWIDTH|SIMPLE;
9052             goto finish_meta_pat;
9053         case 'X':
9054             ret = reg_node(pRExC_state, CLUMP);
9055             *flagp |= HASWIDTH;
9056             goto finish_meta_pat;
9057         case 'w':
9058             switch (get_regex_charset(RExC_flags)) {
9059                 case REGEX_LOCALE_CHARSET:
9060                     op = ALNUML;
9061                     break;
9062                 case REGEX_UNICODE_CHARSET:
9063                     op = ALNUMU;
9064                     break;
9065                 case REGEX_ASCII_RESTRICTED_CHARSET:
9066                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9067                     op = ALNUMA;
9068                     break;
9069                 case REGEX_DEPENDS_CHARSET:
9070                     op = ALNUM;
9071                     break;
9072                 default:
9073                     goto bad_charset;
9074             }
9075             ret = reg_node(pRExC_state, op);
9076             *flagp |= HASWIDTH|SIMPLE;
9077             goto finish_meta_pat;
9078         case 'W':
9079             switch (get_regex_charset(RExC_flags)) {
9080                 case REGEX_LOCALE_CHARSET:
9081                     op = NALNUML;
9082                     break;
9083                 case REGEX_UNICODE_CHARSET:
9084                     op = NALNUMU;
9085                     break;
9086                 case REGEX_ASCII_RESTRICTED_CHARSET:
9087                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9088                     op = NALNUMA;
9089                     break;
9090                 case REGEX_DEPENDS_CHARSET:
9091                     op = NALNUM;
9092                     break;
9093                 default:
9094                     goto bad_charset;
9095             }
9096             ret = reg_node(pRExC_state, op);
9097             *flagp |= HASWIDTH|SIMPLE;
9098             goto finish_meta_pat;
9099         case 'b':
9100             RExC_seen_zerolen++;
9101             RExC_seen |= REG_SEEN_LOOKBEHIND;
9102             switch (get_regex_charset(RExC_flags)) {
9103                 case REGEX_LOCALE_CHARSET:
9104                     op = BOUNDL;
9105                     break;
9106                 case REGEX_UNICODE_CHARSET:
9107                     op = BOUNDU;
9108                     break;
9109                 case REGEX_ASCII_RESTRICTED_CHARSET:
9110                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9111                     op = BOUNDA;
9112                     break;
9113                 case REGEX_DEPENDS_CHARSET:
9114                     op = BOUND;
9115                     break;
9116                 default:
9117                     goto bad_charset;
9118             }
9119             ret = reg_node(pRExC_state, op);
9120             FLAGS(ret) = get_regex_charset(RExC_flags);
9121             *flagp |= SIMPLE;
9122             if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
9123                 ckWARNregdep(RExC_parse, "\"\\b{\" is deprecated; use \"\\b\\{\" instead");
9124             }
9125             goto finish_meta_pat;
9126         case 'B':
9127             RExC_seen_zerolen++;
9128             RExC_seen |= REG_SEEN_LOOKBEHIND;
9129             switch (get_regex_charset(RExC_flags)) {
9130                 case REGEX_LOCALE_CHARSET:
9131                     op = NBOUNDL;
9132                     break;
9133                 case REGEX_UNICODE_CHARSET:
9134                     op = NBOUNDU;
9135                     break;
9136                 case REGEX_ASCII_RESTRICTED_CHARSET:
9137                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9138                     op = NBOUNDA;
9139                     break;
9140                 case REGEX_DEPENDS_CHARSET:
9141                     op = NBOUND;
9142                     break;
9143                 default:
9144                     goto bad_charset;
9145             }
9146             ret = reg_node(pRExC_state, op);
9147             FLAGS(ret) = get_regex_charset(RExC_flags);
9148             *flagp |= SIMPLE;
9149             if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
9150                 ckWARNregdep(RExC_parse, "\"\\B{\" is deprecated; use \"\\B\\{\" instead");
9151             }
9152             goto finish_meta_pat;
9153         case 's':
9154             switch (get_regex_charset(RExC_flags)) {
9155                 case REGEX_LOCALE_CHARSET:
9156                     op = SPACEL;
9157                     break;
9158                 case REGEX_UNICODE_CHARSET:
9159                     op = SPACEU;
9160                     break;
9161                 case REGEX_ASCII_RESTRICTED_CHARSET:
9162                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9163                     op = SPACEA;
9164                     break;
9165                 case REGEX_DEPENDS_CHARSET:
9166                     op = SPACE;
9167                     break;
9168                 default:
9169                     goto bad_charset;
9170             }
9171             ret = reg_node(pRExC_state, op);
9172             *flagp |= HASWIDTH|SIMPLE;
9173             goto finish_meta_pat;
9174         case 'S':
9175             switch (get_regex_charset(RExC_flags)) {
9176                 case REGEX_LOCALE_CHARSET:
9177                     op = NSPACEL;
9178                     break;
9179                 case REGEX_UNICODE_CHARSET:
9180                     op = NSPACEU;
9181                     break;
9182                 case REGEX_ASCII_RESTRICTED_CHARSET:
9183                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9184                     op = NSPACEA;
9185                     break;
9186                 case REGEX_DEPENDS_CHARSET:
9187                     op = NSPACE;
9188                     break;
9189                 default:
9190                     goto bad_charset;
9191             }
9192             ret = reg_node(pRExC_state, op);
9193             *flagp |= HASWIDTH|SIMPLE;
9194             goto finish_meta_pat;
9195         case 'd':
9196             switch (get_regex_charset(RExC_flags)) {
9197                 case REGEX_LOCALE_CHARSET:
9198                     op = DIGITL;
9199                     break;
9200                 case REGEX_ASCII_RESTRICTED_CHARSET:
9201                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9202                     op = DIGITA;
9203                     break;
9204                 case REGEX_DEPENDS_CHARSET: /* No difference between these */
9205                 case REGEX_UNICODE_CHARSET:
9206                     op = DIGIT;
9207                     break;
9208                 default:
9209                     goto bad_charset;
9210             }
9211             ret = reg_node(pRExC_state, op);
9212             *flagp |= HASWIDTH|SIMPLE;
9213             goto finish_meta_pat;
9214         case 'D':
9215             switch (get_regex_charset(RExC_flags)) {
9216                 case REGEX_LOCALE_CHARSET:
9217                     op = NDIGITL;
9218                     break;
9219                 case REGEX_ASCII_RESTRICTED_CHARSET:
9220                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9221                     op = NDIGITA;
9222                     break;
9223                 case REGEX_DEPENDS_CHARSET: /* No difference between these */
9224                 case REGEX_UNICODE_CHARSET:
9225                     op = NDIGIT;
9226                     break;
9227                 default:
9228                     goto bad_charset;
9229             }
9230             ret = reg_node(pRExC_state, op);
9231             *flagp |= HASWIDTH|SIMPLE;
9232             goto finish_meta_pat;
9233         case 'R':
9234             ret = reg_node(pRExC_state, LNBREAK);
9235             *flagp |= HASWIDTH|SIMPLE;
9236             goto finish_meta_pat;
9237         case 'h':
9238             ret = reg_node(pRExC_state, HORIZWS);
9239             *flagp |= HASWIDTH|SIMPLE;
9240             goto finish_meta_pat;
9241         case 'H':
9242             ret = reg_node(pRExC_state, NHORIZWS);
9243             *flagp |= HASWIDTH|SIMPLE;
9244             goto finish_meta_pat;
9245         case 'v':
9246             ret = reg_node(pRExC_state, VERTWS);
9247             *flagp |= HASWIDTH|SIMPLE;
9248             goto finish_meta_pat;
9249         case 'V':
9250             ret = reg_node(pRExC_state, NVERTWS);
9251             *flagp |= HASWIDTH|SIMPLE;
9252          finish_meta_pat:
9253             nextchar(pRExC_state);
9254             Set_Node_Length(ret, 2); /* MJD */
9255             break;
9256         case 'p':
9257         case 'P':
9258             {
9259                 char* const oldregxend = RExC_end;
9260 #ifdef DEBUGGING
9261                 char* parse_start = RExC_parse - 2;
9262 #endif
9263
9264                 if (RExC_parse[1] == '{') {
9265                   /* a lovely hack--pretend we saw [\pX] instead */
9266                     RExC_end = strchr(RExC_parse, '}');
9267                     if (!RExC_end) {
9268                         const U8 c = (U8)*RExC_parse;
9269                         RExC_parse += 2;
9270                         RExC_end = oldregxend;
9271                         vFAIL2("Missing right brace on \\%c{}", c);
9272                     }
9273                     RExC_end++;
9274                 }
9275                 else {
9276                     RExC_end = RExC_parse + 2;
9277                     if (RExC_end > oldregxend)
9278                         RExC_end = oldregxend;
9279                 }
9280                 RExC_parse--;
9281
9282                 ret = regclass(pRExC_state,depth+1);
9283
9284                 RExC_end = oldregxend;
9285                 RExC_parse--;
9286
9287                 Set_Node_Offset(ret, parse_start + 2);
9288                 Set_Node_Cur_Length(ret);
9289                 nextchar(pRExC_state);
9290                 *flagp |= HASWIDTH|SIMPLE;
9291             }
9292             break;
9293         case 'N':
9294             /* Handle \N and \N{NAME} here and not below because it can be
9295             multicharacter. join_exact() will join them up later on.
9296             Also this makes sure that things like /\N{BLAH}+/ and
9297             \N{BLAH} being multi char Just Happen. dmq*/
9298             ++RExC_parse;
9299             ret= reg_namedseq(pRExC_state, NULL, flagp, depth);
9300             break;
9301         case 'k':    /* Handle \k<NAME> and \k'NAME' */
9302         parse_named_seq:
9303         {
9304             char ch= RExC_parse[1];
9305             if (ch != '<' && ch != '\'' && ch != '{') {
9306                 RExC_parse++;
9307                 vFAIL2("Sequence %.2s... not terminated",parse_start);
9308             } else {
9309                 /* this pretty much dupes the code for (?P=...) in reg(), if
9310                    you change this make sure you change that */
9311                 char* name_start = (RExC_parse += 2);
9312                 U32 num = 0;
9313                 SV *sv_dat = reg_scan_name(pRExC_state,
9314                     SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9315                 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
9316                 if (RExC_parse == name_start || *RExC_parse != ch)
9317                     vFAIL2("Sequence %.3s... not terminated",parse_start);
9318
9319                 if (!SIZE_ONLY) {
9320                     num = add_data( pRExC_state, 1, "S" );
9321                     RExC_rxi->data->data[num]=(void*)sv_dat;
9322                     SvREFCNT_inc_simple_void(sv_dat);
9323                 }
9324
9325                 RExC_sawback = 1;
9326                 ret = reganode(pRExC_state,
9327                                ((! FOLD)
9328                                  ? NREF
9329                                  : (MORE_ASCII_RESTRICTED)
9330                                    ? NREFFA
9331                                    : (AT_LEAST_UNI_SEMANTICS)
9332                                      ? NREFFU
9333                                      : (LOC)
9334                                        ? NREFFL
9335                                        : NREFF),
9336                                 num);
9337                 *flagp |= HASWIDTH;
9338
9339                 /* override incorrect value set in reganode MJD */
9340                 Set_Node_Offset(ret, parse_start+1);
9341                 Set_Node_Cur_Length(ret); /* MJD */
9342                 nextchar(pRExC_state);
9343
9344             }
9345             break;
9346         }
9347         case 'g':
9348         case '1': case '2': case '3': case '4':
9349         case '5': case '6': case '7': case '8': case '9':
9350             {
9351                 I32 num;
9352                 bool isg = *RExC_parse == 'g';
9353                 bool isrel = 0;
9354                 bool hasbrace = 0;
9355                 if (isg) {
9356                     RExC_parse++;
9357                     if (*RExC_parse == '{') {
9358                         RExC_parse++;
9359                         hasbrace = 1;
9360                     }
9361                     if (*RExC_parse == '-') {
9362                         RExC_parse++;
9363                         isrel = 1;
9364                     }
9365                     if (hasbrace && !isDIGIT(*RExC_parse)) {
9366                         if (isrel) RExC_parse--;
9367                         RExC_parse -= 2;
9368                         goto parse_named_seq;
9369                 }   }
9370                 num = atoi(RExC_parse);
9371                 if (isg && num == 0)
9372                     vFAIL("Reference to invalid group 0");
9373                 if (isrel) {
9374                     num = RExC_npar - num;
9375                     if (num < 1)
9376                         vFAIL("Reference to nonexistent or unclosed group");
9377                 }
9378                 if (!isg && num > 9 && num >= RExC_npar)
9379                     goto defchar;
9380                 else {
9381                     char * const parse_start = RExC_parse - 1; /* MJD */
9382                     while (isDIGIT(*RExC_parse))
9383                         RExC_parse++;
9384                     if (parse_start == RExC_parse - 1)
9385                         vFAIL("Unterminated \\g... pattern");
9386                     if (hasbrace) {
9387                         if (*RExC_parse != '}')
9388                             vFAIL("Unterminated \\g{...} pattern");
9389                         RExC_parse++;
9390                     }
9391                     if (!SIZE_ONLY) {
9392                         if (num > (I32)RExC_rx->nparens)
9393                             vFAIL("Reference to nonexistent group");
9394                     }
9395                     RExC_sawback = 1;
9396                     ret = reganode(pRExC_state,
9397                                    ((! FOLD)
9398                                      ? REF
9399                                      : (MORE_ASCII_RESTRICTED)
9400                                        ? REFFA
9401                                        : (AT_LEAST_UNI_SEMANTICS)
9402                                          ? REFFU
9403                                          : (LOC)
9404                                            ? REFFL
9405                                            : REFF),
9406                                     num);
9407                     *flagp |= HASWIDTH;
9408
9409                     /* override incorrect value set in reganode MJD */
9410                     Set_Node_Offset(ret, parse_start+1);
9411                     Set_Node_Cur_Length(ret); /* MJD */
9412                     RExC_parse--;
9413                     nextchar(pRExC_state);
9414                 }
9415             }
9416             break;
9417         case '\0':
9418             if (RExC_parse >= RExC_end)
9419                 FAIL("Trailing \\");
9420             /* FALL THROUGH */
9421         default:
9422             /* Do not generate "unrecognized" warnings here, we fall
9423                back into the quick-grab loop below */
9424             parse_start--;
9425             goto defchar;
9426         }
9427         break;
9428
9429     case '#':
9430         if (RExC_flags & RXf_PMf_EXTENDED) {
9431             if ( reg_skipcomment( pRExC_state ) )
9432                 goto tryagain;
9433         }
9434         /* FALL THROUGH */
9435
9436     default:
9437
9438             parse_start = RExC_parse - 1;
9439
9440             RExC_parse++;
9441
9442         defchar: {
9443             register STRLEN len;
9444             register UV ender;
9445             register char *p;
9446             char *s;
9447             STRLEN foldlen;
9448             U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
9449             U8 node_type;
9450
9451             /* Is this a LATIN LOWER CASE SHARP S in an EXACTFU node?  If so,
9452              * it is folded to 'ss' even if not utf8 */
9453             bool is_exactfu_sharp_s;
9454
9455             ender = 0;
9456             node_type = ((! FOLD) ? EXACT
9457                         : (LOC)
9458                           ? EXACTFL
9459                           : (MORE_ASCII_RESTRICTED)
9460                             ? EXACTFA
9461                             : (AT_LEAST_UNI_SEMANTICS)
9462                               ? EXACTFU
9463                               : EXACTF);
9464             ret = reg_node(pRExC_state, node_type);
9465             s = STRING(ret);
9466
9467             /* XXX The node can hold up to 255 bytes, yet this only goes to
9468              * 127.  I (khw) do not know why.  Keeping it somewhat less than
9469              * 255 allows us to not have to worry about overflow due to
9470              * converting to utf8 and fold expansion, but that value is
9471              * 255-UTF8_MAXBYTES_CASE.  join_exact() may join adjacent nodes
9472              * split up by this limit into a single one using the real max of
9473              * 255.  Even at 127, this breaks under rare circumstances.  If
9474              * folding, we do not want to split a node at a character that is a
9475              * non-final in a multi-char fold, as an input string could just
9476              * happen to want to match across the node boundary.  The join
9477              * would solve that problem if the join actually happens.  But a
9478              * series of more than two nodes in a row each of 127 would cause
9479              * the first join to succeed to get to 254, but then there wouldn't
9480              * be room for the next one, which could at be one of those split
9481              * multi-char folds.  I don't know of any fool-proof solution.  One
9482              * could back off to end with only a code point that isn't such a
9483              * non-final, but it is possible for there not to be any in the
9484              * entire node. */
9485             for (len = 0, p = RExC_parse - 1;
9486                  len < 127 && p < RExC_end;
9487                  len++)
9488             {
9489                 char * const oldp = p;
9490
9491                 if (RExC_flags & RXf_PMf_EXTENDED)
9492                     p = regwhite( pRExC_state, p );
9493                 switch ((U8)*p) {
9494                 case '^':
9495                 case '$':
9496                 case '.':
9497                 case '[':
9498                 case '(':
9499                 case ')':
9500                 case '|':
9501                     goto loopdone;
9502                 case '\\':
9503                     /* Literal Escapes Switch
9504
9505                        This switch is meant to handle escape sequences that
9506                        resolve to a literal character.
9507
9508                        Every escape sequence that represents something
9509                        else, like an assertion or a char class, is handled
9510                        in the switch marked 'Special Escapes' above in this
9511                        routine, but also has an entry here as anything that
9512                        isn't explicitly mentioned here will be treated as
9513                        an unescaped equivalent literal.
9514                     */
9515
9516                     switch ((U8)*++p) {
9517                     /* These are all the special escapes. */
9518                     case 'A':             /* Start assertion */
9519                     case 'b': case 'B':   /* Word-boundary assertion*/
9520                     case 'C':             /* Single char !DANGEROUS! */
9521                     case 'd': case 'D':   /* digit class */
9522                     case 'g': case 'G':   /* generic-backref, pos assertion */
9523                     case 'h': case 'H':   /* HORIZWS */
9524                     case 'k': case 'K':   /* named backref, keep marker */
9525                     case 'N':             /* named char sequence */
9526                     case 'p': case 'P':   /* Unicode property */
9527                               case 'R':   /* LNBREAK */
9528                     case 's': case 'S':   /* space class */
9529                     case 'v': case 'V':   /* VERTWS */
9530                     case 'w': case 'W':   /* word class */
9531                     case 'X':             /* eXtended Unicode "combining character sequence" */
9532                     case 'z': case 'Z':   /* End of line/string assertion */
9533                         --p;
9534                         goto loopdone;
9535
9536                     /* Anything after here is an escape that resolves to a
9537                        literal. (Except digits, which may or may not)
9538                      */
9539                     case 'n':
9540                         ender = '\n';
9541                         p++;
9542                         break;
9543                     case 'r':
9544                         ender = '\r';
9545                         p++;
9546                         break;
9547                     case 't':
9548                         ender = '\t';
9549                         p++;
9550                         break;
9551                     case 'f':
9552                         ender = '\f';
9553                         p++;
9554                         break;
9555                     case 'e':
9556                           ender = ASCII_TO_NATIVE('\033');
9557                         p++;
9558                         break;
9559                     case 'a':
9560                           ender = ASCII_TO_NATIVE('\007');
9561                         p++;
9562                         break;
9563                     case 'o':
9564                         {
9565                             STRLEN brace_len = len;
9566                             UV result;
9567                             const char* error_msg;
9568
9569                             bool valid = grok_bslash_o(p,
9570                                                        &result,
9571                                                        &brace_len,
9572                                                        &error_msg,
9573                                                        1);
9574                             p += brace_len;
9575                             if (! valid) {
9576                                 RExC_parse = p; /* going to die anyway; point
9577                                                    to exact spot of failure */
9578                                 vFAIL(error_msg);
9579                             }
9580                             else
9581                             {
9582                                 ender = result;
9583                             }
9584                             if (PL_encoding && ender < 0x100) {
9585                                 goto recode_encoding;
9586                             }
9587                             if (ender > 0xff) {
9588                                 REQUIRE_UTF8;
9589                             }
9590                             break;
9591                         }
9592                     case 'x':
9593                         if (*++p == '{') {
9594                             char* const e = strchr(p, '}');
9595
9596                             if (!e) {
9597                                 RExC_parse = p + 1;
9598                                 vFAIL("Missing right brace on \\x{}");
9599                             }
9600                             else {
9601                                 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
9602                                     | PERL_SCAN_DISALLOW_PREFIX;
9603                                 STRLEN numlen = e - p - 1;
9604                                 ender = grok_hex(p + 1, &numlen, &flags, NULL);
9605                                 if (ender > 0xff)
9606                                     REQUIRE_UTF8;
9607                                 p = e + 1;
9608                             }
9609                         }
9610                         else {
9611                             I32 flags = PERL_SCAN_DISALLOW_PREFIX;
9612                             STRLEN numlen = 2;
9613                             ender = grok_hex(p, &numlen, &flags, NULL);
9614                             p += numlen;
9615                         }
9616                         if (PL_encoding && ender < 0x100)
9617                             goto recode_encoding;
9618                         break;
9619                     case 'c':
9620                         p++;
9621                         ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
9622                         break;
9623                     case '0': case '1': case '2': case '3':case '4':
9624                     case '5': case '6': case '7': case '8':case '9':
9625                         if (*p == '0' ||
9626                             (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
9627                         {
9628                             I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
9629                             STRLEN numlen = 3;
9630                             ender = grok_oct(p, &numlen, &flags, NULL);
9631                             if (ender > 0xff) {
9632                                 REQUIRE_UTF8;
9633                             }
9634                             p += numlen;
9635                         }
9636                         else {
9637                             --p;
9638                             goto loopdone;
9639                         }
9640                         if (PL_encoding && ender < 0x100)
9641                             goto recode_encoding;
9642                         break;
9643                     recode_encoding:
9644                         if (! RExC_override_recoding) {
9645                             SV* enc = PL_encoding;
9646                             ender = reg_recode((const char)(U8)ender, &enc);
9647                             if (!enc && SIZE_ONLY)
9648                                 ckWARNreg(p, "Invalid escape in the specified encoding");
9649                             REQUIRE_UTF8;
9650                         }
9651                         break;
9652                     case '\0':
9653                         if (p >= RExC_end)
9654                             FAIL("Trailing \\");
9655                         /* FALL THROUGH */
9656                     default:
9657                         if (!SIZE_ONLY&& isALPHA(*p)) {
9658                             /* Include any { following the alpha to emphasize
9659                              * that it could be part of an escape at some point
9660                              * in the future */
9661                             int len = (*(p + 1) == '{') ? 2 : 1;
9662                             ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
9663                         }
9664                         goto normal_default;
9665                     }
9666                     break;
9667                 default:
9668                   normal_default:
9669                     if (UTF8_IS_START(*p) && UTF) {
9670                         STRLEN numlen;
9671                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
9672                                                &numlen, UTF8_ALLOW_DEFAULT);
9673                         p += numlen;
9674                     }
9675                     else
9676                         ender = (U8) *p++;
9677                     break;
9678                 } /* End of switch on the literal */
9679
9680                 is_exactfu_sharp_s = (node_type == EXACTFU
9681                                       && ender == LATIN_SMALL_LETTER_SHARP_S);
9682                 if ( RExC_flags & RXf_PMf_EXTENDED)
9683                     p = regwhite( pRExC_state, p );
9684                 if ((UTF && FOLD) || is_exactfu_sharp_s) {
9685                     /* Prime the casefolded buffer.  Locale rules, which apply
9686                      * only to code points < 256, aren't known until execution,
9687                      * so for them, just output the original character using
9688                      * utf8.  If we start to fold non-UTF patterns, be sure to
9689                      * update join_exact() */
9690                     if (LOC && ender < 256) {
9691                         if (UNI_IS_INVARIANT(ender)) {
9692                             *tmpbuf = (U8) ender;
9693                             foldlen = 1;
9694                         } else {
9695                             *tmpbuf = UTF8_TWO_BYTE_HI(ender);
9696                             *(tmpbuf + 1) = UTF8_TWO_BYTE_LO(ender);
9697                             foldlen = 2;
9698                         }
9699                     }
9700                     else if (isASCII(ender)) {  /* Note: Here can't also be LOC
9701                                                  */
9702                         ender = toLOWER(ender);
9703                         *tmpbuf = (U8) ender;
9704                         foldlen = 1;
9705                     }
9706                     else if (! MORE_ASCII_RESTRICTED && ! LOC) {
9707
9708                         /* Locale and /aa require more selectivity about the
9709                          * fold, so are handled below.  Otherwise, here, just
9710                          * use the fold */
9711                         ender = toFOLD_uni(ender, tmpbuf, &foldlen);
9712                     }
9713                     else {
9714                         /* Under locale rules or /aa we are not to mix,
9715                          * respectively, ords < 256 or ASCII with non-.  So
9716                          * reject folds that mix them, using only the
9717                          * non-folded code point.  So do the fold to a
9718                          * temporary, and inspect each character in it. */
9719                         U8 trialbuf[UTF8_MAXBYTES_CASE+1];
9720                         U8* s = trialbuf;
9721                         UV tmpender = toFOLD_uni(ender, trialbuf, &foldlen);
9722                         U8* e = s + foldlen;
9723                         bool fold_ok = TRUE;
9724
9725                         while (s < e) {
9726                             if (isASCII(*s)
9727                                 || (LOC && (UTF8_IS_INVARIANT(*s)
9728                                            || UTF8_IS_DOWNGRADEABLE_START(*s))))
9729                             {
9730                                 fold_ok = FALSE;
9731                                 break;
9732                             }
9733                             s += UTF8SKIP(s);
9734                         }
9735                         if (fold_ok) {
9736                             Copy(trialbuf, tmpbuf, foldlen, U8);
9737                             ender = tmpender;
9738                         }
9739                         else {
9740                             uvuni_to_utf8(tmpbuf, ender);
9741                             foldlen = UNISKIP(ender);
9742                         }
9743                     }
9744                 }
9745                 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
9746                     if (len)
9747                         p = oldp;
9748                     else if (UTF || is_exactfu_sharp_s) {
9749                          if (FOLD) {
9750                               /* Emit all the Unicode characters. */
9751                               STRLEN numlen;
9752                               for (foldbuf = tmpbuf;
9753                                    foldlen;
9754                                    foldlen -= numlen) {
9755                                    ender = utf8_to_uvchr(foldbuf, &numlen);
9756                                    if (numlen > 0) {
9757                                         const STRLEN unilen = reguni(pRExC_state, ender, s);
9758                                         s       += unilen;
9759                                         len     += unilen;
9760                                         /* In EBCDIC the numlen
9761                                          * and unilen can differ. */
9762                                         foldbuf += numlen;
9763                                         if (numlen >= foldlen)
9764                                              break;
9765                                    }
9766                                    else
9767                                         break; /* "Can't happen." */
9768                               }
9769                          }
9770                          else {
9771                               const STRLEN unilen = reguni(pRExC_state, ender, s);
9772                               if (unilen > 0) {
9773                                    s   += unilen;
9774                                    len += unilen;
9775                               }
9776                          }
9777                     }
9778                     else {
9779                         len++;
9780                         REGC((char)ender, s++);
9781                     }
9782                     break;
9783                 }
9784                 if (UTF || is_exactfu_sharp_s) {
9785                      if (FOLD) {
9786                           /* Emit all the Unicode characters. */
9787                           STRLEN numlen;
9788                           for (foldbuf = tmpbuf;
9789                                foldlen;
9790                                foldlen -= numlen) {
9791                                ender = utf8_to_uvchr(foldbuf, &numlen);
9792                                if (numlen > 0) {
9793                                     const STRLEN unilen = reguni(pRExC_state, ender, s);
9794                                     len     += unilen;
9795                                     s       += unilen;
9796                                     /* In EBCDIC the numlen
9797                                      * and unilen can differ. */
9798                                     foldbuf += numlen;
9799                                     if (numlen >= foldlen)
9800                                          break;
9801                                }
9802                                else
9803                                     break;
9804                           }
9805                      }
9806                      else {
9807                           const STRLEN unilen = reguni(pRExC_state, ender, s);
9808                           if (unilen > 0) {
9809                                s   += unilen;
9810                                len += unilen;
9811                           }
9812                      }
9813                      len--;
9814                 }
9815                 else {
9816                     REGC((char)ender, s++);
9817                 }
9818             }
9819         loopdone:   /* Jumped to when encounters something that shouldn't be in
9820                        the node */
9821             RExC_parse = p - 1;
9822             Set_Node_Cur_Length(ret); /* MJD */
9823             nextchar(pRExC_state);
9824             {
9825                 /* len is STRLEN which is unsigned, need to copy to signed */
9826                 IV iv = len;
9827                 if (iv < 0)
9828                     vFAIL("Internal disaster");
9829             }
9830             if (len > 0)
9831                 *flagp |= HASWIDTH;
9832             if (len == 1 && UNI_IS_INVARIANT(ender))
9833                 *flagp |= SIMPLE;
9834
9835             if (SIZE_ONLY)
9836                 RExC_size += STR_SZ(len);
9837             else {
9838                 STR_LEN(ret) = len;
9839                 RExC_emit += STR_SZ(len);
9840             }
9841         }
9842         break;
9843     }
9844
9845     return(ret);
9846
9847 /* Jumped to when an unrecognized character set is encountered */
9848 bad_charset:
9849     Perl_croak(aTHX_ "panic: Unknown regex character set encoding: %u", get_regex_charset(RExC_flags));
9850     return(NULL);
9851 }
9852
9853 STATIC char *
9854 S_regwhite( RExC_state_t *pRExC_state, char *p )
9855 {
9856     const char *e = RExC_end;
9857
9858     PERL_ARGS_ASSERT_REGWHITE;
9859
9860     while (p < e) {
9861         if (isSPACE(*p))
9862             ++p;
9863         else if (*p == '#') {
9864             bool ended = 0;
9865             do {
9866                 if (*p++ == '\n') {
9867                     ended = 1;
9868                     break;
9869                 }
9870             } while (p < e);
9871             if (!ended)
9872                 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
9873         }
9874         else
9875             break;
9876     }
9877     return p;
9878 }
9879
9880 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
9881    Character classes ([:foo:]) can also be negated ([:^foo:]).
9882    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
9883    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
9884    but trigger failures because they are currently unimplemented. */
9885
9886 #define POSIXCC_DONE(c)   ((c) == ':')
9887 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
9888 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
9889
9890 STATIC I32
9891 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
9892 {
9893     dVAR;
9894     I32 namedclass = OOB_NAMEDCLASS;
9895
9896     PERL_ARGS_ASSERT_REGPPOSIXCC;
9897
9898     if (value == '[' && RExC_parse + 1 < RExC_end &&
9899         /* I smell either [: or [= or [. -- POSIX has been here, right? */
9900         POSIXCC(UCHARAT(RExC_parse))) {
9901         const char c = UCHARAT(RExC_parse);
9902         char* const s = RExC_parse++;
9903
9904         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
9905             RExC_parse++;
9906         if (RExC_parse == RExC_end)
9907             /* Grandfather lone [:, [=, [. */
9908             RExC_parse = s;
9909         else {
9910             const char* const t = RExC_parse++; /* skip over the c */
9911             assert(*t == c);
9912
9913             if (UCHARAT(RExC_parse) == ']') {
9914                 const char *posixcc = s + 1;
9915                 RExC_parse++; /* skip over the ending ] */
9916
9917                 if (*s == ':') {
9918                     const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
9919                     const I32 skip = t - posixcc;
9920
9921                     /* Initially switch on the length of the name.  */
9922                     switch (skip) {
9923                     case 4:
9924                         if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
9925                             namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
9926                         break;
9927                     case 5:
9928                         /* Names all of length 5.  */
9929                         /* alnum alpha ascii blank cntrl digit graph lower
9930                            print punct space upper  */
9931                         /* Offset 4 gives the best switch position.  */
9932                         switch (posixcc[4]) {
9933                         case 'a':
9934                             if (memEQ(posixcc, "alph", 4)) /* alpha */
9935                                 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
9936                             break;
9937                         case 'e':
9938                             if (memEQ(posixcc, "spac", 4)) /* space */
9939                                 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
9940                             break;
9941                         case 'h':
9942                             if (memEQ(posixcc, "grap", 4)) /* graph */
9943                                 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
9944                             break;
9945                         case 'i':
9946                             if (memEQ(posixcc, "asci", 4)) /* ascii */
9947                                 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
9948                             break;
9949                         case 'k':
9950                             if (memEQ(posixcc, "blan", 4)) /* blank */
9951                                 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
9952                             break;
9953                         case 'l':
9954                             if (memEQ(posixcc, "cntr", 4)) /* cntrl */
9955                                 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
9956                             break;
9957                         case 'm':
9958                             if (memEQ(posixcc, "alnu", 4)) /* alnum */
9959                                 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
9960                             break;
9961                         case 'r':
9962                             if (memEQ(posixcc, "lowe", 4)) /* lower */
9963                                 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
9964                             else if (memEQ(posixcc, "uppe", 4)) /* upper */
9965                                 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
9966                             break;
9967                         case 't':
9968                             if (memEQ(posixcc, "digi", 4)) /* digit */
9969                                 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
9970                             else if (memEQ(posixcc, "prin", 4)) /* print */
9971                                 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
9972                             else if (memEQ(posixcc, "punc", 4)) /* punct */
9973                                 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
9974                             break;
9975                         }
9976                         break;
9977                     case 6:
9978                         if (memEQ(posixcc, "xdigit", 6))
9979                             namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
9980                         break;
9981                     }
9982
9983                     if (namedclass == OOB_NAMEDCLASS)
9984                         Simple_vFAIL3("POSIX class [:%.*s:] unknown",
9985                                       t - s - 1, s + 1);
9986                     assert (posixcc[skip] == ':');
9987                     assert (posixcc[skip+1] == ']');
9988                 } else if (!SIZE_ONLY) {
9989                     /* [[=foo=]] and [[.foo.]] are still future. */
9990
9991                     /* adjust RExC_parse so the warning shows after
9992                        the class closes */
9993                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
9994                         RExC_parse++;
9995                     Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
9996                 }
9997             } else {
9998                 /* Maternal grandfather:
9999                  * "[:" ending in ":" but not in ":]" */
10000                 RExC_parse = s;
10001             }
10002         }
10003     }
10004
10005     return namedclass;
10006 }
10007
10008 STATIC void
10009 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
10010 {
10011     dVAR;
10012
10013     PERL_ARGS_ASSERT_CHECKPOSIXCC;
10014
10015     if (POSIXCC(UCHARAT(RExC_parse))) {
10016         const char *s = RExC_parse;
10017         const char  c = *s++;
10018
10019         while (isALNUM(*s))
10020             s++;
10021         if (*s && c == *s && s[1] == ']') {
10022             ckWARN3reg(s+2,
10023                        "POSIX syntax [%c %c] belongs inside character classes",
10024                        c, c);
10025
10026             /* [[=foo=]] and [[.foo.]] are still future. */
10027             if (POSIXCC_NOTYET(c)) {
10028                 /* adjust RExC_parse so the error shows after
10029                    the class closes */
10030                 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
10031                     NOOP;
10032                 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
10033             }
10034         }
10035     }
10036 }
10037
10038 /* Generate the code to add a full posix character <class> to the bracketed
10039  * character class given by <node>.  (<node> is needed only under locale rules)
10040  * destlist     is the inversion list for non-locale rules that this class is
10041  *              to be added to
10042  * sourcelist   is the ASCII-range inversion list to add under /a rules
10043  * Xsourcelist  is the full Unicode range list to use otherwise. */
10044 #define DO_POSIX(node, class, destlist, sourcelist, Xsourcelist)           \
10045     if (LOC) {                                                             \
10046         SV* scratch_list = NULL;                                           \
10047                                                                            \
10048         /* Set this class in the node for runtime matching */              \
10049         ANYOF_CLASS_SET(node, class);                                      \
10050                                                                            \
10051         /* For above Latin1 code points, we use the full Unicode range */  \
10052         _invlist_intersection(PL_AboveLatin1,                              \
10053                               Xsourcelist,                                 \
10054                               &scratch_list);                              \
10055         /* And set the output to it, adding instead if there already is an \
10056          * output.  Checking if <destlist> is NULL first saves an extra    \
10057          * clone.  Its reference count will be decremented at the next     \
10058          * union, etc, or if this is the only instance, at the end of the  \
10059          * routine */                                                      \
10060         if (! destlist) {                                                  \
10061             destlist = scratch_list;                                       \
10062         }                                                                  \
10063         else {                                                             \
10064             _invlist_union(destlist, scratch_list, &destlist);             \
10065             SvREFCNT_dec(scratch_list);                                    \
10066         }                                                                  \
10067     }                                                                      \
10068     else {                                                                 \
10069         /* For non-locale, just add it to any existing list */             \
10070         _invlist_union(destlist,                                           \
10071                        (AT_LEAST_ASCII_RESTRICTED)                         \
10072                            ? sourcelist                                    \
10073                            : Xsourcelist,                                  \
10074                        &destlist);                                         \
10075     }
10076
10077 /* Like DO_POSIX, but matches the complement of <sourcelist> and <Xsourcelist>.
10078  */
10079 #define DO_N_POSIX(node, class, destlist, sourcelist, Xsourcelist)         \
10080     if (LOC) {                                                             \
10081         SV* scratch_list = NULL;                                           \
10082         ANYOF_CLASS_SET(node, class);                                      \
10083         _invlist_subtract(PL_AboveLatin1, Xsourcelist, &scratch_list);     \
10084         if (! destlist) {                                                  \
10085             destlist = scratch_list;                                       \
10086         }                                                                  \
10087         else {                                                             \
10088             _invlist_union(destlist, scratch_list, &destlist);             \
10089             SvREFCNT_dec(scratch_list);                                    \
10090         }                                                                  \
10091     }                                                                      \
10092     else {                                                                 \
10093         _invlist_union_complement_2nd(destlist,                            \
10094                                     (AT_LEAST_ASCII_RESTRICTED)            \
10095                                         ? sourcelist                       \
10096                                         : Xsourcelist,                     \
10097                                     &destlist);                            \
10098         /* Under /d, everything in the upper half of the Latin1 range      \
10099          * matches this complement */                                      \
10100         if (DEPENDS_SEMANTICS) {                                           \
10101             ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL;                \
10102         }                                                                  \
10103     }
10104
10105 /* Generate the code to add a posix character <class> to the bracketed
10106  * character class given by <node>.  (<node> is needed only under locale rules)
10107  * destlist       is the inversion list for non-locale rules that this class is
10108  *                to be added to
10109  * sourcelist     is the ASCII-range inversion list to add under /a rules
10110  * l1_sourcelist  is the Latin1 range list to use otherwise.
10111  * Xpropertyname  is the name to add to <run_time_list> of the property to
10112  *                specify the code points above Latin1 that will have to be
10113  *                determined at run-time
10114  * run_time_list  is a SV* that contains text names of properties that are to
10115  *                be computed at run time.  This concatenates <Xpropertyname>
10116  *                to it, apppropriately
10117  * This is essentially DO_POSIX, but we know only the Latin1 values at compile
10118  * time */
10119 #define DO_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist,      \
10120                               l1_sourcelist, Xpropertyname, run_time_list) \
10121     /* If not /a matching, there are going to be code points we will have  \
10122      * to defer to runtime to look-up */                                   \
10123     if (! AT_LEAST_ASCII_RESTRICTED) {                                     \
10124         Perl_sv_catpvf(aTHX_ run_time_list, "+utf8::%s\n", Xpropertyname); \
10125     }                                                                      \
10126     if (LOC) {                                                             \
10127         ANYOF_CLASS_SET(node, class);                                      \
10128     }                                                                      \
10129     else {                                                                 \
10130         _invlist_union(destlist,                                           \
10131                        (AT_LEAST_ASCII_RESTRICTED)                         \
10132                            ? sourcelist                                    \
10133                            : l1_sourcelist,                                \
10134                        &destlist);                                         \
10135     }
10136
10137 /* Like DO_POSIX_LATIN1_ONLY_KNOWN, but for the complement.  A combination of
10138  * this and DO_N_POSIX */
10139 #define DO_N_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist,    \
10140                               l1_sourcelist, Xpropertyname, run_time_list) \
10141     if (AT_LEAST_ASCII_RESTRICTED) {                                       \
10142         _invlist_union_complement_2nd(destlist, sourcelist, &destlist);    \
10143     }                                                                      \
10144     else {                                                                 \
10145         Perl_sv_catpvf(aTHX_ run_time_list, "!utf8::%s\n", Xpropertyname); \
10146         if (LOC) {                                                         \
10147             ANYOF_CLASS_SET(node, namedclass);                             \
10148         }                                                                  \
10149         else {                                                             \
10150             SV* scratch_list = NULL;                                       \
10151             _invlist_subtract(PL_Latin1, l1_sourcelist, &scratch_list);    \
10152             if (! destlist) {                                              \
10153                 destlist = scratch_list;                                   \
10154             }                                                              \
10155             else {                                                         \
10156                 _invlist_union(destlist, scratch_list, &destlist);         \
10157                 SvREFCNT_dec(scratch_list);                                \
10158             }                                                              \
10159             if (DEPENDS_SEMANTICS) {                                       \
10160                 ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL;            \
10161             }                                                              \
10162         }                                                                  \
10163     }
10164
10165 STATIC U8
10166 S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, SV** invlist_ptr, AV** alternate_ptr)
10167 {
10168
10169     /* Handle the setting of folds in the bitmap for non-locale ANYOF nodes.
10170      * Locale folding is done at run-time, so this function should not be
10171      * called for nodes that are for locales.
10172      *
10173      * This function sets the bit corresponding to the fold of the input
10174      * 'value', if not already set.  The fold of 'f' is 'F', and the fold of
10175      * 'F' is 'f'.
10176      *
10177      * It also knows about the characters that are in the bitmap that have
10178      * folds that are matchable only outside it, and sets the appropriate lists
10179      * and flags.
10180      *
10181      * It returns the number of bits that actually changed from 0 to 1 */
10182
10183     U8 stored = 0;
10184     U8 fold;
10185
10186     PERL_ARGS_ASSERT_SET_REGCLASS_BIT_FOLD;
10187
10188     fold = (AT_LEAST_UNI_SEMANTICS) ? PL_fold_latin1[value]
10189                                     : PL_fold[value];
10190
10191     /* It assumes the bit for 'value' has already been set */
10192     if (fold != value && ! ANYOF_BITMAP_TEST(node, fold)) {
10193         ANYOF_BITMAP_SET(node, fold);
10194         stored++;
10195     }
10196     if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value) && (! isASCII(value) || ! MORE_ASCII_RESTRICTED)) {
10197         /* Certain Latin1 characters have matches outside the bitmap.  To get
10198          * here, 'value' is one of those characters.   None of these matches is
10199          * valid for ASCII characters under /aa, which have been excluded by
10200          * the 'if' above.  The matches fall into three categories:
10201          * 1) They are singly folded-to or -from an above 255 character, as
10202          *    LATIN SMALL LETTER Y WITH DIAERESIS and LATIN CAPITAL LETTER Y
10203          *    WITH DIAERESIS;
10204          * 2) They are part of a multi-char fold with another character in the
10205          *    bitmap, only LATIN SMALL LETTER SHARP S => "ss" fits that bill;
10206          * 3) They are part of a multi-char fold with a character not in the
10207          *    bitmap, such as various ligatures.
10208          * We aren't dealing fully with multi-char folds, except we do deal
10209          * with the pattern containing a character that has a multi-char fold
10210          * (not so much the inverse).
10211          * For types 1) and 3), the matches only happen when the target string
10212          * is utf8; that's not true for 2), and we set a flag for it.
10213          *
10214          * The code below adds to the passed in inversion list the single fold
10215          * closures for 'value'.  The values are hard-coded here so that an
10216          * innocent-looking character class, like /[ks]/i won't have to go out
10217          * to disk to find the possible matches.  XXX It would be better to
10218          * generate these via regen, in case a new version of the Unicode
10219          * standard adds new mappings, though that is not really likely. */
10220         switch (value) {
10221             case 'k':
10222             case 'K':
10223                 /* KELVIN SIGN */
10224                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212A);
10225                 break;
10226             case 's':
10227             case 'S':
10228                 /* LATIN SMALL LETTER LONG S */
10229                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x017F);
10230                 break;
10231             case MICRO_SIGN:
10232                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10233                                                  GREEK_SMALL_LETTER_MU);
10234                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10235                                                  GREEK_CAPITAL_LETTER_MU);
10236                 break;
10237             case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
10238             case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
10239                 /* ANGSTROM SIGN */
10240                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212B);
10241                 if (DEPENDS_SEMANTICS) {    /* See DEPENDS comment below */
10242                     *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10243                                                      PL_fold_latin1[value]);
10244                 }
10245                 break;
10246             case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
10247                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10248                                         LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
10249                 break;
10250             case LATIN_SMALL_LETTER_SHARP_S:
10251                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10252                                         LATIN_CAPITAL_LETTER_SHARP_S);
10253
10254                 /* Under /a, /d, and /u, this can match the two chars "ss" */
10255                 if (! MORE_ASCII_RESTRICTED) {
10256                     add_alternate(alternate_ptr, (U8 *) "ss", 2);
10257
10258                     /* And under /u or /a, it can match even if the target is
10259                      * not utf8 */
10260                     if (AT_LEAST_UNI_SEMANTICS) {
10261                         ANYOF_FLAGS(node) |= ANYOF_NONBITMAP_NON_UTF8;
10262                     }
10263                 }
10264                 break;
10265             case 'F': case 'f':
10266             case 'I': case 'i':
10267             case 'L': case 'l':
10268             case 'T': case 't':
10269             case 'A': case 'a':
10270             case 'H': case 'h':
10271             case 'J': case 'j':
10272             case 'N': case 'n':
10273             case 'W': case 'w':
10274             case 'Y': case 'y':
10275                 /* These all are targets of multi-character folds from code
10276                  * points that require UTF8 to express, so they can't match
10277                  * unless the target string is in UTF-8, so no action here is
10278                  * necessary, as regexec.c properly handles the general case
10279                  * for UTF-8 matching */
10280                 break;
10281             default:
10282                 /* Use deprecated warning to increase the chances of this
10283                  * being output */
10284                 ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%x; please use the perlbug utility to report;", value);
10285                 break;
10286         }
10287     }
10288     else if (DEPENDS_SEMANTICS
10289             && ! isASCII(value)
10290             && PL_fold_latin1[value] != value)
10291     {
10292            /* Under DEPENDS rules, non-ASCII Latin1 characters match their
10293             * folds only when the target string is in UTF-8.  We add the fold
10294             * here to the list of things to match outside the bitmap, which
10295             * won't be looked at unless it is UTF8 (or else if something else
10296             * says to look even if not utf8, but those things better not happen
10297             * under DEPENDS semantics. */
10298         *invlist_ptr = add_cp_to_invlist(*invlist_ptr, PL_fold_latin1[value]);
10299     }
10300
10301     return stored;
10302 }
10303
10304
10305 PERL_STATIC_INLINE U8
10306 S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, SV** invlist_ptr, AV** alternate_ptr)
10307 {
10308     /* This inline function sets a bit in the bitmap if not already set, and if
10309      * appropriate, its fold, returning the number of bits that actually
10310      * changed from 0 to 1 */
10311
10312     U8 stored;
10313
10314     PERL_ARGS_ASSERT_SET_REGCLASS_BIT;
10315
10316     if (ANYOF_BITMAP_TEST(node, value)) {   /* Already set */
10317         return 0;
10318     }
10319
10320     ANYOF_BITMAP_SET(node, value);
10321     stored = 1;
10322
10323     if (FOLD && ! LOC) {        /* Locale folds aren't known until runtime */
10324         stored += set_regclass_bit_fold(pRExC_state, node, value, invlist_ptr, alternate_ptr);
10325     }
10326
10327     return stored;
10328 }
10329
10330 STATIC void
10331 S_add_alternate(pTHX_ AV** alternate_ptr, U8* string, STRLEN len)
10332 {
10333     /* Adds input 'string' with length 'len' to the ANYOF node's unicode
10334      * alternate list, pointed to by 'alternate_ptr'.  This is an array of
10335      * the multi-character folds of characters in the node */
10336     SV *sv;
10337
10338     PERL_ARGS_ASSERT_ADD_ALTERNATE;
10339
10340     if (! *alternate_ptr) {
10341         *alternate_ptr = newAV();
10342     }
10343     sv = newSVpvn_utf8((char*)string, len, TRUE);
10344     av_push(*alternate_ptr, sv);
10345     return;
10346 }
10347
10348 /*
10349    parse a class specification and produce either an ANYOF node that
10350    matches the pattern or perhaps will be optimized into an EXACTish node
10351    instead. The node contains a bit map for the first 256 characters, with the
10352    corresponding bit set if that character is in the list.  For characters
10353    above 255, a range list is used */
10354
10355 STATIC regnode *
10356 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
10357 {
10358     dVAR;
10359     register UV nextvalue;
10360     register IV prevvalue = OOB_UNICODE;
10361     register IV range = 0;
10362     UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
10363     register regnode *ret;
10364     STRLEN numlen;
10365     IV namedclass;
10366     char *rangebegin = NULL;
10367     bool need_class = 0;
10368     bool allow_full_fold = TRUE;   /* Assume wants multi-char folding */
10369     SV *listsv = NULL;
10370     STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
10371                                       than just initialized.  */
10372     SV* properties = NULL;    /* Code points that match \p{} \P{} */
10373     UV element_count = 0;   /* Number of distinct elements in the class.
10374                                Optimizations may be possible if this is tiny */
10375     UV n;
10376
10377     /* Unicode properties are stored in a swash; this holds the current one
10378      * being parsed.  If this swash is the only above-latin1 component of the
10379      * character class, an optimization is to pass it directly on to the
10380      * execution engine.  Otherwise, it is set to NULL to indicate that there
10381      * are other things in the class that have to be dealt with at execution
10382      * time */
10383     SV* swash = NULL;           /* Code points that match \p{} \P{} */
10384
10385     /* Set if a component of this character class is user-defined; just passed
10386      * on to the engine */
10387     UV has_user_defined_property = 0;
10388
10389     /* code points this node matches that can't be stored in the bitmap */
10390     SV* nonbitmap = NULL;
10391
10392     /* The items that are to match that aren't stored in the bitmap, but are a
10393      * result of things that are stored there.  This is the fold closure of
10394      * such a character, either because it has DEPENDS semantics and shouldn't
10395      * be matched unless the target string is utf8, or is a code point that is
10396      * too large for the bit map, as for example, the fold of the MICRO SIGN is
10397      * above 255.  This all is solely for performance reasons.  By having this
10398      * code know the outside-the-bitmap folds that the bitmapped characters are
10399      * involved with, we don't have to go out to disk to find the list of
10400      * matches, unless the character class includes code points that aren't
10401      * storable in the bit map.  That means that a character class with an 's'
10402      * in it, for example, doesn't need to go out to disk to find everything
10403      * that matches.  A 2nd list is used so that the 'nonbitmap' list is kept
10404      * empty unless there is something whose fold we don't know about, and will
10405      * have to go out to the disk to find. */
10406     SV* l1_fold_invlist = NULL;
10407
10408     /* List of multi-character folds that are matched by this node */
10409     AV* unicode_alternate  = NULL;
10410 #ifdef EBCDIC
10411     UV literal_endpoint = 0;
10412 #endif
10413     UV stored = 0;  /* how many chars stored in the bitmap */
10414
10415     regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
10416         case we need to change the emitted regop to an EXACT. */
10417     const char * orig_parse = RExC_parse;
10418     GET_RE_DEBUG_FLAGS_DECL;
10419
10420     PERL_ARGS_ASSERT_REGCLASS;
10421 #ifndef DEBUGGING
10422     PERL_UNUSED_ARG(depth);
10423 #endif
10424
10425     DEBUG_PARSE("clas");
10426
10427     /* Assume we are going to generate an ANYOF node. */
10428     ret = reganode(pRExC_state, ANYOF, 0);
10429
10430
10431     if (!SIZE_ONLY) {
10432         ANYOF_FLAGS(ret) = 0;
10433     }
10434
10435     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
10436         RExC_naughty++;
10437         RExC_parse++;
10438         if (!SIZE_ONLY)
10439             ANYOF_FLAGS(ret) |= ANYOF_INVERT;
10440
10441         /* We have decided to not allow multi-char folds in inverted character
10442          * classes, due to the confusion that can happen, especially with
10443          * classes that are designed for a non-Unicode world:  You have the
10444          * peculiar case that:
10445             "s s" =~ /^[^\xDF]+$/i => Y
10446             "ss"  =~ /^[^\xDF]+$/i => N
10447          *
10448          * See [perl #89750] */
10449         allow_full_fold = FALSE;
10450     }
10451
10452     if (SIZE_ONLY) {
10453         RExC_size += ANYOF_SKIP;
10454         listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
10455     }
10456     else {
10457         RExC_emit += ANYOF_SKIP;
10458         if (LOC) {
10459             ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
10460         }
10461         ANYOF_BITMAP_ZERO(ret);
10462         listsv = newSVpvs("# comment\n");
10463         initial_listsv_len = SvCUR(listsv);
10464     }
10465
10466     nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
10467
10468     if (!SIZE_ONLY && POSIXCC(nextvalue))
10469         checkposixcc(pRExC_state);
10470
10471     /* allow 1st char to be ] (allowing it to be - is dealt with later) */
10472     if (UCHARAT(RExC_parse) == ']')
10473         goto charclassloop;
10474
10475 parseit:
10476     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
10477
10478     charclassloop:
10479
10480         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
10481
10482         if (!range) {
10483             rangebegin = RExC_parse;
10484             element_count++;
10485         }
10486         if (UTF) {
10487             value = utf8n_to_uvchr((U8*)RExC_parse,
10488                                    RExC_end - RExC_parse,
10489                                    &numlen, UTF8_ALLOW_DEFAULT);
10490             RExC_parse += numlen;
10491         }
10492         else
10493             value = UCHARAT(RExC_parse++);
10494
10495         nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
10496         if (value == '[' && POSIXCC(nextvalue))
10497             namedclass = regpposixcc(pRExC_state, value);
10498         else if (value == '\\') {
10499             if (UTF) {
10500                 value = utf8n_to_uvchr((U8*)RExC_parse,
10501                                    RExC_end - RExC_parse,
10502                                    &numlen, UTF8_ALLOW_DEFAULT);
10503                 RExC_parse += numlen;
10504             }
10505             else
10506                 value = UCHARAT(RExC_parse++);
10507             /* Some compilers cannot handle switching on 64-bit integer
10508              * values, therefore value cannot be an UV.  Yes, this will
10509              * be a problem later if we want switch on Unicode.
10510              * A similar issue a little bit later when switching on
10511              * namedclass. --jhi */
10512             switch ((I32)value) {
10513             case 'w':   namedclass = ANYOF_ALNUM;       break;
10514             case 'W':   namedclass = ANYOF_NALNUM;      break;
10515             case 's':   namedclass = ANYOF_SPACE;       break;
10516             case 'S':   namedclass = ANYOF_NSPACE;      break;
10517             case 'd':   namedclass = ANYOF_DIGIT;       break;
10518             case 'D':   namedclass = ANYOF_NDIGIT;      break;
10519             case 'v':   namedclass = ANYOF_VERTWS;      break;
10520             case 'V':   namedclass = ANYOF_NVERTWS;     break;
10521             case 'h':   namedclass = ANYOF_HORIZWS;     break;
10522             case 'H':   namedclass = ANYOF_NHORIZWS;    break;
10523             case 'N':  /* Handle \N{NAME} in class */
10524                 {
10525                     /* We only pay attention to the first char of
10526                     multichar strings being returned. I kinda wonder
10527                     if this makes sense as it does change the behaviour
10528                     from earlier versions, OTOH that behaviour was broken
10529                     as well. */
10530                     UV v; /* value is register so we cant & it /grrr */
10531                     if (reg_namedseq(pRExC_state, &v, NULL, depth)) {
10532                         goto parseit;
10533                     }
10534                     value= v;
10535                 }
10536                 break;
10537             case 'p':
10538             case 'P':
10539                 {
10540                 char *e;
10541                 if (RExC_parse >= RExC_end)
10542                     vFAIL2("Empty \\%c{}", (U8)value);
10543                 if (*RExC_parse == '{') {
10544                     const U8 c = (U8)value;
10545                     e = strchr(RExC_parse++, '}');
10546                     if (!e)
10547                         vFAIL2("Missing right brace on \\%c{}", c);
10548                     while (isSPACE(UCHARAT(RExC_parse)))
10549                         RExC_parse++;
10550                     if (e == RExC_parse)
10551                         vFAIL2("Empty \\%c{}", c);
10552                     n = e - RExC_parse;
10553                     while (isSPACE(UCHARAT(RExC_parse + n - 1)))
10554                         n--;
10555                 }
10556                 else {
10557                     e = RExC_parse;
10558                     n = 1;
10559                 }
10560                 if (!SIZE_ONLY) {
10561                     SV** invlistsvp;
10562                     SV* invlist;
10563                     char* name;
10564                     if (UCHARAT(RExC_parse) == '^') {
10565                          RExC_parse++;
10566                          n--;
10567                          value = value == 'p' ? 'P' : 'p'; /* toggle */
10568                          while (isSPACE(UCHARAT(RExC_parse))) {
10569                               RExC_parse++;
10570                               n--;
10571                          }
10572                     }
10573                     /* Try to get the definition of the property into
10574                      * <invlist>.  If /i is in effect, the effective property
10575                      * will have its name be <__NAME_i>.  The design is
10576                      * discussed in commit
10577                      * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
10578                     Newx(name, n + sizeof("_i__\n"), char);
10579
10580                     sprintf(name, "%s%.*s%s\n",
10581                                     (FOLD) ? "__" : "",
10582                                     (int)n,
10583                                     RExC_parse,
10584                                     (FOLD) ? "_i" : ""
10585                     );
10586
10587                     /* Look up the property name, and get its swash and
10588                      * inversion list, if the property is found  */
10589                     if (swash) {
10590                         SvREFCNT_dec(swash);
10591                     }
10592                     swash = _core_swash_init("utf8", name, &PL_sv_undef,
10593                                              1, /* binary */
10594                                              0, /* not tr/// */
10595                                              TRUE, /* this routine will handle
10596                                                       undefined properties */
10597                                              NULL, FALSE /* No inversion list */
10598                                             );
10599                     if (   ! swash
10600                         || ! SvROK(swash)
10601                         || ! SvTYPE(SvRV(swash)) == SVt_PVHV
10602                         || ! (invlistsvp =
10603                                 hv_fetchs(MUTABLE_HV(SvRV(swash)),
10604                                 "INVLIST", FALSE))
10605                         || ! (invlist = *invlistsvp))
10606                     {
10607                         if (swash) {
10608                             SvREFCNT_dec(swash);
10609                             swash = NULL;
10610                         }
10611
10612                         /* Here didn't find it.  It could be a user-defined
10613                          * property that will be available at run-time.  Add it
10614                          * to the list to look up then */
10615                         Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n",
10616                                         (value == 'p' ? '+' : '!'),
10617                                         name);
10618                         has_user_defined_property = 1;
10619
10620                         /* We don't know yet, so have to assume that the
10621                          * property could match something in the Latin1 range,
10622                          * hence something that isn't utf8 */
10623                         ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
10624                     }
10625                     else {
10626
10627                         /* Here, did get the swash and its inversion list.  If
10628                          * the swash is from a user-defined property, then this
10629                          * whole character class should be regarded as such */
10630                         SV** user_defined_svp =
10631                                             hv_fetchs(MUTABLE_HV(SvRV(swash)),
10632                                                         "USER_DEFINED", FALSE);
10633                         if (user_defined_svp) {
10634                             has_user_defined_property
10635                                                     |= SvUV(*user_defined_svp);
10636                         }
10637
10638                         /* Invert if asking for the complement */
10639                         if (value == 'P') {
10640                             _invlist_union_complement_2nd(properties, invlist, &properties);
10641
10642                             /* The swash can't be used as-is, because we've
10643                              * inverted things; delay removing it to here after
10644                              * have copied its invlist above */
10645                             SvREFCNT_dec(swash);
10646                             swash = NULL;
10647                         }
10648                         else {
10649                             _invlist_union(properties, invlist, &properties);
10650                         }
10651                     }
10652                     Safefree(name);
10653                 }
10654                 RExC_parse = e + 1;
10655                 namedclass = ANYOF_MAX;  /* no official name, but it's named */
10656
10657                 /* \p means they want Unicode semantics */
10658                 RExC_uni_semantics = 1;
10659                 }
10660                 break;
10661             case 'n':   value = '\n';                   break;
10662             case 'r':   value = '\r';                   break;
10663             case 't':   value = '\t';                   break;
10664             case 'f':   value = '\f';                   break;
10665             case 'b':   value = '\b';                   break;
10666             case 'e':   value = ASCII_TO_NATIVE('\033');break;
10667             case 'a':   value = ASCII_TO_NATIVE('\007');break;
10668             case 'o':
10669                 RExC_parse--;   /* function expects to be pointed at the 'o' */
10670                 {
10671                     const char* error_msg;
10672                     bool valid = grok_bslash_o(RExC_parse,
10673                                                &value,
10674                                                &numlen,
10675                                                &error_msg,
10676                                                SIZE_ONLY);
10677                     RExC_parse += numlen;
10678                     if (! valid) {
10679                         vFAIL(error_msg);
10680                     }
10681                 }
10682                 if (PL_encoding && value < 0x100) {
10683                     goto recode_encoding;
10684                 }
10685                 break;
10686             case 'x':
10687                 if (*RExC_parse == '{') {
10688                     I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
10689                         | PERL_SCAN_DISALLOW_PREFIX;
10690                     char * const e = strchr(RExC_parse++, '}');
10691                     if (!e)
10692                         vFAIL("Missing right brace on \\x{}");
10693
10694                     numlen = e - RExC_parse;
10695                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
10696                     RExC_parse = e + 1;
10697                 }
10698                 else {
10699                     I32 flags = PERL_SCAN_DISALLOW_PREFIX;
10700                     numlen = 2;
10701                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
10702                     RExC_parse += numlen;
10703                 }
10704                 if (PL_encoding && value < 0x100)
10705                     goto recode_encoding;
10706                 break;
10707             case 'c':
10708                 value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
10709                 break;
10710             case '0': case '1': case '2': case '3': case '4':
10711             case '5': case '6': case '7':
10712                 {
10713                     /* Take 1-3 octal digits */
10714                     I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
10715                     numlen = 3;
10716                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
10717                     RExC_parse += numlen;
10718                     if (PL_encoding && value < 0x100)
10719                         goto recode_encoding;
10720                     break;
10721                 }
10722             recode_encoding:
10723                 if (! RExC_override_recoding) {
10724                     SV* enc = PL_encoding;
10725                     value = reg_recode((const char)(U8)value, &enc);
10726                     if (!enc && SIZE_ONLY)
10727                         ckWARNreg(RExC_parse,
10728                                   "Invalid escape in the specified encoding");
10729                     break;
10730                 }
10731             default:
10732                 /* Allow \_ to not give an error */
10733                 if (!SIZE_ONLY && isALNUM(value) && value != '_') {
10734                     ckWARN2reg(RExC_parse,
10735                                "Unrecognized escape \\%c in character class passed through",
10736                                (int)value);
10737                 }
10738                 break;
10739             }
10740         } /* end of \blah */
10741 #ifdef EBCDIC
10742         else
10743             literal_endpoint++;
10744 #endif
10745
10746         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
10747
10748             /* What matches in a locale is not known until runtime, so need to
10749              * (one time per class) allocate extra space to pass to regexec.
10750              * The space will contain a bit for each named class that is to be
10751              * matched against.  This isn't needed for \p{} and pseudo-classes,
10752              * as they are not affected by locale, and hence are dealt with
10753              * separately */
10754             if (LOC && namedclass < ANYOF_MAX && ! need_class) {
10755                 need_class = 1;
10756                 if (SIZE_ONLY) {
10757                     RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
10758                 }
10759                 else {
10760                     RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
10761                     ANYOF_CLASS_ZERO(ret);
10762                 }
10763                 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
10764             }
10765
10766             /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
10767              * literal, as is the character that began the false range, i.e.
10768              * the 'a' in the examples */
10769             if (range) {
10770                 if (!SIZE_ONLY) {
10771                     const int w =
10772                         RExC_parse >= rangebegin ?
10773                         RExC_parse - rangebegin : 0;
10774                     ckWARN4reg(RExC_parse,
10775                                "False [] range \"%*.*s\"",
10776                                w, w, rangebegin);
10777
10778                     stored +=
10779                          set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
10780                     if (prevvalue < 256) {
10781                         stored +=
10782                          set_regclass_bit(pRExC_state, ret, (U8) prevvalue, &l1_fold_invlist, &unicode_alternate);
10783                     }
10784                     else {
10785                         nonbitmap = add_cp_to_invlist(nonbitmap, prevvalue);
10786                     }
10787                 }
10788
10789                 range = 0; /* this was not a true range */
10790             }
10791
10792             if (!SIZE_ONLY) {
10793
10794                 /* Possible truncation here but in some 64-bit environments
10795                  * the compiler gets heartburn about switch on 64-bit values.
10796                  * A similar issue a little earlier when switching on value.
10797                  * --jhi */
10798                 switch ((I32)namedclass) {
10799
10800                 case ANYOF_ALNUMC: /* C's alnum, in contrast to \w */
10801                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10802                         PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv);
10803                     break;
10804                 case ANYOF_NALNUMC:
10805                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10806                         PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv);
10807                     break;
10808                 case ANYOF_ALPHA:
10809                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10810                         PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv);
10811                     break;
10812                 case ANYOF_NALPHA:
10813                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10814                         PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv);
10815                     break;
10816                 case ANYOF_ASCII:
10817                     if (LOC) {
10818                         ANYOF_CLASS_SET(ret, namedclass);
10819                     }
10820                     else {
10821                         _invlist_union(properties, PL_ASCII, &properties);
10822                     }
10823                     break;
10824                 case ANYOF_NASCII:
10825                     if (LOC) {
10826                         ANYOF_CLASS_SET(ret, namedclass);
10827                     }
10828                     else {
10829                         _invlist_union_complement_2nd(properties,
10830                                                     PL_ASCII, &properties);
10831                         if (DEPENDS_SEMANTICS) {
10832                             ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
10833                         }
10834                     }
10835                     break;
10836                 case ANYOF_BLANK:
10837                     DO_POSIX(ret, namedclass, properties,
10838                                             PL_PosixBlank, PL_XPosixBlank);
10839                     break;
10840                 case ANYOF_NBLANK:
10841                     DO_N_POSIX(ret, namedclass, properties,
10842                                             PL_PosixBlank, PL_XPosixBlank);
10843                     break;
10844                 case ANYOF_CNTRL:
10845                     DO_POSIX(ret, namedclass, properties,
10846                                             PL_PosixCntrl, PL_XPosixCntrl);
10847                     break;
10848                 case ANYOF_NCNTRL:
10849                     DO_N_POSIX(ret, namedclass, properties,
10850                                             PL_PosixCntrl, PL_XPosixCntrl);
10851                     break;
10852                 case ANYOF_DIGIT:
10853                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10854                         PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv);
10855                     break;
10856                 case ANYOF_NDIGIT:
10857                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10858                         PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv);
10859                     break;
10860                 case ANYOF_GRAPH:
10861                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10862                         PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv);
10863                     break;
10864                 case ANYOF_NGRAPH:
10865                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10866                         PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv);
10867                     break;
10868                 case ANYOF_HORIZWS:
10869                     /* For these, we use the nonbitmap, as /d doesn't make a
10870                      * difference in what these match.  There would be problems
10871                      * if these characters had folds other than themselves, as
10872                      * nonbitmap is subject to folding.  It turns out that \h
10873                      * is just a synonym for XPosixBlank */
10874                     _invlist_union(nonbitmap, PL_XPosixBlank, &nonbitmap);
10875                     break;
10876                 case ANYOF_NHORIZWS:
10877                     _invlist_union_complement_2nd(nonbitmap,
10878                                                  PL_XPosixBlank, &nonbitmap);
10879                     break;
10880                 case ANYOF_LOWER:
10881                 case ANYOF_NLOWER:
10882                 {   /* These require special handling, as they differ under
10883                        folding, matching Cased there (which in the ASCII range
10884                        is the same as Alpha */
10885
10886                     SV* ascii_source;
10887                     SV* l1_source;
10888                     const char *Xname;
10889
10890                     if (FOLD && ! LOC) {
10891                         ascii_source = PL_PosixAlpha;
10892                         l1_source = PL_L1Cased;
10893                         Xname = "Cased";
10894                     }
10895                     else {
10896                         ascii_source = PL_PosixLower;
10897                         l1_source = PL_L1PosixLower;
10898                         Xname = "XPosixLower";
10899                     }
10900                     if (namedclass == ANYOF_LOWER) {
10901                         DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10902                                     ascii_source, l1_source, Xname, listsv);
10903                     }
10904                     else {
10905                         DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
10906                             properties, ascii_source, l1_source, Xname, listsv);
10907                     }
10908                     break;
10909                 }
10910                 case ANYOF_PRINT:
10911                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10912                         PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv);
10913                     break;
10914                 case ANYOF_NPRINT:
10915                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10916                         PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv);
10917                     break;
10918                 case ANYOF_PUNCT:
10919                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10920                         PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv);
10921                     break;
10922                 case ANYOF_NPUNCT:
10923                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10924                         PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv);
10925                     break;
10926                 case ANYOF_PSXSPC:
10927                     DO_POSIX(ret, namedclass, properties,
10928                                             PL_PosixSpace, PL_XPosixSpace);
10929                     break;
10930                 case ANYOF_NPSXSPC:
10931                     DO_N_POSIX(ret, namedclass, properties,
10932                                             PL_PosixSpace, PL_XPosixSpace);
10933                     break;
10934                 case ANYOF_SPACE:
10935                     DO_POSIX(ret, namedclass, properties,
10936                                             PL_PerlSpace, PL_XPerlSpace);
10937                     break;
10938                 case ANYOF_NSPACE:
10939                     DO_N_POSIX(ret, namedclass, properties,
10940                                             PL_PerlSpace, PL_XPerlSpace);
10941                     break;
10942                 case ANYOF_UPPER:   /* Same as LOWER, above */
10943                 case ANYOF_NUPPER:
10944                 {
10945                     SV* ascii_source;
10946                     SV* l1_source;
10947                     const char *Xname;
10948
10949                     if (FOLD && ! LOC) {
10950                         ascii_source = PL_PosixAlpha;
10951                         l1_source = PL_L1Cased;
10952                         Xname = "Cased";
10953                     }
10954                     else {
10955                         ascii_source = PL_PosixUpper;
10956                         l1_source = PL_L1PosixUpper;
10957                         Xname = "XPosixUpper";
10958                     }
10959                     if (namedclass == ANYOF_UPPER) {
10960                         DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10961                                     ascii_source, l1_source, Xname, listsv);
10962                     }
10963                     else {
10964                         DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
10965                         properties, ascii_source, l1_source, Xname, listsv);
10966                     }
10967                     break;
10968                 }
10969                 case ANYOF_ALNUM:   /* Really is 'Word' */
10970                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10971                             PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
10972                     break;
10973                 case ANYOF_NALNUM:
10974                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10975                             PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
10976                     break;
10977                 case ANYOF_VERTWS:
10978                     /* For these, we use the nonbitmap, as /d doesn't make a
10979                      * difference in what these match.  There would be problems
10980                      * if these characters had folds other than themselves, as
10981                      * nonbitmap is subject to folding */
10982                     _invlist_union(nonbitmap, PL_VertSpace, &nonbitmap);
10983                     break;
10984                 case ANYOF_NVERTWS:
10985                     _invlist_union_complement_2nd(nonbitmap,
10986                                                     PL_VertSpace, &nonbitmap);
10987                     break;
10988                 case ANYOF_XDIGIT:
10989                     DO_POSIX(ret, namedclass, properties,
10990                                             PL_PosixXDigit, PL_XPosixXDigit);
10991                     break;
10992                 case ANYOF_NXDIGIT:
10993                     DO_N_POSIX(ret, namedclass, properties,
10994                                             PL_PosixXDigit, PL_XPosixXDigit);
10995                     break;
10996                 case ANYOF_MAX:
10997                     /* this is to handle \p and \P */
10998                     break;
10999                 default:
11000                     vFAIL("Invalid [::] class");
11001                     break;
11002                 }
11003
11004                 continue;
11005             }
11006         } /* end of namedclass \blah */
11007
11008         if (range) {
11009             if (prevvalue > (IV)value) /* b-a */ {
11010                 const int w = RExC_parse - rangebegin;
11011                 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
11012                 range = 0; /* not a valid range */
11013             }
11014         }
11015         else {
11016             prevvalue = value; /* save the beginning of the range */
11017             if (RExC_parse+1 < RExC_end
11018                 && *RExC_parse == '-'
11019                 && RExC_parse[1] != ']')
11020             {
11021                 RExC_parse++;
11022
11023                 /* a bad range like \w-, [:word:]- ? */
11024                 if (namedclass > OOB_NAMEDCLASS) {
11025                     if (ckWARN(WARN_REGEXP)) {
11026                         const int w =
11027                             RExC_parse >= rangebegin ?
11028                             RExC_parse - rangebegin : 0;
11029                         vWARN4(RExC_parse,
11030                                "False [] range \"%*.*s\"",
11031                                w, w, rangebegin);
11032                     }
11033                     if (!SIZE_ONLY)
11034                         stored +=
11035                             set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
11036                 } else
11037                     range = 1;  /* yeah, it's a range! */
11038                 continue;       /* but do it the next time */
11039             }
11040         }
11041
11042         /* non-Latin1 code point implies unicode semantics.  Must be set in
11043          * pass1 so is there for the whole of pass 2 */
11044         if (value > 255) {
11045             RExC_uni_semantics = 1;
11046         }
11047
11048         /* now is the next time */
11049         if (!SIZE_ONLY) {
11050             if (prevvalue < 256) {
11051                 const IV ceilvalue = value < 256 ? value : 255;
11052                 IV i;
11053 #ifdef EBCDIC
11054                 /* In EBCDIC [\x89-\x91] should include
11055                  * the \x8e but [i-j] should not. */
11056                 if (literal_endpoint == 2 &&
11057                     ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
11058                      (isUPPER(prevvalue) && isUPPER(ceilvalue))))
11059                 {
11060                     if (isLOWER(prevvalue)) {
11061                         for (i = prevvalue; i <= ceilvalue; i++)
11062                             if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
11063                                 stored +=
11064                                   set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
11065                             }
11066                     } else {
11067                         for (i = prevvalue; i <= ceilvalue; i++)
11068                             if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
11069                                 stored +=
11070                                   set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
11071                             }
11072                     }
11073                 }
11074                 else
11075 #endif
11076                       for (i = prevvalue; i <= ceilvalue; i++) {
11077                         stored += set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
11078                       }
11079           }
11080           if (value > 255) {
11081             const UV prevnatvalue  = NATIVE_TO_UNI(prevvalue);
11082             const UV natvalue      = NATIVE_TO_UNI(value);
11083             nonbitmap = add_range_to_invlist(nonbitmap, prevnatvalue, natvalue);
11084         }
11085 #ifdef EBCDIC
11086             literal_endpoint = 0;
11087 #endif
11088         }
11089
11090         range = 0; /* this range (if it was one) is done now */
11091     }
11092
11093
11094
11095     if (SIZE_ONLY)
11096         return ret;
11097     /****** !SIZE_ONLY AFTER HERE *********/
11098
11099     /* If folding and there are code points above 255, we calculate all
11100      * characters that could fold to or from the ones already on the list */
11101     if (FOLD && nonbitmap) {
11102         UV start, end;  /* End points of code point ranges */
11103
11104         SV* fold_intersection = NULL;
11105
11106         /* This is a list of all the characters that participate in folds
11107             * (except marks, etc in multi-char folds */
11108         if (! PL_utf8_foldable) {
11109             SV* swash = swash_init("utf8", "Cased", &PL_sv_undef, 1, 0);
11110             PL_utf8_foldable = _swash_to_invlist(swash);
11111             SvREFCNT_dec(swash);
11112         }
11113
11114         /* This is a hash that for a particular fold gives all characters
11115             * that are involved in it */
11116         if (! PL_utf8_foldclosures) {
11117
11118             /* If we were unable to find any folds, then we likely won't be
11119              * able to find the closures.  So just create an empty list.
11120              * Folding will effectively be restricted to the non-Unicode rules
11121              * hard-coded into Perl.  (This case happens legitimately during
11122              * compilation of Perl itself before the Unicode tables are
11123              * generated) */
11124             if (invlist_len(PL_utf8_foldable) == 0) {
11125                 PL_utf8_foldclosures = newHV();
11126             } else {
11127                 /* If the folds haven't been read in, call a fold function
11128                     * to force that */
11129                 if (! PL_utf8_tofold) {
11130                     U8 dummy[UTF8_MAXBYTES+1];
11131                     STRLEN dummy_len;
11132
11133                     /* This particular string is above \xff in both UTF-8 and
11134                      * UTFEBCDIC */
11135                     to_utf8_fold((U8*) "\xC8\x80", dummy, &dummy_len);
11136                     assert(PL_utf8_tofold); /* Verify that worked */
11137                 }
11138                 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
11139             }
11140         }
11141
11142         /* Only the characters in this class that participate in folds need be
11143          * checked.  Get the intersection of this class and all the possible
11144          * characters that are foldable.  This can quickly narrow down a large
11145          * class */
11146         _invlist_intersection(PL_utf8_foldable, nonbitmap, &fold_intersection);
11147
11148         /* Now look at the foldable characters in this class individually */
11149         invlist_iterinit(fold_intersection);
11150         while (invlist_iternext(fold_intersection, &start, &end)) {
11151             UV j;
11152
11153             /* Look at every character in the range */
11154             for (j = start; j <= end; j++) {
11155
11156                 /* Get its fold */
11157                 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
11158                 STRLEN foldlen;
11159                 const UV f =
11160                     _to_uni_fold_flags(j, foldbuf, &foldlen, allow_full_fold);
11161
11162                 if (foldlen > (STRLEN)UNISKIP(f)) {
11163
11164                     /* Any multicharacter foldings (disallowed in lookbehind
11165                      * patterns) require the following transform: [ABCDEF] ->
11166                      * (?:[ABCabcDEFd]|pq|rst) where E folds into "pq" and F
11167                      * folds into "rst", all other characters fold to single
11168                      * characters.  We save away these multicharacter foldings,
11169                      * to be later saved as part of the additional "s" data. */
11170                     if (! RExC_in_lookbehind) {
11171                         U8* loc = foldbuf;
11172                         U8* e = foldbuf + foldlen;
11173
11174                         /* If any of the folded characters of this are in the
11175                          * Latin1 range, tell the regex engine that this can
11176                          * match a non-utf8 target string.  The only multi-byte
11177                          * fold whose source is in the Latin1 range (U+00DF)
11178                          * applies only when the target string is utf8, or
11179                          * under unicode rules */
11180                         if (j > 255 || AT_LEAST_UNI_SEMANTICS) {
11181                             while (loc < e) {
11182
11183                                 /* Can't mix ascii with non- under /aa */
11184                                 if (MORE_ASCII_RESTRICTED
11185                                     && (isASCII(*loc) != isASCII(j)))
11186                                 {
11187                                     goto end_multi_fold;
11188                                 }
11189                                 if (UTF8_IS_INVARIANT(*loc)
11190                                     || UTF8_IS_DOWNGRADEABLE_START(*loc))
11191                                 {
11192                                     /* Can't mix above and below 256 under LOC
11193                                      */
11194                                     if (LOC) {
11195                                         goto end_multi_fold;
11196                                     }
11197                                     ANYOF_FLAGS(ret)
11198                                             |= ANYOF_NONBITMAP_NON_UTF8;
11199                                     break;
11200                                 }
11201                                 loc += UTF8SKIP(loc);
11202                             }
11203                         }
11204
11205                         add_alternate(&unicode_alternate, foldbuf, foldlen);
11206                     end_multi_fold: ;
11207                     }
11208
11209                     /* This is special-cased, as it is the only letter which
11210                      * has both a multi-fold and single-fold in Latin1.  All
11211                      * the other chars that have single and multi-folds are
11212                      * always in utf8, and the utf8 folding algorithm catches
11213                      * them */
11214                     if (! LOC && j == LATIN_CAPITAL_LETTER_SHARP_S) {
11215                         stored += set_regclass_bit(pRExC_state,
11216                                         ret,
11217                                         LATIN_SMALL_LETTER_SHARP_S,
11218                                         &l1_fold_invlist, &unicode_alternate);
11219                     }
11220                 }
11221                 else {
11222                     /* Single character fold.  Add everything in its fold
11223                      * closure to the list that this node should match */
11224                     SV** listp;
11225
11226                     /* The fold closures data structure is a hash with the keys
11227                      * being every character that is folded to, like 'k', and
11228                      * the values each an array of everything that folds to its
11229                      * key.  e.g. [ 'k', 'K', KELVIN_SIGN ] */
11230                     if ((listp = hv_fetch(PL_utf8_foldclosures,
11231                                     (char *) foldbuf, foldlen, FALSE)))
11232                     {
11233                         AV* list = (AV*) *listp;
11234                         IV k;
11235                         for (k = 0; k <= av_len(list); k++) {
11236                             SV** c_p = av_fetch(list, k, FALSE);
11237                             UV c;
11238                             if (c_p == NULL) {
11239                                 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
11240                             }
11241                             c = SvUV(*c_p);
11242
11243                             /* /aa doesn't allow folds between ASCII and non-;
11244                              * /l doesn't allow them between above and below
11245                              * 256 */
11246                             if ((MORE_ASCII_RESTRICTED
11247                                  && (isASCII(c) != isASCII(j)))
11248                                     || (LOC && ((c < 256) != (j < 256))))
11249                             {
11250                                 continue;
11251                             }
11252
11253                             if (c < 256 && AT_LEAST_UNI_SEMANTICS) {
11254                                 stored += set_regclass_bit(pRExC_state,
11255                                         ret,
11256                                         (U8) c,
11257                                         &l1_fold_invlist, &unicode_alternate);
11258                             }
11259                                 /* It may be that the code point is already in
11260                                  * this range or already in the bitmap, in
11261                                  * which case we need do nothing */
11262                             else if ((c < start || c > end)
11263                                         && (c > 255
11264                                             || ! ANYOF_BITMAP_TEST(ret, c)))
11265                             {
11266                                 nonbitmap = add_cp_to_invlist(nonbitmap, c);
11267                             }
11268                         }
11269                     }
11270                 }
11271             }
11272         }
11273         SvREFCNT_dec(fold_intersection);
11274     }
11275
11276     /* Combine the two lists into one. */
11277     if (l1_fold_invlist) {
11278         if (nonbitmap) {
11279             _invlist_union(nonbitmap, l1_fold_invlist, &nonbitmap);
11280             SvREFCNT_dec(l1_fold_invlist);
11281         }
11282         else {
11283             nonbitmap = l1_fold_invlist;
11284         }
11285     }
11286
11287     /* And combine the result (if any) with any inversion list from properties.
11288      * The lists are kept separate up to now because we don't want to fold the
11289      * properties */
11290     if (properties) {
11291         if (nonbitmap) {
11292             _invlist_union(nonbitmap, properties, &nonbitmap);
11293             SvREFCNT_dec(properties);
11294         }
11295         else {
11296             nonbitmap = properties;
11297         }
11298     }
11299
11300     /* Here, <nonbitmap> contains all the code points we can determine at
11301      * compile time that we haven't put into the bitmap.  Go through it, and
11302      * for things that belong in the bitmap, put them there, and delete from
11303      * <nonbitmap> */
11304     if (nonbitmap) {
11305
11306         /* Above-ASCII code points in /d have to stay in <nonbitmap>, as they
11307          * possibly only should match when the target string is UTF-8 */
11308         UV max_cp_to_set = (DEPENDS_SEMANTICS) ? 127 : 255;
11309
11310         /* This gets set if we actually need to modify things */
11311         bool change_invlist = FALSE;
11312
11313         UV start, end;
11314
11315         /* Start looking through <nonbitmap> */
11316         invlist_iterinit(nonbitmap);
11317         while (invlist_iternext(nonbitmap, &start, &end)) {
11318             UV high;
11319             int i;
11320
11321             /* Quit if are above what we should change */
11322             if (start > max_cp_to_set) {
11323                 break;
11324             }
11325
11326             change_invlist = TRUE;
11327
11328             /* Set all the bits in the range, up to the max that we are doing */
11329             high = (end < max_cp_to_set) ? end : max_cp_to_set;
11330             for (i = start; i <= (int) high; i++) {
11331                 if (! ANYOF_BITMAP_TEST(ret, i)) {
11332                     ANYOF_BITMAP_SET(ret, i);
11333                     stored++;
11334                     prevvalue = value;
11335                     value = i;
11336                 }
11337             }
11338         }
11339
11340         /* Done with loop; remove any code points that are in the bitmap from
11341          * <nonbitmap> */
11342         if (change_invlist) {
11343             _invlist_subtract(nonbitmap,
11344                               (DEPENDS_SEMANTICS)
11345                                 ? PL_ASCII
11346                                 : PL_Latin1,
11347                               &nonbitmap);
11348         }
11349
11350         /* If have completely emptied it, remove it completely */
11351         if (invlist_len(nonbitmap) == 0) {
11352             SvREFCNT_dec(nonbitmap);
11353             nonbitmap = NULL;
11354         }
11355     }
11356
11357     /* Here, we have calculated what code points should be in the character
11358      * class.  <nonbitmap> does not overlap the bitmap except possibly in the
11359      * case of DEPENDS rules.
11360      *
11361      * Now we can see about various optimizations.  Fold calculation (which we
11362      * did above) needs to take place before inversion.  Otherwise /[^k]/i
11363      * would invert to include K, which under /i would match k, which it
11364      * shouldn't. */
11365
11366     /* Optimize inverted simple patterns (e.g. [^a-z]).  Note that we haven't
11367      * set the FOLD flag yet, so this does optimize those.  It doesn't
11368      * optimize locale.  Doing so perhaps could be done as long as there is
11369      * nothing like \w in it; some thought also would have to be given to the
11370      * interaction with above 0x100 chars */
11371     if ((ANYOF_FLAGS(ret) & ANYOF_INVERT)
11372         && ! LOC
11373         && ! unicode_alternate
11374         /* In case of /d, there are some things that should match only when in
11375          * not in the bitmap, i.e., they require UTF8 to match.  These are
11376          * listed in nonbitmap, but if ANYOF_NONBITMAP_NON_UTF8 is set in this
11377          * case, they don't require UTF8, so can invert here */
11378         && (! nonbitmap
11379             || ! DEPENDS_SEMANTICS
11380             || (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP_NON_UTF8))
11381         && SvCUR(listsv) == initial_listsv_len)
11382     {
11383         int i;
11384         if (! nonbitmap) {
11385             for (i = 0; i < 256; ++i) {
11386                 if (ANYOF_BITMAP_TEST(ret, i)) {
11387                     ANYOF_BITMAP_CLEAR(ret, i);
11388                 }
11389                 else {
11390                     ANYOF_BITMAP_SET(ret, i);
11391                     prevvalue = value;
11392                     value = i;
11393                 }
11394             }
11395             /* The inversion means that everything above 255 is matched */
11396             ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
11397         }
11398         else {
11399             /* Here, also has things outside the bitmap that may overlap with
11400              * the bitmap.  We have to sync them up, so that they get inverted
11401              * in both places.  Earlier, we removed all overlaps except in the
11402              * case of /d rules, so no syncing is needed except for this case
11403              */
11404             SV *remove_list = NULL;
11405
11406             if (DEPENDS_SEMANTICS) {
11407                 UV start, end;
11408
11409                 /* Set the bits that correspond to the ones that aren't in the
11410                  * bitmap.  Otherwise, when we invert, we'll miss these.
11411                  * Earlier, we removed from the nonbitmap all code points
11412                  * < 128, so there is no extra work here */
11413                 invlist_iterinit(nonbitmap);
11414                 while (invlist_iternext(nonbitmap, &start, &end)) {
11415                     if (start > 255) {  /* The bit map goes to 255 */
11416                         break;
11417                     }
11418                     if (end > 255) {
11419                         end = 255;
11420                     }
11421                     for (i = start; i <= (int) end; ++i) {
11422                         ANYOF_BITMAP_SET(ret, i);
11423                         prevvalue = value;
11424                         value = i;
11425                     }
11426                 }
11427             }
11428
11429             /* Now invert both the bitmap and the nonbitmap.  Anything in the
11430              * bitmap has to also be removed from the non-bitmap, but again,
11431              * there should not be overlap unless is /d rules. */
11432             _invlist_invert(nonbitmap);
11433
11434             /* Any swash can't be used as-is, because we've inverted things */
11435             if (swash) {
11436                 SvREFCNT_dec(swash);
11437                 swash = NULL;
11438             }
11439
11440             for (i = 0; i < 256; ++i) {
11441                 if (ANYOF_BITMAP_TEST(ret, i)) {
11442                     ANYOF_BITMAP_CLEAR(ret, i);
11443                     if (DEPENDS_SEMANTICS) {
11444                         if (! remove_list) {
11445                             remove_list = _new_invlist(2);
11446                         }
11447                         remove_list = add_cp_to_invlist(remove_list, i);
11448                     }
11449                 }
11450                 else {
11451                     ANYOF_BITMAP_SET(ret, i);
11452                     prevvalue = value;
11453                     value = i;
11454                 }
11455             }
11456
11457             /* And do the removal */
11458             if (DEPENDS_SEMANTICS) {
11459                 if (remove_list) {
11460                     _invlist_subtract(nonbitmap, remove_list, &nonbitmap);
11461                     SvREFCNT_dec(remove_list);
11462                 }
11463             }
11464             else {
11465                 /* There is no overlap for non-/d, so just delete anything
11466                  * below 256 */
11467                 _invlist_intersection(nonbitmap, PL_AboveLatin1, &nonbitmap);
11468             }
11469         }
11470
11471         stored = 256 - stored;
11472
11473         /* Clear the invert flag since have just done it here */
11474         ANYOF_FLAGS(ret) &= ~ANYOF_INVERT;
11475     }
11476
11477     /* Folding in the bitmap is taken care of above, but not for locale (for
11478      * which we have to wait to see what folding is in effect at runtime), and
11479      * for some things not in the bitmap (only the upper latin folds in this
11480      * case, as all other single-char folding has been set above).  Set
11481      * run-time fold flag for these */
11482     if (FOLD && (LOC
11483                 || (DEPENDS_SEMANTICS
11484                     && nonbitmap
11485                     && ! (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP_NON_UTF8))
11486                 || unicode_alternate))
11487     {
11488         ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD;
11489     }
11490
11491     /* A single character class can be "optimized" into an EXACTish node.
11492      * Note that since we don't currently count how many characters there are
11493      * outside the bitmap, we are XXX missing optimization possibilities for
11494      * them.  This optimization can't happen unless this is a truly single
11495      * character class, which means that it can't be an inversion into a
11496      * many-character class, and there must be no possibility of there being
11497      * things outside the bitmap.  'stored' (only) for locales doesn't include
11498      * \w, etc, so have to make a special test that they aren't present
11499      *
11500      * Similarly A 2-character class of the very special form like [bB] can be
11501      * optimized into an EXACTFish node, but only for non-locales, and for
11502      * characters which only have the two folds; so things like 'fF' and 'Ii'
11503      * wouldn't work because they are part of the fold of 'LATIN SMALL LIGATURE
11504      * FI'. */
11505     if (! nonbitmap
11506         && ! unicode_alternate
11507         && SvCUR(listsv) == initial_listsv_len
11508         && ! (ANYOF_FLAGS(ret) & (ANYOF_INVERT|ANYOF_UNICODE_ALL))
11509         && (((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
11510                               || (! ANYOF_CLASS_TEST_ANY_SET(ret)))))
11511             || (stored == 2 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
11512                                  && (! _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value))
11513                                  /* If the latest code point has a fold whose
11514                                   * bit is set, it must be the only other one */
11515                                 && ((prevvalue = PL_fold_latin1[value]) != (IV)value)
11516                                  && ANYOF_BITMAP_TEST(ret, prevvalue)))))
11517     {
11518         /* Note that the information needed to decide to do this optimization
11519          * is not currently available until the 2nd pass, and that the actually
11520          * used EXACTish node takes less space than the calculated ANYOF node,
11521          * and hence the amount of space calculated in the first pass is larger
11522          * than actually used, so this optimization doesn't gain us any space.
11523          * But an EXACT node is faster than an ANYOF node, and can be combined
11524          * with any adjacent EXACT nodes later by the optimizer for further
11525          * gains.  The speed of executing an EXACTF is similar to an ANYOF
11526          * node, so the optimization advantage comes from the ability to join
11527          * it to adjacent EXACT nodes */
11528
11529         const char * cur_parse= RExC_parse;
11530         U8 op;
11531         RExC_emit = (regnode *)orig_emit;
11532         RExC_parse = (char *)orig_parse;
11533
11534         if (stored == 1) {
11535
11536             /* A locale node with one point can be folded; all the other cases
11537              * with folding will have two points, since we calculate them above
11538              */
11539             if (ANYOF_FLAGS(ret) & ANYOF_LOC_NONBITMAP_FOLD) {
11540                  op = EXACTFL;
11541             }
11542             else {
11543                 op = EXACT;
11544             }
11545         }
11546         else {   /* else 2 chars in the bit map: the folds of each other */
11547
11548             /* Use the folded value, which for the cases where we get here,
11549              * is just the lower case of the current one (which may resolve to
11550              * itself, or to the other one */
11551             value = toLOWER_LATIN1(value);
11552
11553             /* To join adjacent nodes, they must be the exact EXACTish type.
11554              * Try to use the most likely type, by using EXACTFA if possible,
11555              * then EXACTFU if the regex calls for it, or is required because
11556              * the character is non-ASCII.  (If <value> is ASCII, its fold is
11557              * also ASCII for the cases where we get here.) */
11558             if (MORE_ASCII_RESTRICTED && isASCII(value)) {
11559                 op = EXACTFA;
11560             }
11561             else if (AT_LEAST_UNI_SEMANTICS || !isASCII(value)) {
11562                 op = EXACTFU;
11563             }
11564             else {    /* Otherwise, more likely to be EXACTF type */
11565                 op = EXACTF;
11566             }
11567         }
11568
11569         ret = reg_node(pRExC_state, op);
11570         RExC_parse = (char *)cur_parse;
11571         if (UTF && ! NATIVE_IS_INVARIANT(value)) {
11572             *STRING(ret)= UTF8_EIGHT_BIT_HI((U8) value);
11573             *(STRING(ret) + 1)= UTF8_EIGHT_BIT_LO((U8) value);
11574             STR_LEN(ret)= 2;
11575             RExC_emit += STR_SZ(2);
11576         }
11577         else {
11578             *STRING(ret)= (char)value;
11579             STR_LEN(ret)= 1;
11580             RExC_emit += STR_SZ(1);
11581         }
11582         SvREFCNT_dec(listsv);
11583         return ret;
11584     }
11585
11586     /* If there is a swash and more than one element, we can't use the swash in
11587      * the optimization below. */
11588     if (swash && element_count > 1) {
11589         SvREFCNT_dec(swash);
11590         swash = NULL;
11591     }
11592     if (! nonbitmap
11593         && SvCUR(listsv) == initial_listsv_len
11594         && ! unicode_alternate)
11595     {
11596         ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
11597         SvREFCNT_dec(listsv);
11598         SvREFCNT_dec(unicode_alternate);
11599     }
11600     else {
11601         /* av[0] stores the character class description in its textual form:
11602          *       used later (regexec.c:Perl_regclass_swash()) to initialize the
11603          *       appropriate swash, and is also useful for dumping the regnode.
11604          * av[1] if NULL, is a placeholder to later contain the swash computed
11605          *       from av[0].  But if no further computation need be done, the
11606          *       swash is stored there now.
11607          * av[2] stores the multicharacter foldings, used later in
11608          *       regexec.c:S_reginclass().
11609          * av[3] stores the nonbitmap inversion list for use in addition or
11610          *       instead of av[0]; not used if av[1] isn't NULL
11611          * av[4] is set if any component of the class is from a user-defined
11612          *       property; not used if av[1] isn't NULL */
11613         AV * const av = newAV();
11614         SV *rv;
11615
11616         av_store(av, 0, (SvCUR(listsv) == initial_listsv_len)
11617                         ? &PL_sv_undef
11618                         : listsv);
11619         if (swash) {
11620             av_store(av, 1, swash);
11621             SvREFCNT_dec(nonbitmap);
11622         }
11623         else {
11624             av_store(av, 1, NULL);
11625             if (nonbitmap) {
11626                 av_store(av, 3, nonbitmap);
11627                 av_store(av, 4, newSVuv(has_user_defined_property));
11628             }
11629         }
11630
11631         /* Store any computed multi-char folds only if we are allowing
11632          * them */
11633         if (allow_full_fold) {
11634             av_store(av, 2, MUTABLE_SV(unicode_alternate));
11635             if (unicode_alternate) { /* This node is variable length */
11636                 OP(ret) = ANYOFV;
11637             }
11638         }
11639         else {
11640             av_store(av, 2, NULL);
11641         }
11642         rv = newRV_noinc(MUTABLE_SV(av));
11643         n = add_data(pRExC_state, 1, "s");
11644         RExC_rxi->data->data[n] = (void*)rv;
11645         ARG_SET(ret, n);
11646     }
11647     return ret;
11648 }
11649
11650
11651 /* reg_skipcomment()
11652
11653    Absorbs an /x style # comments from the input stream.
11654    Returns true if there is more text remaining in the stream.
11655    Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
11656    terminates the pattern without including a newline.
11657
11658    Note its the callers responsibility to ensure that we are
11659    actually in /x mode
11660
11661 */
11662
11663 STATIC bool
11664 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
11665 {
11666     bool ended = 0;
11667
11668     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
11669
11670     while (RExC_parse < RExC_end)
11671         if (*RExC_parse++ == '\n') {
11672             ended = 1;
11673             break;
11674         }
11675     if (!ended) {
11676         /* we ran off the end of the pattern without ending
11677            the comment, so we have to add an \n when wrapping */
11678         RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11679         return 0;
11680     } else
11681         return 1;
11682 }
11683
11684 /* nextchar()
11685
11686    Advances the parse position, and optionally absorbs
11687    "whitespace" from the inputstream.
11688
11689    Without /x "whitespace" means (?#...) style comments only,
11690    with /x this means (?#...) and # comments and whitespace proper.
11691
11692    Returns the RExC_parse point from BEFORE the scan occurs.
11693
11694    This is the /x friendly way of saying RExC_parse++.
11695 */
11696
11697 STATIC char*
11698 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
11699 {
11700     char* const retval = RExC_parse++;
11701
11702     PERL_ARGS_ASSERT_NEXTCHAR;
11703
11704     for (;;) {
11705         if (RExC_end - RExC_parse >= 3
11706             && *RExC_parse == '('
11707             && RExC_parse[1] == '?'
11708             && RExC_parse[2] == '#')
11709         {
11710             while (*RExC_parse != ')') {
11711                 if (RExC_parse == RExC_end)
11712                     FAIL("Sequence (?#... not terminated");
11713                 RExC_parse++;
11714             }
11715             RExC_parse++;
11716             continue;
11717         }
11718         if (RExC_flags & RXf_PMf_EXTENDED) {
11719             if (isSPACE(*RExC_parse)) {
11720                 RExC_parse++;
11721                 continue;
11722             }
11723             else if (*RExC_parse == '#') {
11724                 if ( reg_skipcomment( pRExC_state ) )
11725                     continue;
11726             }
11727         }
11728         return retval;
11729     }
11730 }
11731
11732 /*
11733 - reg_node - emit a node
11734 */
11735 STATIC regnode *                        /* Location. */
11736 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
11737 {
11738     dVAR;
11739     register regnode *ptr;
11740     regnode * const ret = RExC_emit;
11741     GET_RE_DEBUG_FLAGS_DECL;
11742
11743     PERL_ARGS_ASSERT_REG_NODE;
11744
11745     if (SIZE_ONLY) {
11746         SIZE_ALIGN(RExC_size);
11747         RExC_size += 1;
11748         return(ret);
11749     }
11750     if (RExC_emit >= RExC_emit_bound)
11751         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
11752                    op, RExC_emit, RExC_emit_bound);
11753
11754     NODE_ALIGN_FILL(ret);
11755     ptr = ret;
11756     FILL_ADVANCE_NODE(ptr, op);
11757     REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 1);
11758 #ifdef RE_TRACK_PATTERN_OFFSETS
11759     if (RExC_offsets) {         /* MJD */
11760         MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
11761               "reg_node", __LINE__,
11762               PL_reg_name[op],
11763               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
11764                 ? "Overwriting end of array!\n" : "OK",
11765               (UV)(RExC_emit - RExC_emit_start),
11766               (UV)(RExC_parse - RExC_start),
11767               (UV)RExC_offsets[0]));
11768         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
11769     }
11770 #endif
11771     RExC_emit = ptr;
11772     return(ret);
11773 }
11774
11775 /*
11776 - reganode - emit a node with an argument
11777 */
11778 STATIC regnode *                        /* Location. */
11779 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
11780 {
11781     dVAR;
11782     register regnode *ptr;
11783     regnode * const ret = RExC_emit;
11784     GET_RE_DEBUG_FLAGS_DECL;
11785
11786     PERL_ARGS_ASSERT_REGANODE;
11787
11788     if (SIZE_ONLY) {
11789         SIZE_ALIGN(RExC_size);
11790         RExC_size += 2;
11791         /*
11792            We can't do this:
11793
11794            assert(2==regarglen[op]+1);
11795
11796            Anything larger than this has to allocate the extra amount.
11797            If we changed this to be:
11798
11799            RExC_size += (1 + regarglen[op]);
11800
11801            then it wouldn't matter. Its not clear what side effect
11802            might come from that so its not done so far.
11803            -- dmq
11804         */
11805         return(ret);
11806     }
11807     if (RExC_emit >= RExC_emit_bound)
11808         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
11809                    op, RExC_emit, RExC_emit_bound);
11810
11811     NODE_ALIGN_FILL(ret);
11812     ptr = ret;
11813     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
11814     REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 2);
11815 #ifdef RE_TRACK_PATTERN_OFFSETS
11816     if (RExC_offsets) {         /* MJD */
11817         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
11818               "reganode",
11819               __LINE__,
11820               PL_reg_name[op],
11821               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
11822               "Overwriting end of array!\n" : "OK",
11823               (UV)(RExC_emit - RExC_emit_start),
11824               (UV)(RExC_parse - RExC_start),
11825               (UV)RExC_offsets[0]));
11826         Set_Cur_Node_Offset;
11827     }
11828 #endif
11829     RExC_emit = ptr;
11830     return(ret);
11831 }
11832
11833 /*
11834 - reguni - emit (if appropriate) a Unicode character
11835 */
11836 STATIC STRLEN
11837 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
11838 {
11839     dVAR;
11840
11841     PERL_ARGS_ASSERT_REGUNI;
11842
11843     return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
11844 }
11845
11846 /*
11847 - reginsert - insert an operator in front of already-emitted operand
11848 *
11849 * Means relocating the operand.
11850 */
11851 STATIC void
11852 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
11853 {
11854     dVAR;
11855     register regnode *src;
11856     register regnode *dst;
11857     register regnode *place;
11858     const int offset = regarglen[(U8)op];
11859     const int size = NODE_STEP_REGNODE + offset;
11860     GET_RE_DEBUG_FLAGS_DECL;
11861
11862     PERL_ARGS_ASSERT_REGINSERT;
11863     PERL_UNUSED_ARG(depth);
11864 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
11865     DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
11866     if (SIZE_ONLY) {
11867         RExC_size += size;
11868         return;
11869     }
11870
11871     src = RExC_emit;
11872     RExC_emit += size;
11873     dst = RExC_emit;
11874     if (RExC_open_parens) {
11875         int paren;
11876         /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
11877         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
11878             if ( RExC_open_parens[paren] >= opnd ) {
11879                 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
11880                 RExC_open_parens[paren] += size;
11881             } else {
11882                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
11883             }
11884             if ( RExC_close_parens[paren] >= opnd ) {
11885                 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
11886                 RExC_close_parens[paren] += size;
11887             } else {
11888                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
11889             }
11890         }
11891     }
11892
11893     while (src > opnd) {
11894         StructCopy(--src, --dst, regnode);
11895 #ifdef RE_TRACK_PATTERN_OFFSETS
11896         if (RExC_offsets) {     /* MJD 20010112 */
11897             MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
11898                   "reg_insert",
11899                   __LINE__,
11900                   PL_reg_name[op],
11901                   (UV)(dst - RExC_emit_start) > RExC_offsets[0]
11902                     ? "Overwriting end of array!\n" : "OK",
11903                   (UV)(src - RExC_emit_start),
11904                   (UV)(dst - RExC_emit_start),
11905                   (UV)RExC_offsets[0]));
11906             Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
11907             Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
11908         }
11909 #endif
11910     }
11911
11912
11913     place = opnd;               /* Op node, where operand used to be. */
11914 #ifdef RE_TRACK_PATTERN_OFFSETS
11915     if (RExC_offsets) {         /* MJD */
11916         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
11917               "reginsert",
11918               __LINE__,
11919               PL_reg_name[op],
11920               (UV)(place - RExC_emit_start) > RExC_offsets[0]
11921               ? "Overwriting end of array!\n" : "OK",
11922               (UV)(place - RExC_emit_start),
11923               (UV)(RExC_parse - RExC_start),
11924               (UV)RExC_offsets[0]));
11925         Set_Node_Offset(place, RExC_parse);
11926         Set_Node_Length(place, 1);
11927     }
11928 #endif
11929     src = NEXTOPER(place);
11930     FILL_ADVANCE_NODE(place, op);
11931     REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (place) - 1);
11932     Zero(src, offset, regnode);
11933 }
11934
11935 /*
11936 - regtail - set the next-pointer at the end of a node chain of p to val.
11937 - SEE ALSO: regtail_study
11938 */
11939 /* TODO: All three parms should be const */
11940 STATIC void
11941 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
11942 {
11943     dVAR;
11944     register regnode *scan;
11945     GET_RE_DEBUG_FLAGS_DECL;
11946
11947     PERL_ARGS_ASSERT_REGTAIL;
11948 #ifndef DEBUGGING
11949     PERL_UNUSED_ARG(depth);
11950 #endif
11951
11952     if (SIZE_ONLY)
11953         return;
11954
11955     /* Find last node. */
11956     scan = p;
11957     for (;;) {
11958         regnode * const temp = regnext(scan);
11959         DEBUG_PARSE_r({
11960             SV * const mysv=sv_newmortal();
11961             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
11962             regprop(RExC_rx, mysv, scan);
11963             PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
11964                 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
11965                     (temp == NULL ? "->" : ""),
11966                     (temp == NULL ? PL_reg_name[OP(val)] : "")
11967             );
11968         });
11969         if (temp == NULL)
11970             break;
11971         scan = temp;
11972     }
11973
11974     if (reg_off_by_arg[OP(scan)]) {
11975         ARG_SET(scan, val - scan);
11976     }
11977     else {
11978         NEXT_OFF(scan) = val - scan;
11979     }
11980 }
11981
11982 #ifdef DEBUGGING
11983 /*
11984 - regtail_study - set the next-pointer at the end of a node chain of p to val.
11985 - Look for optimizable sequences at the same time.
11986 - currently only looks for EXACT chains.
11987
11988 This is experimental code. The idea is to use this routine to perform
11989 in place optimizations on branches and groups as they are constructed,
11990 with the long term intention of removing optimization from study_chunk so
11991 that it is purely analytical.
11992
11993 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
11994 to control which is which.
11995
11996 */
11997 /* TODO: All four parms should be const */
11998
11999 STATIC U8
12000 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
12001 {
12002     dVAR;
12003     register regnode *scan;
12004     U8 exact = PSEUDO;
12005 #ifdef EXPERIMENTAL_INPLACESCAN
12006     I32 min = 0;
12007 #endif
12008     GET_RE_DEBUG_FLAGS_DECL;
12009
12010     PERL_ARGS_ASSERT_REGTAIL_STUDY;
12011
12012
12013     if (SIZE_ONLY)
12014         return exact;
12015
12016     /* Find last node. */
12017
12018     scan = p;
12019     for (;;) {
12020         regnode * const temp = regnext(scan);
12021 #ifdef EXPERIMENTAL_INPLACESCAN
12022         if (PL_regkind[OP(scan)] == EXACT) {
12023             bool has_exactf_sharp_s;    /* Unexamined in this routine */
12024             if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1))
12025                 return EXACT;
12026         }
12027 #endif
12028         if ( exact ) {
12029             switch (OP(scan)) {
12030                 case EXACT:
12031                 case EXACTF:
12032                 case EXACTFA:
12033                 case EXACTFU:
12034                 case EXACTFU_SS:
12035                 case EXACTFU_NO_TRIE:
12036                 case EXACTFL:
12037                         if( exact == PSEUDO )
12038                             exact= OP(scan);
12039                         else if ( exact != OP(scan) )
12040                             exact= 0;
12041                 case NOTHING:
12042                     break;
12043                 default:
12044                     exact= 0;
12045             }
12046         }
12047         DEBUG_PARSE_r({
12048             SV * const mysv=sv_newmortal();
12049             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
12050             regprop(RExC_rx, mysv, scan);
12051             PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
12052                 SvPV_nolen_const(mysv),
12053                 REG_NODE_NUM(scan),
12054                 PL_reg_name[exact]);
12055         });
12056         if (temp == NULL)
12057             break;
12058         scan = temp;
12059     }
12060     DEBUG_PARSE_r({
12061         SV * const mysv_val=sv_newmortal();
12062         DEBUG_PARSE_MSG("");
12063         regprop(RExC_rx, mysv_val, val);
12064         PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
12065                       SvPV_nolen_const(mysv_val),
12066                       (IV)REG_NODE_NUM(val),
12067                       (IV)(val - scan)
12068         );
12069     });
12070     if (reg_off_by_arg[OP(scan)]) {
12071         ARG_SET(scan, val - scan);
12072     }
12073     else {
12074         NEXT_OFF(scan) = val - scan;
12075     }
12076
12077     return exact;
12078 }
12079 #endif
12080
12081 /*
12082  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
12083  */
12084 #ifdef DEBUGGING
12085 static void
12086 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
12087 {
12088     int bit;
12089     int set=0;
12090     regex_charset cs;
12091
12092     for (bit=0; bit<32; bit++) {
12093         if (flags & (1<<bit)) {
12094             if ((1<<bit) & RXf_PMf_CHARSET) {   /* Output separately, below */
12095                 continue;
12096             }
12097             if (!set++ && lead)
12098                 PerlIO_printf(Perl_debug_log, "%s",lead);
12099             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
12100         }
12101     }
12102     if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
12103             if (!set++ && lead) {
12104                 PerlIO_printf(Perl_debug_log, "%s",lead);
12105             }
12106             switch (cs) {
12107                 case REGEX_UNICODE_CHARSET:
12108                     PerlIO_printf(Perl_debug_log, "UNICODE");
12109                     break;
12110                 case REGEX_LOCALE_CHARSET:
12111                     PerlIO_printf(Perl_debug_log, "LOCALE");
12112                     break;
12113                 case REGEX_ASCII_RESTRICTED_CHARSET:
12114                     PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
12115                     break;
12116                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
12117                     PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
12118                     break;
12119                 default:
12120                     PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
12121                     break;
12122             }
12123     }
12124     if (lead)  {
12125         if (set)
12126             PerlIO_printf(Perl_debug_log, "\n");
12127         else
12128             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
12129     }
12130 }
12131 #endif
12132
12133 void
12134 Perl_regdump(pTHX_ const regexp *r)
12135 {
12136 #ifdef DEBUGGING
12137     dVAR;
12138     SV * const sv = sv_newmortal();
12139     SV *dsv= sv_newmortal();
12140     RXi_GET_DECL(r,ri);
12141     GET_RE_DEBUG_FLAGS_DECL;
12142
12143     PERL_ARGS_ASSERT_REGDUMP;
12144
12145     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
12146
12147     /* Header fields of interest. */
12148     if (r->anchored_substr) {
12149         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
12150             RE_SV_DUMPLEN(r->anchored_substr), 30);
12151         PerlIO_printf(Perl_debug_log,
12152                       "anchored %s%s at %"IVdf" ",
12153                       s, RE_SV_TAIL(r->anchored_substr),
12154                       (IV)r->anchored_offset);
12155     } else if (r->anchored_utf8) {
12156         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
12157             RE_SV_DUMPLEN(r->anchored_utf8), 30);
12158         PerlIO_printf(Perl_debug_log,
12159                       "anchored utf8 %s%s at %"IVdf" ",
12160                       s, RE_SV_TAIL(r->anchored_utf8),
12161                       (IV)r->anchored_offset);
12162     }
12163     if (r->float_substr) {
12164         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
12165             RE_SV_DUMPLEN(r->float_substr), 30);
12166         PerlIO_printf(Perl_debug_log,
12167                       "floating %s%s at %"IVdf"..%"UVuf" ",
12168                       s, RE_SV_TAIL(r->float_substr),
12169                       (IV)r->float_min_offset, (UV)r->float_max_offset);
12170     } else if (r->float_utf8) {
12171         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
12172             RE_SV_DUMPLEN(r->float_utf8), 30);
12173         PerlIO_printf(Perl_debug_log,
12174                       "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
12175                       s, RE_SV_TAIL(r->float_utf8),
12176                       (IV)r->float_min_offset, (UV)r->float_max_offset);
12177     }
12178     if (r->check_substr || r->check_utf8)
12179         PerlIO_printf(Perl_debug_log,
12180                       (const char *)
12181                       (r->check_substr == r->float_substr
12182                        && r->check_utf8 == r->float_utf8
12183                        ? "(checking floating" : "(checking anchored"));
12184     if (r->extflags & RXf_NOSCAN)
12185         PerlIO_printf(Perl_debug_log, " noscan");
12186     if (r->extflags & RXf_CHECK_ALL)
12187         PerlIO_printf(Perl_debug_log, " isall");
12188     if (r->check_substr || r->check_utf8)
12189         PerlIO_printf(Perl_debug_log, ") ");
12190
12191     if (ri->regstclass) {
12192         regprop(r, sv, ri->regstclass);
12193         PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
12194     }
12195     if (r->extflags & RXf_ANCH) {
12196         PerlIO_printf(Perl_debug_log, "anchored");
12197         if (r->extflags & RXf_ANCH_BOL)
12198             PerlIO_printf(Perl_debug_log, "(BOL)");
12199         if (r->extflags & RXf_ANCH_MBOL)
12200             PerlIO_printf(Perl_debug_log, "(MBOL)");
12201         if (r->extflags & RXf_ANCH_SBOL)
12202             PerlIO_printf(Perl_debug_log, "(SBOL)");
12203         if (r->extflags & RXf_ANCH_GPOS)
12204             PerlIO_printf(Perl_debug_log, "(GPOS)");
12205         PerlIO_putc(Perl_debug_log, ' ');
12206     }
12207     if (r->extflags & RXf_GPOS_SEEN)
12208         PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
12209     if (r->intflags & PREGf_SKIP)
12210         PerlIO_printf(Perl_debug_log, "plus ");
12211     if (r->intflags & PREGf_IMPLICIT)
12212         PerlIO_printf(Perl_debug_log, "implicit ");
12213     PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
12214     if (r->extflags & RXf_EVAL_SEEN)
12215         PerlIO_printf(Perl_debug_log, "with eval ");
12216     PerlIO_printf(Perl_debug_log, "\n");
12217     DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
12218 #else
12219     PERL_ARGS_ASSERT_REGDUMP;
12220     PERL_UNUSED_CONTEXT;
12221     PERL_UNUSED_ARG(r);
12222 #endif  /* DEBUGGING */
12223 }
12224
12225 /*
12226 - regprop - printable representation of opcode
12227 */
12228 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
12229 STMT_START { \
12230         if (do_sep) {                           \
12231             Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
12232             if (flags & ANYOF_INVERT)           \
12233                 /*make sure the invert info is in each */ \
12234                 sv_catpvs(sv, "^");             \
12235             do_sep = 0;                         \
12236         }                                       \
12237 } STMT_END
12238
12239 void
12240 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
12241 {
12242 #ifdef DEBUGGING
12243     dVAR;
12244     register int k;
12245     RXi_GET_DECL(prog,progi);
12246     GET_RE_DEBUG_FLAGS_DECL;
12247
12248     PERL_ARGS_ASSERT_REGPROP;
12249
12250     sv_setpvs(sv, "");
12251
12252     if (OP(o) > REGNODE_MAX)            /* regnode.type is unsigned */
12253         /* It would be nice to FAIL() here, but this may be called from
12254            regexec.c, and it would be hard to supply pRExC_state. */
12255         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
12256     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
12257
12258     k = PL_regkind[OP(o)];
12259
12260     if (k == EXACT) {
12261         sv_catpvs(sv, " ");
12262         /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
12263          * is a crude hack but it may be the best for now since
12264          * we have no flag "this EXACTish node was UTF-8"
12265          * --jhi */
12266         pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
12267                   PERL_PV_ESCAPE_UNI_DETECT |
12268                   PERL_PV_ESCAPE_NONASCII   |
12269                   PERL_PV_PRETTY_ELLIPSES   |
12270                   PERL_PV_PRETTY_LTGT       |
12271                   PERL_PV_PRETTY_NOCLEAR
12272                   );
12273     } else if (k == TRIE) {
12274         /* print the details of the trie in dumpuntil instead, as
12275          * progi->data isn't available here */
12276         const char op = OP(o);
12277         const U32 n = ARG(o);
12278         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
12279                (reg_ac_data *)progi->data->data[n] :
12280                NULL;
12281         const reg_trie_data * const trie
12282             = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
12283
12284         Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
12285         DEBUG_TRIE_COMPILE_r(
12286             Perl_sv_catpvf(aTHX_ sv,
12287                 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
12288                 (UV)trie->startstate,
12289                 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
12290                 (UV)trie->wordcount,
12291                 (UV)trie->minlen,
12292                 (UV)trie->maxlen,
12293                 (UV)TRIE_CHARCOUNT(trie),
12294                 (UV)trie->uniquecharcount
12295             )
12296         );
12297         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
12298             int i;
12299             int rangestart = -1;
12300             U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
12301             sv_catpvs(sv, "[");
12302             for (i = 0; i <= 256; i++) {
12303                 if (i < 256 && BITMAP_TEST(bitmap,i)) {
12304                     if (rangestart == -1)
12305                         rangestart = i;
12306                 } else if (rangestart != -1) {
12307                     if (i <= rangestart + 3)
12308                         for (; rangestart < i; rangestart++)
12309                             put_byte(sv, rangestart);
12310                     else {
12311                         put_byte(sv, rangestart);
12312                         sv_catpvs(sv, "-");
12313                         put_byte(sv, i - 1);
12314                     }
12315                     rangestart = -1;
12316                 }
12317             }
12318             sv_catpvs(sv, "]");
12319         }
12320
12321     } else if (k == CURLY) {
12322         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
12323             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
12324         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
12325     }
12326     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
12327         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
12328     else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
12329         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
12330         if ( RXp_PAREN_NAMES(prog) ) {
12331             if ( k != REF || (OP(o) < NREF)) {
12332                 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
12333                 SV **name= av_fetch(list, ARG(o), 0 );
12334                 if (name)
12335                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
12336             }
12337             else {
12338                 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
12339                 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
12340                 I32 *nums=(I32*)SvPVX(sv_dat);
12341                 SV **name= av_fetch(list, nums[0], 0 );
12342                 I32 n;
12343                 if (name) {
12344                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
12345                         Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
12346                                     (n ? "," : ""), (IV)nums[n]);
12347                     }
12348                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
12349                 }
12350             }
12351         }
12352     } else if (k == GOSUB)
12353         Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
12354     else if (k == VERB) {
12355         if (!o->flags)
12356             Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
12357                            SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
12358     } else if (k == LOGICAL)
12359         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);     /* 2: embedded, otherwise 1 */
12360     else if (k == ANYOF) {
12361         int i, rangestart = -1;
12362         const U8 flags = ANYOF_FLAGS(o);
12363         int do_sep = 0;
12364
12365         /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
12366         static const char * const anyofs[] = {
12367             "\\w",
12368             "\\W",
12369             "\\s",
12370             "\\S",
12371             "\\d",
12372             "\\D",
12373             "[:alnum:]",
12374             "[:^alnum:]",
12375             "[:alpha:]",
12376             "[:^alpha:]",
12377             "[:ascii:]",
12378             "[:^ascii:]",
12379             "[:cntrl:]",
12380             "[:^cntrl:]",
12381             "[:graph:]",
12382             "[:^graph:]",
12383             "[:lower:]",
12384             "[:^lower:]",
12385             "[:print:]",
12386             "[:^print:]",
12387             "[:punct:]",
12388             "[:^punct:]",
12389             "[:upper:]",
12390             "[:^upper:]",
12391             "[:xdigit:]",
12392             "[:^xdigit:]",
12393             "[:space:]",
12394             "[:^space:]",
12395             "[:blank:]",
12396             "[:^blank:]"
12397         };
12398
12399         if (flags & ANYOF_LOCALE)
12400             sv_catpvs(sv, "{loc}");
12401         if (flags & ANYOF_LOC_NONBITMAP_FOLD)
12402             sv_catpvs(sv, "{i}");
12403         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
12404         if (flags & ANYOF_INVERT)
12405             sv_catpvs(sv, "^");
12406
12407         /* output what the standard cp 0-255 bitmap matches */
12408         for (i = 0; i <= 256; i++) {
12409             if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
12410                 if (rangestart == -1)
12411                     rangestart = i;
12412             } else if (rangestart != -1) {
12413                 if (i <= rangestart + 3)
12414                     for (; rangestart < i; rangestart++)
12415                         put_byte(sv, rangestart);
12416                 else {
12417                     put_byte(sv, rangestart);
12418                     sv_catpvs(sv, "-");
12419                     put_byte(sv, i - 1);
12420                 }
12421                 do_sep = 1;
12422                 rangestart = -1;
12423             }
12424         }
12425
12426         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
12427         /* output any special charclass tests (used entirely under use locale) */
12428         if (ANYOF_CLASS_TEST_ANY_SET(o))
12429             for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
12430                 if (ANYOF_CLASS_TEST(o,i)) {
12431                     sv_catpv(sv, anyofs[i]);
12432                     do_sep = 1;
12433                 }
12434
12435         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
12436
12437         if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
12438             sv_catpvs(sv, "{non-utf8-latin1-all}");
12439         }
12440
12441         /* output information about the unicode matching */
12442         if (flags & ANYOF_UNICODE_ALL)
12443             sv_catpvs(sv, "{unicode_all}");
12444         else if (ANYOF_NONBITMAP(o))
12445             sv_catpvs(sv, "{unicode}");
12446         if (flags & ANYOF_NONBITMAP_NON_UTF8)
12447             sv_catpvs(sv, "{outside bitmap}");
12448
12449         if (ANYOF_NONBITMAP(o)) {
12450             SV *lv; /* Set if there is something outside the bit map */
12451             SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
12452             bool byte_output = FALSE;   /* If something in the bitmap has been
12453                                            output */
12454
12455             if (lv && lv != &PL_sv_undef) {
12456                 if (sw) {
12457                     U8 s[UTF8_MAXBYTES_CASE+1];
12458
12459                     for (i = 0; i <= 256; i++) { /* Look at chars in bitmap */
12460                         uvchr_to_utf8(s, i);
12461
12462                         if (i < 256
12463                             && ! ANYOF_BITMAP_TEST(o, i)    /* Don't duplicate
12464                                                                things already
12465                                                                output as part
12466                                                                of the bitmap */
12467                             && swash_fetch(sw, s, TRUE))
12468                         {
12469                             if (rangestart == -1)
12470                                 rangestart = i;
12471                         } else if (rangestart != -1) {
12472                             byte_output = TRUE;
12473                             if (i <= rangestart + 3)
12474                                 for (; rangestart < i; rangestart++) {
12475                                     put_byte(sv, rangestart);
12476                                 }
12477                             else {
12478                                 put_byte(sv, rangestart);
12479                                 sv_catpvs(sv, "-");
12480                                 put_byte(sv, i-1);
12481                             }
12482                             rangestart = -1;
12483                         }
12484                     }
12485                 }
12486
12487                 {
12488                     char *s = savesvpv(lv);
12489                     char * const origs = s;
12490
12491                     while (*s && *s != '\n')
12492                         s++;
12493
12494                     if (*s == '\n') {
12495                         const char * const t = ++s;
12496
12497                         if (byte_output) {
12498                             sv_catpvs(sv, " ");
12499                         }
12500
12501                         while (*s) {
12502                             if (*s == '\n') {
12503
12504                                 /* Truncate very long output */
12505                                 if (s - origs > 256) {
12506                                     Perl_sv_catpvf(aTHX_ sv,
12507                                                    "%.*s...",
12508                                                    (int) (s - origs - 1),
12509                                                    t);
12510                                     goto out_dump;
12511                                 }
12512                                 *s = ' ';
12513                             }
12514                             else if (*s == '\t') {
12515                                 *s = '-';
12516                             }
12517                             s++;
12518                         }
12519                         if (s[-1] == ' ')
12520                             s[-1] = 0;
12521
12522                         sv_catpv(sv, t);
12523                     }
12524
12525                 out_dump:
12526
12527                     Safefree(origs);
12528                 }
12529                 SvREFCNT_dec(lv);
12530             }
12531         }
12532
12533         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
12534     }
12535     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
12536         Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
12537 #else
12538     PERL_UNUSED_CONTEXT;
12539     PERL_UNUSED_ARG(sv);
12540     PERL_UNUSED_ARG(o);
12541     PERL_UNUSED_ARG(prog);
12542 #endif  /* DEBUGGING */
12543 }
12544
12545 SV *
12546 Perl_re_intuit_string(pTHX_ REGEXP * const r)
12547 {                               /* Assume that RE_INTUIT is set */
12548     dVAR;
12549     struct regexp *const prog = (struct regexp *)SvANY(r);
12550     GET_RE_DEBUG_FLAGS_DECL;
12551
12552     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
12553     PERL_UNUSED_CONTEXT;
12554
12555     DEBUG_COMPILE_r(
12556         {
12557             const char * const s = SvPV_nolen_const(prog->check_substr
12558                       ? prog->check_substr : prog->check_utf8);
12559
12560             if (!PL_colorset) reginitcolors();
12561             PerlIO_printf(Perl_debug_log,
12562                       "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
12563                       PL_colors[4],
12564                       prog->check_substr ? "" : "utf8 ",
12565                       PL_colors[5],PL_colors[0],
12566                       s,
12567                       PL_colors[1],
12568                       (strlen(s) > 60 ? "..." : ""));
12569         } );
12570
12571     return prog->check_substr ? prog->check_substr : prog->check_utf8;
12572 }
12573
12574 /*
12575    pregfree()
12576
12577    handles refcounting and freeing the perl core regexp structure. When
12578    it is necessary to actually free the structure the first thing it
12579    does is call the 'free' method of the regexp_engine associated to
12580    the regexp, allowing the handling of the void *pprivate; member
12581    first. (This routine is not overridable by extensions, which is why
12582    the extensions free is called first.)
12583
12584    See regdupe and regdupe_internal if you change anything here.
12585 */
12586 #ifndef PERL_IN_XSUB_RE
12587 void
12588 Perl_pregfree(pTHX_ REGEXP *r)
12589 {
12590     SvREFCNT_dec(r);
12591 }
12592
12593 void
12594 Perl_pregfree2(pTHX_ REGEXP *rx)
12595 {
12596     dVAR;
12597     struct regexp *const r = (struct regexp *)SvANY(rx);
12598     GET_RE_DEBUG_FLAGS_DECL;
12599
12600     PERL_ARGS_ASSERT_PREGFREE2;
12601
12602     if (r->mother_re) {
12603         ReREFCNT_dec(r->mother_re);
12604     } else {
12605         CALLREGFREE_PVT(rx); /* free the private data */
12606         SvREFCNT_dec(RXp_PAREN_NAMES(r));
12607     }
12608     if (r->substrs) {
12609         SvREFCNT_dec(r->anchored_substr);
12610         SvREFCNT_dec(r->anchored_utf8);
12611         SvREFCNT_dec(r->float_substr);
12612         SvREFCNT_dec(r->float_utf8);
12613         Safefree(r->substrs);
12614     }
12615     RX_MATCH_COPY_FREE(rx);
12616 #ifdef PERL_OLD_COPY_ON_WRITE
12617     SvREFCNT_dec(r->saved_copy);
12618 #endif
12619     Safefree(r->offs);
12620 }
12621
12622 /*  reg_temp_copy()
12623
12624     This is a hacky workaround to the structural issue of match results
12625     being stored in the regexp structure which is in turn stored in
12626     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
12627     could be PL_curpm in multiple contexts, and could require multiple
12628     result sets being associated with the pattern simultaneously, such
12629     as when doing a recursive match with (??{$qr})
12630
12631     The solution is to make a lightweight copy of the regexp structure
12632     when a qr// is returned from the code executed by (??{$qr}) this
12633     lightweight copy doesn't actually own any of its data except for
12634     the starp/end and the actual regexp structure itself.
12635
12636 */
12637
12638
12639 REGEXP *
12640 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
12641 {
12642     struct regexp *ret;
12643     struct regexp *const r = (struct regexp *)SvANY(rx);
12644     register const I32 npar = r->nparens+1;
12645
12646     PERL_ARGS_ASSERT_REG_TEMP_COPY;
12647
12648     if (!ret_x)
12649         ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
12650     ret = (struct regexp *)SvANY(ret_x);
12651
12652     (void)ReREFCNT_inc(rx);
12653     /* We can take advantage of the existing "copied buffer" mechanism in SVs
12654        by pointing directly at the buffer, but flagging that the allocated
12655        space in the copy is zero. As we've just done a struct copy, it's now
12656        a case of zero-ing that, rather than copying the current length.  */
12657     SvPV_set(ret_x, RX_WRAPPED(rx));
12658     SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
12659     memcpy(&(ret->xpv_cur), &(r->xpv_cur),
12660            sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
12661     SvLEN_set(ret_x, 0);
12662     SvSTASH_set(ret_x, NULL);
12663     SvMAGIC_set(ret_x, NULL);
12664     Newx(ret->offs, npar, regexp_paren_pair);
12665     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
12666     if (r->substrs) {
12667         Newx(ret->substrs, 1, struct reg_substr_data);
12668         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
12669
12670         SvREFCNT_inc_void(ret->anchored_substr);
12671         SvREFCNT_inc_void(ret->anchored_utf8);
12672         SvREFCNT_inc_void(ret->float_substr);
12673         SvREFCNT_inc_void(ret->float_utf8);
12674
12675         /* check_substr and check_utf8, if non-NULL, point to either their
12676            anchored or float namesakes, and don't hold a second reference.  */
12677     }
12678     RX_MATCH_COPIED_off(ret_x);
12679 #ifdef PERL_OLD_COPY_ON_WRITE
12680     ret->saved_copy = NULL;
12681 #endif
12682     ret->mother_re = rx;
12683
12684     return ret_x;
12685 }
12686 #endif
12687
12688 /* regfree_internal()
12689
12690    Free the private data in a regexp. This is overloadable by
12691    extensions. Perl takes care of the regexp structure in pregfree(),
12692    this covers the *pprivate pointer which technically perl doesn't
12693    know about, however of course we have to handle the
12694    regexp_internal structure when no extension is in use.
12695
12696    Note this is called before freeing anything in the regexp
12697    structure.
12698  */
12699
12700 void
12701 Perl_regfree_internal(pTHX_ REGEXP * const rx)
12702 {
12703     dVAR;
12704     struct regexp *const r = (struct regexp *)SvANY(rx);
12705     RXi_GET_DECL(r,ri);
12706     GET_RE_DEBUG_FLAGS_DECL;
12707
12708     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
12709
12710     DEBUG_COMPILE_r({
12711         if (!PL_colorset)
12712             reginitcolors();
12713         {
12714             SV *dsv= sv_newmortal();
12715             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
12716                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
12717             PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
12718                 PL_colors[4],PL_colors[5],s);
12719         }
12720     });
12721 #ifdef RE_TRACK_PATTERN_OFFSETS
12722     if (ri->u.offsets)
12723         Safefree(ri->u.offsets);             /* 20010421 MJD */
12724 #endif
12725     if (ri->data) {
12726         int n = ri->data->count;
12727         PAD* new_comppad = NULL;
12728         PAD* old_comppad;
12729         PADOFFSET refcnt;
12730
12731         while (--n >= 0) {
12732           /* If you add a ->what type here, update the comment in regcomp.h */
12733             switch (ri->data->what[n]) {
12734             case 'a':
12735             case 's':
12736             case 'S':
12737             case 'u':
12738                 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
12739                 break;
12740             case 'f':
12741                 Safefree(ri->data->data[n]);
12742                 break;
12743             case 'p':
12744                 new_comppad = MUTABLE_AV(ri->data->data[n]);
12745                 break;
12746             case 'o':
12747                 if (new_comppad == NULL)
12748                     Perl_croak(aTHX_ "panic: pregfree comppad");
12749                 PAD_SAVE_LOCAL(old_comppad,
12750                     /* Watch out for global destruction's random ordering. */
12751                     (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
12752                 );
12753                 OP_REFCNT_LOCK;
12754                 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
12755                 OP_REFCNT_UNLOCK;
12756                 if (!refcnt)
12757                     op_free((OP_4tree*)ri->data->data[n]);
12758
12759                 PAD_RESTORE_LOCAL(old_comppad);
12760                 SvREFCNT_dec(MUTABLE_SV(new_comppad));
12761                 new_comppad = NULL;
12762                 break;
12763             case 'n':
12764                 break;
12765             case 'T':
12766                 { /* Aho Corasick add-on structure for a trie node.
12767                      Used in stclass optimization only */
12768                     U32 refcount;
12769                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
12770                     OP_REFCNT_LOCK;
12771                     refcount = --aho->refcount;
12772                     OP_REFCNT_UNLOCK;
12773                     if ( !refcount ) {
12774                         PerlMemShared_free(aho->states);
12775                         PerlMemShared_free(aho->fail);
12776                          /* do this last!!!! */
12777                         PerlMemShared_free(ri->data->data[n]);
12778                         PerlMemShared_free(ri->regstclass);
12779                     }
12780                 }
12781                 break;
12782             case 't':
12783                 {
12784                     /* trie structure. */
12785                     U32 refcount;
12786                     reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
12787                     OP_REFCNT_LOCK;
12788                     refcount = --trie->refcount;
12789                     OP_REFCNT_UNLOCK;
12790                     if ( !refcount ) {
12791                         PerlMemShared_free(trie->charmap);
12792                         PerlMemShared_free(trie->states);
12793                         PerlMemShared_free(trie->trans);
12794                         if (trie->bitmap)
12795                             PerlMemShared_free(trie->bitmap);
12796                         if (trie->jump)
12797                             PerlMemShared_free(trie->jump);
12798                         PerlMemShared_free(trie->wordinfo);
12799                         /* do this last!!!! */
12800                         PerlMemShared_free(ri->data->data[n]);
12801                     }
12802                 }
12803                 break;
12804             default:
12805                 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
12806             }
12807         }
12808         Safefree(ri->data->what);
12809         Safefree(ri->data);
12810     }
12811
12812     Safefree(ri);
12813 }
12814
12815 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
12816 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
12817 #define SAVEPVN(p,n)    ((p) ? savepvn(p,n) : NULL)
12818
12819 /*
12820    re_dup - duplicate a regexp.
12821
12822    This routine is expected to clone a given regexp structure. It is only
12823    compiled under USE_ITHREADS.
12824
12825    After all of the core data stored in struct regexp is duplicated
12826    the regexp_engine.dupe method is used to copy any private data
12827    stored in the *pprivate pointer. This allows extensions to handle
12828    any duplication it needs to do.
12829
12830    See pregfree() and regfree_internal() if you change anything here.
12831 */
12832 #if defined(USE_ITHREADS)
12833 #ifndef PERL_IN_XSUB_RE
12834 void
12835 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
12836 {
12837     dVAR;
12838     I32 npar;
12839     const struct regexp *r = (const struct regexp *)SvANY(sstr);
12840     struct regexp *ret = (struct regexp *)SvANY(dstr);
12841
12842     PERL_ARGS_ASSERT_RE_DUP_GUTS;
12843
12844     npar = r->nparens+1;
12845     Newx(ret->offs, npar, regexp_paren_pair);
12846     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
12847     if(ret->swap) {
12848         /* no need to copy these */
12849         Newx(ret->swap, npar, regexp_paren_pair);
12850     }
12851
12852     if (ret->substrs) {
12853         /* Do it this way to avoid reading from *r after the StructCopy().
12854            That way, if any of the sv_dup_inc()s dislodge *r from the L1
12855            cache, it doesn't matter.  */
12856         const bool anchored = r->check_substr
12857             ? r->check_substr == r->anchored_substr
12858             : r->check_utf8 == r->anchored_utf8;
12859         Newx(ret->substrs, 1, struct reg_substr_data);
12860         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
12861
12862         ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
12863         ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
12864         ret->float_substr = sv_dup_inc(ret->float_substr, param);
12865         ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
12866
12867         /* check_substr and check_utf8, if non-NULL, point to either their
12868            anchored or float namesakes, and don't hold a second reference.  */
12869
12870         if (ret->check_substr) {
12871             if (anchored) {
12872                 assert(r->check_utf8 == r->anchored_utf8);
12873                 ret->check_substr = ret->anchored_substr;
12874                 ret->check_utf8 = ret->anchored_utf8;
12875             } else {
12876                 assert(r->check_substr == r->float_substr);
12877                 assert(r->check_utf8 == r->float_utf8);
12878                 ret->check_substr = ret->float_substr;
12879                 ret->check_utf8 = ret->float_utf8;
12880             }
12881         } else if (ret->check_utf8) {
12882             if (anchored) {
12883                 ret->check_utf8 = ret->anchored_utf8;
12884             } else {
12885                 ret->check_utf8 = ret->float_utf8;
12886             }
12887         }
12888     }
12889
12890     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
12891
12892     if (ret->pprivate)
12893         RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
12894
12895     if (RX_MATCH_COPIED(dstr))
12896         ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
12897     else
12898         ret->subbeg = NULL;
12899 #ifdef PERL_OLD_COPY_ON_WRITE
12900     ret->saved_copy = NULL;
12901 #endif
12902
12903     if (ret->mother_re) {
12904         if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
12905             /* Our storage points directly to our mother regexp, but that's
12906                1: a buffer in a different thread
12907                2: something we no longer hold a reference on
12908                so we need to copy it locally.  */
12909             /* Note we need to use SvCUR(), rather than
12910                SvLEN(), on our mother_re, because it, in
12911                turn, may well be pointing to its own mother_re.  */
12912             SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
12913                                    SvCUR(ret->mother_re)+1));
12914             SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
12915         }
12916         ret->mother_re      = NULL;
12917     }
12918     ret->gofs = 0;
12919 }
12920 #endif /* PERL_IN_XSUB_RE */
12921
12922 /*
12923    regdupe_internal()
12924
12925    This is the internal complement to regdupe() which is used to copy
12926    the structure pointed to by the *pprivate pointer in the regexp.
12927    This is the core version of the extension overridable cloning hook.
12928    The regexp structure being duplicated will be copied by perl prior
12929    to this and will be provided as the regexp *r argument, however
12930    with the /old/ structures pprivate pointer value. Thus this routine
12931    may override any copying normally done by perl.
12932
12933    It returns a pointer to the new regexp_internal structure.
12934 */
12935
12936 void *
12937 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
12938 {
12939     dVAR;
12940     struct regexp *const r = (struct regexp *)SvANY(rx);
12941     regexp_internal *reti;
12942     int len;
12943     RXi_GET_DECL(r,ri);
12944
12945     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
12946
12947     len = ProgLen(ri);
12948
12949     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
12950     Copy(ri->program, reti->program, len+1, regnode);
12951
12952
12953     reti->regstclass = NULL;
12954
12955     if (ri->data) {
12956         struct reg_data *d;
12957         const int count = ri->data->count;
12958         int i;
12959
12960         Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
12961                 char, struct reg_data);
12962         Newx(d->what, count, U8);
12963
12964         d->count = count;
12965         for (i = 0; i < count; i++) {
12966             d->what[i] = ri->data->what[i];
12967             switch (d->what[i]) {
12968                 /* legal options are one of: sSfpontTua
12969                    see also regcomp.h and pregfree() */
12970             case 'a': /* actually an AV, but the dup function is identical.  */
12971             case 's':
12972             case 'S':
12973             case 'p': /* actually an AV, but the dup function is identical.  */
12974             case 'u': /* actually an HV, but the dup function is identical.  */
12975                 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
12976                 break;
12977             case 'f':
12978                 /* This is cheating. */
12979                 Newx(d->data[i], 1, struct regnode_charclass_class);
12980                 StructCopy(ri->data->data[i], d->data[i],
12981                             struct regnode_charclass_class);
12982                 reti->regstclass = (regnode*)d->data[i];
12983                 break;
12984             case 'o':
12985                 /* Compiled op trees are readonly and in shared memory,
12986                    and can thus be shared without duplication. */
12987                 OP_REFCNT_LOCK;
12988                 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
12989                 OP_REFCNT_UNLOCK;
12990                 break;
12991             case 'T':
12992                 /* Trie stclasses are readonly and can thus be shared
12993                  * without duplication. We free the stclass in pregfree
12994                  * when the corresponding reg_ac_data struct is freed.
12995                  */
12996                 reti->regstclass= ri->regstclass;
12997                 /* Fall through */
12998             case 't':
12999                 OP_REFCNT_LOCK;
13000                 ((reg_trie_data*)ri->data->data[i])->refcount++;
13001                 OP_REFCNT_UNLOCK;
13002                 /* Fall through */
13003             case 'n':
13004                 d->data[i] = ri->data->data[i];
13005                 break;
13006             default:
13007                 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
13008             }
13009         }
13010
13011         reti->data = d;
13012     }
13013     else
13014         reti->data = NULL;
13015
13016     reti->name_list_idx = ri->name_list_idx;
13017
13018 #ifdef RE_TRACK_PATTERN_OFFSETS
13019     if (ri->u.offsets) {
13020         Newx(reti->u.offsets, 2*len+1, U32);
13021         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
13022     }
13023 #else
13024     SetProgLen(reti,len);
13025 #endif
13026
13027     return (void*)reti;
13028 }
13029
13030 #endif    /* USE_ITHREADS */
13031
13032 #ifndef PERL_IN_XSUB_RE
13033
13034 /*
13035  - regnext - dig the "next" pointer out of a node
13036  */
13037 regnode *
13038 Perl_regnext(pTHX_ register regnode *p)
13039 {
13040     dVAR;
13041     register I32 offset;
13042
13043     if (!p)
13044         return(NULL);
13045
13046     if (OP(p) > REGNODE_MAX) {          /* regnode.type is unsigned */
13047         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
13048     }
13049
13050     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
13051     if (offset == 0)
13052         return(NULL);
13053
13054     return(p+offset);
13055 }
13056 #endif
13057
13058 STATIC void
13059 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
13060 {
13061     va_list args;
13062     STRLEN l1 = strlen(pat1);
13063     STRLEN l2 = strlen(pat2);
13064     char buf[512];
13065     SV *msv;
13066     const char *message;
13067
13068     PERL_ARGS_ASSERT_RE_CROAK2;
13069
13070     if (l1 > 510)
13071         l1 = 510;
13072     if (l1 + l2 > 510)
13073         l2 = 510 - l1;
13074     Copy(pat1, buf, l1 , char);
13075     Copy(pat2, buf + l1, l2 , char);
13076     buf[l1 + l2] = '\n';
13077     buf[l1 + l2 + 1] = '\0';
13078 #ifdef I_STDARG
13079     /* ANSI variant takes additional second argument */
13080     va_start(args, pat2);
13081 #else
13082     va_start(args);
13083 #endif
13084     msv = vmess(buf, &args);
13085     va_end(args);
13086     message = SvPV_const(msv,l1);
13087     if (l1 > 512)
13088         l1 = 512;
13089     Copy(message, buf, l1 , char);
13090     buf[l1-1] = '\0';                   /* Overwrite \n */
13091     Perl_croak(aTHX_ "%s", buf);
13092 }
13093
13094 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
13095
13096 #ifndef PERL_IN_XSUB_RE
13097 void
13098 Perl_save_re_context(pTHX)
13099 {
13100     dVAR;
13101
13102     struct re_save_state *state;
13103
13104     SAVEVPTR(PL_curcop);
13105     SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
13106
13107     state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
13108     PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
13109     SSPUSHUV(SAVEt_RE_STATE);
13110
13111     Copy(&PL_reg_state, state, 1, struct re_save_state);
13112
13113     PL_reg_start_tmp = 0;
13114     PL_reg_start_tmpl = 0;
13115     PL_reg_oldsaved = NULL;
13116     PL_reg_oldsavedlen = 0;
13117     PL_reg_maxiter = 0;
13118     PL_reg_leftiter = 0;
13119     PL_reg_poscache = NULL;
13120     PL_reg_poscache_size = 0;
13121 #ifdef PERL_OLD_COPY_ON_WRITE
13122     PL_nrs = NULL;
13123 #endif
13124
13125     /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
13126     if (PL_curpm) {
13127         const REGEXP * const rx = PM_GETRE(PL_curpm);
13128         if (rx) {
13129             U32 i;
13130             for (i = 1; i <= RX_NPARENS(rx); i++) {
13131                 char digits[TYPE_CHARS(long)];
13132                 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
13133                 GV *const *const gvp
13134                     = (GV**)hv_fetch(PL_defstash, digits, len, 0);
13135
13136                 if (gvp) {
13137                     GV * const gv = *gvp;
13138                     if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
13139                         save_scalar(gv);
13140                 }
13141             }
13142         }
13143     }
13144 }
13145 #endif
13146
13147 static void
13148 clear_re(pTHX_ void *r)
13149 {
13150     dVAR;
13151     ReREFCNT_dec((REGEXP *)r);
13152 }
13153
13154 #ifdef DEBUGGING
13155
13156 STATIC void
13157 S_put_byte(pTHX_ SV *sv, int c)
13158 {
13159     PERL_ARGS_ASSERT_PUT_BYTE;
13160
13161     /* Our definition of isPRINT() ignores locales, so only bytes that are
13162        not part of UTF-8 are considered printable. I assume that the same
13163        holds for UTF-EBCDIC.
13164        Also, code point 255 is not printable in either (it's E0 in EBCDIC,
13165        which Wikipedia says:
13166
13167        EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
13168        ones (binary 1111 1111, hexadecimal FF). It is similar, but not
13169        identical, to the ASCII delete (DEL) or rubout control character.
13170        ) So the old condition can be simplified to !isPRINT(c)  */
13171     if (!isPRINT(c)) {
13172         if (c < 256) {
13173             Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
13174         }
13175         else {
13176             Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
13177         }
13178     }
13179     else {
13180         const char string = c;
13181         if (c == '-' || c == ']' || c == '\\' || c == '^')
13182             sv_catpvs(sv, "\\");
13183         sv_catpvn(sv, &string, 1);
13184     }
13185 }
13186
13187
13188 #define CLEAR_OPTSTART \
13189     if (optstart) STMT_START { \
13190             DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
13191             optstart=NULL; \
13192     } STMT_END
13193
13194 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
13195
13196 STATIC const regnode *
13197 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
13198             const regnode *last, const regnode *plast,
13199             SV* sv, I32 indent, U32 depth)
13200 {
13201     dVAR;
13202     register U8 op = PSEUDO;    /* Arbitrary non-END op. */
13203     register const regnode *next;
13204     const regnode *optstart= NULL;
13205
13206     RXi_GET_DECL(r,ri);
13207     GET_RE_DEBUG_FLAGS_DECL;
13208
13209     PERL_ARGS_ASSERT_DUMPUNTIL;
13210
13211 #ifdef DEBUG_DUMPUNTIL
13212     PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
13213         last ? last-start : 0,plast ? plast-start : 0);
13214 #endif
13215
13216     if (plast && plast < last)
13217         last= plast;
13218
13219     while (PL_regkind[op] != END && (!last || node < last)) {
13220         /* While that wasn't END last time... */
13221         NODE_ALIGN(node);
13222         op = OP(node);
13223         if (op == CLOSE || op == WHILEM)
13224             indent--;
13225         next = regnext((regnode *)node);
13226
13227         /* Where, what. */
13228         if (OP(node) == OPTIMIZED) {
13229             if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
13230                 optstart = node;
13231             else
13232                 goto after_print;
13233         } else
13234             CLEAR_OPTSTART;
13235
13236         regprop(r, sv, node);
13237         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
13238                       (int)(2*indent + 1), "", SvPVX_const(sv));
13239
13240         if (OP(node) != OPTIMIZED) {
13241             if (next == NULL)           /* Next ptr. */
13242                 PerlIO_printf(Perl_debug_log, " (0)");
13243             else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
13244                 PerlIO_printf(Perl_debug_log, " (FAIL)");
13245             else
13246                 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
13247             (void)PerlIO_putc(Perl_debug_log, '\n');
13248         }
13249
13250       after_print:
13251         if (PL_regkind[(U8)op] == BRANCHJ) {
13252             assert(next);
13253             {
13254                 register const regnode *nnode = (OP(next) == LONGJMP
13255                                              ? regnext((regnode *)next)
13256                                              : next);
13257                 if (last && nnode > last)
13258                     nnode = last;
13259                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
13260             }
13261         }
13262         else if (PL_regkind[(U8)op] == BRANCH) {
13263             assert(next);
13264             DUMPUNTIL(NEXTOPER(node), next);
13265         }
13266         else if ( PL_regkind[(U8)op]  == TRIE ) {
13267             const regnode *this_trie = node;
13268             const char op = OP(node);
13269             const U32 n = ARG(node);
13270             const reg_ac_data * const ac = op>=AHOCORASICK ?
13271                (reg_ac_data *)ri->data->data[n] :
13272                NULL;
13273             const reg_trie_data * const trie =
13274                 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
13275 #ifdef DEBUGGING
13276             AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
13277 #endif
13278             const regnode *nextbranch= NULL;
13279             I32 word_idx;
13280             sv_setpvs(sv, "");
13281             for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
13282                 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
13283
13284                 PerlIO_printf(Perl_debug_log, "%*s%s ",
13285                    (int)(2*(indent+3)), "",
13286                     elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
13287                             PL_colors[0], PL_colors[1],
13288                             (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
13289                             PERL_PV_PRETTY_ELLIPSES    |
13290                             PERL_PV_PRETTY_LTGT
13291                             )
13292                             : "???"
13293                 );
13294                 if (trie->jump) {
13295                     U16 dist= trie->jump[word_idx+1];
13296                     PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
13297                                   (UV)((dist ? this_trie + dist : next) - start));
13298                     if (dist) {
13299                         if (!nextbranch)
13300                             nextbranch= this_trie + trie->jump[0];
13301                         DUMPUNTIL(this_trie + dist, nextbranch);
13302                     }
13303                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
13304                         nextbranch= regnext((regnode *)nextbranch);
13305                 } else {
13306                     PerlIO_printf(Perl_debug_log, "\n");
13307                 }
13308             }
13309             if (last && next > last)
13310                 node= last;
13311             else
13312                 node= next;
13313         }
13314         else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
13315             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
13316                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
13317         }
13318         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
13319             assert(next);
13320             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
13321         }
13322         else if ( op == PLUS || op == STAR) {
13323             DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
13324         }
13325         else if (PL_regkind[(U8)op] == ANYOF) {
13326             /* arglen 1 + class block */
13327             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
13328                     ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
13329             node = NEXTOPER(node);
13330         }
13331         else if (PL_regkind[(U8)op] == EXACT) {
13332             /* Literal string, where present. */
13333             node += NODE_SZ_STR(node) - 1;
13334             node = NEXTOPER(node);
13335         }
13336         else {
13337             node = NEXTOPER(node);
13338             node += regarglen[(U8)op];
13339         }
13340         if (op == CURLYX || op == OPEN)
13341             indent++;
13342     }
13343     CLEAR_OPTSTART;
13344 #ifdef DEBUG_DUMPUNTIL
13345     PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
13346 #endif
13347     return node;
13348 }
13349
13350 #endif  /* DEBUGGING */
13351
13352 /*
13353  * Local variables:
13354  * c-indentation-style: bsd
13355  * c-basic-offset: 4
13356  * indent-tabs-mode: t
13357  * End:
13358  *
13359  * ex: set ts=8 sts=4 sw=4 noet:
13360  */