src/5015007/regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  *      Copyright (c) 1986 by University of Toronto.
  42  *      Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  *      Permission is granted to anyone to use this software for any
  45  *      purpose on any computer system, and to redistribute it freely,
  46  *      subject to the following restrictions:
  47  *
  48  *      1. The author is not responsible for the consequences of use of
  49  *              this software, no matter how awful, even if they arise
  50  *              from defects in it.
  51  *
  52  *      2. The origin of this software must not be misrepresented, either
  53  *              by explicit claim or by omission.
  54  *
  55  *      3. Altered versions must be plainly marked as such, and must not
  56  *              be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #include "perl.h"
  76
  77 #ifndef PERL_IN_XSUB_RE
  78 #include "re_defs.h"
  79 #endif
  80
  81 #define REG_COMP_C
  82 #ifdef PERL_IN_XSUB_RE
  83 #  include "re_comp.h"
  84 #else
  85 #  include "regcomp.h"
  86 #endif
  87
  88 #include "dquote_static.c"
  89
  90 #ifdef op
  91 #undef op
  92 #endif /* op */
  93
  94 #ifdef MSDOS
  95 #  if defined(BUGGY_MSC6)
  96  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
  97 #    pragma optimize("a",off)
  98  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
  99 #    pragma optimize("w",on )
 100 #  endif /* BUGGY_MSC6 */
 101 #endif /* MSDOS */
 102
 103 #ifndef STATIC
 104 #define STATIC  static
 105 #endif
 106
 107 typedef struct RExC_state_t {
 108     U32         flags;                  /* are we folding, multilining? */
 109     char        *precomp;               /* uncompiled string. */
 110     REGEXP      *rx_sv;                 /* The SV that is the regexp. */
 111     regexp      *rx;                    /* perl core regexp structure */
 112     regexp_internal     *rxi;           /* internal data for regexp object pprivate field */
 113     char        *start;                 /* Start of input for compile */
 114     char        *end;                   /* End of input for compile */
 115     char        *parse;                 /* Input-scan pointer. */
 116     I32         whilem_seen;            /* number of WHILEM in this expr */
 117     regnode     *emit_start;            /* Start of emitted-code area */
 118     regnode     *emit_bound;            /* First regnode outside of the allocated space */
 119     regnode     *emit;                  /* Code-emit pointer; &regdummy = don't = compiling */
 120     I32         naughty;                /* How bad is this pattern? */
 121     I32         sawback;                /* Did we see \1, ...? */
 122     U32         seen;
 123     I32         size;                   /* Code size. */
 124     I32         npar;                   /* Capture buffer count, (OPEN). */
 125     I32         cpar;                   /* Capture buffer count, (CLOSE). */
 126     I32         nestroot;               /* root parens we are in - used by accept */
 127     I32         extralen;
 128     I32         seen_zerolen;
 129     I32         seen_evals;
 130     regnode     **open_parens;          /* pointers to open parens */
 131     regnode     **close_parens;         /* pointers to close parens */
 132     regnode     *opend;                 /* END node in program */
 133     I32         utf8;           /* whether the pattern is utf8 or not */
 134     I32         orig_utf8;      /* whether the pattern was originally in utf8 */
 135                                 /* XXX use this for future optimisation of case
 136                                  * where pattern must be upgraded to utf8. */
 137     I32         uni_semantics;  /* If a d charset modifier should use unicode
 138                                    rules, even if the pattern is not in
 139                                    utf8 */
 140     HV          *paren_names;           /* Paren names */
 141
 142     regnode     **recurse;              /* Recurse regops */
 143     I32         recurse_count;          /* Number of recurse regops */
 144     I32         in_lookbehind;
 145     I32         contains_locale;
 146     I32         override_recoding;
 147 #if ADD_TO_REGEXEC
 148     char        *starttry;              /* -Dr: where regtry was called. */
 149 #define RExC_starttry   (pRExC_state->starttry)
 150 #endif
 151 #ifdef DEBUGGING
 152     const char  *lastparse;
 153     I32         lastnum;
 154     AV          *paren_name_list;       /* idx -> name */
 155 #define RExC_lastparse  (pRExC_state->lastparse)
 156 #define RExC_lastnum    (pRExC_state->lastnum)
 157 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 158 #endif
 159 } RExC_state_t;
 160
 161 #define RExC_flags      (pRExC_state->flags)
 162 #define RExC_precomp    (pRExC_state->precomp)
 163 #define RExC_rx_sv      (pRExC_state->rx_sv)
 164 #define RExC_rx         (pRExC_state->rx)
 165 #define RExC_rxi        (pRExC_state->rxi)
 166 #define RExC_start      (pRExC_state->start)
 167 #define RExC_end        (pRExC_state->end)
 168 #define RExC_parse      (pRExC_state->parse)
 169 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
 170 #ifdef RE_TRACK_PATTERN_OFFSETS
 171 #define RExC_offsets    (pRExC_state->rxi->u.offsets) /* I am not like the others */
 172 #endif
 173 #define RExC_emit       (pRExC_state->emit)
 174 #define RExC_emit_start (pRExC_state->emit_start)
 175 #define RExC_emit_bound (pRExC_state->emit_bound)
 176 #define RExC_naughty    (pRExC_state->naughty)
 177 #define RExC_sawback    (pRExC_state->sawback)
 178 #define RExC_seen       (pRExC_state->seen)
 179 #define RExC_size       (pRExC_state->size)
 180 #define RExC_npar       (pRExC_state->npar)
 181 #define RExC_nestroot   (pRExC_state->nestroot)
 182 #define RExC_extralen   (pRExC_state->extralen)
 183 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
 184 #define RExC_seen_evals (pRExC_state->seen_evals)
 185 #define RExC_utf8       (pRExC_state->utf8)
 186 #define RExC_uni_semantics      (pRExC_state->uni_semantics)
 187 #define RExC_orig_utf8  (pRExC_state->orig_utf8)
 188 #define RExC_open_parens        (pRExC_state->open_parens)
 189 #define RExC_close_parens       (pRExC_state->close_parens)
 190 #define RExC_opend      (pRExC_state->opend)
 191 #define RExC_paren_names        (pRExC_state->paren_names)
 192 #define RExC_recurse    (pRExC_state->recurse)
 193 #define RExC_recurse_count      (pRExC_state->recurse_count)
 194 #define RExC_in_lookbehind      (pRExC_state->in_lookbehind)
 195 #define RExC_contains_locale    (pRExC_state->contains_locale)
 196 #define RExC_override_recoding  (pRExC_state->override_recoding)
 197
 198
 199 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
 200 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 201         ((*s) == '{' && regcurly(s)))
 202
 203 #ifdef SPSTART
 204 #undef SPSTART          /* dratted cpp namespace... */
 205 #endif
 206 /*
 207  * Flags to be passed up and down.
 208  */
 209 #define WORST           0       /* Worst case. */
 210 #define HASWIDTH        0x01    /* Known to match non-null strings. */
 211
 212 /* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
 213  * character, and if utf8, must be invariant.  Note that this is not the same thing as REGNODE_SIMPLE */
 214 #define SIMPLE          0x02
 215 #define SPSTART         0x04    /* Starts with * or +. */
 216 #define TRYAGAIN        0x08    /* Weeded out a declaration. */
 217 #define POSTPONED       0x10    /* (?1),(?&name), (??{...}) or similar */
 218
 219 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 220
 221 /* whether trie related optimizations are enabled */
 222 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 223 #define TRIE_STUDY_OPT
 224 #define FULL_TRIE_STUDY
 225 #define TRIE_STCLASS
 226 #endif
 227
 228
 229
 230 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 231 #define PBITVAL(paren) (1 << ((paren) & 7))
 232 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 233 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 234 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 235
 236 /* If not already in utf8, do a longjmp back to the beginning */
 237 #define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
 238 #define REQUIRE_UTF8    STMT_START {                                       \
 239                                      if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
 240                         } STMT_END
 241
 242 /* About scan_data_t.
 243
 244   During optimisation we recurse through the regexp program performing
 245   various inplace (keyhole style) optimisations. In addition study_chunk
 246   and scan_commit populate this data structure with information about
 247   what strings MUST appear in the pattern. We look for the longest
 248   string that must appear at a fixed location, and we look for the
 249   longest string that may appear at a floating location. So for instance
 250   in the pattern:
 251
 252     /FOO[xX]A.*B[xX]BAR/
 253
 254   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 255   strings (because they follow a .* construct). study_chunk will identify
 256   both FOO and BAR as being the longest fixed and floating strings respectively.
 257
 258   The strings can be composites, for instance
 259
 260      /(f)(o)(o)/
 261
 262   will result in a composite fixed substring 'foo'.
 263
 264   For each string some basic information is maintained:
 265
 266   - offset or min_offset
 267     This is the position the string must appear at, or not before.
 268     It also implicitly (when combined with minlenp) tells us how many
 269     characters must match before the string we are searching for.
 270     Likewise when combined with minlenp and the length of the string it
 271     tells us how many characters must appear after the string we have
 272     found.
 273
 274   - max_offset
 275     Only used for floating strings. This is the rightmost point that
 276     the string can appear at. If set to I32 max it indicates that the
 277     string can occur infinitely far to the right.
 278
 279   - minlenp
 280     A pointer to the minimum length of the pattern that the string
 281     was found inside. This is important as in the case of positive
 282     lookahead or positive lookbehind we can have multiple patterns
 283     involved. Consider
 284
 285     /(?=FOO).*F/
 286
 287     The minimum length of the pattern overall is 3, the minimum length
 288     of the lookahead part is 3, but the minimum length of the part that
 289     will actually match is 1. So 'FOO's minimum length is 3, but the
 290     minimum length for the F is 1. This is important as the minimum length
 291     is used to determine offsets in front of and behind the string being
 292     looked for.  Since strings can be composites this is the length of the
 293     pattern at the time it was committed with a scan_commit. Note that
 294     the length is calculated by study_chunk, so that the minimum lengths
 295     are not known until the full pattern has been compiled, thus the
 296     pointer to the value.
 297
 298   - lookbehind
 299
 300     In the case of lookbehind the string being searched for can be
 301     offset past the start point of the final matching string.
 302     If this value was just blithely removed from the min_offset it would
 303     invalidate some of the calculations for how many chars must match
 304     before or after (as they are derived from min_offset and minlen and
 305     the length of the string being searched for).
 306     When the final pattern is compiled and the data is moved from the
 307     scan_data_t structure into the regexp structure the information
 308     about lookbehind is factored in, with the information that would
 309     have been lost precalculated in the end_shift field for the
 310     associated string.
 311
 312   The fields pos_min and pos_delta are used to store the minimum offset
 313   and the delta to the maximum offset at the current point in the pattern.
 314
 315 */
 316
 317 typedef struct scan_data_t {
 318     /*I32 len_min;      unused */
 319     /*I32 len_delta;    unused */
 320     I32 pos_min;
 321     I32 pos_delta;
 322     SV *last_found;
 323     I32 last_end;           /* min value, <0 unless valid. */
 324     I32 last_start_min;
 325     I32 last_start_max;
 326     SV **longest;           /* Either &l_fixed, or &l_float. */
 327     SV *longest_fixed;      /* longest fixed string found in pattern */
 328     I32 offset_fixed;       /* offset where it starts */
 329     I32 *minlen_fixed;      /* pointer to the minlen relevant to the string */
 330     I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 331     SV *longest_float;      /* longest floating string found in pattern */
 332     I32 offset_float_min;   /* earliest point in string it can appear */
 333     I32 offset_float_max;   /* latest point in string it can appear */
 334     I32 *minlen_float;      /* pointer to the minlen relevant to the string */
 335     I32 lookbehind_float;   /* is the position of the string modified by LB */
 336     I32 flags;
 337     I32 whilem_c;
 338     I32 *last_closep;
 339     struct regnode_charclass_class *start_class;
 340 } scan_data_t;
 341
 342 /*
 343  * Forward declarations for pregcomp()'s friends.
 344  */
 345
 346 static const scan_data_t zero_scan_data =
 347   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 348
 349 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 350 #define SF_BEFORE_SEOL          0x0001
 351 #define SF_BEFORE_MEOL          0x0002
 352 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 353 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 354
 355 #ifdef NO_UNARY_PLUS
 356 #  define SF_FIX_SHIFT_EOL      (0+2)
 357 #  define SF_FL_SHIFT_EOL               (0+4)
 358 #else
 359 #  define SF_FIX_SHIFT_EOL      (+2)
 360 #  define SF_FL_SHIFT_EOL               (+4)
 361 #endif
 362
 363 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 364 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 365
 366 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 367 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 368 #define SF_IS_INF               0x0040
 369 #define SF_HAS_PAR              0x0080
 370 #define SF_IN_PAR               0x0100
 371 #define SF_HAS_EVAL             0x0200
 372 #define SCF_DO_SUBSTR           0x0400
 373 #define SCF_DO_STCLASS_AND      0x0800
 374 #define SCF_DO_STCLASS_OR       0x1000
 375 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 376 #define SCF_WHILEM_VISITED_POS  0x2000
 377
 378 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 379 #define SCF_SEEN_ACCEPT         0x8000
 380
 381 #define UTF cBOOL(RExC_utf8)
 382 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 383 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
 384 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
 385 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
 386 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
 387 #define MORE_ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
 388 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
 389
 390 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 391
 392 #define OOB_UNICODE             12345678
 393 #define OOB_NAMEDCLASS          -1
 394
 395 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 396 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 397
 398
 399 /* length of regex to show in messages that don't mark a position within */
 400 #define RegexLengthToShowInErrorMessages 127
 401
 402 /*
 403  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 404  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 405  * op/pragma/warn/regcomp.
 406  */
 407 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 408 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 409
 410 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
 411
 412 /*
 413  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 414  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 415  * "...".
 416  */
 417 #define _FAIL(code) STMT_START {                                        \
 418     const char *ellipses = "";                                          \
 419     IV len = RExC_end - RExC_precomp;                                   \
 420                                                                         \
 421     if (!SIZE_ONLY)                                                     \
 422         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);                   \
 423     if (len > RegexLengthToShowInErrorMessages) {                       \
 424         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
 425         len = RegexLengthToShowInErrorMessages - 10;                    \
 426         ellipses = "...";                                               \
 427     }                                                                   \
 428     code;                                                               \
 429 } STMT_END
 430
 431 #define FAIL(msg) _FAIL(                            \
 432     Perl_croak(aTHX_ "%s in regex m/%.*s%s/",       \
 433             msg, (int)len, RExC_precomp, ellipses))
 434
 435 #define FAIL2(msg,arg) _FAIL(                       \
 436     Perl_croak(aTHX_ msg " in regex m/%.*s%s/",     \
 437             arg, (int)len, RExC_precomp, ellipses))
 438
 439 /*
 440  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 441  */
 442 #define Simple_vFAIL(m) STMT_START {                                    \
 443     const IV offset = RExC_parse - RExC_precomp;                        \
 444     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
 445             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
 446 } STMT_END
 447
 448 /*
 449  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 450  */
 451 #define vFAIL(m) STMT_START {                           \
 452     if (!SIZE_ONLY)                                     \
 453         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 454     Simple_vFAIL(m);                                    \
 455 } STMT_END
 456
 457 /*
 458  * Like Simple_vFAIL(), but accepts two arguments.
 459  */
 460 #define Simple_vFAIL2(m,a1) STMT_START {                        \
 461     const IV offset = RExC_parse - RExC_precomp;                        \
 462     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,                   \
 463             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 464 } STMT_END
 465
 466 /*
 467  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 468  */
 469 #define vFAIL2(m,a1) STMT_START {                       \
 470     if (!SIZE_ONLY)                                     \
 471         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 472     Simple_vFAIL2(m, a1);                               \
 473 } STMT_END
 474
 475
 476 /*
 477  * Like Simple_vFAIL(), but accepts three arguments.
 478  */
 479 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
 480     const IV offset = RExC_parse - RExC_precomp;                \
 481     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,               \
 482             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 483 } STMT_END
 484
 485 /*
 486  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 487  */
 488 #define vFAIL3(m,a1,a2) STMT_START {                    \
 489     if (!SIZE_ONLY)                                     \
 490         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 491     Simple_vFAIL3(m, a1, a2);                           \
 492 } STMT_END
 493
 494 /*
 495  * Like Simple_vFAIL(), but accepts four arguments.
 496  */
 497 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
 498     const IV offset = RExC_parse - RExC_precomp;                \
 499     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,           \
 500             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 501 } STMT_END
 502
 503 #define ckWARNreg(loc,m) STMT_START {                                   \
 504     const IV offset = loc - RExC_precomp;                               \
 505     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 506             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 507 } STMT_END
 508
 509 #define ckWARNregdep(loc,m) STMT_START {                                \
 510     const IV offset = loc - RExC_precomp;                               \
 511     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 512             m REPORT_LOCATION,                                          \
 513             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 514 } STMT_END
 515
 516 #define ckWARN2regdep(loc,m, a1) STMT_START {                           \
 517     const IV offset = loc - RExC_precomp;                               \
 518     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 519             m REPORT_LOCATION,                                          \
 520             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
 521 } STMT_END
 522
 523 #define ckWARN2reg(loc, m, a1) STMT_START {                             \
 524     const IV offset = loc - RExC_precomp;                               \
 525     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 526             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
 527 } STMT_END
 528
 529 #define vWARN3(loc, m, a1, a2) STMT_START {                             \
 530     const IV offset = loc - RExC_precomp;                               \
 531     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 532             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 533 } STMT_END
 534
 535 #define ckWARN3reg(loc, m, a1, a2) STMT_START {                         \
 536     const IV offset = loc - RExC_precomp;                               \
 537     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 538             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 539 } STMT_END
 540
 541 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
 542     const IV offset = loc - RExC_precomp;                               \
 543     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 544             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 545 } STMT_END
 546
 547 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START {                     \
 548     const IV offset = loc - RExC_precomp;                               \
 549     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 550             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 551 } STMT_END
 552
 553 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
 554     const IV offset = loc - RExC_precomp;                               \
 555     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 556             a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
 557 } STMT_END
 558
 559
 560 /* Allow for side effects in s */
 561 #define REGC(c,s) STMT_START {                  \
 562     if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
 563 } STMT_END
 564
 565 /* Macros for recording node offsets.   20001227 mjd@plover.com
 566  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 567  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 568  * Element 0 holds the number n.
 569  * Position is 1 indexed.
 570  */
 571 #ifndef RE_TRACK_PATTERN_OFFSETS
 572 #define Set_Node_Offset_To_R(node,byte)
 573 #define Set_Node_Offset(node,byte)
 574 #define Set_Cur_Node_Offset
 575 #define Set_Node_Length_To_R(node,len)
 576 #define Set_Node_Length(node,len)
 577 #define Set_Node_Cur_Length(node)
 578 #define Node_Offset(n)
 579 #define Node_Length(n)
 580 #define Set_Node_Offset_Length(node,offset,len)
 581 #define ProgLen(ri) ri->u.proglen
 582 #define SetProgLen(ri,x) ri->u.proglen = x
 583 #else
 584 #define ProgLen(ri) ri->u.offsets[0]
 585 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 586 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
 587     if (! SIZE_ONLY) {                                                  \
 588         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
 589                     __LINE__, (int)(node), (int)(byte)));               \
 590         if((node) < 0) {                                                \
 591             Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
 592         } else {                                                        \
 593             RExC_offsets[2*(node)-1] = (byte);                          \
 594         }                                                               \
 595     }                                                                   \
 596 } STMT_END
 597
 598 #define Set_Node_Offset(node,byte) \
 599     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 600 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 601
 602 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
 603     if (! SIZE_ONLY) {                                                  \
 604         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
 605                 __LINE__, (int)(node), (int)(len)));                    \
 606         if((node) < 0) {                                                \
 607             Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
 608         } else {                                                        \
 609             RExC_offsets[2*(node)] = (len);                             \
 610         }                                                               \
 611     }                                                                   \
 612 } STMT_END
 613
 614 #define Set_Node_Length(node,len) \
 615     Set_Node_Length_To_R((node)-RExC_emit_start, len)
 616 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
 617 #define Set_Node_Cur_Length(node) \
 618     Set_Node_Length(node, RExC_parse - parse_start)
 619
 620 /* Get offsets and lengths */
 621 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 622 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 623
 624 #define Set_Node_Offset_Length(node,offset,len) STMT_START {    \
 625     Set_Node_Offset_To_R((node)-RExC_emit_start, (offset));     \
 626     Set_Node_Length_To_R((node)-RExC_emit_start, (len));        \
 627 } STMT_END
 628 #endif
 629
 630 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 631 #define EXPERIMENTAL_INPLACESCAN
 632 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
 633
 634 #define DEBUG_STUDYDATA(str,data,depth)                              \
 635 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
 636     PerlIO_printf(Perl_debug_log,                                    \
 637         "%*s" str "Pos:%"IVdf"/%"IVdf                                \
 638         " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s",       \
 639         (int)(depth)*2, "",                                          \
 640         (IV)((data)->pos_min),                                       \
 641         (IV)((data)->pos_delta),                                     \
 642         (UV)((data)->flags),                                         \
 643         (IV)((data)->whilem_c),                                      \
 644         (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
 645         is_inf ? "INF " : ""                                         \
 646     );                                                               \
 647     if ((data)->last_found)                                          \
 648         PerlIO_printf(Perl_debug_log,                                \
 649             "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 650             " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 651             SvPVX_const((data)->last_found),                         \
 652             (IV)((data)->last_end),                                  \
 653             (IV)((data)->last_start_min),                            \
 654             (IV)((data)->last_start_max),                            \
 655             ((data)->longest &&                                      \
 656              (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 657             SvPVX_const((data)->longest_fixed),                      \
 658             (IV)((data)->offset_fixed),                              \
 659             ((data)->longest &&                                      \
 660              (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 661             SvPVX_const((data)->longest_float),                      \
 662             (IV)((data)->offset_float_min),                          \
 663             (IV)((data)->offset_float_max)                           \
 664         );                                                           \
 665     PerlIO_printf(Perl_debug_log,"\n");                              \
 666 });
 667
 668 static void clear_re(pTHX_ void *r);
 669
 670 /* Mark that we cannot extend a found fixed substring at this point.
 671    Update the longest found anchored substring and the longest found
 672    floating substrings if needed. */
 673
 674 STATIC void
 675 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
 676 {
 677     const STRLEN l = CHR_SVLEN(data->last_found);
 678     const STRLEN old_l = CHR_SVLEN(*data->longest);
 679     GET_RE_DEBUG_FLAGS_DECL;
 680
 681     PERL_ARGS_ASSERT_SCAN_COMMIT;
 682
 683     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 684         SvSetMagicSV(*data->longest, data->last_found);
 685         if (*data->longest == data->longest_fixed) {
 686             data->offset_fixed = l ? data->last_start_min : data->pos_min;
 687             if (data->flags & SF_BEFORE_EOL)
 688                 data->flags
 689                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
 690             else
 691                 data->flags &= ~SF_FIX_BEFORE_EOL;
 692             data->minlen_fixed=minlenp;
 693             data->lookbehind_fixed=0;
 694         }
 695         else { /* *data->longest == data->longest_float */
 696             data->offset_float_min = l ? data->last_start_min : data->pos_min;
 697             data->offset_float_max = (l
 698                                       ? data->last_start_max
 699                                       : data->pos_min + data->pos_delta);
 700             if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
 701                 data->offset_float_max = I32_MAX;
 702             if (data->flags & SF_BEFORE_EOL)
 703                 data->flags
 704                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
 705             else
 706                 data->flags &= ~SF_FL_BEFORE_EOL;
 707             data->minlen_float=minlenp;
 708             data->lookbehind_float=0;
 709         }
 710     }
 711     SvCUR_set(data->last_found, 0);
 712     {
 713         SV * const sv = data->last_found;
 714         if (SvUTF8(sv) && SvMAGICAL(sv)) {
 715             MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 716             if (mg)
 717                 mg->mg_len = 0;
 718         }
 719     }
 720     data->last_end = -1;
 721     data->flags &= ~SF_BEFORE_EOL;
 722     DEBUG_STUDYDATA("commit: ",data,0);
 723 }
 724
 725 /* Can match anything (initialization) */
 726 STATIC void
 727 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 728 {
 729     PERL_ARGS_ASSERT_CL_ANYTHING;
 730
 731     ANYOF_BITMAP_SETALL(cl);
 732     cl->flags = ANYOF_CLASS|ANYOF_EOS|ANYOF_UNICODE_ALL
 733                 |ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL;
 734
 735     /* If any portion of the regex is to operate under locale rules,
 736      * initialization includes it.  The reason this isn't done for all regexes
 737      * is that the optimizer was written under the assumption that locale was
 738      * all-or-nothing.  Given the complexity and lack of documentation in the
 739      * optimizer, and that there are inadequate test cases for locale, so many
 740      * parts of it may not work properly, it is safest to avoid locale unless
 741      * necessary. */
 742     if (RExC_contains_locale) {
 743         ANYOF_CLASS_SETALL(cl);     /* /l uses class */
 744         cl->flags |= ANYOF_LOCALE;
 745     }
 746     else {
 747         ANYOF_CLASS_ZERO(cl);       /* Only /l uses class now */
 748     }
 749 }
 750
 751 /* Can match anything (initialization) */
 752 STATIC int
 753 S_cl_is_anything(const struct regnode_charclass_class *cl)
 754 {
 755     int value;
 756
 757     PERL_ARGS_ASSERT_CL_IS_ANYTHING;
 758
 759     for (value = 0; value <= ANYOF_MAX; value += 2)
 760         if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
 761             return 1;
 762     if (!(cl->flags & ANYOF_UNICODE_ALL))
 763         return 0;
 764     if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
 765         return 0;
 766     return 1;
 767 }
 768
 769 /* Can match anything (initialization) */
 770 STATIC void
 771 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 772 {
 773     PERL_ARGS_ASSERT_CL_INIT;
 774
 775     Zero(cl, 1, struct regnode_charclass_class);
 776     cl->type = ANYOF;
 777     cl_anything(pRExC_state, cl);
 778     ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
 779 }
 780
 781 /* These two functions currently do the exact same thing */
 782 #define cl_init_zero            S_cl_init
 783
 784 /* 'AND' a given class with another one.  Can create false positives.  'cl'
 785  * should not be inverted.  'and_with->flags & ANYOF_CLASS' should be 0 if
 786  * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
 787 STATIC void
 788 S_cl_and(struct regnode_charclass_class *cl,
 789         const struct regnode_charclass_class *and_with)
 790 {
 791     PERL_ARGS_ASSERT_CL_AND;
 792
 793     assert(and_with->type == ANYOF);
 794
 795     /* I (khw) am not sure all these restrictions are necessary XXX */
 796     if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
 797         && !(ANYOF_CLASS_TEST_ANY_SET(cl))
 798         && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 799         && !(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
 800         && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) {
 801         int i;
 802
 803         if (and_with->flags & ANYOF_INVERT)
 804             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 805                 cl->bitmap[i] &= ~and_with->bitmap[i];
 806         else
 807             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 808                 cl->bitmap[i] &= and_with->bitmap[i];
 809     } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
 810
 811     if (and_with->flags & ANYOF_INVERT) {
 812
 813         /* Here, the and'ed node is inverted.  Get the AND of the flags that
 814          * aren't affected by the inversion.  Those that are affected are
 815          * handled individually below */
 816         U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
 817         cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
 818         cl->flags |= affected_flags;
 819
 820         /* We currently don't know how to deal with things that aren't in the
 821          * bitmap, but we know that the intersection is no greater than what
 822          * is already in cl, so let there be false positives that get sorted
 823          * out after the synthetic start class succeeds, and the node is
 824          * matched for real. */
 825
 826         /* The inversion of these two flags indicate that the resulting
 827          * intersection doesn't have them */
 828         if (and_with->flags & ANYOF_UNICODE_ALL) {
 829             cl->flags &= ~ANYOF_UNICODE_ALL;
 830         }
 831         if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
 832             cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
 833         }
 834     }
 835     else {   /* and'd node is not inverted */
 836         U8 outside_bitmap_but_not_utf8; /* Temp variable */
 837
 838         if (! ANYOF_NONBITMAP(and_with)) {
 839
 840             /* Here 'and_with' doesn't match anything outside the bitmap
 841              * (except possibly ANYOF_UNICODE_ALL), which means the
 842              * intersection can't either, except for ANYOF_UNICODE_ALL, in
 843              * which case we don't know what the intersection is, but it's no
 844              * greater than what cl already has, so can just leave it alone,
 845              * with possible false positives */
 846             if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
 847                 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
 848                 cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
 849             }
 850         }
 851         else if (! ANYOF_NONBITMAP(cl)) {
 852
 853             /* Here, 'and_with' does match something outside the bitmap, and cl
 854              * doesn't have a list of things to match outside the bitmap.  If
 855              * cl can match all code points above 255, the intersection will
 856              * be those above-255 code points that 'and_with' matches.  If cl
 857              * can't match all Unicode code points, it means that it can't
 858              * match anything outside the bitmap (since the 'if' that got us
 859              * into this block tested for that), so we leave the bitmap empty.
 860              */
 861             if (cl->flags & ANYOF_UNICODE_ALL) {
 862                 ARG_SET(cl, ARG(and_with));
 863
 864                 /* and_with's ARG may match things that don't require UTF8.
 865                  * And now cl's will too, in spite of this being an 'and'.  See
 866                  * the comments below about the kludge */
 867                 cl->flags |= and_with->flags & ANYOF_NONBITMAP_NON_UTF8;
 868             }
 869         }
 870         else {
 871             /* Here, both 'and_with' and cl match something outside the
 872              * bitmap.  Currently we do not do the intersection, so just match
 873              * whatever cl had at the beginning.  */
 874         }
 875
 876
 877         /* Take the intersection of the two sets of flags.  However, the
 878          * ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'.  This is a
 879          * kludge around the fact that this flag is not treated like the others
 880          * which are initialized in cl_anything().  The way the optimizer works
 881          * is that the synthetic start class (SSC) is initialized to match
 882          * anything, and then the first time a real node is encountered, its
 883          * values are AND'd with the SSC's with the result being the values of
 884          * the real node.  However, there are paths through the optimizer where
 885          * the AND never gets called, so those initialized bits are set
 886          * inappropriately, which is not usually a big deal, as they just cause
 887          * false positives in the SSC, which will just mean a probably
 888          * imperceptible slow down in execution.  However this bit has a
 889          * higher false positive consequence in that it can cause utf8.pm,
 890          * utf8_heavy.pl ... to be loaded when not necessary, which is a much
 891          * bigger slowdown and also causes significant extra memory to be used.
 892          * In order to prevent this, the code now takes a different tack.  The
 893          * bit isn't set unless some part of the regular expression needs it,
 894          * but once set it won't get cleared.  This means that these extra
 895          * modules won't get loaded unless there was some path through the
 896          * pattern that would have required them anyway, and  so any false
 897          * positives that occur by not ANDing them out when they could be
 898          * aren't as severe as they would be if we treated this bit like all
 899          * the others */
 900         outside_bitmap_but_not_utf8 = (cl->flags | and_with->flags)
 901                                       & ANYOF_NONBITMAP_NON_UTF8;
 902         cl->flags &= and_with->flags;
 903         cl->flags |= outside_bitmap_but_not_utf8;
 904     }
 905 }
 906
 907 /* 'OR' a given class with another one.  Can create false positives.  'cl'
 908  * should not be inverted.  'or_with->flags & ANYOF_CLASS' should be 0 if
 909  * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
 910 STATIC void
 911 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
 912 {
 913     PERL_ARGS_ASSERT_CL_OR;
 914
 915     if (or_with->flags & ANYOF_INVERT) {
 916
 917         /* Here, the or'd node is to be inverted.  This means we take the
 918          * complement of everything not in the bitmap, but currently we don't
 919          * know what that is, so give up and match anything */
 920         if (ANYOF_NONBITMAP(or_with)) {
 921             cl_anything(pRExC_state, cl);
 922         }
 923         /* We do not use
 924          * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
 925          *   <= (B1 | !B2) | (CL1 | !CL2)
 926          * which is wasteful if CL2 is small, but we ignore CL2:
 927          *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
 928          * XXXX Can we handle case-fold?  Unclear:
 929          *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
 930          *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
 931          */
 932         else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 933              && !(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
 934              && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD) ) {
 935             int i;
 936
 937             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 938                 cl->bitmap[i] |= ~or_with->bitmap[i];
 939         } /* XXXX: logic is complicated otherwise */
 940         else {
 941             cl_anything(pRExC_state, cl);
 942         }
 943
 944         /* And, we can just take the union of the flags that aren't affected
 945          * by the inversion */
 946         cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
 947
 948         /* For the remaining flags:
 949             ANYOF_UNICODE_ALL and inverted means to not match anything above
 950                     255, which means that the union with cl should just be
 951                     what cl has in it, so can ignore this flag
 952             ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
 953                     is 127-255 to match them, but then invert that, so the
 954                     union with cl should just be what cl has in it, so can
 955                     ignore this flag
 956          */
 957     } else {    /* 'or_with' is not inverted */
 958         /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
 959         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 960              && (!(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
 961                  || (cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) ) {
 962             int i;
 963
 964             /* OR char bitmap and class bitmap separately */
 965             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 966                 cl->bitmap[i] |= or_with->bitmap[i];
 967             if (ANYOF_CLASS_TEST_ANY_SET(or_with)) {
 968                 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
 969                     cl->classflags[i] |= or_with->classflags[i];
 970                 cl->flags |= ANYOF_CLASS;
 971             }
 972         }
 973         else { /* XXXX: logic is complicated, leave it along for a moment. */
 974             cl_anything(pRExC_state, cl);
 975         }
 976
 977         if (ANYOF_NONBITMAP(or_with)) {
 978
 979             /* Use the added node's outside-the-bit-map match if there isn't a
 980              * conflict.  If there is a conflict (both nodes match something
 981              * outside the bitmap, but what they match outside is not the same
 982              * pointer, and hence not easily compared until XXX we extend
 983              * inversion lists this far), give up and allow the start class to
 984              * match everything outside the bitmap.  If that stuff is all above
 985              * 255, can just set UNICODE_ALL, otherwise caould be anything. */
 986             if (! ANYOF_NONBITMAP(cl)) {
 987                 ARG_SET(cl, ARG(or_with));
 988             }
 989             else if (ARG(cl) != ARG(or_with)) {
 990
 991                 if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
 992                     cl_anything(pRExC_state, cl);
 993                 }
 994                 else {
 995                     cl->flags |= ANYOF_UNICODE_ALL;
 996                 }
 997             }
 998         }
 999
1000         /* Take the union */
1001         cl->flags |= or_with->flags;
1002     }
1003 }
1004
1005 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1006 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
1007 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1008 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
1009
1010
1011 #ifdef DEBUGGING
1012 /*
1013    dump_trie(trie,widecharmap,revcharmap)
1014    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1015    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1016
1017    These routines dump out a trie in a somewhat readable format.
1018    The _interim_ variants are used for debugging the interim
1019    tables that are used to generate the final compressed
1020    representation which is what dump_trie expects.
1021
1022    Part of the reason for their existence is to provide a form
1023    of documentation as to how the different representations function.
1024
1025 */
1026
1027 /*
1028   Dumps the final compressed table form of the trie to Perl_debug_log.
1029   Used for debugging make_trie().
1030 */
1031
1032 STATIC void
1033 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1034             AV *revcharmap, U32 depth)
1035 {
1036     U32 state;
1037     SV *sv=sv_newmortal();
1038     int colwidth= widecharmap ? 6 : 4;
1039     U16 word;
1040     GET_RE_DEBUG_FLAGS_DECL;
1041
1042     PERL_ARGS_ASSERT_DUMP_TRIE;
1043
1044     PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1045         (int)depth * 2 + 2,"",
1046         "Match","Base","Ofs" );
1047
1048     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1049         SV ** const tmp = av_fetch( revcharmap, state, 0);
1050         if ( tmp ) {
1051             PerlIO_printf( Perl_debug_log, "%*s",
1052                 colwidth,
1053                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1054                             PL_colors[0], PL_colors[1],
1055                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1056                             PERL_PV_ESCAPE_FIRSTCHAR
1057                 )
1058             );
1059         }
1060     }
1061     PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1062         (int)depth * 2 + 2,"");
1063
1064     for( state = 0 ; state < trie->uniquecharcount ; state++ )
1065         PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1066     PerlIO_printf( Perl_debug_log, "\n");
1067
1068     for( state = 1 ; state < trie->statecount ; state++ ) {
1069         const U32 base = trie->states[ state ].trans.base;
1070
1071         PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1072
1073         if ( trie->states[ state ].wordnum ) {
1074             PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1075         } else {
1076             PerlIO_printf( Perl_debug_log, "%6s", "" );
1077         }
1078
1079         PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1080
1081         if ( base ) {
1082             U32 ofs = 0;
1083
1084             while( ( base + ofs  < trie->uniquecharcount ) ||
1085                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
1086                      && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1087                     ofs++;
1088
1089             PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1090
1091             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1092                 if ( ( base + ofs >= trie->uniquecharcount ) &&
1093                      ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1094                      trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1095                 {
1096                    PerlIO_printf( Perl_debug_log, "%*"UVXf,
1097                     colwidth,
1098                     (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1099                 } else {
1100                     PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
1101                 }
1102             }
1103
1104             PerlIO_printf( Perl_debug_log, "]");
1105
1106         }
1107         PerlIO_printf( Perl_debug_log, "\n" );
1108     }
1109     PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1110     for (word=1; word <= trie->wordcount; word++) {
1111         PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1112             (int)word, (int)(trie->wordinfo[word].prev),
1113             (int)(trie->wordinfo[word].len));
1114     }
1115     PerlIO_printf(Perl_debug_log, "\n" );
1116 }
1117 /*
1118   Dumps a fully constructed but uncompressed trie in list form.
1119   List tries normally only are used for construction when the number of
1120   possible chars (trie->uniquecharcount) is very high.
1121   Used for debugging make_trie().
1122 */
1123 STATIC void
1124 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1125                          HV *widecharmap, AV *revcharmap, U32 next_alloc,
1126                          U32 depth)
1127 {
1128     U32 state;
1129     SV *sv=sv_newmortal();
1130     int colwidth= widecharmap ? 6 : 4;
1131     GET_RE_DEBUG_FLAGS_DECL;
1132
1133     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1134
1135     /* print out the table precompression.  */
1136     PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1137         (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1138         "------:-----+-----------------\n" );
1139
1140     for( state=1 ; state < next_alloc ; state ++ ) {
1141         U16 charid;
1142
1143         PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1144             (int)depth * 2 + 2,"", (UV)state  );
1145         if ( ! trie->states[ state ].wordnum ) {
1146             PerlIO_printf( Perl_debug_log, "%5s| ","");
1147         } else {
1148             PerlIO_printf( Perl_debug_log, "W%4x| ",
1149                 trie->states[ state ].wordnum
1150             );
1151         }
1152         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1153             SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1154             if ( tmp ) {
1155                 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1156                     colwidth,
1157                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1158                             PL_colors[0], PL_colors[1],
1159                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1160                             PERL_PV_ESCAPE_FIRSTCHAR
1161                     ) ,
1162                     TRIE_LIST_ITEM(state,charid).forid,
1163                     (UV)TRIE_LIST_ITEM(state,charid).newstate
1164                 );
1165                 if (!(charid % 10))
1166                     PerlIO_printf(Perl_debug_log, "\n%*s| ",
1167                         (int)((depth * 2) + 14), "");
1168             }
1169         }
1170         PerlIO_printf( Perl_debug_log, "\n");
1171     }
1172 }
1173
1174 /*
1175   Dumps a fully constructed but uncompressed trie in table form.
1176   This is the normal DFA style state transition table, with a few
1177   twists to facilitate compression later.
1178   Used for debugging make_trie().
1179 */
1180 STATIC void
1181 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1182                           HV *widecharmap, AV *revcharmap, U32 next_alloc,
1183                           U32 depth)
1184 {
1185     U32 state;
1186     U16 charid;
1187     SV *sv=sv_newmortal();
1188     int colwidth= widecharmap ? 6 : 4;
1189     GET_RE_DEBUG_FLAGS_DECL;
1190
1191     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1192
1193     /*
1194        print out the table precompression so that we can do a visual check
1195        that they are identical.
1196      */
1197
1198     PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1199
1200     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1201         SV ** const tmp = av_fetch( revcharmap, charid, 0);
1202         if ( tmp ) {
1203             PerlIO_printf( Perl_debug_log, "%*s",
1204                 colwidth,
1205                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1206                             PL_colors[0], PL_colors[1],
1207                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1208                             PERL_PV_ESCAPE_FIRSTCHAR
1209                 )
1210             );
1211         }
1212     }
1213
1214     PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1215
1216     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1217         PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1218     }
1219
1220     PerlIO_printf( Perl_debug_log, "\n" );
1221
1222     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1223
1224         PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1225             (int)depth * 2 + 2,"",
1226             (UV)TRIE_NODENUM( state ) );
1227
1228         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1229             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1230             if (v)
1231                 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1232             else
1233                 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1234         }
1235         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1236             PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1237         } else {
1238             PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1239             trie->states[ TRIE_NODENUM( state ) ].wordnum );
1240         }
1241     }
1242 }
1243
1244 #endif
1245
1246
1247 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1248   startbranch: the first branch in the whole branch sequence
1249   first      : start branch of sequence of branch-exact nodes.
1250                May be the same as startbranch
1251   last       : Thing following the last branch.
1252                May be the same as tail.
1253   tail       : item following the branch sequence
1254   count      : words in the sequence
1255   flags      : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1256   depth      : indent depth
1257
1258 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1259
1260 A trie is an N'ary tree where the branches are determined by digital
1261 decomposition of the key. IE, at the root node you look up the 1st character and
1262 follow that branch repeat until you find the end of the branches. Nodes can be
1263 marked as "accepting" meaning they represent a complete word. Eg:
1264
1265   /he|she|his|hers/
1266
1267 would convert into the following structure. Numbers represent states, letters
1268 following numbers represent valid transitions on the letter from that state, if
1269 the number is in square brackets it represents an accepting state, otherwise it
1270 will be in parenthesis.
1271
1272       +-h->+-e->[3]-+-r->(8)-+-s->[9]
1273       |    |
1274       |   (2)
1275       |    |
1276      (1)   +-i->(6)-+-s->[7]
1277       |
1278       +-s->(3)-+-h->(4)-+-e->[5]
1279
1280       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1281
1282 This shows that when matching against the string 'hers' we will begin at state 1
1283 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1284 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1285 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1286 single traverse. We store a mapping from accepting to state to which word was
1287 matched, and then when we have multiple possibilities we try to complete the
1288 rest of the regex in the order in which they occured in the alternation.
1289
1290 The only prior NFA like behaviour that would be changed by the TRIE support is
1291 the silent ignoring of duplicate alternations which are of the form:
1292
1293  / (DUPE|DUPE) X? (?{ ... }) Y /x
1294
1295 Thus EVAL blocks following a trie may be called a different number of times with
1296 and without the optimisation. With the optimisations dupes will be silently
1297 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1298 the following demonstrates:
1299
1300  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1301
1302 which prints out 'word' three times, but
1303
1304  'words'=~/(word|word|word)(?{ print $1 })S/
1305
1306 which doesnt print it out at all. This is due to other optimisations kicking in.
1307
1308 Example of what happens on a structural level:
1309
1310 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1311
1312    1: CURLYM[1] {1,32767}(18)
1313    5:   BRANCH(8)
1314    6:     EXACT <ac>(16)
1315    8:   BRANCH(11)
1316    9:     EXACT <ad>(16)
1317   11:   BRANCH(14)
1318   12:     EXACT <ab>(16)
1319   16:   SUCCEED(0)
1320   17:   NOTHING(18)
1321   18: END(0)
1322
1323 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1324 and should turn into:
1325
1326    1: CURLYM[1] {1,32767}(18)
1327    5:   TRIE(16)
1328         [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1329           <ac>
1330           <ad>
1331           <ab>
1332   16:   SUCCEED(0)
1333   17:   NOTHING(18)
1334   18: END(0)
1335
1336 Cases where tail != last would be like /(?foo|bar)baz/:
1337
1338    1: BRANCH(4)
1339    2:   EXACT <foo>(8)
1340    4: BRANCH(7)
1341    5:   EXACT <bar>(8)
1342    7: TAIL(8)
1343    8: EXACT <baz>(10)
1344   10: END(0)
1345
1346 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1347 and would end up looking like:
1348
1349     1: TRIE(8)
1350       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1351         <foo>
1352         <bar>
1353    7: TAIL(8)
1354    8: EXACT <baz>(10)
1355   10: END(0)
1356
1357     d = uvuni_to_utf8_flags(d, uv, 0);
1358
1359 is the recommended Unicode-aware way of saying
1360
1361     *(d++) = uv;
1362 */
1363
1364 #define TRIE_STORE_REVCHAR                                                 \
1365     STMT_START {                                                           \
1366         if (UTF) {                                                         \
1367             SV *zlopp = newSV(2);                                          \
1368             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
1369             unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
1370             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
1371             SvPOK_on(zlopp);                                               \
1372             SvUTF8_on(zlopp);                                              \
1373             av_push(revcharmap, zlopp);                                    \
1374         } else {                                                           \
1375             char ooooff = (char)uvc;                                               \
1376             av_push(revcharmap, newSVpvn(&ooooff, 1));                     \
1377         }                                                                  \
1378         } STMT_END
1379
1380 #define TRIE_READ_CHAR STMT_START {                                           \
1381     wordlen++;                                                                \
1382     if ( UTF ) {                                                              \
1383         if ( folder ) {                                                       \
1384             if ( foldlen > 0 ) {                                              \
1385                uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags );     \
1386                foldlen -= len;                                                \
1387                scan += len;                                                   \
1388                len = 0;                                                       \
1389             } else {                                                          \
1390                 len = UTF8SKIP(uc);\
1391                 uvc = to_utf8_fold( uc, foldbuf, &foldlen);                   \
1392                 foldlen -= UNISKIP( uvc );                                    \
1393                 scan = foldbuf + UNISKIP( uvc );                              \
1394             }                                                                 \
1395         } else {                                                              \
1396             uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1397         }                                                                     \
1398     } else {                                                                  \
1399         uvc = (U32)*uc;                                                       \
1400         len = 1;                                                              \
1401     }                                                                         \
1402 } STMT_END
1403
1404
1405
1406 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
1407     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
1408         U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
1409         Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1410     }                                                           \
1411     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
1412     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
1413     TRIE_LIST_CUR( state )++;                                   \
1414 } STMT_END
1415
1416 #define TRIE_LIST_NEW(state) STMT_START {                       \
1417     Newxz( trie->states[ state ].trans.list,               \
1418         4, reg_trie_trans_le );                                 \
1419      TRIE_LIST_CUR( state ) = 1;                                \
1420      TRIE_LIST_LEN( state ) = 4;                                \
1421 } STMT_END
1422
1423 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
1424     U16 dupe= trie->states[ state ].wordnum;                    \
1425     regnode * const noper_next = regnext( noper );              \
1426                                                                 \
1427     DEBUG_r({                                                   \
1428         /* store the word for dumping */                        \
1429         SV* tmp;                                                \
1430         if (OP(noper) != NOTHING)                               \
1431             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);    \
1432         else                                                    \
1433             tmp = newSVpvn_utf8( "", 0, UTF );                  \
1434         av_push( trie_words, tmp );                             \
1435     });                                                         \
1436                                                                 \
1437     curword++;                                                  \
1438     trie->wordinfo[curword].prev   = 0;                         \
1439     trie->wordinfo[curword].len    = wordlen;                   \
1440     trie->wordinfo[curword].accept = state;                     \
1441                                                                 \
1442     if ( noper_next < tail ) {                                  \
1443         if (!trie->jump)                                        \
1444             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1445         trie->jump[curword] = (U16)(noper_next - convert);      \
1446         if (!jumper)                                            \
1447             jumper = noper_next;                                \
1448         if (!nextbranch)                                        \
1449             nextbranch= regnext(cur);                           \
1450     }                                                           \
1451                                                                 \
1452     if ( dupe ) {                                               \
1453         /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
1454         /* chain, so that when the bits of chain are later    */\
1455         /* linked together, the dups appear in the chain      */\
1456         trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1457         trie->wordinfo[dupe].prev = curword;                    \
1458     } else {                                                    \
1459         /* we haven't inserted this word yet.                */ \
1460         trie->states[ state ].wordnum = curword;                \
1461     }                                                           \
1462 } STMT_END
1463
1464
1465 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)          \
1466      ( ( base + charid >=  ucharcount                                   \
1467          && base + charid < ubound                                      \
1468          && state == trie->trans[ base - ucharcount + charid ].check    \
1469          && trie->trans[ base - ucharcount + charid ].next )            \
1470            ? trie->trans[ base - ucharcount + charid ].next             \
1471            : ( state==1 ? special : 0 )                                 \
1472       )
1473
1474 #define MADE_TRIE       1
1475 #define MADE_JUMP_TRIE  2
1476 #define MADE_EXACT_TRIE 4
1477
1478 STATIC I32
1479 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1480 {
1481     dVAR;
1482     /* first pass, loop through and scan words */
1483     reg_trie_data *trie;
1484     HV *widecharmap = NULL;
1485     AV *revcharmap = newAV();
1486     regnode *cur;
1487     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1488     STRLEN len = 0;
1489     UV uvc = 0;
1490     U16 curword = 0;
1491     U32 next_alloc = 0;
1492     regnode *jumper = NULL;
1493     regnode *nextbranch = NULL;
1494     regnode *convert = NULL;
1495     U32 *prev_states; /* temp array mapping each state to previous one */
1496     /* we just use folder as a flag in utf8 */
1497     const U8 * folder = NULL;
1498
1499 #ifdef DEBUGGING
1500     const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1501     AV *trie_words = NULL;
1502     /* along with revcharmap, this only used during construction but both are
1503      * useful during debugging so we store them in the struct when debugging.
1504      */
1505 #else
1506     const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1507     STRLEN trie_charcount=0;
1508 #endif
1509     SV *re_trie_maxbuff;
1510     GET_RE_DEBUG_FLAGS_DECL;
1511
1512     PERL_ARGS_ASSERT_MAKE_TRIE;
1513 #ifndef DEBUGGING
1514     PERL_UNUSED_ARG(depth);
1515 #endif
1516
1517     switch (flags) {
1518         case EXACT: break;
1519         case EXACTFA:
1520         case EXACTFU: folder = PL_fold_latin1; break;
1521         case EXACTF:  folder = PL_fold; break;
1522         case EXACTFL: folder = PL_fold_locale; break;
1523         default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u", (unsigned) flags );
1524     }
1525
1526     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1527     trie->refcount = 1;
1528     trie->startstate = 1;
1529     trie->wordcount = word_count;
1530     RExC_rxi->data->data[ data_slot ] = (void*)trie;
1531     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1532     if (!(UTF && folder))
1533         trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1534     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1535                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
1536
1537     DEBUG_r({
1538         trie_words = newAV();
1539     });
1540
1541     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1542     if (!SvIOK(re_trie_maxbuff)) {
1543         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1544     }
1545     DEBUG_OPTIMISE_r({
1546                 PerlIO_printf( Perl_debug_log,
1547                   "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1548                   (int)depth * 2 + 2, "",
1549                   REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1550                   REG_NODE_NUM(last), REG_NODE_NUM(tail),
1551                   (int)depth);
1552     });
1553
1554    /* Find the node we are going to overwrite */
1555     if ( first == startbranch && OP( last ) != BRANCH ) {
1556         /* whole branch chain */
1557         convert = first;
1558     } else {
1559         /* branch sub-chain */
1560         convert = NEXTOPER( first );
1561     }
1562
1563     /*  -- First loop and Setup --
1564
1565        We first traverse the branches and scan each word to determine if it
1566        contains widechars, and how many unique chars there are, this is
1567        important as we have to build a table with at least as many columns as we
1568        have unique chars.
1569
1570        We use an array of integers to represent the character codes 0..255
1571        (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1572        native representation of the character value as the key and IV's for the
1573        coded index.
1574
1575        *TODO* If we keep track of how many times each character is used we can
1576        remap the columns so that the table compression later on is more
1577        efficient in terms of memory by ensuring the most common value is in the
1578        middle and the least common are on the outside.  IMO this would be better
1579        than a most to least common mapping as theres a decent chance the most
1580        common letter will share a node with the least common, meaning the node
1581        will not be compressible. With a middle is most common approach the worst
1582        case is when we have the least common nodes twice.
1583
1584      */
1585
1586     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1587         regnode * const noper = NEXTOPER( cur );
1588         const U8 *uc = (U8*)STRING( noper );
1589         const U8 * const e  = uc + STR_LEN( noper );
1590         STRLEN foldlen = 0;
1591         U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1592         const U8 *scan = (U8*)NULL;
1593         U32 wordlen      = 0;         /* required init */
1594         STRLEN chars = 0;
1595         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1596
1597         if (OP(noper) == NOTHING) {
1598             trie->minlen= 0;
1599             continue;
1600         }
1601         if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1602             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1603                                           regardless of encoding */
1604
1605         for ( ; uc < e ; uc += len ) {
1606             TRIE_CHARCOUNT(trie)++;
1607             TRIE_READ_CHAR;
1608             chars++;
1609             if ( uvc < 256 ) {
1610                 if ( !trie->charmap[ uvc ] ) {
1611                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1612                     if ( folder )
1613                         trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1614                     TRIE_STORE_REVCHAR;
1615                 }
1616                 if ( set_bit ) {
1617                     /* store the codepoint in the bitmap, and its folded
1618                      * equivalent. */
1619                     TRIE_BITMAP_SET(trie,uvc);
1620
1621                     /* store the folded codepoint */
1622                     if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1623
1624                     if ( !UTF ) {
1625                         /* store first byte of utf8 representation of
1626                            variant codepoints */
1627                         if (! UNI_IS_INVARIANT(uvc)) {
1628                             TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
1629                         }
1630                     }
1631                     set_bit = 0; /* We've done our bit :-) */
1632                 }
1633             } else {
1634                 SV** svpp;
1635                 if ( !widecharmap )
1636                     widecharmap = newHV();
1637
1638                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1639
1640                 if ( !svpp )
1641                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1642
1643                 if ( !SvTRUE( *svpp ) ) {
1644                     sv_setiv( *svpp, ++trie->uniquecharcount );
1645                     TRIE_STORE_REVCHAR;
1646                 }
1647             }
1648         }
1649         if( cur == first ) {
1650             trie->minlen=chars;
1651             trie->maxlen=chars;
1652         } else if (chars < trie->minlen) {
1653             trie->minlen=chars;
1654         } else if (chars > trie->maxlen) {
1655             trie->maxlen=chars;
1656         }
1657
1658     } /* end first pass */
1659     DEBUG_TRIE_COMPILE_r(
1660         PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1661                 (int)depth * 2 + 2,"",
1662                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1663                 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1664                 (int)trie->minlen, (int)trie->maxlen )
1665     );
1666
1667     /*
1668         We now know what we are dealing with in terms of unique chars and
1669         string sizes so we can calculate how much memory a naive
1670         representation using a flat table  will take. If it's over a reasonable
1671         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1672         conservative but potentially much slower representation using an array
1673         of lists.
1674
1675         At the end we convert both representations into the same compressed
1676         form that will be used in regexec.c for matching with. The latter
1677         is a form that cannot be used to construct with but has memory
1678         properties similar to the list form and access properties similar
1679         to the table form making it both suitable for fast searches and
1680         small enough that its feasable to store for the duration of a program.
1681
1682         See the comment in the code where the compressed table is produced
1683         inplace from the flat tabe representation for an explanation of how
1684         the compression works.
1685
1686     */
1687
1688
1689     Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1690     prev_states[1] = 0;
1691
1692     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1693         /*
1694             Second Pass -- Array Of Lists Representation
1695
1696             Each state will be represented by a list of charid:state records
1697             (reg_trie_trans_le) the first such element holds the CUR and LEN
1698             points of the allocated array. (See defines above).
1699
1700             We build the initial structure using the lists, and then convert
1701             it into the compressed table form which allows faster lookups
1702             (but cant be modified once converted).
1703         */
1704
1705         STRLEN transcount = 1;
1706
1707         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1708             "%*sCompiling trie using list compiler\n",
1709             (int)depth * 2 + 2, ""));
1710
1711         trie->states = (reg_trie_state *)
1712             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1713                                   sizeof(reg_trie_state) );
1714         TRIE_LIST_NEW(1);
1715         next_alloc = 2;
1716
1717         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1718
1719             regnode * const noper = NEXTOPER( cur );
1720             U8 *uc           = (U8*)STRING( noper );
1721             const U8 * const e = uc + STR_LEN( noper );
1722             U32 state        = 1;         /* required init */
1723             U16 charid       = 0;         /* sanity init */
1724             U8 *scan         = (U8*)NULL; /* sanity init */
1725             STRLEN foldlen   = 0;         /* required init */
1726             U32 wordlen      = 0;         /* required init */
1727             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1728
1729             if (OP(noper) != NOTHING) {
1730                 for ( ; uc < e ; uc += len ) {
1731
1732                     TRIE_READ_CHAR;
1733
1734                     if ( uvc < 256 ) {
1735                         charid = trie->charmap[ uvc ];
1736                     } else {
1737                         SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1738                         if ( !svpp ) {
1739                             charid = 0;
1740                         } else {
1741                             charid=(U16)SvIV( *svpp );
1742                         }
1743                     }
1744                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
1745                     if ( charid ) {
1746
1747                         U16 check;
1748                         U32 newstate = 0;
1749
1750                         charid--;
1751                         if ( !trie->states[ state ].trans.list ) {
1752                             TRIE_LIST_NEW( state );
1753                         }
1754                         for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1755                             if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1756                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1757                                 break;
1758                             }
1759                         }
1760                         if ( ! newstate ) {
1761                             newstate = next_alloc++;
1762                             prev_states[newstate] = state;
1763                             TRIE_LIST_PUSH( state, charid, newstate );
1764                             transcount++;
1765                         }
1766                         state = newstate;
1767                     } else {
1768                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1769                     }
1770                 }
1771             }
1772             TRIE_HANDLE_WORD(state);
1773
1774         } /* end second pass */
1775
1776         /* next alloc is the NEXT state to be allocated */
1777         trie->statecount = next_alloc;
1778         trie->states = (reg_trie_state *)
1779             PerlMemShared_realloc( trie->states,
1780                                    next_alloc
1781                                    * sizeof(reg_trie_state) );
1782
1783         /* and now dump it out before we compress it */
1784         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1785                                                          revcharmap, next_alloc,
1786                                                          depth+1)
1787         );
1788
1789         trie->trans = (reg_trie_trans *)
1790             PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1791         {
1792             U32 state;
1793             U32 tp = 0;
1794             U32 zp = 0;
1795
1796
1797             for( state=1 ; state < next_alloc ; state ++ ) {
1798                 U32 base=0;
1799
1800                 /*
1801                 DEBUG_TRIE_COMPILE_MORE_r(
1802                     PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1803                 );
1804                 */
1805
1806                 if (trie->states[state].trans.list) {
1807                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1808                     U16 maxid=minid;
1809                     U16 idx;
1810
1811                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1812                         const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1813                         if ( forid < minid ) {
1814                             minid=forid;
1815                         } else if ( forid > maxid ) {
1816                             maxid=forid;
1817                         }
1818                     }
1819                     if ( transcount < tp + maxid - minid + 1) {
1820                         transcount *= 2;
1821                         trie->trans = (reg_trie_trans *)
1822                             PerlMemShared_realloc( trie->trans,
1823                                                      transcount
1824                                                      * sizeof(reg_trie_trans) );
1825                         Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1826                     }
1827                     base = trie->uniquecharcount + tp - minid;
1828                     if ( maxid == minid ) {
1829                         U32 set = 0;
1830                         for ( ; zp < tp ; zp++ ) {
1831                             if ( ! trie->trans[ zp ].next ) {
1832                                 base = trie->uniquecharcount + zp - minid;
1833                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1834                                 trie->trans[ zp ].check = state;
1835                                 set = 1;
1836                                 break;
1837                             }
1838                         }
1839                         if ( !set ) {
1840                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1841                             trie->trans[ tp ].check = state;
1842                             tp++;
1843                             zp = tp;
1844                         }
1845                     } else {
1846                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1847                             const U32 tid = base -  trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1848                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1849                             trie->trans[ tid ].check = state;
1850                         }
1851                         tp += ( maxid - minid + 1 );
1852                     }
1853                     Safefree(trie->states[ state ].trans.list);
1854                 }
1855                 /*
1856                 DEBUG_TRIE_COMPILE_MORE_r(
1857                     PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1858                 );
1859                 */
1860                 trie->states[ state ].trans.base=base;
1861             }
1862             trie->lasttrans = tp + 1;
1863         }
1864     } else {
1865         /*
1866            Second Pass -- Flat Table Representation.
1867
1868            we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1869            We know that we will need Charcount+1 trans at most to store the data
1870            (one row per char at worst case) So we preallocate both structures
1871            assuming worst case.
1872
1873            We then construct the trie using only the .next slots of the entry
1874            structs.
1875
1876            We use the .check field of the first entry of the node temporarily to
1877            make compression both faster and easier by keeping track of how many non
1878            zero fields are in the node.
1879
1880            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1881            transition.
1882
1883            There are two terms at use here: state as a TRIE_NODEIDX() which is a
1884            number representing the first entry of the node, and state as a
1885            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1886            TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1887            are 2 entrys per node. eg:
1888
1889              A B       A B
1890           1. 2 4    1. 3 7
1891           2. 0 3    3. 0 5
1892           3. 0 0    5. 0 0
1893           4. 0 0    7. 0 0
1894
1895            The table is internally in the right hand, idx form. However as we also
1896            have to deal with the states array which is indexed by nodenum we have to
1897            use TRIE_NODENUM() to convert.
1898
1899         */
1900         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1901             "%*sCompiling trie using table compiler\n",
1902             (int)depth * 2 + 2, ""));
1903
1904         trie->trans = (reg_trie_trans *)
1905             PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1906                                   * trie->uniquecharcount + 1,
1907                                   sizeof(reg_trie_trans) );
1908         trie->states = (reg_trie_state *)
1909             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1910                                   sizeof(reg_trie_state) );
1911         next_alloc = trie->uniquecharcount + 1;
1912
1913
1914         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1915
1916             regnode * const noper   = NEXTOPER( cur );
1917             const U8 *uc     = (U8*)STRING( noper );
1918             const U8 * const e = uc + STR_LEN( noper );
1919
1920             U32 state        = 1;         /* required init */
1921
1922             U16 charid       = 0;         /* sanity init */
1923             U32 accept_state = 0;         /* sanity init */
1924             U8 *scan         = (U8*)NULL; /* sanity init */
1925
1926             STRLEN foldlen   = 0;         /* required init */
1927             U32 wordlen      = 0;         /* required init */
1928             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1929
1930             if ( OP(noper) != NOTHING ) {
1931                 for ( ; uc < e ; uc += len ) {
1932
1933                     TRIE_READ_CHAR;
1934
1935                     if ( uvc < 256 ) {
1936                         charid = trie->charmap[ uvc ];
1937                     } else {
1938                         SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1939                         charid = svpp ? (U16)SvIV(*svpp) : 0;
1940                     }
1941                     if ( charid ) {
1942                         charid--;
1943                         if ( !trie->trans[ state + charid ].next ) {
1944                             trie->trans[ state + charid ].next = next_alloc;
1945                             trie->trans[ state ].check++;
1946                             prev_states[TRIE_NODENUM(next_alloc)]
1947                                     = TRIE_NODENUM(state);
1948                             next_alloc += trie->uniquecharcount;
1949                         }
1950                         state = trie->trans[ state + charid ].next;
1951                     } else {
1952                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1953                     }
1954                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
1955                 }
1956             }
1957             accept_state = TRIE_NODENUM( state );
1958             TRIE_HANDLE_WORD(accept_state);
1959
1960         } /* end second pass */
1961
1962         /* and now dump it out before we compress it */
1963         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1964                                                           revcharmap,
1965                                                           next_alloc, depth+1));
1966
1967         {
1968         /*
1969            * Inplace compress the table.*
1970
1971            For sparse data sets the table constructed by the trie algorithm will
1972            be mostly 0/FAIL transitions or to put it another way mostly empty.
1973            (Note that leaf nodes will not contain any transitions.)
1974
1975            This algorithm compresses the tables by eliminating most such
1976            transitions, at the cost of a modest bit of extra work during lookup:
1977
1978            - Each states[] entry contains a .base field which indicates the
1979            index in the state[] array wheres its transition data is stored.
1980
1981            - If .base is 0 there are no valid transitions from that node.
1982
1983            - If .base is nonzero then charid is added to it to find an entry in
1984            the trans array.
1985
1986            -If trans[states[state].base+charid].check!=state then the
1987            transition is taken to be a 0/Fail transition. Thus if there are fail
1988            transitions at the front of the node then the .base offset will point
1989            somewhere inside the previous nodes data (or maybe even into a node
1990            even earlier), but the .check field determines if the transition is
1991            valid.
1992
1993            XXX - wrong maybe?
1994            The following process inplace converts the table to the compressed
1995            table: We first do not compress the root node 1,and mark all its
1996            .check pointers as 1 and set its .base pointer as 1 as well. This
1997            allows us to do a DFA construction from the compressed table later,
1998            and ensures that any .base pointers we calculate later are greater
1999            than 0.
2000
2001            - We set 'pos' to indicate the first entry of the second node.
2002
2003            - We then iterate over the columns of the node, finding the first and
2004            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2005            and set the .check pointers accordingly, and advance pos
2006            appropriately and repreat for the next node. Note that when we copy
2007            the next pointers we have to convert them from the original
2008            NODEIDX form to NODENUM form as the former is not valid post
2009            compression.
2010
2011            - If a node has no transitions used we mark its base as 0 and do not
2012            advance the pos pointer.
2013
2014            - If a node only has one transition we use a second pointer into the
2015            structure to fill in allocated fail transitions from other states.
2016            This pointer is independent of the main pointer and scans forward
2017            looking for null transitions that are allocated to a state. When it
2018            finds one it writes the single transition into the "hole".  If the
2019            pointer doesnt find one the single transition is appended as normal.
2020
2021            - Once compressed we can Renew/realloc the structures to release the
2022            excess space.
2023
2024            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2025            specifically Fig 3.47 and the associated pseudocode.
2026
2027            demq
2028         */
2029         const U32 laststate = TRIE_NODENUM( next_alloc );
2030         U32 state, charid;
2031         U32 pos = 0, zp=0;
2032         trie->statecount = laststate;
2033
2034         for ( state = 1 ; state < laststate ; state++ ) {
2035             U8 flag = 0;
2036             const U32 stateidx = TRIE_NODEIDX( state );
2037             const U32 o_used = trie->trans[ stateidx ].check;
2038             U32 used = trie->trans[ stateidx ].check;
2039             trie->trans[ stateidx ].check = 0;
2040
2041             for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2042                 if ( flag || trie->trans[ stateidx + charid ].next ) {
2043                     if ( trie->trans[ stateidx + charid ].next ) {
2044                         if (o_used == 1) {
2045                             for ( ; zp < pos ; zp++ ) {
2046                                 if ( ! trie->trans[ zp ].next ) {
2047                                     break;
2048                                 }
2049                             }
2050                             trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2051                             trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2052                             trie->trans[ zp ].check = state;
2053                             if ( ++zp > pos ) pos = zp;
2054                             break;
2055                         }
2056                         used--;
2057                     }
2058                     if ( !flag ) {
2059                         flag = 1;
2060                         trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2061                     }
2062                     trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2063                     trie->trans[ pos ].check = state;
2064                     pos++;
2065                 }
2066             }
2067         }
2068         trie->lasttrans = pos + 1;
2069         trie->states = (reg_trie_state *)
2070             PerlMemShared_realloc( trie->states, laststate
2071                                    * sizeof(reg_trie_state) );
2072         DEBUG_TRIE_COMPILE_MORE_r(
2073                 PerlIO_printf( Perl_debug_log,
2074                     "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2075                     (int)depth * 2 + 2,"",
2076                     (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
2077                     (IV)next_alloc,
2078                     (IV)pos,
2079                     ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2080             );
2081
2082         } /* end table compress */
2083     }
2084     DEBUG_TRIE_COMPILE_MORE_r(
2085             PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2086                 (int)depth * 2 + 2, "",
2087                 (UV)trie->statecount,
2088                 (UV)trie->lasttrans)
2089     );
2090     /* resize the trans array to remove unused space */
2091     trie->trans = (reg_trie_trans *)
2092         PerlMemShared_realloc( trie->trans, trie->lasttrans
2093                                * sizeof(reg_trie_trans) );
2094
2095     {   /* Modify the program and insert the new TRIE node */
2096         U8 nodetype =(U8)(flags & 0xFF);
2097         char *str=NULL;
2098
2099 #ifdef DEBUGGING
2100         regnode *optimize = NULL;
2101 #ifdef RE_TRACK_PATTERN_OFFSETS
2102
2103         U32 mjd_offset = 0;
2104         U32 mjd_nodelen = 0;
2105 #endif /* RE_TRACK_PATTERN_OFFSETS */
2106 #endif /* DEBUGGING */
2107         /*
2108            This means we convert either the first branch or the first Exact,
2109            depending on whether the thing following (in 'last') is a branch
2110            or not and whther first is the startbranch (ie is it a sub part of
2111            the alternation or is it the whole thing.)
2112            Assuming its a sub part we convert the EXACT otherwise we convert
2113            the whole branch sequence, including the first.
2114          */
2115         /* Find the node we are going to overwrite */
2116         if ( first != startbranch || OP( last ) == BRANCH ) {
2117             /* branch sub-chain */
2118             NEXT_OFF( first ) = (U16)(last - first);
2119 #ifdef RE_TRACK_PATTERN_OFFSETS
2120             DEBUG_r({
2121                 mjd_offset= Node_Offset((convert));
2122                 mjd_nodelen= Node_Length((convert));
2123             });
2124 #endif
2125             /* whole branch chain */
2126         }
2127 #ifdef RE_TRACK_PATTERN_OFFSETS
2128         else {
2129             DEBUG_r({
2130                 const  regnode *nop = NEXTOPER( convert );
2131                 mjd_offset= Node_Offset((nop));
2132                 mjd_nodelen= Node_Length((nop));
2133             });
2134         }
2135         DEBUG_OPTIMISE_r(
2136             PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2137                 (int)depth * 2 + 2, "",
2138                 (UV)mjd_offset, (UV)mjd_nodelen)
2139         );
2140 #endif
2141         /* But first we check to see if there is a common prefix we can
2142            split out as an EXACT and put in front of the TRIE node.  */
2143         trie->startstate= 1;
2144         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
2145             U32 state;
2146             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2147                 U32 ofs = 0;
2148                 I32 idx = -1;
2149                 U32 count = 0;
2150                 const U32 base = trie->states[ state ].trans.base;
2151
2152                 if ( trie->states[state].wordnum )
2153                         count = 1;
2154
2155                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2156                     if ( ( base + ofs >= trie->uniquecharcount ) &&
2157                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2158                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2159                     {
2160                         if ( ++count > 1 ) {
2161                             SV **tmp = av_fetch( revcharmap, ofs, 0);
2162                             const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2163                             if ( state == 1 ) break;
2164                             if ( count == 2 ) {
2165                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2166                                 DEBUG_OPTIMISE_r(
2167                                     PerlIO_printf(Perl_debug_log,
2168                                         "%*sNew Start State=%"UVuf" Class: [",
2169                                         (int)depth * 2 + 2, "",
2170                                         (UV)state));
2171                                 if (idx >= 0) {
2172                                     SV ** const tmp = av_fetch( revcharmap, idx, 0);
2173                                     const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2174
2175                                     TRIE_BITMAP_SET(trie,*ch);
2176                                     if ( folder )
2177                                         TRIE_BITMAP_SET(trie, folder[ *ch ]);
2178                                     DEBUG_OPTIMISE_r(
2179                                         PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2180                                     );
2181                                 }
2182                             }
2183                             TRIE_BITMAP_SET(trie,*ch);
2184                             if ( folder )
2185                                 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2186                             DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2187                         }
2188                         idx = ofs;
2189                     }
2190                 }
2191                 if ( count == 1 ) {
2192                     SV **tmp = av_fetch( revcharmap, idx, 0);
2193                     STRLEN len;
2194                     char *ch = SvPV( *tmp, len );
2195                     DEBUG_OPTIMISE_r({
2196                         SV *sv=sv_newmortal();
2197                         PerlIO_printf( Perl_debug_log,
2198                             "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2199                             (int)depth * 2 + 2, "",
2200                             (UV)state, (UV)idx,
2201                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2202                                 PL_colors[0], PL_colors[1],
2203                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2204                                 PERL_PV_ESCAPE_FIRSTCHAR
2205                             )
2206                         );
2207                     });
2208                     if ( state==1 ) {
2209                         OP( convert ) = nodetype;
2210                         str=STRING(convert);
2211                         STR_LEN(convert)=0;
2212                     }
2213                     STR_LEN(convert) += len;
2214                     while (len--)
2215                         *str++ = *ch++;
2216                 } else {
2217 #ifdef DEBUGGING
2218                     if (state>1)
2219                         DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2220 #endif
2221                     break;
2222                 }
2223             }
2224             trie->prefixlen = (state-1);
2225             if (str) {
2226                 regnode *n = convert+NODE_SZ_STR(convert);
2227                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2228                 trie->startstate = state;
2229                 trie->minlen -= (state - 1);
2230                 trie->maxlen -= (state - 1);
2231 #ifdef DEBUGGING
2232                /* At least the UNICOS C compiler choked on this
2233                 * being argument to DEBUG_r(), so let's just have
2234                 * it right here. */
2235                if (
2236 #ifdef PERL_EXT_RE_BUILD
2237                    1
2238 #else
2239                    DEBUG_r_TEST
2240 #endif
2241                    ) {
2242                    regnode *fix = convert;
2243                    U32 word = trie->wordcount;
2244                    mjd_nodelen++;
2245                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2246                    while( ++fix < n ) {
2247                        Set_Node_Offset_Length(fix, 0, 0);
2248                    }
2249                    while (word--) {
2250                        SV ** const tmp = av_fetch( trie_words, word, 0 );
2251                        if (tmp) {
2252                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
2253                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2254                            else
2255                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2256                        }
2257                    }
2258                }
2259 #endif
2260                 if (trie->maxlen) {
2261                     convert = n;
2262                 } else {
2263                     NEXT_OFF(convert) = (U16)(tail - convert);
2264                     DEBUG_r(optimize= n);
2265                 }
2266             }
2267         }
2268         if (!jumper)
2269             jumper = last;
2270         if ( trie->maxlen ) {
2271             NEXT_OFF( convert ) = (U16)(tail - convert);
2272             ARG_SET( convert, data_slot );
2273             /* Store the offset to the first unabsorbed branch in
2274                jump[0], which is otherwise unused by the jump logic.
2275                We use this when dumping a trie and during optimisation. */
2276             if (trie->jump)
2277                 trie->jump[0] = (U16)(nextbranch - convert);
2278
2279             /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2280              *   and there is a bitmap
2281              *   and the first "jump target" node we found leaves enough room
2282              * then convert the TRIE node into a TRIEC node, with the bitmap
2283              * embedded inline in the opcode - this is hypothetically faster.
2284              */
2285             if ( !trie->states[trie->startstate].wordnum
2286                  && trie->bitmap
2287                  && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2288             {
2289                 OP( convert ) = TRIEC;
2290                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2291                 PerlMemShared_free(trie->bitmap);
2292                 trie->bitmap= NULL;
2293             } else
2294                 OP( convert ) = TRIE;
2295
2296             /* store the type in the flags */
2297             convert->flags = nodetype;
2298             DEBUG_r({
2299             optimize = convert
2300                       + NODE_STEP_REGNODE
2301                       + regarglen[ OP( convert ) ];
2302             });
2303             /* XXX We really should free up the resource in trie now,
2304                    as we won't use them - (which resources?) dmq */
2305         }
2306         /* needed for dumping*/
2307         DEBUG_r(if (optimize) {
2308             regnode *opt = convert;
2309
2310             while ( ++opt < optimize) {
2311                 Set_Node_Offset_Length(opt,0,0);
2312             }
2313             /*
2314                 Try to clean up some of the debris left after the
2315                 optimisation.
2316              */
2317             while( optimize < jumper ) {
2318                 mjd_nodelen += Node_Length((optimize));
2319                 OP( optimize ) = OPTIMIZED;
2320                 Set_Node_Offset_Length(optimize,0,0);
2321                 optimize++;
2322             }
2323             Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2324         });
2325     } /* end node insert */
2326
2327     /*  Finish populating the prev field of the wordinfo array.  Walk back
2328      *  from each accept state until we find another accept state, and if
2329      *  so, point the first word's .prev field at the second word. If the
2330      *  second already has a .prev field set, stop now. This will be the
2331      *  case either if we've already processed that word's accept state,
2332      *  or that state had multiple words, and the overspill words were
2333      *  already linked up earlier.
2334      */
2335     {
2336         U16 word;
2337         U32 state;
2338         U16 prev;
2339
2340         for (word=1; word <= trie->wordcount; word++) {
2341             prev = 0;
2342             if (trie->wordinfo[word].prev)
2343                 continue;
2344             state = trie->wordinfo[word].accept;
2345             while (state) {
2346                 state = prev_states[state];
2347                 if (!state)
2348                     break;
2349                 prev = trie->states[state].wordnum;
2350                 if (prev)
2351                     break;
2352             }
2353             trie->wordinfo[word].prev = prev;
2354         }
2355         Safefree(prev_states);
2356     }
2357
2358
2359     /* and now dump out the compressed format */
2360     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2361
2362     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2363 #ifdef DEBUGGING
2364     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2365     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2366 #else
2367     SvREFCNT_dec(revcharmap);
2368 #endif
2369     return trie->jump
2370            ? MADE_JUMP_TRIE
2371            : trie->startstate>1
2372              ? MADE_EXACT_TRIE
2373              : MADE_TRIE;
2374 }
2375
2376 STATIC void
2377 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode *stclass, U32 depth)
2378 {
2379 /* The Trie is constructed and compressed now so we can build a fail array if it's needed
2380
2381    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2382    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2383    ISBN 0-201-10088-6
2384
2385    We find the fail state for each state in the trie, this state is the longest proper
2386    suffix of the current state's 'word' that is also a proper prefix of another word in our
2387    trie. State 1 represents the word '' and is thus the default fail state. This allows
2388    the DFA not to have to restart after its tried and failed a word at a given point, it
2389    simply continues as though it had been matching the other word in the first place.
2390    Consider
2391       'abcdgu'=~/abcdefg|cdgu/
2392    When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2393    fail, which would bring us to the state representing 'd' in the second word where we would
2394    try 'g' and succeed, proceeding to match 'cdgu'.
2395  */
2396  /* add a fail transition */
2397     const U32 trie_offset = ARG(source);
2398     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2399     U32 *q;
2400     const U32 ucharcount = trie->uniquecharcount;
2401     const U32 numstates = trie->statecount;
2402     const U32 ubound = trie->lasttrans + ucharcount;
2403     U32 q_read = 0;
2404     U32 q_write = 0;
2405     U32 charid;
2406     U32 base = trie->states[ 1 ].trans.base;
2407     U32 *fail;
2408     reg_ac_data *aho;
2409     const U32 data_slot = add_data( pRExC_state, 1, "T" );
2410     GET_RE_DEBUG_FLAGS_DECL;
2411
2412     PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2413 #ifndef DEBUGGING
2414     PERL_UNUSED_ARG(depth);
2415 #endif
2416
2417
2418     ARG_SET( stclass, data_slot );
2419     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2420     RExC_rxi->data->data[ data_slot ] = (void*)aho;
2421     aho->trie=trie_offset;
2422     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2423     Copy( trie->states, aho->states, numstates, reg_trie_state );
2424     Newxz( q, numstates, U32);
2425     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2426     aho->refcount = 1;
2427     fail = aho->fail;
2428     /* initialize fail[0..1] to be 1 so that we always have
2429        a valid final fail state */
2430     fail[ 0 ] = fail[ 1 ] = 1;
2431
2432     for ( charid = 0; charid < ucharcount ; charid++ ) {
2433         const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2434         if ( newstate ) {
2435             q[ q_write ] = newstate;
2436             /* set to point at the root */
2437             fail[ q[ q_write++ ] ]=1;
2438         }
2439     }
2440     while ( q_read < q_write) {
2441         const U32 cur = q[ q_read++ % numstates ];
2442         base = trie->states[ cur ].trans.base;
2443
2444         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2445             const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2446             if (ch_state) {
2447                 U32 fail_state = cur;
2448                 U32 fail_base;
2449                 do {
2450                     fail_state = fail[ fail_state ];
2451                     fail_base = aho->states[ fail_state ].trans.base;
2452                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2453
2454                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2455                 fail[ ch_state ] = fail_state;
2456                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2457                 {
2458                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
2459                 }
2460                 q[ q_write++ % numstates] = ch_state;
2461             }
2462         }
2463     }
2464     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2465        when we fail in state 1, this allows us to use the
2466        charclass scan to find a valid start char. This is based on the principle
2467        that theres a good chance the string being searched contains lots of stuff
2468        that cant be a start char.
2469      */
2470     fail[ 0 ] = fail[ 1 ] = 0;
2471     DEBUG_TRIE_COMPILE_r({
2472         PerlIO_printf(Perl_debug_log,
2473                       "%*sStclass Failtable (%"UVuf" states): 0",
2474                       (int)(depth * 2), "", (UV)numstates
2475         );
2476         for( q_read=1; q_read<numstates; q_read++ ) {
2477             PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2478         }
2479         PerlIO_printf(Perl_debug_log, "\n");
2480     });
2481     Safefree(q);
2482     /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2483 }
2484
2485
2486 /*
2487  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2488  * These need to be revisited when a newer toolchain becomes available.
2489  */
2490 #if defined(__sparc64__) && defined(__GNUC__)
2491 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2492 #       undef  SPARC64_GCC_WORKAROUND
2493 #       define SPARC64_GCC_WORKAROUND 1
2494 #   endif
2495 #endif
2496
2497 #define DEBUG_PEEP(str,scan,depth) \
2498     DEBUG_OPTIMISE_r({if (scan){ \
2499        SV * const mysv=sv_newmortal(); \
2500        regnode *Next = regnext(scan); \
2501        regprop(RExC_rx, mysv, scan); \
2502        PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2503        (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2504        Next ? (REG_NODE_NUM(Next)) : 0 ); \
2505    }});
2506
2507
2508 /* The below joins as many adjacent EXACTish nodes as possible into a single
2509  * one, and looks for problematic sequences of characters whose folds vs.
2510  * non-folds have sufficiently different lengths, that the optimizer would be
2511  * fooled into rejecting legitimate matches of them, and the trie construction
2512  * code can't cope with them.  The joining is only done if:
2513  * 1) there is room in the current conglomerated node to entirely contain the
2514  *    next one.
2515  * 2) they are the exact same node type
2516  *
2517  * The adjacent nodes actually may be separated by NOTHING kind nodes, and
2518  * these get optimized out
2519  *
2520  * If there are problematic code sequences, *min_subtract is set to the delta
2521  * that the minimum size of the node can be less than its actual size.  And,
2522  * the node type of the result is changed to reflect that it contains these
2523  * sequences.
2524  *
2525  * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
2526  * and contains LATIN SMALL LETTER SHARP S
2527  *
2528  * This is as good a place as any to discuss the design of handling these
2529  * problematic sequences.  It's been wrong in Perl for a very long time.  There
2530  * are three code points in Unicode whose folded lengths differ so much from
2531  * the un-folded lengths that it causes problems for the optimizer and trie
2532  * construction.  Why only these are problematic, and not others where lengths
2533  * also differ is something I (khw) do not understand.  New versions of Unicode
2534  * might add more such code points.  Hopefully the logic in fold_grind.t that
2535  * figures out what to test (in part by verifying that each size-combination
2536  * gets tested) will catch any that do come along, so they can be added to the
2537  * special handling below.  The chances of new ones are actually rather small,
2538  * as most, if not all, of the world's scripts that have casefolding have
2539  * already been encoded by Unicode.  Also, a number of Unicode's decisions were
2540  * made to allow compatibility with pre-existing standards, and almost all of
2541  * those have already been dealt with.  These would otherwise be the most
2542  * likely candidates for generating further tricky sequences.  In other words,
2543  * Unicode by itself is unlikely to add new ones unless it is for compatibility
2544  * with pre-existing standards, and there aren't many of those left.
2545  *
2546  * The previous designs for dealing with these involved assigning a special
2547  * node for them.  This approach doesn't work, as evidenced by this example:
2548  *      "\xDFs" =~ /s\xDF/ui    # Used to fail before these patches
2549  * Both these fold to "sss", but if the pattern is parsed to create a node of
2550  * that would match just the \xDF, it won't be able to handle the case where a
2551  * successful match would have to cross the node's boundary.  The new approach
2552  * that hopefully generally solves the problem generates an EXACTFU_SS node
2553  * that is "sss".
2554  *
2555  * There are a number of components to the approach (a lot of work for just
2556  * three code points!):
2557  * 1)   This routine examines each EXACTFish node that could contain the
2558  *      problematic sequences.  It returns in *min_subtract how much to
2559  *      subtract from the the actual length of the string to get a real minimum
2560  *      for one that could match it.  This number is usually 0 except for the
2561  *      problematic sequences.  This delta is used by the caller to adjust the
2562  *      min length of the match, and the delta between min and max, so that the
2563  *      optimizer doesn't reject these possibilities based on size constraints.
2564  * 2)   These sequences are not currently correctly handled by the trie code
2565  *      either, so it changes the joined node type to ops that are not handled
2566  *      by trie's, those new ops being EXACTFU_SS and EXACTFU_NO_TRIE.
2567  * 3)   This is sufficient for the two Greek sequences (described below), but
2568  *      the one involving the Sharp s (\xDF) needs more.  The node type
2569  *      EXACTFU_SS is used for an EXACTFU node that contains at least one "ss"
2570  *      sequence in it.  For non-UTF-8 patterns and strings, this is the only
2571  *      case where there is a possible fold length change.  That means that a
2572  *      regular EXACTFU node without UTF-8 involvement doesn't have to concern
2573  *      itself with length changes, and so can be processed faster.  regexec.c
2574  *      takes advantage of this.  Generally, an EXACTFish node that is in UTF-8
2575  *      is pre-folded by regcomp.c.  This saves effort in regex matching.
2576  *      However, probably mostly for historical reasons, the pre-folding isn't
2577  *      done for non-UTF8 patterns (and it can't be for EXACTF and EXACTFL
2578  *      nodes, as what they fold to isn't known until runtime.)  The fold
2579  *      possibilities for the non-UTF8 patterns are quite simple, except for
2580  *      the sharp s.  All the ones that don't involve a UTF-8 target string
2581  *      are members of a fold-pair, and arrays are set up for all of them
2582  *      that quickly find the other member of the pair.  It might actually
2583  *      be faster to pre-fold these, but it isn't currently done, except for
2584  *      the sharp s.  Code elsewhere in this file makes sure that it gets
2585  *      folded to 'ss', even if the pattern isn't UTF-8.  This avoids the
2586  *      issues described in the next item.
2587  * 4)   A problem remains for the sharp s in EXACTF nodes.  Whether it matches
2588  *      'ss' or not is not knowable at compile time.  It will match iff the
2589  *      target string is in UTF-8, unlike the EXACTFU nodes, where it always
2590  *      matches; and the EXACTFL and EXACTFA nodes where it never does.  Thus
2591  *      it can't be folded to "ss" at compile time, unlike EXACTFU does as
2592  *      described in item 3).  An assumption that the optimizer part of
2593  *      regexec.c (probably unwittingly) makes is that a character in the
2594  *      pattern corresponds to at most a single character in the target string.
2595  *      (And I do mean character, and not byte here, unlike other parts of the
2596  *      documentation that have never been updated to account for multibyte
2597  *      Unicode.)  This assumption is wrong only in this case, as all other
2598  *      cases are either 1-1 folds when no UTF-8 is involved; or is true by
2599  *      virtue of having this file pre-fold UTF-8 patterns.   I'm
2600  *      reluctant to try to change this assumption, so instead the code punts.
2601  *      This routine examines EXACTF nodes for the sharp s, and returns a
2602  *      boolean indicating whether or not the node is an EXACTF node that
2603  *      contains a sharp s.  When it is true, the caller sets a flag that later
2604  *      causes the optimizer in this file to not set values for the floating
2605  *      and fixed string lengths, and thus avoids the optimizer code in
2606  *      regexec.c that makes the invalid assumption.  Thus, there is no
2607  *      optimization based on string lengths for EXACTF nodes that contain the
2608  *      sharp s.  This only happens for /id rules (which means the pattern
2609  *      isn't in UTF-8).
2610  */
2611
2612 #define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
2613     if (PL_regkind[OP(scan)] == EXACT) \
2614         join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
2615
2616 STATIC U32
2617 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
2618     /* Merge several consecutive EXACTish nodes into one. */
2619     regnode *n = regnext(scan);
2620     U32 stringok = 1;
2621     regnode *next = scan + NODE_SZ_STR(scan);
2622     U32 merged = 0;
2623     U32 stopnow = 0;
2624 #ifdef DEBUGGING
2625     regnode *stop = scan;
2626     GET_RE_DEBUG_FLAGS_DECL;
2627 #else
2628     PERL_UNUSED_ARG(depth);
2629 #endif
2630
2631     PERL_ARGS_ASSERT_JOIN_EXACT;
2632 #ifndef EXPERIMENTAL_INPLACESCAN
2633     PERL_UNUSED_ARG(flags);
2634     PERL_UNUSED_ARG(val);
2635 #endif
2636     DEBUG_PEEP("join",scan,depth);
2637
2638     /* Look through the subsequent nodes in the chain.  Skip NOTHING, merge
2639      * EXACT ones that are mergeable to the current one. */
2640     while (n
2641            && (PL_regkind[OP(n)] == NOTHING
2642                || (stringok && OP(n) == OP(scan)))
2643            && NEXT_OFF(n)
2644            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
2645     {
2646
2647         if (OP(n) == TAIL || n > next)
2648             stringok = 0;
2649         if (PL_regkind[OP(n)] == NOTHING) {
2650             DEBUG_PEEP("skip:",n,depth);
2651             NEXT_OFF(scan) += NEXT_OFF(n);
2652             next = n + NODE_STEP_REGNODE;
2653 #ifdef DEBUGGING
2654             if (stringok)
2655                 stop = n;
2656 #endif
2657             n = regnext(n);
2658         }
2659         else if (stringok) {
2660             const unsigned int oldl = STR_LEN(scan);
2661             regnode * const nnext = regnext(n);
2662
2663             if (oldl + STR_LEN(n) > U8_MAX)
2664                 break;
2665
2666             DEBUG_PEEP("merg",n,depth);
2667             merged++;
2668
2669             NEXT_OFF(scan) += NEXT_OFF(n);
2670             STR_LEN(scan) += STR_LEN(n);
2671             next = n + NODE_SZ_STR(n);
2672             /* Now we can overwrite *n : */
2673             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2674 #ifdef DEBUGGING
2675             stop = next - 1;
2676 #endif
2677             n = nnext;
2678             if (stopnow) break;
2679         }
2680
2681 #ifdef EXPERIMENTAL_INPLACESCAN
2682         if (flags && !NEXT_OFF(n)) {
2683             DEBUG_PEEP("atch", val, depth);
2684             if (reg_off_by_arg[OP(n)]) {
2685                 ARG_SET(n, val - n);
2686             }
2687             else {
2688                 NEXT_OFF(n) = val - n;
2689             }
2690             stopnow = 1;
2691         }
2692 #endif
2693     }
2694
2695     *min_subtract = 0;
2696     *has_exactf_sharp_s = FALSE;
2697
2698     /* Here, all the adjacent mergeable EXACTish nodes have been merged.  We
2699      * can now analyze for sequences of problematic code points.  (Prior to
2700      * this final joining, sequences could have been split over boundaries, and
2701      * hence missed).  The sequences only happen in folding, hence for any
2702      * non-EXACT EXACTish node */
2703     if (OP(scan) != EXACT) {
2704         U8 *s;
2705         U8 * s0 = (U8*) STRING(scan);
2706         U8 * const s_end = s0 + STR_LEN(scan);
2707
2708         /* The below is perhaps overboard, but this allows us to save a test
2709          * each time through the loop at the expense of a mask.  This is
2710          * because on both EBCDIC and ASCII machines, 'S' and 's' differ by a
2711          * single bit.  On ASCII they are 32 apart; on EBCDIC, they are 64.
2712          * This uses an exclusive 'or' to find that bit and then inverts it to
2713          * form a mask, with just a single 0, in the bit position where 'S' and
2714          * 's' differ. */
2715         const U8 S_or_s_mask = ~ ('S' ^ 's');
2716         const U8 s_masked = 's' & S_or_s_mask;
2717
2718         /* One pass is made over the node's string looking for all the
2719          * possibilities.  to avoid some tests in the loop, there are two main
2720          * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
2721          * non-UTF-8 */
2722         if (UTF) {
2723
2724             /* There are two problematic Greek code points in Unicode
2725              * casefolding
2726              *
2727              * U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2728              * U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2729              *
2730              * which casefold to
2731              *
2732              * Unicode                      UTF-8
2733              *
2734              * U+03B9 U+0308 U+0301         0xCE 0xB9 0xCC 0x88 0xCC 0x81
2735              * U+03C5 U+0308 U+0301         0xCF 0x85 0xCC 0x88 0xCC 0x81
2736              *
2737              * This means that in case-insensitive matching (or "loose
2738              * matching", as Unicode calls it), an EXACTF of length six (the
2739              * UTF-8 encoded byte length of the above casefolded versions) can
2740              * match a target string of length two (the byte length of UTF-8
2741              * encoded U+0390 or U+03B0).  This would rather mess up the
2742              * minimum length computation.  (there are other code points that
2743              * also fold to these two sequences, but the delta is smaller)
2744              *
2745              * If these sequences are found, the minimum length is decreased by
2746              * four (six minus two).
2747              *
2748              * Similarly, 'ss' may match the single char and byte LATIN SMALL
2749              * LETTER SHARP S.  We decrease the min length by 1 for each
2750              * occurrence of 'ss' found */
2751
2752 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2753 #           define U390_first_byte 0xb4
2754             const U8 U390_tail[] = "\x68\xaf\x49\xaf\x42";
2755 #           define U3B0_first_byte 0xb5
2756             const U8 U3B0_tail[] = "\x46\xaf\x49\xaf\x42";
2757 #else
2758 #           define U390_first_byte 0xce
2759             const U8 U390_tail[] = "\xb9\xcc\x88\xcc\x81";
2760 #           define U3B0_first_byte 0xcf
2761             const U8 U3B0_tail[] = "\x85\xcc\x88\xcc\x81";
2762 #endif
2763             const U8 len = sizeof(U390_tail); /* (-1 for NUL; +1 for 1st byte;
2764                                                  yields a net of 0 */
2765             /* Examine the string for one of the problematic sequences */
2766             for (s = s0;
2767                  s < s_end - 1; /* Can stop 1 before the end, as minimum length
2768                                  * sequence we are looking for is 2 */
2769                  s += UTF8SKIP(s))
2770             {
2771
2772                 /* Look for the first byte in each problematic sequence */
2773                 switch (*s) {
2774                     /* We don't have to worry about other things that fold to
2775                      * 's' (such as the long s, U+017F), as all above-latin1
2776                      * code points have been pre-folded */
2777                     case 's':
2778                     case 'S':
2779
2780                         /* Current character is an 's' or 'S'.  If next one is
2781                          * as well, we have the dreaded sequence */
2782                         if (((*(s+1) & S_or_s_mask) == s_masked)
2783                             /* These two node types don't have special handling
2784                              * for 'ss' */
2785                             && OP(scan) != EXACTFL && OP(scan) != EXACTFA)
2786                         {
2787                             *min_subtract += 1;
2788                             OP(scan) = EXACTFU_SS;
2789                             s++;    /* No need to look at this character again */
2790                         }
2791                         break;
2792
2793                     case U390_first_byte:
2794                         if (s_end - s >= len
2795
2796                             /* The 1's are because are skipping comparing the
2797                              * first byte */
2798                             && memEQ(s + 1, U390_tail, len - 1))
2799                         {
2800                             goto greek_sequence;
2801                         }
2802                         break;
2803
2804                     case U3B0_first_byte:
2805                         if (! (s_end - s >= len
2806                                && memEQ(s + 1, U3B0_tail, len - 1)))
2807                         {
2808                             break;
2809                         }
2810                       greek_sequence:
2811                         *min_subtract += 4;
2812
2813                         /* This can't currently be handled by trie's, so change
2814                          * the node type to indicate this.  If EXACTFA and
2815                          * EXACTFL were ever to be handled by trie's, this
2816                          * would have to be changed.  If this node has already
2817                          * been changed to EXACTFU_SS in this loop, leave it as
2818                          * is.  (I (khw) think it doesn't matter in regexec.c
2819                          * for UTF patterns, but no need to change it */
2820                         if (OP(scan) == EXACTFU) {
2821                             OP(scan) = EXACTFU_NO_TRIE;
2822                         }
2823                         s += 6; /* We already know what this sequence is.  Skip
2824                                    the rest of it */
2825                         break;
2826                 }
2827             }
2828         }
2829         else if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
2830
2831             /* Here, the pattern is not UTF-8.  We need to look only for the
2832              * 'ss' sequence, and in the EXACTF case, the sharp s, which can be
2833              * in the final position.  Otherwise we can stop looking 1 byte
2834              * earlier because have to find both the first and second 's' */
2835             const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
2836
2837             for (s = s0; s < upper; s++) {
2838                 switch (*s) {
2839                     case 'S':
2840                     case 's':
2841                         if (s_end - s > 1
2842                             && ((*(s+1) & S_or_s_mask) == s_masked))
2843                         {
2844                             *min_subtract += 1;
2845
2846                             /* EXACTF nodes need to know that the minimum
2847                              * length changed so that a sharp s in the string
2848                              * can match this ss in the pattern, but they
2849                              * remain EXACTF nodes, as they are not trie'able,
2850                              * so don't have to invent a new node type to
2851                              * exclude them from the trie code */
2852                             if (OP(scan) != EXACTF) {
2853                                 OP(scan) = EXACTFU_SS;
2854                             }
2855                             s++;
2856                         }
2857                         break;
2858                     case LATIN_SMALL_LETTER_SHARP_S:
2859                         if (OP(scan) == EXACTF) {
2860                             *has_exactf_sharp_s = TRUE;
2861                         }
2862                         break;
2863                 }
2864             }
2865         }
2866     }
2867
2868 #ifdef DEBUGGING
2869     /* Allow dumping but overwriting the collection of skipped
2870      * ops and/or strings with fake optimized ops */
2871     n = scan + NODE_SZ_STR(scan);
2872     while (n <= stop) {
2873         OP(n) = OPTIMIZED;
2874         FLAGS(n) = 0;
2875         NEXT_OFF(n) = 0;
2876         n++;
2877     }
2878 #endif
2879     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2880     return stopnow;
2881 }
2882
2883 /* REx optimizer.  Converts nodes into quicker variants "in place".
2884    Finds fixed substrings.  */
2885
2886 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2887    to the position after last scanned or to NULL. */
2888
2889 #define INIT_AND_WITHP \
2890     assert(!and_withp); \
2891     Newx(and_withp,1,struct regnode_charclass_class); \
2892     SAVEFREEPV(and_withp)
2893
2894 /* this is a chain of data about sub patterns we are processing that
2895    need to be handled separately/specially in study_chunk. Its so
2896    we can simulate recursion without losing state.  */
2897 struct scan_frame;
2898 typedef struct scan_frame {
2899     regnode *last;  /* last node to process in this frame */
2900     regnode *next;  /* next node to process when last is reached */
2901     struct scan_frame *prev; /*previous frame*/
2902     I32 stop; /* what stopparen do we use */
2903 } scan_frame;
2904
2905
2906 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2907
2908 #define CASE_SYNST_FNC(nAmE)                                       \
2909 case nAmE:                                                         \
2910     if (flags & SCF_DO_STCLASS_AND) {                              \
2911             for (value = 0; value < 256; value++)                  \
2912                 if (!is_ ## nAmE ## _cp(value))                       \
2913                     ANYOF_BITMAP_CLEAR(data->start_class, value);  \
2914     }                                                              \
2915     else {                                                         \
2916             for (value = 0; value < 256; value++)                  \
2917                 if (is_ ## nAmE ## _cp(value))                        \
2918                     ANYOF_BITMAP_SET(data->start_class, value);    \
2919     }                                                              \
2920     break;                                                         \
2921 case N ## nAmE:                                                    \
2922     if (flags & SCF_DO_STCLASS_AND) {                              \
2923             for (value = 0; value < 256; value++)                   \
2924                 if (is_ ## nAmE ## _cp(value))                         \
2925                     ANYOF_BITMAP_CLEAR(data->start_class, value);   \
2926     }                                                               \
2927     else {                                                          \
2928             for (value = 0; value < 256; value++)                   \
2929                 if (!is_ ## nAmE ## _cp(value))                        \
2930                     ANYOF_BITMAP_SET(data->start_class, value);     \
2931     }                                                               \
2932     break
2933
2934
2935
2936 STATIC I32
2937 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2938                         I32 *minlenp, I32 *deltap,
2939                         regnode *last,
2940                         scan_data_t *data,
2941                         I32 stopparen,
2942                         U8* recursed,
2943                         struct regnode_charclass_class *and_withp,
2944                         U32 flags, U32 depth)
2945                         /* scanp: Start here (read-write). */
2946                         /* deltap: Write maxlen-minlen here. */
2947                         /* last: Stop before this one. */
2948                         /* data: string data about the pattern */
2949                         /* stopparen: treat close N as END */
2950                         /* recursed: which subroutines have we recursed into */
2951                         /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2952 {
2953     dVAR;
2954     I32 min = 0, pars = 0, code;
2955     regnode *scan = *scanp, *next;
2956     I32 delta = 0;
2957     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2958     int is_inf_internal = 0;            /* The studied chunk is infinite */
2959     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2960     scan_data_t data_fake;
2961     SV *re_trie_maxbuff = NULL;
2962     regnode *first_non_open = scan;
2963     I32 stopmin = I32_MAX;
2964     scan_frame *frame = NULL;
2965     GET_RE_DEBUG_FLAGS_DECL;
2966
2967     PERL_ARGS_ASSERT_STUDY_CHUNK;
2968
2969 #ifdef DEBUGGING
2970     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2971 #endif
2972
2973     if ( depth == 0 ) {
2974         while (first_non_open && OP(first_non_open) == OPEN)
2975             first_non_open=regnext(first_non_open);
2976     }
2977
2978
2979   fake_study_recurse:
2980     while ( scan && OP(scan) != END && scan < last ){
2981         UV min_subtract = 0;    /* How much to subtract from the minimum node
2982                                    length to get a real minimum (because the
2983                                    folded version may be shorter) */
2984         bool has_exactf_sharp_s = FALSE;
2985         /* Peephole optimizer: */
2986         DEBUG_STUDYDATA("Peep:", data,depth);
2987         DEBUG_PEEP("Peep",scan,depth);
2988
2989         /* Its not clear to khw or hv why this is done here, and not in the
2990          * clauses that deal with EXACT nodes.  khw's guess is that it's
2991          * because of a previous design */
2992         JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
2993
2994         /* Follow the next-chain of the current node and optimize
2995            away all the NOTHINGs from it.  */
2996         if (OP(scan) != CURLYX) {
2997             const int max = (reg_off_by_arg[OP(scan)]
2998                        ? I32_MAX
2999                        /* I32 may be smaller than U16 on CRAYs! */
3000                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3001             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3002             int noff;
3003             regnode *n = scan;
3004
3005             /* Skip NOTHING and LONGJMP. */
3006             while ((n = regnext(n))
3007                    && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3008                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3009                    && off + noff < max)
3010                 off += noff;
3011             if (reg_off_by_arg[OP(scan)])
3012                 ARG(scan) = off;
3013             else
3014                 NEXT_OFF(scan) = off;
3015         }
3016
3017
3018
3019         /* The principal pseudo-switch.  Cannot be a switch, since we
3020            look into several different things.  */
3021         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3022                    || OP(scan) == IFTHEN) {
3023             next = regnext(scan);
3024             code = OP(scan);
3025             /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
3026
3027             if (OP(next) == code || code == IFTHEN) {
3028                 /* NOTE - There is similar code to this block below for handling
3029                    TRIE nodes on a re-study.  If you change stuff here check there
3030                    too. */
3031                 I32 max1 = 0, min1 = I32_MAX, num = 0;
3032                 struct regnode_charclass_class accum;
3033                 regnode * const startbranch=scan;
3034
3035                 if (flags & SCF_DO_SUBSTR)
3036                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
3037                 if (flags & SCF_DO_STCLASS)
3038                     cl_init_zero(pRExC_state, &accum);
3039
3040                 while (OP(scan) == code) {
3041                     I32 deltanext, minnext, f = 0, fake;
3042                     struct regnode_charclass_class this_class;
3043
3044                     num++;
3045                     data_fake.flags = 0;
3046                     if (data) {
3047                         data_fake.whilem_c = data->whilem_c;
3048                         data_fake.last_closep = data->last_closep;
3049                     }
3050                     else
3051                         data_fake.last_closep = &fake;
3052
3053                     data_fake.pos_delta = delta;
3054                     next = regnext(scan);
3055                     scan = NEXTOPER(scan);
3056                     if (code != BRANCH)
3057                         scan = NEXTOPER(scan);
3058                     if (flags & SCF_DO_STCLASS) {
3059                         cl_init(pRExC_state, &this_class);
3060                         data_fake.start_class = &this_class;
3061                         f = SCF_DO_STCLASS_AND;
3062                     }
3063                     if (flags & SCF_WHILEM_VISITED_POS)
3064                         f |= SCF_WHILEM_VISITED_POS;
3065
3066                     /* we suppose the run is continuous, last=next...*/
3067                     minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3068                                           next, &data_fake,
3069                                           stopparen, recursed, NULL, f,depth+1);
3070                     if (min1 > minnext)
3071                         min1 = minnext;
3072                     if (max1 < minnext + deltanext)
3073                         max1 = minnext + deltanext;
3074                     if (deltanext == I32_MAX)
3075                         is_inf = is_inf_internal = 1;
3076                     scan = next;
3077                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3078                         pars++;
3079                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
3080                         if ( stopmin > minnext)
3081                             stopmin = min + min1;
3082                         flags &= ~SCF_DO_SUBSTR;
3083                         if (data)
3084                             data->flags |= SCF_SEEN_ACCEPT;
3085                     }
3086                     if (data) {
3087                         if (data_fake.flags & SF_HAS_EVAL)
3088                             data->flags |= SF_HAS_EVAL;
3089                         data->whilem_c = data_fake.whilem_c;
3090                     }
3091                     if (flags & SCF_DO_STCLASS)
3092                         cl_or(pRExC_state, &accum, &this_class);
3093                 }
3094                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3095                     min1 = 0;
3096                 if (flags & SCF_DO_SUBSTR) {
3097                     data->pos_min += min1;
3098                     data->pos_delta += max1 - min1;
3099                     if (max1 != min1 || is_inf)
3100                         data->longest = &(data->longest_float);
3101                 }
3102                 min += min1;
3103                 delta += max1 - min1;
3104                 if (flags & SCF_DO_STCLASS_OR) {
3105                     cl_or(pRExC_state, data->start_class, &accum);
3106                     if (min1) {
3107                         cl_and(data->start_class, and_withp);
3108                         flags &= ~SCF_DO_STCLASS;
3109                     }
3110                 }
3111                 else if (flags & SCF_DO_STCLASS_AND) {
3112                     if (min1) {
3113                         cl_and(data->start_class, &accum);
3114                         flags &= ~SCF_DO_STCLASS;
3115                     }
3116                     else {
3117                         /* Switch to OR mode: cache the old value of
3118                          * data->start_class */
3119                         INIT_AND_WITHP;
3120                         StructCopy(data->start_class, and_withp,
3121                                    struct regnode_charclass_class);
3122                         flags &= ~SCF_DO_STCLASS_AND;
3123                         StructCopy(&accum, data->start_class,
3124                                    struct regnode_charclass_class);
3125                         flags |= SCF_DO_STCLASS_OR;
3126                         data->start_class->flags |= ANYOF_EOS;
3127                     }
3128                 }
3129
3130                 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
3131                 /* demq.
3132
3133                    Assuming this was/is a branch we are dealing with: 'scan' now
3134                    points at the item that follows the branch sequence, whatever
3135                    it is. We now start at the beginning of the sequence and look
3136                    for subsequences of
3137
3138                    BRANCH->EXACT=>x1
3139                    BRANCH->EXACT=>x2
3140                    tail
3141
3142                    which would be constructed from a pattern like /A|LIST|OF|WORDS/
3143
3144                    If we can find such a subsequence we need to turn the first
3145                    element into a trie and then add the subsequent branch exact
3146                    strings to the trie.
3147
3148                    We have two cases
3149
3150                      1. patterns where the whole set of branches can be converted.
3151
3152                      2. patterns where only a subset can be converted.
3153
3154                    In case 1 we can replace the whole set with a single regop
3155                    for the trie. In case 2 we need to keep the start and end
3156                    branches so
3157
3158                      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3159                      becomes BRANCH TRIE; BRANCH X;
3160
3161                   There is an additional case, that being where there is a
3162                   common prefix, which gets split out into an EXACT like node
3163                   preceding the TRIE node.
3164
3165                   If x(1..n)==tail then we can do a simple trie, if not we make
3166                   a "jump" trie, such that when we match the appropriate word
3167                   we "jump" to the appropriate tail node. Essentially we turn
3168                   a nested if into a case structure of sorts.
3169
3170                 */
3171
3172                     int made=0;
3173                     if (!re_trie_maxbuff) {
3174                         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3175                         if (!SvIOK(re_trie_maxbuff))
3176                             sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3177                     }
3178                     if ( SvIV(re_trie_maxbuff)>=0  ) {
3179                         regnode *cur;
3180                         regnode *first = (regnode *)NULL;
3181                         regnode *last = (regnode *)NULL;
3182                         regnode *tail = scan;
3183                         U8 optype = 0;
3184                         U32 count=0;
3185
3186 #ifdef DEBUGGING
3187                         SV * const mysv = sv_newmortal();       /* for dumping */
3188 #endif
3189                         /* var tail is used because there may be a TAIL
3190                            regop in the way. Ie, the exacts will point to the
3191                            thing following the TAIL, but the last branch will
3192                            point at the TAIL. So we advance tail. If we
3193                            have nested (?:) we may have to move through several
3194                            tails.
3195                          */
3196
3197                         while ( OP( tail ) == TAIL ) {
3198                             /* this is the TAIL generated by (?:) */
3199                             tail = regnext( tail );
3200                         }
3201
3202
3203                         DEBUG_OPTIMISE_r({
3204                             regprop(RExC_rx, mysv, tail );
3205                             PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3206                                 (int)depth * 2 + 2, "",
3207                                 "Looking for TRIE'able sequences. Tail node is: ",
3208                                 SvPV_nolen_const( mysv )
3209                             );
3210                         });
3211
3212                         /*
3213
3214                            step through the branches, cur represents each
3215                            branch, noper is the first thing to be matched
3216                            as part of that branch and noper_next is the
3217                            regnext() of that node. if noper is an EXACT
3218                            and noper_next is the same as scan (our current
3219                            position in the regex) then the EXACT branch is
3220                            a possible optimization target. Once we have
3221                            two or more consecutive such branches we can
3222                            create a trie of the EXACT's contents and stich
3223                            it in place. If the sequence represents all of
3224                            the branches we eliminate the whole thing and
3225                            replace it with a single TRIE. If it is a
3226                            subsequence then we need to stitch it in. This
3227                            means the first branch has to remain, and needs
3228                            to be repointed at the item on the branch chain
3229                            following the last branch optimized. This could
3230                            be either a BRANCH, in which case the
3231                            subsequence is internal, or it could be the
3232                            item following the branch sequence in which
3233                            case the subsequence is at the end.
3234
3235                         */
3236
3237                         /* dont use tail as the end marker for this traverse */
3238                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3239                             regnode * const noper = NEXTOPER( cur );
3240 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3241                             regnode * const noper_next = regnext( noper );
3242 #endif
3243
3244                             DEBUG_OPTIMISE_r({
3245                                 regprop(RExC_rx, mysv, cur);
3246                                 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3247                                    (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3248
3249                                 regprop(RExC_rx, mysv, noper);
3250                                 PerlIO_printf( Perl_debug_log, " -> %s",
3251                                     SvPV_nolen_const(mysv));
3252
3253                                 if ( noper_next ) {
3254                                   regprop(RExC_rx, mysv, noper_next );
3255                                   PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3256                                     SvPV_nolen_const(mysv));
3257                                 }
3258                                 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
3259                                    REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
3260                             });
3261                             if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
3262                                          : PL_regkind[ OP( noper ) ] == EXACT )
3263                                   || OP(noper) == NOTHING )
3264 #ifdef NOJUMPTRIE
3265                                   && noper_next == tail
3266 #endif
3267                                   && count < U16_MAX)
3268                             {
3269                                 count++;
3270                                 if ( !first || optype == NOTHING ) {
3271                                     if (!first) first = cur;
3272                                     optype = OP( noper );
3273                                 } else {
3274                                     last = cur;
3275                                 }
3276                             } else {
3277 /*
3278     Currently the trie logic handles case insensitive matching properly only
3279     when the pattern is UTF-8 and the node is EXACTFU (thus forcing unicode
3280     semantics).
3281
3282     If/when this is fixed the following define can be swapped
3283     in below to fully enable trie logic.
3284
3285 #define TRIE_TYPE_IS_SAFE 1
3286
3287 Note that join_exact() assumes that the other types of EXACTFish nodes are not
3288 used in tries, so that would have to be updated if this changed
3289
3290 */
3291 #define TRIE_TYPE_IS_SAFE ((UTF && optype == EXACTFU) || optype==EXACT)
3292
3293                                 if ( last && TRIE_TYPE_IS_SAFE ) {
3294                                     make_trie( pRExC_state,
3295                                             startbranch, first, cur, tail, count,
3296                                             optype, depth+1 );
3297                                 }
3298                                 if ( PL_regkind[ OP( noper ) ] == EXACT
3299 #ifdef NOJUMPTRIE
3300                                      && noper_next == tail
3301 #endif
3302                                 ){
3303                                     count = 1;
3304                                     first = cur;
3305                                     optype = OP( noper );
3306                                 } else {
3307                                     count = 0;
3308                                     first = NULL;
3309                                     optype = 0;
3310                                 }
3311                                 last = NULL;
3312                             }
3313                         }
3314                         DEBUG_OPTIMISE_r({
3315                             regprop(RExC_rx, mysv, cur);
3316                             PerlIO_printf( Perl_debug_log,
3317                               "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3318                               "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3319
3320                         });
3321
3322                         if ( last && TRIE_TYPE_IS_SAFE ) {
3323                             made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3324 #ifdef TRIE_STUDY_OPT
3325                             if ( ((made == MADE_EXACT_TRIE &&
3326                                  startbranch == first)
3327                                  || ( first_non_open == first )) &&
3328                                  depth==0 ) {
3329                                 flags |= SCF_TRIE_RESTUDY;
3330                                 if ( startbranch == first
3331                                      && scan == tail )
3332                                 {
3333                                     RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3334                                 }
3335                             }
3336 #endif
3337                         }
3338                     }
3339
3340                 } /* do trie */
3341
3342             }
3343             else if ( code == BRANCHJ ) {  /* single branch is optimized. */
3344                 scan = NEXTOPER(NEXTOPER(scan));
3345             } else                      /* single branch is optimized. */
3346                 scan = NEXTOPER(scan);
3347             continue;
3348         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3349             scan_frame *newframe = NULL;
3350             I32 paren;
3351             regnode *start;
3352             regnode *end;
3353
3354             if (OP(scan) != SUSPEND) {
3355             /* set the pointer */
3356                 if (OP(scan) == GOSUB) {
3357                     paren = ARG(scan);
3358                     RExC_recurse[ARG2L(scan)] = scan;
3359                     start = RExC_open_parens[paren-1];
3360                     end   = RExC_close_parens[paren-1];
3361                 } else {
3362                     paren = 0;
3363                     start = RExC_rxi->program + 1;
3364                     end   = RExC_opend;
3365                 }
3366                 if (!recursed) {
3367                     Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3368                     SAVEFREEPV(recursed);
3369                 }
3370                 if (!PAREN_TEST(recursed,paren+1)) {
3371                     PAREN_SET(recursed,paren+1);
3372                     Newx(newframe,1,scan_frame);
3373                 } else {
3374                     if (flags & SCF_DO_SUBSTR) {
3375                         SCAN_COMMIT(pRExC_state,data,minlenp);
3376                         data->longest = &(data->longest_float);
3377                     }
3378                     is_inf = is_inf_internal = 1;
3379                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3380                         cl_anything(pRExC_state, data->start_class);
3381                     flags &= ~SCF_DO_STCLASS;
3382                 }
3383             } else {
3384                 Newx(newframe,1,scan_frame);
3385                 paren = stopparen;
3386                 start = scan+2;
3387                 end = regnext(scan);
3388             }
3389             if (newframe) {
3390                 assert(start);
3391                 assert(end);
3392                 SAVEFREEPV(newframe);
3393                 newframe->next = regnext(scan);
3394                 newframe->last = last;
3395                 newframe->stop = stopparen;
3396                 newframe->prev = frame;
3397
3398                 frame = newframe;
3399                 scan =  start;
3400                 stopparen = paren;
3401                 last = end;
3402
3403                 continue;
3404             }
3405         }
3406         else if (OP(scan) == EXACT) {
3407             I32 l = STR_LEN(scan);
3408             UV uc;
3409             if (UTF) {
3410                 const U8 * const s = (U8*)STRING(scan);
3411                 l = utf8_length(s, s + l);
3412                 uc = utf8_to_uvchr(s, NULL);
3413             } else {
3414                 uc = *((U8*)STRING(scan));
3415             }
3416             min += l;
3417             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3418                 /* The code below prefers earlier match for fixed
3419                    offset, later match for variable offset.  */
3420                 if (data->last_end == -1) { /* Update the start info. */
3421                     data->last_start_min = data->pos_min;
3422                     data->last_start_max = is_inf
3423                         ? I32_MAX : data->pos_min + data->pos_delta;
3424                 }
3425                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3426                 if (UTF)
3427                     SvUTF8_on(data->last_found);
3428                 {
3429                     SV * const sv = data->last_found;
3430                     MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3431                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
3432                     if (mg && mg->mg_len >= 0)
3433                         mg->mg_len += utf8_length((U8*)STRING(scan),
3434                                                   (U8*)STRING(scan)+STR_LEN(scan));
3435                 }
3436                 data->last_end = data->pos_min + l;
3437                 data->pos_min += l; /* As in the first entry. */
3438                 data->flags &= ~SF_BEFORE_EOL;
3439             }
3440             if (flags & SCF_DO_STCLASS_AND) {
3441                 /* Check whether it is compatible with what we know already! */
3442                 int compat = 1;
3443
3444
3445                 /* If compatible, we or it in below.  It is compatible if is
3446                  * in the bitmp and either 1) its bit or its fold is set, or 2)
3447                  * it's for a locale.  Even if there isn't unicode semantics
3448                  * here, at runtime there may be because of matching against a
3449                  * utf8 string, so accept a possible false positive for
3450                  * latin1-range folds */
3451                 if (uc >= 0x100 ||
3452                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3453                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
3454                     && (!(data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD)
3455                         || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3456                     )
3457                 {
3458                     compat = 0;
3459                 }
3460                 ANYOF_CLASS_ZERO(data->start_class);
3461                 ANYOF_BITMAP_ZERO(data->start_class);
3462                 if (compat)
3463                     ANYOF_BITMAP_SET(data->start_class, uc);
3464                 else if (uc >= 0x100) {
3465                     int i;
3466
3467                     /* Some Unicode code points fold to the Latin1 range; as
3468                      * XXX temporary code, instead of figuring out if this is
3469                      * one, just assume it is and set all the start class bits
3470                      * that could be some such above 255 code point's fold
3471                      * which will generate fals positives.  As the code
3472                      * elsewhere that does compute the fold settles down, it
3473                      * can be extracted out and re-used here */
3474                     for (i = 0; i < 256; i++){
3475                         if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3476                             ANYOF_BITMAP_SET(data->start_class, i);
3477                         }
3478                     }
3479                 }
3480                 data->start_class->flags &= ~ANYOF_EOS;
3481                 if (uc < 0x100)
3482                   data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3483             }
3484             else if (flags & SCF_DO_STCLASS_OR) {
3485                 /* false positive possible if the class is case-folded */
3486                 if (uc < 0x100)
3487                     ANYOF_BITMAP_SET(data->start_class, uc);
3488                 else
3489                     data->start_class->flags |= ANYOF_UNICODE_ALL;
3490                 data->start_class->flags &= ~ANYOF_EOS;
3491                 cl_and(data->start_class, and_withp);
3492             }
3493             flags &= ~SCF_DO_STCLASS;
3494         }
3495         else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3496             I32 l = STR_LEN(scan);
3497             UV uc = *((U8*)STRING(scan));
3498
3499             /* Search for fixed substrings supports EXACT only. */
3500             if (flags & SCF_DO_SUBSTR) {
3501                 assert(data);
3502                 SCAN_COMMIT(pRExC_state, data, minlenp);
3503             }
3504             if (UTF) {
3505                 const U8 * const s = (U8 *)STRING(scan);
3506                 l = utf8_length(s, s + l);
3507                 uc = utf8_to_uvchr(s, NULL);
3508             }
3509             else if (has_exactf_sharp_s) {
3510                 RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
3511             }
3512             min += l - min_subtract;
3513             if (min < 0) {
3514                 min = 0;
3515             }
3516             delta += min_subtract;
3517             if (flags & SCF_DO_SUBSTR) {
3518                 data->pos_min += l - min_subtract;
3519                 if (data->pos_min < 0) {
3520                     data->pos_min = 0;
3521                 }
3522                 data->pos_delta += min_subtract;
3523                 if (min_subtract) {
3524                     data->longest = &(data->longest_float);
3525                 }
3526             }
3527             if (flags & SCF_DO_STCLASS_AND) {
3528                 /* Check whether it is compatible with what we know already! */
3529                 int compat = 1;
3530                 if (uc >= 0x100 ||
3531                  (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3532                   && !ANYOF_BITMAP_TEST(data->start_class, uc)
3533                   && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3534                 {
3535                     compat = 0;
3536                 }
3537                 ANYOF_CLASS_ZERO(data->start_class);
3538                 ANYOF_BITMAP_ZERO(data->start_class);
3539                 if (compat) {
3540                     ANYOF_BITMAP_SET(data->start_class, uc);
3541                     data->start_class->flags &= ~ANYOF_EOS;
3542                     data->start_class->flags |= ANYOF_LOC_NONBITMAP_FOLD;
3543                     if (OP(scan) == EXACTFL) {
3544                         /* XXX This set is probably no longer necessary, and
3545                          * probably wrong as LOCALE now is on in the initial
3546                          * state */
3547                         data->start_class->flags |= ANYOF_LOCALE;
3548                     }
3549                     else {
3550
3551                         /* Also set the other member of the fold pair.  In case
3552                          * that unicode semantics is called for at runtime, use
3553                          * the full latin1 fold.  (Can't do this for locale,
3554                          * because not known until runtime) */
3555                         ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3556
3557                         /* All other (EXACTFL handled above) folds except under
3558                          * /iaa that include s, S, and sharp_s also may include
3559                          * the others */
3560                         if (OP(scan) != EXACTFA) {
3561                             if (uc == 's' || uc == 'S') {
3562                                 ANYOF_BITMAP_SET(data->start_class,
3563                                                  LATIN_SMALL_LETTER_SHARP_S);
3564                             }
3565                             else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3566                                 ANYOF_BITMAP_SET(data->start_class, 's');
3567                                 ANYOF_BITMAP_SET(data->start_class, 'S');
3568                             }
3569                         }
3570                     }
3571                 }
3572                 else if (uc >= 0x100) {
3573                     int i;
3574                     for (i = 0; i < 256; i++){
3575                         if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3576                             ANYOF_BITMAP_SET(data->start_class, i);
3577                         }
3578                     }
3579                 }
3580             }
3581             else if (flags & SCF_DO_STCLASS_OR) {
3582                 if (data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) {
3583                     /* false positive possible if the class is case-folded.
3584                        Assume that the locale settings are the same... */
3585                     if (uc < 0x100) {
3586                         ANYOF_BITMAP_SET(data->start_class, uc);
3587                         if (OP(scan) != EXACTFL) {
3588
3589                             /* And set the other member of the fold pair, but
3590                              * can't do that in locale because not known until
3591                              * run-time */
3592                             ANYOF_BITMAP_SET(data->start_class,
3593                                              PL_fold_latin1[uc]);
3594
3595                             /* All folds except under /iaa that include s, S,
3596                              * and sharp_s also may include the others */
3597                             if (OP(scan) != EXACTFA) {
3598                                 if (uc == 's' || uc == 'S') {
3599                                     ANYOF_BITMAP_SET(data->start_class,
3600                                                    LATIN_SMALL_LETTER_SHARP_S);
3601                                 }
3602                                 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3603                                     ANYOF_BITMAP_SET(data->start_class, 's');
3604                                     ANYOF_BITMAP_SET(data->start_class, 'S');
3605                                 }
3606                             }
3607                         }
3608                     }
3609                     data->start_class->flags &= ~ANYOF_EOS;
3610                 }
3611                 cl_and(data->start_class, and_withp);
3612             }
3613             flags &= ~SCF_DO_STCLASS;
3614         }
3615         else if (REGNODE_VARIES(OP(scan))) {
3616             I32 mincount, maxcount, minnext, deltanext, fl = 0;
3617             I32 f = flags, pos_before = 0;
3618             regnode * const oscan = scan;
3619             struct regnode_charclass_class this_class;
3620             struct regnode_charclass_class *oclass = NULL;
3621             I32 next_is_eval = 0;
3622
3623             switch (PL_regkind[OP(scan)]) {
3624             case WHILEM:                /* End of (?:...)* . */
3625                 scan = NEXTOPER(scan);
3626                 goto finish;
3627             case PLUS:
3628                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3629                     next = NEXTOPER(scan);
3630                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3631                         mincount = 1;
3632                         maxcount = REG_INFTY;
3633                         next = regnext(scan);
3634                         scan = NEXTOPER(scan);
3635                         goto do_curly;
3636                     }
3637                 }
3638                 if (flags & SCF_DO_SUBSTR)
3639                     data->pos_min++;
3640                 min++;
3641                 /* Fall through. */
3642             case STAR:
3643                 if (flags & SCF_DO_STCLASS) {
3644                     mincount = 0;
3645                     maxcount = REG_INFTY;
3646                     next = regnext(scan);
3647                     scan = NEXTOPER(scan);
3648                     goto do_curly;
3649                 }
3650                 is_inf = is_inf_internal = 1;
3651                 scan = regnext(scan);
3652                 if (flags & SCF_DO_SUBSTR) {
3653                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3654                     data->longest = &(data->longest_float);
3655                 }
3656                 goto optimize_curly_tail;
3657             case CURLY:
3658                 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3659                     && (scan->flags == stopparen))
3660                 {
3661                     mincount = 1;
3662                     maxcount = 1;
3663                 } else {
3664                     mincount = ARG1(scan);
3665                     maxcount = ARG2(scan);
3666                 }
3667                 next = regnext(scan);
3668                 if (OP(scan) == CURLYX) {
3669                     I32 lp = (data ? *(data->last_closep) : 0);
3670                     scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3671                 }
3672                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3673                 next_is_eval = (OP(scan) == EVAL);
3674               do_curly:
3675                 if (flags & SCF_DO_SUBSTR) {
3676                     if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3677                     pos_before = data->pos_min;
3678                 }
3679                 if (data) {
3680                     fl = data->flags;
3681                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3682                     if (is_inf)
3683                         data->flags |= SF_IS_INF;
3684                 }
3685                 if (flags & SCF_DO_STCLASS) {
3686                     cl_init(pRExC_state, &this_class);
3687                     oclass = data->start_class;
3688                     data->start_class = &this_class;
3689                     f |= SCF_DO_STCLASS_AND;
3690                     f &= ~SCF_DO_STCLASS_OR;
3691                 }
3692                 /* Exclude from super-linear cache processing any {n,m}
3693                    regops for which the combination of input pos and regex
3694                    pos is not enough information to determine if a match
3695                    will be possible.
3696
3697                    For example, in the regex /foo(bar\s*){4,8}baz/ with the
3698                    regex pos at the \s*, the prospects for a match depend not
3699                    only on the input position but also on how many (bar\s*)
3700                    repeats into the {4,8} we are. */
3701                if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3702                     f &= ~SCF_WHILEM_VISITED_POS;
3703
3704                 /* This will finish on WHILEM, setting scan, or on NULL: */
3705                 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3706                                       last, data, stopparen, recursed, NULL,
3707                                       (mincount == 0
3708                                         ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3709
3710                 if (flags & SCF_DO_STCLASS)
3711                     data->start_class = oclass;
3712                 if (mincount == 0 || minnext == 0) {
3713                     if (flags & SCF_DO_STCLASS_OR) {
3714                         cl_or(pRExC_state, data->start_class, &this_class);
3715                     }
3716                     else if (flags & SCF_DO_STCLASS_AND) {
3717                         /* Switch to OR mode: cache the old value of
3718                          * data->start_class */
3719                         INIT_AND_WITHP;
3720                         StructCopy(data->start_class, and_withp,
3721                                    struct regnode_charclass_class);
3722                         flags &= ~SCF_DO_STCLASS_AND;
3723                         StructCopy(&this_class, data->start_class,
3724                                    struct regnode_charclass_class);
3725                         flags |= SCF_DO_STCLASS_OR;
3726                         data->start_class->flags |= ANYOF_EOS;
3727                     }
3728                 } else {                /* Non-zero len */
3729                     if (flags & SCF_DO_STCLASS_OR) {
3730                         cl_or(pRExC_state, data->start_class, &this_class);
3731                         cl_and(data->start_class, and_withp);
3732                     }
3733                     else if (flags & SCF_DO_STCLASS_AND)
3734                         cl_and(data->start_class, &this_class);
3735                     flags &= ~SCF_DO_STCLASS;
3736                 }
3737                 if (!scan)              /* It was not CURLYX, but CURLY. */
3738                     scan = next;
3739                 if ( /* ? quantifier ok, except for (?{ ... }) */
3740                     (next_is_eval || !(mincount == 0 && maxcount == 1))
3741                     && (minnext == 0) && (deltanext == 0)
3742                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3743                     && maxcount <= REG_INFTY/3) /* Complement check for big count */
3744                 {
3745                     ckWARNreg(RExC_parse,
3746                               "Quantifier unexpected on zero-length expression");
3747                 }
3748
3749                 min += minnext * mincount;
3750                 is_inf_internal |= ((maxcount == REG_INFTY
3751                                      && (minnext + deltanext) > 0)
3752                                     || deltanext == I32_MAX);
3753                 is_inf |= is_inf_internal;
3754                 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3755
3756                 /* Try powerful optimization CURLYX => CURLYN. */
3757                 if (  OP(oscan) == CURLYX && data
3758                       && data->flags & SF_IN_PAR
3759                       && !(data->flags & SF_HAS_EVAL)
3760                       && !deltanext && minnext == 1 ) {
3761                     /* Try to optimize to CURLYN.  */
3762                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3763                     regnode * const nxt1 = nxt;
3764 #ifdef DEBUGGING
3765                     regnode *nxt2;
3766 #endif
3767
3768                     /* Skip open. */
3769                     nxt = regnext(nxt);
3770                     if (!REGNODE_SIMPLE(OP(nxt))
3771                         && !(PL_regkind[OP(nxt)] == EXACT
3772                              && STR_LEN(nxt) == 1))
3773                         goto nogo;
3774 #ifdef DEBUGGING
3775                     nxt2 = nxt;
3776 #endif
3777                     nxt = regnext(nxt);
3778                     if (OP(nxt) != CLOSE)
3779                         goto nogo;
3780                     if (RExC_open_parens) {
3781                         RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3782                         RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3783                     }
3784                     /* Now we know that nxt2 is the only contents: */
3785                     oscan->flags = (U8)ARG(nxt);
3786                     OP(oscan) = CURLYN;
3787                     OP(nxt1) = NOTHING; /* was OPEN. */
3788
3789 #ifdef DEBUGGING
3790                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3791                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3792                     NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3793                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
3794                     OP(nxt + 1) = OPTIMIZED; /* was count. */
3795                     NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3796 #endif
3797                 }
3798               nogo:
3799
3800                 /* Try optimization CURLYX => CURLYM. */
3801                 if (  OP(oscan) == CURLYX && data
3802                       && !(data->flags & SF_HAS_PAR)
3803                       && !(data->flags & SF_HAS_EVAL)
3804                       && !deltanext     /* atom is fixed width */
3805                       && minnext != 0   /* CURLYM can't handle zero width */
3806                 ) {
3807                     /* XXXX How to optimize if data == 0? */
3808                     /* Optimize to a simpler form.  */
3809                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3810                     regnode *nxt2;
3811
3812                     OP(oscan) = CURLYM;
3813                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3814                             && (OP(nxt2) != WHILEM))
3815                         nxt = nxt2;
3816                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
3817                     /* Need to optimize away parenths. */
3818                     if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3819                         /* Set the parenth number.  */
3820                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3821
3822                         oscan->flags = (U8)ARG(nxt);
3823                         if (RExC_open_parens) {
3824                             RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3825                             RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3826                         }
3827                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
3828                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
3829
3830 #ifdef DEBUGGING
3831                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3832                         OP(nxt + 1) = OPTIMIZED; /* was count. */
3833                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3834                         NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
3835 #endif
3836 #if 0
3837                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
3838                             regnode *nnxt = regnext(nxt1);
3839                             if (nnxt == nxt) {
3840                                 if (reg_off_by_arg[OP(nxt1)])
3841                                     ARG_SET(nxt1, nxt2 - nxt1);
3842                                 else if (nxt2 - nxt1 < U16_MAX)
3843                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
3844                                 else
3845                                     OP(nxt) = NOTHING;  /* Cannot beautify */
3846                             }
3847                             nxt1 = nnxt;
3848                         }
3849 #endif
3850                         /* Optimize again: */
3851                         study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3852                                     NULL, stopparen, recursed, NULL, 0,depth+1);
3853                     }
3854                     else
3855                         oscan->flags = 0;
3856                 }
3857                 else if ((OP(oscan) == CURLYX)
3858                          && (flags & SCF_WHILEM_VISITED_POS)
3859                          /* See the comment on a similar expression above.
3860                             However, this time it's not a subexpression
3861                             we care about, but the expression itself. */
3862                          && (maxcount == REG_INFTY)
3863                          && data && ++data->whilem_c < 16) {
3864                     /* This stays as CURLYX, we can put the count/of pair. */
3865                     /* Find WHILEM (as in regexec.c) */
3866                     regnode *nxt = oscan + NEXT_OFF(oscan);
3867
3868                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3869                         nxt += ARG(nxt);
3870                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
3871                         | (RExC_whilem_seen << 4)); /* On WHILEM */
3872                 }
3873                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3874                     pars++;
3875                 if (flags & SCF_DO_SUBSTR) {
3876                     SV *last_str = NULL;
3877                     int counted = mincount != 0;
3878
3879                     if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3880 #if defined(SPARC64_GCC_WORKAROUND)
3881                         I32 b = 0;
3882                         STRLEN l = 0;
3883                         const char *s = NULL;
3884                         I32 old = 0;
3885
3886                         if (pos_before >= data->last_start_min)
3887                             b = pos_before;
3888                         else
3889                             b = data->last_start_min;
3890
3891                         l = 0;
3892                         s = SvPV_const(data->last_found, l);
3893                         old = b - data->last_start_min;
3894
3895 #else
3896                         I32 b = pos_before >= data->last_start_min
3897                             ? pos_before : data->last_start_min;
3898                         STRLEN l;
3899                         const char * const s = SvPV_const(data->last_found, l);
3900                         I32 old = b - data->last_start_min;
3901 #endif
3902
3903                         if (UTF)
3904                             old = utf8_hop((U8*)s, old) - (U8*)s;
3905                         l -= old;
3906                         /* Get the added string: */
3907                         last_str = newSVpvn_utf8(s  + old, l, UTF);
3908                         if (deltanext == 0 && pos_before == b) {
3909                             /* What was added is a constant string */
3910                             if (mincount > 1) {
3911                                 SvGROW(last_str, (mincount * l) + 1);
3912                                 repeatcpy(SvPVX(last_str) + l,
3913                                           SvPVX_const(last_str), l, mincount - 1);
3914                                 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3915                                 /* Add additional parts. */
3916                                 SvCUR_set(data->last_found,
3917                                           SvCUR(data->last_found) - l);
3918                                 sv_catsv(data->last_found, last_str);
3919                                 {
3920                                     SV * sv = data->last_found;
3921                                     MAGIC *mg =
3922                                         SvUTF8(sv) && SvMAGICAL(sv) ?
3923                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
3924                                     if (mg && mg->mg_len >= 0)
3925                                         mg->mg_len += CHR_SVLEN(last_str) - l;
3926                                 }
3927                                 data->last_end += l * (mincount - 1);
3928                             }
3929                         } else {
3930                             /* start offset must point into the last copy */
3931                             data->last_start_min += minnext * (mincount - 1);
3932                             data->last_start_max += is_inf ? I32_MAX
3933                                 : (maxcount - 1) * (minnext + data->pos_delta);
3934                         }
3935                     }
3936                     /* It is counted once already... */
3937                     data->pos_min += minnext * (mincount - counted);
3938                     data->pos_delta += - counted * deltanext +
3939                         (minnext + deltanext) * maxcount - minnext * mincount;
3940                     if (mincount != maxcount) {
3941                          /* Cannot extend fixed substrings found inside
3942                             the group.  */
3943                         SCAN_COMMIT(pRExC_state,data,minlenp);
3944                         if (mincount && last_str) {
3945                             SV * const sv = data->last_found;
3946                             MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3947                                 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3948
3949                             if (mg)
3950                                 mg->mg_len = -1;
3951                             sv_setsv(sv, last_str);
3952                             data->last_end = data->pos_min;
3953                             data->last_start_min =
3954                                 data->pos_min - CHR_SVLEN(last_str);
3955                             data->last_start_max = is_inf
3956                                 ? I32_MAX
3957                                 : data->pos_min + data->pos_delta
3958                                 - CHR_SVLEN(last_str);
3959                         }
3960                         data->longest = &(data->longest_float);
3961                     }
3962                     SvREFCNT_dec(last_str);
3963                 }
3964                 if (data && (fl & SF_HAS_EVAL))
3965                     data->flags |= SF_HAS_EVAL;
3966               optimize_curly_tail:
3967                 if (OP(oscan) != CURLYX) {
3968                     while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3969                            && NEXT_OFF(next))
3970                         NEXT_OFF(oscan) += NEXT_OFF(next);
3971                 }
3972                 continue;
3973             default:                    /* REF, ANYOFV, and CLUMP only? */
3974                 if (flags & SCF_DO_SUBSTR) {
3975                     SCAN_COMMIT(pRExC_state,data,minlenp);      /* Cannot expect anything... */
3976                     data->longest = &(data->longest_float);
3977                 }
3978                 is_inf = is_inf_internal = 1;
3979                 if (flags & SCF_DO_STCLASS_OR)
3980                     cl_anything(pRExC_state, data->start_class);
3981                 flags &= ~SCF_DO_STCLASS;
3982                 break;
3983             }
3984         }
3985         else if (OP(scan) == LNBREAK) {
3986             if (flags & SCF_DO_STCLASS) {
3987                 int value = 0;
3988                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3989                 if (flags & SCF_DO_STCLASS_AND) {
3990                     for (value = 0; value < 256; value++)
3991                         if (!is_VERTWS_cp(value))
3992                             ANYOF_BITMAP_CLEAR(data->start_class, value);
3993                 }
3994                 else {
3995                     for (value = 0; value < 256; value++)
3996                         if (is_VERTWS_cp(value))
3997                             ANYOF_BITMAP_SET(data->start_class, value);
3998                 }
3999                 if (flags & SCF_DO_STCLASS_OR)
4000                     cl_and(data->start_class, and_withp);
4001                 flags &= ~SCF_DO_STCLASS;
4002             }
4003             min += 1;
4004             delta += 1;
4005             if (flags & SCF_DO_SUBSTR) {
4006                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
4007                 data->pos_min += 1;
4008                 data->pos_delta += 1;
4009                 data->longest = &(data->longest_float);
4010             }
4011         }
4012         else if (REGNODE_SIMPLE(OP(scan))) {
4013             int value = 0;
4014
4015             if (flags & SCF_DO_SUBSTR) {
4016                 SCAN_COMMIT(pRExC_state,data,minlenp);
4017                 data->pos_min++;
4018             }
4019             min++;
4020             if (flags & SCF_DO_STCLASS) {
4021                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
4022
4023                 /* Some of the logic below assumes that switching
4024                    locale on will only add false positives. */
4025                 switch (PL_regkind[OP(scan)]) {
4026                 case SANY:
4027                 default:
4028                   do_default:
4029                     /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
4030                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4031                         cl_anything(pRExC_state, data->start_class);
4032                     break;
4033                 case REG_ANY:
4034                     if (OP(scan) == SANY)
4035                         goto do_default;
4036                     if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
4037                         value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
4038                                  || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
4039                         cl_anything(pRExC_state, data->start_class);
4040                     }
4041                     if (flags & SCF_DO_STCLASS_AND || !value)
4042                         ANYOF_BITMAP_CLEAR(data->start_class,'\n');
4043                     break;
4044                 case ANYOF:
4045                     if (flags & SCF_DO_STCLASS_AND)
4046                         cl_and(data->start_class,
4047                                (struct regnode_charclass_class*)scan);
4048                     else
4049                         cl_or(pRExC_state, data->start_class,
4050                               (struct regnode_charclass_class*)scan);
4051                     break;
4052                 case ALNUM:
4053                     if (flags & SCF_DO_STCLASS_AND) {
4054                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4055                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
4056                             if (OP(scan) == ALNUMU) {
4057                                 for (value = 0; value < 256; value++) {
4058                                     if (!isWORDCHAR_L1(value)) {
4059                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4060                                     }
4061                                 }
4062                             } else {
4063                                 for (value = 0; value < 256; value++) {
4064                                     if (!isALNUM(value)) {
4065                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4066                                     }
4067                                 }
4068                             }
4069                         }
4070                     }
4071                     else {
4072                         if (data->start_class->flags & ANYOF_LOCALE)
4073                             ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
4074
4075                         /* Even if under locale, set the bits for non-locale
4076                          * in case it isn't a true locale-node.  This will
4077                          * create false positives if it truly is locale */
4078                         if (OP(scan) == ALNUMU) {
4079                             for (value = 0; value < 256; value++) {
4080                                 if (isWORDCHAR_L1(value)) {
4081                                     ANYOF_BITMAP_SET(data->start_class, value);
4082                                 }
4083                             }
4084                         } else {
4085                             for (value = 0; value < 256; value++) {
4086                                 if (isALNUM(value)) {
4087                                     ANYOF_BITMAP_SET(data->start_class, value);
4088                                 }
4089                             }
4090                         }
4091                     }
4092                     break;
4093                 case NALNUM:
4094                     if (flags & SCF_DO_STCLASS_AND) {
4095                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4096                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
4097                             if (OP(scan) == NALNUMU) {
4098                                 for (value = 0; value < 256; value++) {
4099                                     if (isWORDCHAR_L1(value)) {
4100                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4101                                     }
4102                                 }
4103                             } else {
4104                                 for (value = 0; value < 256; value++) {
4105                                     if (isALNUM(value)) {
4106                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4107                                     }
4108                                 }
4109                             }
4110                         }
4111                     }
4112                     else {
4113                         if (data->start_class->flags & ANYOF_LOCALE)
4114                             ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
4115
4116                         /* Even if under locale, set the bits for non-locale in
4117                          * case it isn't a true locale-node.  This will create
4118                          * false positives if it truly is locale */
4119                         if (OP(scan) == NALNUMU) {
4120                             for (value = 0; value < 256; value++) {
4121                                 if (! isWORDCHAR_L1(value)) {
4122                                     ANYOF_BITMAP_SET(data->start_class, value);
4123                                 }
4124                             }
4125                         } else {
4126                             for (value = 0; value < 256; value++) {
4127                                 if (! isALNUM(value)) {
4128                                     ANYOF_BITMAP_SET(data->start_class, value);
4129                                 }
4130                             }
4131                         }
4132                     }
4133                     break;
4134                 case SPACE:
4135                     if (flags & SCF_DO_STCLASS_AND) {
4136                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4137                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
4138                             if (OP(scan) == SPACEU) {
4139                                 for (value = 0; value < 256; value++) {
4140                                     if (!isSPACE_L1(value)) {
4141                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4142                                     }
4143                                 }
4144                             } else {
4145                                 for (value = 0; value < 256; value++) {
4146                                     if (!isSPACE(value)) {
4147                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4148                                     }
4149                                 }
4150                             }
4151                         }
4152                     }
4153                     else {
4154                         if (data->start_class->flags & ANYOF_LOCALE) {
4155                             ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
4156                         }
4157                         if (OP(scan) == SPACEU) {
4158                             for (value = 0; value < 256; value++) {
4159                                 if (isSPACE_L1(value)) {
4160                                     ANYOF_BITMAP_SET(data->start_class, value);
4161                                 }
4162                             }
4163                         } else {
4164                             for (value = 0; value < 256; value++) {
4165                                 if (isSPACE(value)) {
4166                                     ANYOF_BITMAP_SET(data->start_class, value);
4167                                 }
4168                             }
4169                         }
4170                     }
4171                     break;
4172                 case NSPACE:
4173                     if (flags & SCF_DO_STCLASS_AND) {
4174                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4175                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
4176                             if (OP(scan) == NSPACEU) {
4177                                 for (value = 0; value < 256; value++) {
4178                                     if (isSPACE_L1(value)) {
4179                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4180                                     }
4181                                 }
4182                             } else {
4183                                 for (value = 0; value < 256; value++) {
4184                                     if (isSPACE(value)) {
4185                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4186                                     }
4187                                 }
4188                             }
4189                         }
4190                     }
4191                     else {
4192                         if (data->start_class->flags & ANYOF_LOCALE)
4193                             ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
4194                         if (OP(scan) == NSPACEU) {
4195                             for (value = 0; value < 256; value++) {
4196                                 if (!isSPACE_L1(value)) {
4197                                     ANYOF_BITMAP_SET(data->start_class, value);
4198                                 }
4199                             }
4200                         }
4201                         else {
4202                             for (value = 0; value < 256; value++) {
4203                                 if (!isSPACE(value)) {
4204                                     ANYOF_BITMAP_SET(data->start_class, value);
4205                                 }
4206                             }
4207                         }
4208                     }
4209                     break;
4210                 case DIGIT:
4211                     if (flags & SCF_DO_STCLASS_AND) {
4212                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4213                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
4214                             for (value = 0; value < 256; value++)
4215                                 if (!isDIGIT(value))
4216                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
4217                         }
4218                     }
4219                     else {
4220                         if (data->start_class->flags & ANYOF_LOCALE)
4221                             ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
4222                         for (value = 0; value < 256; value++)
4223                             if (isDIGIT(value))
4224                                 ANYOF_BITMAP_SET(data->start_class, value);
4225                     }
4226                     break;
4227                 case NDIGIT:
4228                     if (flags & SCF_DO_STCLASS_AND) {
4229                         if (!(data->start_class->flags & ANYOF_LOCALE))
4230                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
4231                         for (value = 0; value < 256; value++)
4232                             if (isDIGIT(value))
4233                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
4234                     }
4235                     else {
4236                         if (data->start_class->flags & ANYOF_LOCALE)
4237                             ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
4238                         for (value = 0; value < 256; value++)
4239                             if (!isDIGIT(value))
4240                                 ANYOF_BITMAP_SET(data->start_class, value);
4241                     }
4242                     break;
4243                 CASE_SYNST_FNC(VERTWS);
4244                 CASE_SYNST_FNC(HORIZWS);
4245
4246                 }
4247                 if (flags & SCF_DO_STCLASS_OR)
4248                     cl_and(data->start_class, and_withp);
4249                 flags &= ~SCF_DO_STCLASS;
4250             }
4251         }
4252         else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
4253             data->flags |= (OP(scan) == MEOL
4254                             ? SF_BEFORE_MEOL
4255                             : SF_BEFORE_SEOL);
4256         }
4257         else if (  PL_regkind[OP(scan)] == BRANCHJ
4258                  /* Lookbehind, or need to calculate parens/evals/stclass: */
4259                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
4260                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4261             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4262                 || OP(scan) == UNLESSM )
4263             {
4264                 /* Negative Lookahead/lookbehind
4265                    In this case we can't do fixed string optimisation.
4266                 */
4267
4268                 I32 deltanext, minnext, fake = 0;
4269                 regnode *nscan;
4270                 struct regnode_charclass_class intrnl;
4271                 int f = 0;
4272
4273                 data_fake.flags = 0;
4274                 if (data) {
4275                     data_fake.whilem_c = data->whilem_c;
4276                     data_fake.last_closep = data->last_closep;
4277                 }
4278                 else
4279                     data_fake.last_closep = &fake;
4280                 data_fake.pos_delta = delta;
4281                 if ( flags & SCF_DO_STCLASS && !scan->flags
4282                      && OP(scan) == IFMATCH ) { /* Lookahead */
4283                     cl_init(pRExC_state, &intrnl);
4284                     data_fake.start_class = &intrnl;
4285                     f |= SCF_DO_STCLASS_AND;
4286                 }
4287                 if (flags & SCF_WHILEM_VISITED_POS)
4288                     f |= SCF_WHILEM_VISITED_POS;
4289                 next = regnext(scan);
4290                 nscan = NEXTOPER(NEXTOPER(scan));
4291                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
4292                     last, &data_fake, stopparen, recursed, NULL, f, depth+1);
4293                 if (scan->flags) {
4294                     if (deltanext) {
4295                         FAIL("Variable length lookbehind not implemented");
4296                     }
4297                     else if (minnext > (I32)U8_MAX) {
4298                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4299                     }
4300                     scan->flags = (U8)minnext;
4301                 }
4302                 if (data) {
4303                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4304                         pars++;
4305                     if (data_fake.flags & SF_HAS_EVAL)
4306                         data->flags |= SF_HAS_EVAL;
4307                     data->whilem_c = data_fake.whilem_c;
4308                 }
4309                 if (f & SCF_DO_STCLASS_AND) {
4310                     if (flags & SCF_DO_STCLASS_OR) {
4311                         /* OR before, AND after: ideally we would recurse with
4312                          * data_fake to get the AND applied by study of the
4313                          * remainder of the pattern, and then derecurse;
4314                          * *** HACK *** for now just treat as "no information".
4315                          * See [perl #56690].
4316                          */
4317                         cl_init(pRExC_state, data->start_class);
4318                     }  else {
4319                         /* AND before and after: combine and continue */
4320                         const int was = (data->start_class->flags & ANYOF_EOS);
4321
4322                         cl_and(data->start_class, &intrnl);
4323                         if (was)
4324                             data->start_class->flags |= ANYOF_EOS;
4325                     }
4326                 }
4327             }
4328 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4329             else {
4330                 /* Positive Lookahead/lookbehind
4331                    In this case we can do fixed string optimisation,
4332                    but we must be careful about it. Note in the case of
4333                    lookbehind the positions will be offset by the minimum
4334                    length of the pattern, something we won't know about
4335                    until after the recurse.
4336                 */
4337                 I32 deltanext, fake = 0;
4338                 regnode *nscan;
4339                 struct regnode_charclass_class intrnl;
4340                 int f = 0;
4341                 /* We use SAVEFREEPV so that when the full compile
4342                     is finished perl will clean up the allocated
4343                     minlens when it's all done. This way we don't
4344                     have to worry about freeing them when we know
4345                     they wont be used, which would be a pain.
4346                  */
4347                 I32 *minnextp;
4348                 Newx( minnextp, 1, I32 );
4349                 SAVEFREEPV(minnextp);
4350
4351                 if (data) {
4352                     StructCopy(data, &data_fake, scan_data_t);
4353                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
4354                         f |= SCF_DO_SUBSTR;
4355                         if (scan->flags)
4356                             SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
4357                         data_fake.last_found=newSVsv(data->last_found);
4358                     }
4359                 }
4360                 else
4361                     data_fake.last_closep = &fake;
4362                 data_fake.flags = 0;
4363                 data_fake.pos_delta = delta;
4364                 if (is_inf)
4365                     data_fake.flags |= SF_IS_INF;
4366                 if ( flags & SCF_DO_STCLASS && !scan->flags
4367                      && OP(scan) == IFMATCH ) { /* Lookahead */
4368                     cl_init(pRExC_state, &intrnl);
4369                     data_fake.start_class = &intrnl;
4370                     f |= SCF_DO_STCLASS_AND;
4371                 }
4372                 if (flags & SCF_WHILEM_VISITED_POS)
4373                     f |= SCF_WHILEM_VISITED_POS;
4374                 next = regnext(scan);
4375                 nscan = NEXTOPER(NEXTOPER(scan));
4376
4377                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
4378                     last, &data_fake, stopparen, recursed, NULL, f,depth+1);
4379                 if (scan->flags) {
4380                     if (deltanext) {
4381                         FAIL("Variable length lookbehind not implemented");
4382                     }
4383                     else if (*minnextp > (I32)U8_MAX) {
4384                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4385                     }
4386                     scan->flags = (U8)*minnextp;
4387                 }
4388
4389                 *minnextp += min;
4390
4391                 if (f & SCF_DO_STCLASS_AND) {
4392                     const int was = (data->start_class->flags & ANYOF_EOS);
4393
4394                     cl_and(data->start_class, &intrnl);
4395                     if (was)
4396                         data->start_class->flags |= ANYOF_EOS;
4397                 }
4398                 if (data) {
4399                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4400                         pars++;
4401                     if (data_fake.flags & SF_HAS_EVAL)
4402                         data->flags |= SF_HAS_EVAL;
4403                     data->whilem_c = data_fake.whilem_c;
4404                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4405                         if (RExC_rx->minlen<*minnextp)
4406                             RExC_rx->minlen=*minnextp;
4407                         SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4408                         SvREFCNT_dec(data_fake.last_found);
4409
4410                         if ( data_fake.minlen_fixed != minlenp )
4411                         {
4412                             data->offset_fixed= data_fake.offset_fixed;
4413                             data->minlen_fixed= data_fake.minlen_fixed;
4414                             data->lookbehind_fixed+= scan->flags;
4415                         }
4416                         if ( data_fake.minlen_float != minlenp )
4417                         {
4418                             data->minlen_float= data_fake.minlen_float;
4419                             data->offset_float_min=data_fake.offset_float_min;
4420                             data->offset_float_max=data_fake.offset_float_max;
4421                             data->lookbehind_float+= scan->flags;
4422                         }
4423                     }
4424                 }
4425
4426
4427             }
4428 #endif
4429         }
4430         else if (OP(scan) == OPEN) {
4431             if (stopparen != (I32)ARG(scan))
4432                 pars++;
4433         }
4434         else if (OP(scan) == CLOSE) {
4435             if (stopparen == (I32)ARG(scan)) {
4436                 break;
4437             }
4438             if ((I32)ARG(scan) == is_par) {
4439                 next = regnext(scan);
4440
4441                 if ( next && (OP(next) != WHILEM) && next < last)
4442                     is_par = 0;         /* Disable optimization */
4443             }
4444             if (data)
4445                 *(data->last_closep) = ARG(scan);
4446         }
4447         else if (OP(scan) == EVAL) {
4448                 if (data)
4449                     data->flags |= SF_HAS_EVAL;
4450         }
4451         else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4452             if (flags & SCF_DO_SUBSTR) {
4453                 SCAN_COMMIT(pRExC_state,data,minlenp);
4454                 flags &= ~SCF_DO_SUBSTR;
4455             }
4456             if (data && OP(scan)==ACCEPT) {
4457                 data->flags |= SCF_SEEN_ACCEPT;
4458                 if (stopmin > min)
4459                     stopmin = min;
4460             }
4461         }
4462         else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4463         {
4464                 if (flags & SCF_DO_SUBSTR) {
4465                     SCAN_COMMIT(pRExC_state,data,minlenp);
4466                     data->longest = &(data->longest_float);
4467                 }
4468                 is_inf = is_inf_internal = 1;
4469                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4470                     cl_anything(pRExC_state, data->start_class);
4471                 flags &= ~SCF_DO_STCLASS;
4472         }
4473         else if (OP(scan) == GPOS) {
4474             if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4475                 !(delta || is_inf || (data && data->pos_delta)))
4476             {
4477                 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4478                     RExC_rx->extflags |= RXf_ANCH_GPOS;
4479                 if (RExC_rx->gofs < (U32)min)
4480                     RExC_rx->gofs = min;
4481             } else {
4482                 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4483                 RExC_rx->gofs = 0;
4484             }
4485         }
4486 #ifdef TRIE_STUDY_OPT
4487 #ifdef FULL_TRIE_STUDY
4488         else if (PL_regkind[OP(scan)] == TRIE) {
4489             /* NOTE - There is similar code to this block above for handling
4490                BRANCH nodes on the initial study.  If you change stuff here
4491                check there too. */
4492             regnode *trie_node= scan;
4493             regnode *tail= regnext(scan);
4494             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4495             I32 max1 = 0, min1 = I32_MAX;
4496             struct regnode_charclass_class accum;
4497
4498             if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4499                 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4500             if (flags & SCF_DO_STCLASS)
4501                 cl_init_zero(pRExC_state, &accum);
4502
4503             if (!trie->jump) {
4504                 min1= trie->minlen;
4505                 max1= trie->maxlen;
4506             } else {
4507                 const regnode *nextbranch= NULL;
4508                 U32 word;
4509
4510                 for ( word=1 ; word <= trie->wordcount ; word++)
4511                 {
4512                     I32 deltanext=0, minnext=0, f = 0, fake;
4513                     struct regnode_charclass_class this_class;
4514
4515                     data_fake.flags = 0;
4516                     if (data) {
4517                         data_fake.whilem_c = data->whilem_c;
4518                         data_fake.last_closep = data->last_closep;
4519                     }
4520                     else
4521                         data_fake.last_closep = &fake;
4522                     data_fake.pos_delta = delta;
4523                     if (flags & SCF_DO_STCLASS) {
4524                         cl_init(pRExC_state, &this_class);
4525                         data_fake.start_class = &this_class;
4526                         f = SCF_DO_STCLASS_AND;
4527                     }
4528                     if (flags & SCF_WHILEM_VISITED_POS)
4529                         f |= SCF_WHILEM_VISITED_POS;
4530
4531                     if (trie->jump[word]) {
4532                         if (!nextbranch)
4533                             nextbranch = trie_node + trie->jump[0];
4534                         scan= trie_node + trie->jump[word];
4535                         /* We go from the jump point to the branch that follows
4536                            it. Note this means we need the vestigal unused branches
4537                            even though they arent otherwise used.
4538                          */
4539                         minnext = study_chunk(pRExC_state, &scan, minlenp,
4540                             &deltanext, (regnode *)nextbranch, &data_fake,
4541                             stopparen, recursed, NULL, f,depth+1);
4542                     }
4543                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4544                         nextbranch= regnext((regnode*)nextbranch);
4545
4546                     if (min1 > (I32)(minnext + trie->minlen))
4547                         min1 = minnext + trie->minlen;
4548                     if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4549                         max1 = minnext + deltanext + trie->maxlen;
4550                     if (deltanext == I32_MAX)
4551                         is_inf = is_inf_internal = 1;
4552
4553                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4554                         pars++;
4555                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
4556                         if ( stopmin > min + min1)
4557                             stopmin = min + min1;
4558                         flags &= ~SCF_DO_SUBSTR;
4559                         if (data)
4560                             data->flags |= SCF_SEEN_ACCEPT;
4561                     }
4562                     if (data) {
4563                         if (data_fake.flags & SF_HAS_EVAL)
4564                             data->flags |= SF_HAS_EVAL;
4565                         data->whilem_c = data_fake.whilem_c;
4566                     }
4567                     if (flags & SCF_DO_STCLASS)
4568                         cl_or(pRExC_state, &accum, &this_class);
4569                 }
4570             }
4571             if (flags & SCF_DO_SUBSTR) {
4572                 data->pos_min += min1;
4573                 data->pos_delta += max1 - min1;
4574                 if (max1 != min1 || is_inf)
4575                     data->longest = &(data->longest_float);
4576             }
4577             min += min1;
4578             delta += max1 - min1;
4579             if (flags & SCF_DO_STCLASS_OR) {
4580                 cl_or(pRExC_state, data->start_class, &accum);
4581                 if (min1) {
4582                     cl_and(data->start_class, and_withp);
4583                     flags &= ~SCF_DO_STCLASS;
4584                 }
4585             }
4586             else if (flags & SCF_DO_STCLASS_AND) {
4587                 if (min1) {
4588                     cl_and(data->start_class, &accum);
4589                     flags &= ~SCF_DO_STCLASS;
4590                 }
4591                 else {
4592                     /* Switch to OR mode: cache the old value of
4593                      * data->start_class */
4594                     INIT_AND_WITHP;
4595                     StructCopy(data->start_class, and_withp,
4596                                struct regnode_charclass_class);
4597                     flags &= ~SCF_DO_STCLASS_AND;
4598                     StructCopy(&accum, data->start_class,
4599                                struct regnode_charclass_class);
4600                     flags |= SCF_DO_STCLASS_OR;
4601                     data->start_class->flags |= ANYOF_EOS;
4602                 }
4603             }
4604             scan= tail;
4605             continue;
4606         }
4607 #else
4608         else if (PL_regkind[OP(scan)] == TRIE) {
4609             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4610             U8*bang=NULL;
4611
4612             min += trie->minlen;
4613             delta += (trie->maxlen - trie->minlen);
4614             flags &= ~SCF_DO_STCLASS; /* xxx */
4615             if (flags & SCF_DO_SUBSTR) {
4616                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
4617                 data->pos_min += trie->minlen;
4618                 data->pos_delta += (trie->maxlen - trie->minlen);
4619                 if (trie->maxlen != trie->minlen)
4620                     data->longest = &(data->longest_float);
4621             }
4622             if (trie->jump) /* no more substrings -- for now /grr*/
4623                 flags &= ~SCF_DO_SUBSTR;
4624         }
4625 #endif /* old or new */
4626 #endif /* TRIE_STUDY_OPT */
4627
4628         /* Else: zero-length, ignore. */
4629         scan = regnext(scan);
4630     }
4631     if (frame) {
4632         last = frame->last;
4633         scan = frame->next;
4634         stopparen = frame->stop;
4635         frame = frame->prev;
4636         goto fake_study_recurse;
4637     }
4638
4639   finish:
4640     assert(!frame);
4641     DEBUG_STUDYDATA("pre-fin:",data,depth);
4642
4643     *scanp = scan;
4644     *deltap = is_inf_internal ? I32_MAX : delta;
4645     if (flags & SCF_DO_SUBSTR && is_inf)
4646         data->pos_delta = I32_MAX - data->pos_min;
4647     if (is_par > (I32)U8_MAX)
4648         is_par = 0;
4649     if (is_par && pars==1 && data) {
4650         data->flags |= SF_IN_PAR;
4651         data->flags &= ~SF_HAS_PAR;
4652     }
4653     else if (pars && data) {
4654         data->flags |= SF_HAS_PAR;
4655         data->flags &= ~SF_IN_PAR;
4656     }
4657     if (flags & SCF_DO_STCLASS_OR)
4658         cl_and(data->start_class, and_withp);
4659     if (flags & SCF_TRIE_RESTUDY)
4660         data->flags |=  SCF_TRIE_RESTUDY;
4661
4662     DEBUG_STUDYDATA("post-fin:",data,depth);
4663
4664     return min < stopmin ? min : stopmin;
4665 }
4666
4667 STATIC U32
4668 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4669 {
4670     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4671
4672     PERL_ARGS_ASSERT_ADD_DATA;
4673
4674     Renewc(RExC_rxi->data,
4675            sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4676            char, struct reg_data);
4677     if(count)
4678         Renew(RExC_rxi->data->what, count + n, U8);
4679     else
4680         Newx(RExC_rxi->data->what, n, U8);
4681     RExC_rxi->data->count = count + n;
4682     Copy(s, RExC_rxi->data->what + count, n, U8);
4683     return count;
4684 }
4685
4686 /*XXX: todo make this not included in a non debugging perl */
4687 #ifndef PERL_IN_XSUB_RE
4688 void
4689 Perl_reginitcolors(pTHX)
4690 {
4691     dVAR;
4692     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4693     if (s) {
4694         char *t = savepv(s);
4695         int i = 0;
4696         PL_colors[0] = t;
4697         while (++i < 6) {
4698             t = strchr(t, '\t');
4699             if (t) {
4700                 *t = '\0';
4701                 PL_colors[i] = ++t;
4702             }
4703             else
4704                 PL_colors[i] = t = (char *)"";
4705         }
4706     } else {
4707         int i = 0;
4708         while (i < 6)
4709             PL_colors[i++] = (char *)"";
4710     }
4711     PL_colorset = 1;
4712 }
4713 #endif
4714
4715
4716 #ifdef TRIE_STUDY_OPT
4717 #define CHECK_RESTUDY_GOTO                                  \
4718         if (                                                \
4719               (data.flags & SCF_TRIE_RESTUDY)               \
4720               && ! restudied++                              \
4721         )     goto reStudy
4722 #else
4723 #define CHECK_RESTUDY_GOTO
4724 #endif
4725
4726 /*
4727  - pregcomp - compile a regular expression into internal code
4728  *
4729  * We can't allocate space until we know how big the compiled form will be,
4730  * but we can't compile it (and thus know how big it is) until we've got a
4731  * place to put the code.  So we cheat:  we compile it twice, once with code
4732  * generation turned off and size counting turned on, and once "for real".
4733  * This also means that we don't allocate space until we are sure that the
4734  * thing really will compile successfully, and we never have to move the
4735  * code and thus invalidate pointers into it.  (Note that it has to be in
4736  * one piece because free() must be able to free it all.) [NB: not true in perl]
4737  *
4738  * Beware that the optimization-preparation code in here knows about some
4739  * of the structure of the compiled regexp.  [I'll say.]
4740  */
4741
4742
4743
4744 #ifndef PERL_IN_XSUB_RE
4745 #define RE_ENGINE_PTR &PL_core_reg_engine
4746 #else
4747 extern const struct regexp_engine my_reg_engine;
4748 #define RE_ENGINE_PTR &my_reg_engine
4749 #endif
4750
4751 #ifndef PERL_IN_XSUB_RE
4752 REGEXP *
4753 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4754 {
4755     dVAR;
4756     HV * const table = GvHV(PL_hintgv);
4757
4758     PERL_ARGS_ASSERT_PREGCOMP;
4759
4760     /* Dispatch a request to compile a regexp to correct
4761        regexp engine. */
4762     if (table) {
4763         SV **ptr= hv_fetchs(table, "regcomp", FALSE);
4764         GET_RE_DEBUG_FLAGS_DECL;
4765         if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
4766             const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4767             DEBUG_COMPILE_r({
4768                 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4769                     SvIV(*ptr));
4770             });
4771             return CALLREGCOMP_ENG(eng, pattern, flags);
4772         }
4773     }
4774     return Perl_re_compile(aTHX_ pattern, flags);
4775 }
4776 #endif
4777
4778 REGEXP *
4779 Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
4780 {
4781     dVAR;
4782     REGEXP *rx;
4783     struct regexp *r;
4784     register regexp_internal *ri;
4785     STRLEN plen;
4786     char* VOL exp;
4787     char* xend;
4788     regnode *scan;
4789     I32 flags;
4790     I32 minlen = 0;
4791     U32 pm_flags;
4792
4793     /* these are all flags - maybe they should be turned
4794      * into a single int with different bit masks */
4795     I32 sawlookahead = 0;
4796     I32 sawplus = 0;
4797     I32 sawopen = 0;
4798     bool used_setjump = FALSE;
4799     regex_charset initial_charset = get_regex_charset(orig_pm_flags);
4800
4801     U8 jump_ret = 0;
4802     dJMPENV;
4803     scan_data_t data;
4804     RExC_state_t RExC_state;
4805     RExC_state_t * const pRExC_state = &RExC_state;
4806 #ifdef TRIE_STUDY_OPT
4807     int restudied;
4808     RExC_state_t copyRExC_state;
4809 #endif
4810     GET_RE_DEBUG_FLAGS_DECL;
4811
4812     PERL_ARGS_ASSERT_RE_COMPILE;
4813
4814     DEBUG_r(if (!PL_colorset) reginitcolors());
4815
4816     exp = SvPV(pattern, plen);
4817
4818     if (plen == 0) { /* ignore the utf8ness if the pattern is 0 length */
4819         RExC_utf8 = RExC_orig_utf8 = 0;
4820     }
4821     else {
4822         RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
4823     }
4824     RExC_uni_semantics = 0;
4825     RExC_contains_locale = 0;
4826
4827     /****************** LONG JUMP TARGET HERE***********************/
4828     /* Longjmp back to here if have to switch in midstream to utf8 */
4829     if (! RExC_orig_utf8) {
4830         JMPENV_PUSH(jump_ret);
4831         used_setjump = TRUE;
4832     }
4833
4834     if (jump_ret == 0) {    /* First time through */
4835         xend = exp + plen;
4836
4837         DEBUG_COMPILE_r({
4838             SV *dsv= sv_newmortal();
4839             RE_PV_QUOTED_DECL(s, RExC_utf8,
4840                 dsv, exp, plen, 60);
4841             PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4842                            PL_colors[4],PL_colors[5],s);
4843         });
4844     }
4845     else {  /* longjumped back */
4846         STRLEN len = plen;
4847
4848         /* If the cause for the longjmp was other than changing to utf8, pop
4849          * our own setjmp, and longjmp to the correct handler */
4850         if (jump_ret != UTF8_LONGJMP) {
4851             JMPENV_POP;
4852             JMPENV_JUMP(jump_ret);
4853         }
4854
4855         GET_RE_DEBUG_FLAGS;
4856
4857         /* It's possible to write a regexp in ascii that represents Unicode
4858         codepoints outside of the byte range, such as via \x{100}. If we
4859         detect such a sequence we have to convert the entire pattern to utf8
4860         and then recompile, as our sizing calculation will have been based
4861         on 1 byte == 1 character, but we will need to use utf8 to encode
4862         at least some part of the pattern, and therefore must convert the whole
4863         thing.
4864         -- dmq */
4865         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4866             "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4867         exp = (char*)Perl_bytes_to_utf8(aTHX_
4868                                         (U8*)SvPV_nomg(pattern, plen),
4869                                         &len);
4870         xend = exp + len;
4871         RExC_orig_utf8 = RExC_utf8 = 1;
4872         SAVEFREEPV(exp);
4873     }
4874
4875 #ifdef TRIE_STUDY_OPT
4876     restudied = 0;
4877 #endif
4878
4879     pm_flags = orig_pm_flags;
4880
4881     if (initial_charset == REGEX_LOCALE_CHARSET) {
4882         RExC_contains_locale = 1;
4883     }
4884     else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
4885
4886         /* Set to use unicode semantics if the pattern is in utf8 and has the
4887          * 'depends' charset specified, as it means unicode when utf8  */
4888         set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
4889     }
4890
4891     RExC_precomp = exp;
4892     RExC_flags = pm_flags;
4893     RExC_sawback = 0;
4894
4895     RExC_seen = 0;
4896     RExC_in_lookbehind = 0;
4897     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4898     RExC_seen_evals = 0;
4899     RExC_extralen = 0;
4900     RExC_override_recoding = 0;
4901
4902     /* First pass: determine size, legality. */
4903     RExC_parse = exp;
4904     RExC_start = exp;
4905     RExC_end = xend;
4906     RExC_naughty = 0;
4907     RExC_npar = 1;
4908     RExC_nestroot = 0;
4909     RExC_size = 0L;
4910     RExC_emit = &PL_regdummy;
4911     RExC_whilem_seen = 0;
4912     RExC_open_parens = NULL;
4913     RExC_close_parens = NULL;
4914     RExC_opend = NULL;
4915     RExC_paren_names = NULL;
4916 #ifdef DEBUGGING
4917     RExC_paren_name_list = NULL;
4918 #endif
4919     RExC_recurse = NULL;
4920     RExC_recurse_count = 0;
4921
4922 #if 0 /* REGC() is (currently) a NOP at the first pass.
4923        * Clever compilers notice this and complain. --jhi */
4924     REGC((U8)REG_MAGIC, (char*)RExC_emit);
4925 #endif
4926     DEBUG_PARSE_r(
4927         PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
4928         RExC_lastnum=0;
4929         RExC_lastparse=NULL;
4930     );
4931     if (reg(pRExC_state, 0, &flags,1) == NULL) {
4932         RExC_precomp = NULL;
4933         return(NULL);
4934     }
4935
4936     /* Here, finished first pass.  Get rid of any added setjmp */
4937     if (used_setjump) {
4938         JMPENV_POP;
4939     }
4940
4941     DEBUG_PARSE_r({
4942         PerlIO_printf(Perl_debug_log,
4943             "Required size %"IVdf" nodes\n"
4944             "Starting second pass (creation)\n",
4945             (IV)RExC_size);
4946         RExC_lastnum=0;
4947         RExC_lastparse=NULL;
4948     });
4949
4950     /* The first pass could have found things that force Unicode semantics */
4951     if ((RExC_utf8 || RExC_uni_semantics)
4952          && get_regex_charset(pm_flags) == REGEX_DEPENDS_CHARSET)
4953     {
4954         set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
4955     }
4956
4957     /* Small enough for pointer-storage convention?
4958        If extralen==0, this means that we will not need long jumps. */
4959     if (RExC_size >= 0x10000L && RExC_extralen)
4960         RExC_size += RExC_extralen;
4961     else
4962         RExC_extralen = 0;
4963     if (RExC_whilem_seen > 15)
4964         RExC_whilem_seen = 15;
4965
4966     /* Allocate space and zero-initialize. Note, the two step process
4967        of zeroing when in debug mode, thus anything assigned has to
4968        happen after that */
4969     rx = (REGEXP*) newSV_type(SVt_REGEXP);
4970     r = (struct regexp*)SvANY(rx);
4971     Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4972          char, regexp_internal);
4973     if ( r == NULL || ri == NULL )
4974         FAIL("Regexp out of space");
4975 #ifdef DEBUGGING
4976     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
4977     Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
4978 #else
4979     /* bulk initialize base fields with 0. */
4980     Zero(ri, sizeof(regexp_internal), char);
4981 #endif
4982
4983     /* non-zero initialization begins here */
4984     RXi_SET( r, ri );
4985     r->engine= RE_ENGINE_PTR;
4986     r->extflags = pm_flags;
4987     {
4988         bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
4989         bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
4990
4991         /* The caret is output if there are any defaults: if not all the STD
4992          * flags are set, or if no character set specifier is needed */
4993         bool has_default =
4994                     (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
4995                     || ! has_charset);
4996         bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
4997         U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
4998                             >> RXf_PMf_STD_PMMOD_SHIFT);
4999         const char *fptr = STD_PAT_MODS;        /*"msix"*/
5000         char *p;
5001         /* Allocate for the worst case, which is all the std flags are turned
5002          * on.  If more precision is desired, we could do a population count of
5003          * the flags set.  This could be done with a small lookup table, or by
5004          * shifting, masking and adding, or even, when available, assembly
5005          * language for a machine-language population count.
5006          * We never output a minus, as all those are defaults, so are
5007          * covered by the caret */
5008         const STRLEN wraplen = plen + has_p + has_runon
5009             + has_default       /* If needs a caret */
5010
5011                 /* If needs a character set specifier */
5012             + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
5013             + (sizeof(STD_PAT_MODS) - 1)
5014             + (sizeof("(?:)") - 1);
5015
5016         p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
5017         SvPOK_on(rx);
5018         SvFLAGS(rx) |= SvUTF8(pattern);
5019         *p++='('; *p++='?';
5020
5021         /* If a default, cover it using the caret */
5022         if (has_default) {
5023             *p++= DEFAULT_PAT_MOD;
5024         }
5025         if (has_charset) {
5026             STRLEN len;
5027             const char* const name = get_regex_charset_name(r->extflags, &len);
5028             Copy(name, p, len, char);
5029             p += len;
5030         }
5031         if (has_p)
5032             *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
5033         {
5034             char ch;
5035             while((ch = *fptr++)) {
5036                 if(reganch & 1)
5037                     *p++ = ch;
5038                 reganch >>= 1;
5039             }
5040         }
5041
5042         *p++ = ':';
5043         Copy(RExC_precomp, p, plen, char);
5044         assert ((RX_WRAPPED(rx) - p) < 16);
5045         r->pre_prefix = p - RX_WRAPPED(rx);
5046         p += plen;
5047         if (has_runon)
5048             *p++ = '\n';
5049         *p++ = ')';
5050         *p = 0;
5051         SvCUR_set(rx, p - SvPVX_const(rx));
5052     }
5053
5054     r->intflags = 0;
5055     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
5056
5057     if (RExC_seen & REG_SEEN_RECURSE) {
5058         Newxz(RExC_open_parens, RExC_npar,regnode *);
5059         SAVEFREEPV(RExC_open_parens);
5060         Newxz(RExC_close_parens,RExC_npar,regnode *);
5061         SAVEFREEPV(RExC_close_parens);
5062     }
5063
5064     /* Useful during FAIL. */
5065 #ifdef RE_TRACK_PATTERN_OFFSETS
5066     Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
5067     DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
5068                           "%s %"UVuf" bytes for offset annotations.\n",
5069                           ri->u.offsets ? "Got" : "Couldn't get",
5070                           (UV)((2*RExC_size+1) * sizeof(U32))));
5071 #endif
5072     SetProgLen(ri,RExC_size);
5073     RExC_rx_sv = rx;
5074     RExC_rx = r;
5075     RExC_rxi = ri;
5076     REH_CALL_COMP_BEGIN_HOOK(pRExC_state->rx);
5077
5078     /* Second pass: emit code. */
5079     RExC_flags = pm_flags;      /* don't let top level (?i) bleed */
5080     RExC_parse = exp;
5081     RExC_end = xend;
5082     RExC_naughty = 0;
5083     RExC_npar = 1;
5084     RExC_emit_start = ri->program;
5085     RExC_emit = ri->program;
5086     RExC_emit_bound = ri->program + RExC_size + 1;
5087
5088     /* Store the count of eval-groups for security checks: */
5089     RExC_rx->seen_evals = RExC_seen_evals;
5090     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
5091     if (reg(pRExC_state, 0, &flags,1) == NULL) {
5092         ReREFCNT_dec(rx);
5093         return(NULL);
5094     }
5095     /* XXXX To minimize changes to RE engine we always allocate
5096        3-units-long substrs field. */
5097     Newx(r->substrs, 1, struct reg_substr_data);
5098     if (RExC_recurse_count) {
5099         Newxz(RExC_recurse,RExC_recurse_count,regnode *);
5100         SAVEFREEPV(RExC_recurse);
5101     }
5102
5103 reStudy:
5104     r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
5105     Zero(r->substrs, 1, struct reg_substr_data);
5106
5107 #ifdef TRIE_STUDY_OPT
5108     if (!restudied) {
5109         StructCopy(&zero_scan_data, &data, scan_data_t);
5110         copyRExC_state = RExC_state;
5111     } else {
5112         U32 seen=RExC_seen;
5113         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
5114
5115         RExC_state = copyRExC_state;
5116         if (seen & REG_TOP_LEVEL_BRANCHES)
5117             RExC_seen |= REG_TOP_LEVEL_BRANCHES;
5118         else
5119             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
5120         if (data.last_found) {
5121             SvREFCNT_dec(data.longest_fixed);
5122             SvREFCNT_dec(data.longest_float);
5123             SvREFCNT_dec(data.last_found);
5124         }
5125         StructCopy(&zero_scan_data, &data, scan_data_t);
5126     }
5127 #else
5128     StructCopy(&zero_scan_data, &data, scan_data_t);
5129 #endif
5130
5131     /* Dig out information for optimizations. */
5132     r->extflags = RExC_flags; /* was pm_op */
5133     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
5134
5135     if (UTF)
5136         SvUTF8_on(rx);  /* Unicode in it? */
5137     ri->regstclass = NULL;
5138     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
5139         r->intflags |= PREGf_NAUGHTY;
5140     scan = ri->program + 1;             /* First BRANCH. */
5141
5142     /* testing for BRANCH here tells us whether there is "must appear"
5143        data in the pattern. If there is then we can use it for optimisations */
5144     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /*  Only one top-level choice. */
5145         I32 fake;
5146         STRLEN longest_float_length, longest_fixed_length;
5147         struct regnode_charclass_class ch_class; /* pointed to by data */
5148         int stclass_flag;
5149         I32 last_close = 0; /* pointed to by data */
5150         regnode *first= scan;
5151         regnode *first_next= regnext(first);
5152         /*
5153          * Skip introductions and multiplicators >= 1
5154          * so that we can extract the 'meat' of the pattern that must
5155          * match in the large if() sequence following.
5156          * NOTE that EXACT is NOT covered here, as it is normally
5157          * picked up by the optimiser separately.
5158          *
5159          * This is unfortunate as the optimiser isnt handling lookahead
5160          * properly currently.
5161          *
5162          */
5163         while ((OP(first) == OPEN && (sawopen = 1)) ||
5164                /* An OR of *one* alternative - should not happen now. */
5165             (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
5166             /* for now we can't handle lookbehind IFMATCH*/
5167             (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
5168             (OP(first) == PLUS) ||
5169             (OP(first) == MINMOD) ||
5170                /* An {n,m} with n>0 */
5171             (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
5172             (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
5173         {
5174                 /*
5175                  * the only op that could be a regnode is PLUS, all the rest
5176                  * will be regnode_1 or regnode_2.
5177                  *
5178                  */
5179                 if (OP(first) == PLUS)
5180                     sawplus = 1;
5181                 else
5182                     first += regarglen[OP(first)];
5183
5184                 first = NEXTOPER(first);
5185                 first_next= regnext(first);
5186         }
5187
5188         /* Starting-point info. */
5189       again:
5190         DEBUG_PEEP("first:",first,0);
5191         /* Ignore EXACT as we deal with it later. */
5192         if (PL_regkind[OP(first)] == EXACT) {
5193             if (OP(first) == EXACT)
5194                 NOOP;   /* Empty, get anchored substr later. */
5195             else
5196                 ri->regstclass = first;
5197         }
5198 #ifdef TRIE_STCLASS
5199         else if (PL_regkind[OP(first)] == TRIE &&
5200                 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
5201         {
5202             regnode *trie_op;
5203             /* this can happen only on restudy */
5204             if ( OP(first) == TRIE ) {
5205                 struct regnode_1 *trieop = (struct regnode_1 *)
5206                     PerlMemShared_calloc(1, sizeof(struct regnode_1));
5207                 StructCopy(first,trieop,struct regnode_1);
5208                 trie_op=(regnode *)trieop;
5209             } else {
5210                 struct regnode_charclass *trieop = (struct regnode_charclass *)
5211                     PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
5212                 StructCopy(first,trieop,struct regnode_charclass);
5213                 trie_op=(regnode *)trieop;
5214             }
5215             OP(trie_op)+=2;
5216             make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
5217             ri->regstclass = trie_op;
5218         }
5219 #endif
5220         else if (REGNODE_SIMPLE(OP(first)))
5221             ri->regstclass = first;
5222         else if (PL_regkind[OP(first)] == BOUND ||
5223                  PL_regkind[OP(first)] == NBOUND)
5224             ri->regstclass = first;
5225         else if (PL_regkind[OP(first)] == BOL) {
5226             r->extflags |= (OP(first) == MBOL
5227                            ? RXf_ANCH_MBOL
5228                            : (OP(first) == SBOL
5229                               ? RXf_ANCH_SBOL
5230                               : RXf_ANCH_BOL));
5231             first = NEXTOPER(first);
5232             goto again;
5233         }
5234         else if (OP(first) == GPOS) {
5235             r->extflags |= RXf_ANCH_GPOS;
5236             first = NEXTOPER(first);
5237             goto again;
5238         }
5239         else if ((!sawopen || !RExC_sawback) &&
5240             (OP(first) == STAR &&
5241             PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
5242             !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
5243         {
5244             /* turn .* into ^.* with an implied $*=1 */
5245             const int type =
5246                 (OP(NEXTOPER(first)) == REG_ANY)
5247                     ? RXf_ANCH_MBOL
5248                     : RXf_ANCH_SBOL;
5249             r->extflags |= type;
5250             r->intflags |= PREGf_IMPLICIT;
5251             first = NEXTOPER(first);
5252             goto again;
5253         }
5254         if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
5255             && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
5256             /* x+ must match at the 1st pos of run of x's */
5257             r->intflags |= PREGf_SKIP;
5258
5259         /* Scan is after the zeroth branch, first is atomic matcher. */
5260 #ifdef TRIE_STUDY_OPT
5261         DEBUG_PARSE_r(
5262             if (!restudied)
5263                 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
5264                               (IV)(first - scan + 1))
5265         );
5266 #else
5267         DEBUG_PARSE_r(
5268             PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
5269                 (IV)(first - scan + 1))
5270         );
5271 #endif
5272
5273
5274         /*
5275         * If there's something expensive in the r.e., find the
5276         * longest literal string that must appear and make it the
5277         * regmust.  Resolve ties in favor of later strings, since
5278         * the regstart check works with the beginning of the r.e.
5279         * and avoiding duplication strengthens checking.  Not a
5280         * strong reason, but sufficient in the absence of others.
5281         * [Now we resolve ties in favor of the earlier string if
5282         * it happens that c_offset_min has been invalidated, since the
5283         * earlier string may buy us something the later one won't.]
5284         */
5285
5286         data.longest_fixed = newSVpvs("");
5287         data.longest_float = newSVpvs("");
5288         data.last_found = newSVpvs("");
5289         data.longest = &(data.longest_fixed);
5290         first = scan;
5291         if (!ri->regstclass) {
5292             cl_init(pRExC_state, &ch_class);
5293             data.start_class = &ch_class;
5294             stclass_flag = SCF_DO_STCLASS_AND;
5295         } else                          /* XXXX Check for BOUND? */
5296             stclass_flag = 0;
5297         data.last_closep = &last_close;
5298
5299         minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
5300             &data, -1, NULL, NULL,
5301             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
5302
5303
5304         CHECK_RESTUDY_GOTO;
5305
5306
5307         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
5308              && data.last_start_min == 0 && data.last_end > 0
5309              && !RExC_seen_zerolen
5310              && !(RExC_seen & REG_SEEN_VERBARG)
5311              && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
5312             r->extflags |= RXf_CHECK_ALL;
5313         scan_commit(pRExC_state, &data,&minlen,0);
5314         SvREFCNT_dec(data.last_found);
5315
5316         /* Note that code very similar to this but for anchored string
5317            follows immediately below, changes may need to be made to both.
5318            Be careful.
5319          */
5320         longest_float_length = CHR_SVLEN(data.longest_float);
5321         if (longest_float_length
5322             || (data.flags & SF_FL_BEFORE_EOL
5323                 && (!(data.flags & SF_FL_BEFORE_MEOL)
5324                     || (RExC_flags & RXf_PMf_MULTILINE))))
5325         {
5326             I32 t,ml;
5327
5328             /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
5329             if ((RExC_seen & REG_SEEN_EXACTF_SHARP_S)
5330                 || (SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
5331                     && data.offset_fixed == data.offset_float_min
5332                     && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
5333                     goto remove_float;          /* As in (a)+. */
5334
5335             /* copy the information about the longest float from the reg_scan_data
5336                over to the program. */
5337             if (SvUTF8(data.longest_float)) {
5338                 r->float_utf8 = data.longest_float;
5339                 r->float_substr = NULL;
5340             } else {
5341                 r->float_substr = data.longest_float;
5342                 r->float_utf8 = NULL;
5343             }
5344             /* float_end_shift is how many chars that must be matched that
5345                follow this item. We calculate it ahead of time as once the
5346                lookbehind offset is added in we lose the ability to correctly
5347                calculate it.*/
5348             ml = data.minlen_float ? *(data.minlen_float)
5349                                    : (I32)longest_float_length;
5350             r->float_end_shift = ml - data.offset_float_min
5351                 - longest_float_length + (SvTAIL(data.longest_float) != 0)
5352                 + data.lookbehind_float;
5353             r->float_min_offset = data.offset_float_min - data.lookbehind_float;
5354             r->float_max_offset = data.offset_float_max;
5355             if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
5356                 r->float_max_offset -= data.lookbehind_float;
5357
5358             t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
5359                        && (!(data.flags & SF_FL_BEFORE_MEOL)
5360                            || (RExC_flags & RXf_PMf_MULTILINE)));
5361             fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
5362         }
5363         else {
5364           remove_float:
5365             r->float_substr = r->float_utf8 = NULL;
5366             SvREFCNT_dec(data.longest_float);
5367             longest_float_length = 0;
5368         }
5369
5370         /* Note that code very similar to this but for floating string
5371            is immediately above, changes may need to be made to both.
5372            Be careful.
5373          */
5374         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
5375
5376         /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
5377         if (! (RExC_seen & REG_SEEN_EXACTF_SHARP_S)
5378             && (longest_fixed_length
5379                 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
5380                     && (!(data.flags & SF_FIX_BEFORE_MEOL)
5381                         || (RExC_flags & RXf_PMf_MULTILINE)))) )
5382         {
5383             I32 t,ml;
5384
5385             /* copy the information about the longest fixed
5386                from the reg_scan_data over to the program. */
5387             if (SvUTF8(data.longest_fixed)) {
5388                 r->anchored_utf8 = data.longest_fixed;
5389                 r->anchored_substr = NULL;
5390             } else {
5391                 r->anchored_substr = data.longest_fixed;
5392                 r->anchored_utf8 = NULL;
5393             }
5394             /* fixed_end_shift is how many chars that must be matched that
5395                follow this item. We calculate it ahead of time as once the
5396                lookbehind offset is added in we lose the ability to correctly
5397                calculate it.*/
5398             ml = data.minlen_fixed ? *(data.minlen_fixed)
5399                                    : (I32)longest_fixed_length;
5400             r->anchored_end_shift = ml - data.offset_fixed
5401                 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
5402                 + data.lookbehind_fixed;
5403             r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
5404
5405             t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
5406                  && (!(data.flags & SF_FIX_BEFORE_MEOL)
5407                      || (RExC_flags & RXf_PMf_MULTILINE)));
5408             fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
5409         }
5410         else {
5411             r->anchored_substr = r->anchored_utf8 = NULL;
5412             SvREFCNT_dec(data.longest_fixed);
5413             longest_fixed_length = 0;
5414         }
5415         if (ri->regstclass
5416             && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
5417             ri->regstclass = NULL;
5418
5419         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
5420             && stclass_flag
5421             && !(data.start_class->flags & ANYOF_EOS)
5422             && !cl_is_anything(data.start_class))
5423         {
5424             const U32 n = add_data(pRExC_state, 1, "f");
5425             data.start_class->flags |= ANYOF_IS_SYNTHETIC;
5426
5427             Newx(RExC_rxi->data->data[n], 1,
5428                 struct regnode_charclass_class);
5429             StructCopy(data.start_class,
5430                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
5431                        struct regnode_charclass_class);
5432             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
5433             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
5434             DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
5435                       regprop(r, sv, (regnode*)data.start_class);
5436                       PerlIO_printf(Perl_debug_log,
5437                                     "synthetic stclass \"%s\".\n",
5438                                     SvPVX_const(sv));});
5439         }
5440
5441         /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
5442         if (longest_fixed_length > longest_float_length) {
5443             r->check_end_shift = r->anchored_end_shift;
5444             r->check_substr = r->anchored_substr;
5445             r->check_utf8 = r->anchored_utf8;
5446             r->check_offset_min = r->check_offset_max = r->anchored_offset;
5447             if (r->extflags & RXf_ANCH_SINGLE)
5448                 r->extflags |= RXf_NOSCAN;
5449         }
5450         else {
5451             r->check_end_shift = r->float_end_shift;
5452             r->check_substr = r->float_substr;
5453             r->check_utf8 = r->float_utf8;
5454             r->check_offset_min = r->float_min_offset;
5455             r->check_offset_max = r->float_max_offset;
5456         }
5457         /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
5458            This should be changed ASAP!  */
5459         if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
5460             r->extflags |= RXf_USE_INTUIT;
5461             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
5462                 r->extflags |= RXf_INTUIT_TAIL;
5463         }
5464         /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
5465         if ( (STRLEN)minlen < longest_float_length )
5466             minlen= longest_float_length;
5467         if ( (STRLEN)minlen < longest_fixed_length )
5468             minlen= longest_fixed_length;
5469         */
5470     }
5471     else {
5472         /* Several toplevels. Best we can is to set minlen. */
5473         I32 fake;
5474         struct regnode_charclass_class ch_class;
5475         I32 last_close = 0;
5476
5477         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
5478
5479         scan = ri->program + 1;
5480         cl_init(pRExC_state, &ch_class);
5481         data.start_class = &ch_class;
5482         data.last_closep = &last_close;
5483
5484
5485         minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
5486             &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
5487
5488         CHECK_RESTUDY_GOTO;
5489
5490         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
5491                 = r->float_substr = r->float_utf8 = NULL;
5492
5493         if (!(data.start_class->flags & ANYOF_EOS)
5494             && !cl_is_anything(data.start_class))
5495         {
5496             const U32 n = add_data(pRExC_state, 1, "f");
5497             data.start_class->flags |= ANYOF_IS_SYNTHETIC;
5498
5499             Newx(RExC_rxi->data->data[n], 1,
5500                 struct regnode_charclass_class);
5501             StructCopy(data.start_class,
5502                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
5503                        struct regnode_charclass_class);
5504             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
5505             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
5506             DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
5507                       regprop(r, sv, (regnode*)data.start_class);
5508                       PerlIO_printf(Perl_debug_log,
5509                                     "synthetic stclass \"%s\".\n",
5510                                     SvPVX_const(sv));});
5511         }
5512     }
5513
5514     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
5515        the "real" pattern. */
5516     DEBUG_OPTIMISE_r({
5517         PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
5518                       (IV)minlen, (IV)r->minlen);
5519     });
5520     r->minlenret = minlen;
5521     if (r->minlen < minlen)
5522         r->minlen = minlen;
5523
5524     if (RExC_seen & REG_SEEN_GPOS)
5525         r->extflags |= RXf_GPOS_SEEN;
5526     if (RExC_seen & REG_SEEN_LOOKBEHIND)
5527         r->extflags |= RXf_LOOKBEHIND_SEEN;
5528     if (RExC_seen & REG_SEEN_EVAL)
5529         r->extflags |= RXf_EVAL_SEEN;
5530     if (RExC_seen & REG_SEEN_CANY)
5531         r->extflags |= RXf_CANY_SEEN;
5532     if (RExC_seen & REG_SEEN_VERBARG)
5533         r->intflags |= PREGf_VERBARG_SEEN;
5534     if (RExC_seen & REG_SEEN_CUTGROUP)
5535         r->intflags |= PREGf_CUTGROUP_SEEN;
5536     if (RExC_paren_names)
5537         RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
5538     else
5539         RXp_PAREN_NAMES(r) = NULL;
5540
5541 #ifdef STUPID_PATTERN_CHECKS
5542     if (RX_PRELEN(rx) == 0)
5543         r->extflags |= RXf_NULL;
5544     if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
5545         /* XXX: this should happen BEFORE we compile */
5546         r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5547     else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
5548         r->extflags |= RXf_WHITE;
5549     else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
5550         r->extflags |= RXf_START_ONLY;
5551 #else
5552     if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
5553             /* XXX: this should happen BEFORE we compile */
5554             r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5555     else {
5556         regnode *first = ri->program + 1;
5557         U8 fop = OP(first);
5558
5559         if (PL_regkind[fop] == NOTHING && OP(NEXTOPER(first)) == END)
5560             r->extflags |= RXf_NULL;
5561         else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END)
5562             r->extflags |= RXf_START_ONLY;
5563         else if (fop == PLUS && OP(NEXTOPER(first)) == SPACE
5564                              && OP(regnext(first)) == END)
5565             r->extflags |= RXf_WHITE;
5566     }
5567 #endif
5568 #ifdef DEBUGGING
5569     if (RExC_paren_names) {
5570         ri->name_list_idx = add_data( pRExC_state, 1, "a" );
5571         ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
5572     } else
5573 #endif
5574         ri->name_list_idx = 0;
5575
5576     if (RExC_recurse_count) {
5577         for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
5578             const regnode *scan = RExC_recurse[RExC_recurse_count-1];
5579             ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
5580         }
5581     }
5582     Newxz(r->offs, RExC_npar, regexp_paren_pair);
5583     /* assume we don't need to swap parens around before we match */
5584
5585     DEBUG_DUMP_r({
5586         PerlIO_printf(Perl_debug_log,"Final program:\n");
5587         regdump(r);
5588     });
5589 #ifdef RE_TRACK_PATTERN_OFFSETS
5590     DEBUG_OFFSETS_r(if (ri->u.offsets) {
5591         const U32 len = ri->u.offsets[0];
5592         U32 i;
5593         GET_RE_DEBUG_FLAGS_DECL;
5594         PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
5595         for (i = 1; i <= len; i++) {
5596             if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
5597                 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
5598                 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
5599             }
5600         PerlIO_printf(Perl_debug_log, "\n");
5601     });
5602 #endif
5603     return rx;
5604 }
5605
5606 #undef RE_ENGINE_PTR
5607
5608
5609 SV*
5610 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
5611                     const U32 flags)
5612 {
5613     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
5614
5615     PERL_UNUSED_ARG(value);
5616
5617     if (flags & RXapif_FETCH) {
5618         return reg_named_buff_fetch(rx, key, flags);
5619     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
5620         Perl_croak_no_modify(aTHX);
5621         return NULL;
5622     } else if (flags & RXapif_EXISTS) {
5623         return reg_named_buff_exists(rx, key, flags)
5624             ? &PL_sv_yes
5625             : &PL_sv_no;
5626     } else if (flags & RXapif_REGNAMES) {
5627         return reg_named_buff_all(rx, flags);
5628     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
5629         return reg_named_buff_scalar(rx, flags);
5630     } else {
5631         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
5632         return NULL;
5633     }
5634 }
5635
5636 SV*
5637 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
5638                          const U32 flags)
5639 {
5640     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
5641     PERL_UNUSED_ARG(lastkey);
5642
5643     if (flags & RXapif_FIRSTKEY)
5644         return reg_named_buff_firstkey(rx, flags);
5645     else if (flags & RXapif_NEXTKEY)
5646         return reg_named_buff_nextkey(rx, flags);
5647     else {
5648         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
5649         return NULL;
5650     }
5651 }
5652
5653 SV*
5654 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
5655                           const U32 flags)
5656 {
5657     AV *retarray = NULL;
5658     SV *ret;
5659     struct regexp *const rx = (struct regexp *)SvANY(r);
5660
5661     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
5662
5663     if (flags & RXapif_ALL)
5664         retarray=newAV();
5665
5666     if (rx && RXp_PAREN_NAMES(rx)) {
5667         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
5668         if (he_str) {
5669             IV i;
5670             SV* sv_dat=HeVAL(he_str);
5671             I32 *nums=(I32*)SvPVX(sv_dat);
5672             for ( i=0; i<SvIVX(sv_dat); i++ ) {
5673                 if ((I32)(rx->nparens) >= nums[i]
5674                     && rx->offs[nums[i]].start != -1
5675                     && rx->offs[nums[i]].end != -1)
5676                 {
5677                     ret = newSVpvs("");
5678                     CALLREG_NUMBUF_FETCH(r,nums[i],ret);
5679                     if (!retarray)
5680                         return ret;
5681                 } else {
5682                     if (retarray)
5683                         ret = newSVsv(&PL_sv_undef);
5684                 }
5685                 if (retarray)
5686                     av_push(retarray, ret);
5687             }
5688             if (retarray)
5689                 return newRV_noinc(MUTABLE_SV(retarray));
5690         }
5691     }
5692     return NULL;
5693 }
5694
5695 bool
5696 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
5697                            const U32 flags)
5698 {
5699     struct regexp *const rx = (struct regexp *)SvANY(r);
5700
5701     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
5702
5703     if (rx && RXp_PAREN_NAMES(rx)) {
5704         if (flags & RXapif_ALL) {
5705             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
5706         } else {
5707             SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
5708             if (sv) {
5709                 SvREFCNT_dec(sv);
5710                 return TRUE;
5711             } else {
5712                 return FALSE;
5713             }
5714         }
5715     } else {
5716         return FALSE;
5717     }
5718 }
5719
5720 SV*
5721 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
5722 {
5723     struct regexp *const rx = (struct regexp *)SvANY(r);
5724
5725     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
5726
5727     if ( rx && RXp_PAREN_NAMES(rx) ) {
5728         (void)hv_iterinit(RXp_PAREN_NAMES(rx));
5729
5730         return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
5731     } else {
5732         return FALSE;
5733     }
5734 }
5735
5736 SV*
5737 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
5738 {
5739     struct regexp *const rx = (struct regexp *)SvANY(r);
5740     GET_RE_DEBUG_FLAGS_DECL;
5741
5742     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
5743
5744     if (rx && RXp_PAREN_NAMES(rx)) {
5745         HV *hv = RXp_PAREN_NAMES(rx);
5746         HE *temphe;
5747         while ( (temphe = hv_iternext_flags(hv,0)) ) {
5748             IV i;
5749             IV parno = 0;
5750             SV* sv_dat = HeVAL(temphe);
5751             I32 *nums = (I32*)SvPVX(sv_dat);
5752             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5753                 if ((I32)(rx->lastparen) >= nums[i] &&
5754                     rx->offs[nums[i]].start != -1 &&
5755                     rx->offs[nums[i]].end != -1)
5756                 {
5757                     parno = nums[i];
5758                     break;
5759                 }
5760             }
5761             if (parno || flags & RXapif_ALL) {
5762                 return newSVhek(HeKEY_hek(temphe));
5763             }
5764         }
5765     }
5766     return NULL;
5767 }
5768
5769 SV*
5770 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
5771 {
5772     SV *ret;
5773     AV *av;
5774     I32 length;
5775     struct regexp *const rx = (struct regexp *)SvANY(r);
5776
5777     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
5778
5779     if (rx && RXp_PAREN_NAMES(rx)) {
5780         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5781             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
5782         } else if (flags & RXapif_ONE) {
5783             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
5784             av = MUTABLE_AV(SvRV(ret));
5785             length = av_len(av);
5786             SvREFCNT_dec(ret);
5787             return newSViv(length + 1);
5788         } else {
5789             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5790             return NULL;
5791         }
5792     }
5793     return &PL_sv_undef;
5794 }
5795
5796 SV*
5797 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
5798 {
5799     struct regexp *const rx = (struct regexp *)SvANY(r);
5800     AV *av = newAV();
5801
5802     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
5803
5804     if (rx && RXp_PAREN_NAMES(rx)) {
5805         HV *hv= RXp_PAREN_NAMES(rx);
5806         HE *temphe;
5807         (void)hv_iterinit(hv);
5808         while ( (temphe = hv_iternext_flags(hv,0)) ) {
5809             IV i;
5810             IV parno = 0;
5811             SV* sv_dat = HeVAL(temphe);
5812             I32 *nums = (I32*)SvPVX(sv_dat);
5813             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5814                 if ((I32)(rx->lastparen) >= nums[i] &&
5815                     rx->offs[nums[i]].start != -1 &&
5816                     rx->offs[nums[i]].end != -1)
5817                 {
5818                     parno = nums[i];
5819                     break;
5820                 }
5821             }
5822             if (parno || flags & RXapif_ALL) {
5823                 av_push(av, newSVhek(HeKEY_hek(temphe)));
5824             }
5825         }
5826     }
5827
5828     return newRV_noinc(MUTABLE_SV(av));
5829 }
5830
5831 void
5832 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
5833                              SV * const sv)
5834 {
5835     struct regexp *const rx = (struct regexp *)SvANY(r);
5836     char *s = NULL;
5837     I32 i = 0;
5838     I32 s1, t1;
5839
5840     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
5841
5842     if (!rx->subbeg) {
5843         sv_setsv(sv,&PL_sv_undef);
5844         return;
5845     }
5846     else
5847     if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
5848         /* $` */
5849         i = rx->offs[0].start;
5850         s = rx->subbeg;
5851     }
5852     else
5853     if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
5854         /* $' */
5855         s = rx->subbeg + rx->offs[0].end;
5856         i = rx->sublen - rx->offs[0].end;
5857     }
5858     else
5859     if ( 0 <= paren && paren <= (I32)rx->nparens &&
5860         (s1 = rx->offs[paren].start) != -1 &&
5861         (t1 = rx->offs[paren].end) != -1)
5862     {
5863         /* $& $1 ... */
5864         i = t1 - s1;
5865         s = rx->subbeg + s1;
5866     } else {
5867         sv_setsv(sv,&PL_sv_undef);
5868         return;
5869     }
5870     assert(rx->sublen >= (s - rx->subbeg) + i );
5871     if (i >= 0) {
5872         const int oldtainted = PL_tainted;
5873         TAINT_NOT;
5874         sv_setpvn(sv, s, i);
5875         PL_tainted = oldtainted;
5876         if ( (rx->extflags & RXf_CANY_SEEN)
5877             ? (RXp_MATCH_UTF8(rx)
5878                         && (!i || is_utf8_string((U8*)s, i)))
5879             : (RXp_MATCH_UTF8(rx)) )
5880         {
5881             SvUTF8_on(sv);
5882         }
5883         else
5884             SvUTF8_off(sv);
5885         if (PL_tainting) {
5886             if (RXp_MATCH_TAINTED(rx)) {
5887                 if (SvTYPE(sv) >= SVt_PVMG) {
5888                     MAGIC* const mg = SvMAGIC(sv);
5889                     MAGIC* mgt;
5890                     PL_tainted = 1;
5891                     SvMAGIC_set(sv, mg->mg_moremagic);
5892                     SvTAINT(sv);
5893                     if ((mgt = SvMAGIC(sv))) {
5894                         mg->mg_moremagic = mgt;
5895                         SvMAGIC_set(sv, mg);
5896                     }
5897                 } else {
5898                     PL_tainted = 1;
5899                     SvTAINT(sv);
5900                 }
5901             } else
5902                 SvTAINTED_off(sv);
5903         }
5904     } else {
5905         sv_setsv(sv,&PL_sv_undef);
5906         return;
5907     }
5908 }
5909
5910 void
5911 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
5912                                                          SV const * const value)
5913 {
5914     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
5915
5916     PERL_UNUSED_ARG(rx);
5917     PERL_UNUSED_ARG(paren);
5918     PERL_UNUSED_ARG(value);
5919
5920     if (!PL_localizing)
5921         Perl_croak_no_modify(aTHX);
5922 }
5923
5924 I32
5925 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
5926                               const I32 paren)
5927 {
5928     struct regexp *const rx = (struct regexp *)SvANY(r);
5929     I32 i;
5930     I32 s1, t1;
5931
5932     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
5933
5934     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
5935         switch (paren) {
5936       /* $` / ${^PREMATCH} */
5937       case RX_BUFF_IDX_PREMATCH:
5938         if (rx->offs[0].start != -1) {
5939                         i = rx->offs[0].start;
5940                         if (i > 0) {
5941                                 s1 = 0;
5942                                 t1 = i;
5943                                 goto getlen;
5944                         }
5945             }
5946         return 0;
5947       /* $' / ${^POSTMATCH} */
5948       case RX_BUFF_IDX_POSTMATCH:
5949             if (rx->offs[0].end != -1) {
5950                         i = rx->sublen - rx->offs[0].end;
5951                         if (i > 0) {
5952                                 s1 = rx->offs[0].end;
5953                                 t1 = rx->sublen;
5954                                 goto getlen;
5955                         }
5956             }
5957         return 0;
5958       /* $& / ${^MATCH}, $1, $2, ... */
5959       default:
5960             if (paren <= (I32)rx->nparens &&
5961             (s1 = rx->offs[paren].start) != -1 &&
5962             (t1 = rx->offs[paren].end) != -1)
5963             {
5964             i = t1 - s1;
5965             goto getlen;
5966         } else {
5967             if (ckWARN(WARN_UNINITIALIZED))
5968                 report_uninit((const SV *)sv);
5969             return 0;
5970         }
5971     }
5972   getlen:
5973     if (i > 0 && RXp_MATCH_UTF8(rx)) {
5974         const char * const s = rx->subbeg + s1;
5975         const U8 *ep;
5976         STRLEN el;
5977
5978         i = t1 - s1;
5979         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
5980                         i = el;
5981     }
5982     return i;
5983 }
5984
5985 SV*
5986 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
5987 {
5988     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
5989         PERL_UNUSED_ARG(rx);
5990         if (0)
5991             return NULL;
5992         else
5993             return newSVpvs("Regexp");
5994 }
5995
5996 /* Scans the name of a named buffer from the pattern.
5997  * If flags is REG_RSN_RETURN_NULL returns null.
5998  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
5999  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
6000  * to the parsed name as looked up in the RExC_paren_names hash.
6001  * If there is an error throws a vFAIL().. type exception.
6002  */
6003
6004 #define REG_RSN_RETURN_NULL    0
6005 #define REG_RSN_RETURN_NAME    1
6006 #define REG_RSN_RETURN_DATA    2
6007
6008 STATIC SV*
6009 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
6010 {
6011     char *name_start = RExC_parse;
6012
6013     PERL_ARGS_ASSERT_REG_SCAN_NAME;
6014
6015     if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
6016          /* skip IDFIRST by using do...while */
6017         if (UTF)
6018             do {
6019                 RExC_parse += UTF8SKIP(RExC_parse);
6020             } while (isALNUM_utf8((U8*)RExC_parse));
6021         else
6022             do {
6023                 RExC_parse++;
6024             } while (isALNUM(*RExC_parse));
6025     }
6026
6027     if ( flags ) {
6028         SV* sv_name
6029             = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
6030                              SVs_TEMP | (UTF ? SVf_UTF8 : 0));
6031         if ( flags == REG_RSN_RETURN_NAME)
6032             return sv_name;
6033         else if (flags==REG_RSN_RETURN_DATA) {
6034             HE *he_str = NULL;
6035             SV *sv_dat = NULL;
6036             if ( ! sv_name )      /* should not happen*/
6037                 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
6038             if (RExC_paren_names)
6039                 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
6040             if ( he_str )
6041                 sv_dat = HeVAL(he_str);
6042             if ( ! sv_dat )
6043                 vFAIL("Reference to nonexistent named group");
6044             return sv_dat;
6045         }
6046         else {
6047             Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
6048                        (unsigned long) flags);
6049         }
6050         /* NOT REACHED */
6051     }
6052     return NULL;
6053 }
6054
6055 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
6056     int rem=(int)(RExC_end - RExC_parse);                       \
6057     int cut;                                                    \
6058     int num;                                                    \
6059     int iscut=0;                                                \
6060     if (rem>10) {                                               \
6061         rem=10;                                                 \
6062         iscut=1;                                                \
6063     }                                                           \
6064     cut=10-rem;                                                 \
6065     if (RExC_lastparse!=RExC_parse)                             \
6066         PerlIO_printf(Perl_debug_log," >%.*s%-*s",              \
6067             rem, RExC_parse,                                    \
6068             cut + 4,                                            \
6069             iscut ? "..." : "<"                                 \
6070         );                                                      \
6071     else                                                        \
6072         PerlIO_printf(Perl_debug_log,"%16s","");                \
6073                                                                 \
6074     if (SIZE_ONLY)                                              \
6075        num = RExC_size + 1;                                     \
6076     else                                                        \
6077        num=REG_NODE_NUM(RExC_emit);                             \
6078     if (RExC_lastnum!=num)                                      \
6079        PerlIO_printf(Perl_debug_log,"|%4d",num);                \
6080     else                                                        \
6081        PerlIO_printf(Perl_debug_log,"|%4s","");                 \
6082     PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
6083         (int)((depth*2)), "",                                   \
6084         (funcname)                                              \
6085     );                                                          \
6086     RExC_lastnum=num;                                           \
6087     RExC_lastparse=RExC_parse;                                  \
6088 })
6089
6090
6091
6092 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
6093     DEBUG_PARSE_MSG((funcname));                            \
6094     PerlIO_printf(Perl_debug_log,"%4s","\n");               \
6095 })
6096 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
6097     DEBUG_PARSE_MSG((funcname));                            \
6098     PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
6099 })
6100
6101 /* This section of code defines the inversion list object and its methods.  The
6102  * interfaces are highly subject to change, so as much as possible is static to
6103  * this file.  An inversion list is here implemented as a malloc'd C UV array
6104  * with some added info that is placed as UVs at the beginning in a header
6105  * portion.  An inversion list for Unicode is an array of code points, sorted
6106  * by ordinal number.  The zeroth element is the first code point in the list.
6107  * The 1th element is the first element beyond that not in the list.  In other
6108  * words, the first range is
6109  *  invlist[0]..(invlist[1]-1)
6110  * The other ranges follow.  Thus every element whose index is divisible by two
6111  * marks the beginning of a range that is in the list, and every element not
6112  * divisible by two marks the beginning of a range not in the list.  A single
6113  * element inversion list that contains the single code point N generally
6114  * consists of two elements
6115  *  invlist[0] == N
6116  *  invlist[1] == N+1
6117  * (The exception is when N is the highest representable value on the
6118  * machine, in which case the list containing just it would be a single
6119  * element, itself.  By extension, if the last range in the list extends to
6120  * infinity, then the first element of that range will be in the inversion list
6121  * at a position that is divisible by two, and is the final element in the
6122  * list.)
6123  * Taking the complement (inverting) an inversion list is quite simple, if the
6124  * first element is 0, remove it; otherwise add a 0 element at the beginning.
6125  * This implementation reserves an element at the beginning of each inversion list
6126  * to contain 0 when the list contains 0, and contains 1 otherwise.  The actual
6127  * beginning of the list is either that element if 0, or the next one if 1.
6128  *
6129  * More about inversion lists can be found in "Unicode Demystified"
6130  * Chapter 13 by Richard Gillam, published by Addison-Wesley.
6131  * More will be coming when functionality is added later.
6132  *
6133  * The inversion list data structure is currently implemented as an SV pointing
6134  * to an array of UVs that the SV thinks are bytes.  This allows us to have an
6135  * array of UV whose memory management is automatically handled by the existing
6136  * facilities for SV's.
6137  *
6138  * Some of the methods should always be private to the implementation, and some
6139  * should eventually be made public */
6140
6141 #define INVLIST_LEN_OFFSET 0    /* Number of elements in the inversion list */
6142 #define INVLIST_ITER_OFFSET 1   /* Current iteration position */
6143
6144 #define INVLIST_ZERO_OFFSET 2   /* 0 or 1; must be last element in header */
6145 /* The UV at position ZERO contains either 0 or 1.  If 0, the inversion list
6146  * contains the code point U+00000, and begins here.  If 1, the inversion list
6147  * doesn't contain U+0000, and it begins at the next UV in the array.
6148  * Inverting an inversion list consists of adding or removing the 0 at the
6149  * beginning of it.  By reserving a space for that 0, inversion can be made
6150  * very fast */
6151
6152 #define HEADER_LENGTH (INVLIST_ZERO_OFFSET + 1)
6153
6154 /* Internally things are UVs */
6155 #define TO_INTERNAL_SIZE(x) ((x + HEADER_LENGTH) * sizeof(UV))
6156 #define FROM_INTERNAL_SIZE(x) ((x / sizeof(UV)) - HEADER_LENGTH)
6157
6158 #define INVLIST_INITIAL_LEN 10
6159
6160 PERL_STATIC_INLINE UV*
6161 S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
6162 {
6163     /* Returns a pointer to the first element in the inversion list's array.
6164      * This is called upon initialization of an inversion list.  Where the
6165      * array begins depends on whether the list has the code point U+0000
6166      * in it or not.  The other parameter tells it whether the code that
6167      * follows this call is about to put a 0 in the inversion list or not.
6168      * The first element is either the element with 0, if 0, or the next one,
6169      * if 1 */
6170
6171     UV* zero = get_invlist_zero_addr(invlist);
6172
6173     PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
6174
6175     /* Must be empty */
6176     assert(! *get_invlist_len_addr(invlist));
6177
6178     /* 1^1 = 0; 1^0 = 1 */
6179     *zero = 1 ^ will_have_0;
6180     return zero + *zero;
6181 }
6182
6183 PERL_STATIC_INLINE UV*
6184 S_invlist_array(pTHX_ SV* const invlist)
6185 {
6186     /* Returns the pointer to the inversion list's array.  Every time the
6187      * length changes, this needs to be called in case malloc or realloc moved
6188      * it */
6189
6190     PERL_ARGS_ASSERT_INVLIST_ARRAY;
6191
6192     /* Must not be empty.  If these fail, you probably didn't check for <len>
6193      * being non-zero before trying to get the array */
6194     assert(*get_invlist_len_addr(invlist));
6195     assert(*get_invlist_zero_addr(invlist) == 0
6196            || *get_invlist_zero_addr(invlist) == 1);
6197
6198     /* The array begins either at the element reserved for zero if the
6199      * list contains 0 (that element will be set to 0), or otherwise the next
6200      * element (in which case the reserved element will be set to 1). */
6201     return (UV *) (get_invlist_zero_addr(invlist)
6202                    + *get_invlist_zero_addr(invlist));
6203 }
6204
6205 PERL_STATIC_INLINE UV*
6206 S_get_invlist_len_addr(pTHX_ SV* invlist)
6207 {
6208     /* Return the address of the UV that contains the current number
6209      * of used elements in the inversion list */
6210
6211     PERL_ARGS_ASSERT_GET_INVLIST_LEN_ADDR;
6212
6213     return (UV *) (SvPVX(invlist) + (INVLIST_LEN_OFFSET * sizeof (UV)));
6214 }
6215
6216 PERL_STATIC_INLINE UV
6217 S_invlist_len(pTHX_ SV* const invlist)
6218 {
6219     /* Returns the current number of elements stored in the inversion list's
6220      * array */
6221
6222     PERL_ARGS_ASSERT_INVLIST_LEN;
6223
6224     return *get_invlist_len_addr(invlist);
6225 }
6226
6227 PERL_STATIC_INLINE void
6228 S_invlist_set_len(pTHX_ SV* const invlist, const UV len)
6229 {
6230     /* Sets the current number of elements stored in the inversion list */
6231
6232     PERL_ARGS_ASSERT_INVLIST_SET_LEN;
6233
6234     *get_invlist_len_addr(invlist) = len;
6235
6236     assert(len <= SvLEN(invlist));
6237
6238     SvCUR_set(invlist, TO_INTERNAL_SIZE(len));
6239     /* If the list contains U+0000, that element is part of the header,
6240      * and should not be counted as part of the array.  It will contain
6241      * 0 in that case, and 1 otherwise.  So we could flop 0=>1, 1=>0 and
6242      * subtract:
6243      *  SvCUR_set(invlist,
6244      *            TO_INTERNAL_SIZE(len
6245      *                             - (*get_invlist_zero_addr(inv_list) ^ 1)));
6246      * But, this is only valid if len is not 0.  The consequences of not doing
6247      * this is that the memory allocation code may think that 1 more UV is
6248      * being used than actually is, and so might do an unnecessary grow.  That
6249      * seems worth not bothering to make this the precise amount.
6250      *
6251      * Note that when inverting, SvCUR shouldn't change */
6252 }
6253
6254 PERL_STATIC_INLINE UV
6255 S_invlist_max(pTHX_ SV* const invlist)
6256 {
6257     /* Returns the maximum number of elements storable in the inversion list's
6258      * array, without having to realloc() */
6259
6260     PERL_ARGS_ASSERT_INVLIST_MAX;
6261
6262     return FROM_INTERNAL_SIZE(SvLEN(invlist));
6263 }
6264
6265 PERL_STATIC_INLINE UV*
6266 S_get_invlist_zero_addr(pTHX_ SV* invlist)
6267 {
6268     /* Return the address of the UV that is reserved to hold 0 if the inversion
6269      * list contains 0.  This has to be the last element of the heading, as the
6270      * list proper starts with either it if 0, or the next element if not.
6271      * (But we force it to contain either 0 or 1) */
6272
6273     PERL_ARGS_ASSERT_GET_INVLIST_ZERO_ADDR;
6274
6275     return (UV *) (SvPVX(invlist) + (INVLIST_ZERO_OFFSET * sizeof (UV)));
6276 }
6277
6278 #ifndef PERL_IN_XSUB_RE
6279 SV*
6280 Perl__new_invlist(pTHX_ IV initial_size)
6281 {
6282
6283     /* Return a pointer to a newly constructed inversion list, with enough
6284      * space to store 'initial_size' elements.  If that number is negative, a
6285      * system default is used instead */
6286
6287     SV* new_list;
6288
6289     if (initial_size < 0) {
6290         initial_size = INVLIST_INITIAL_LEN;
6291     }
6292
6293     /* Allocate the initial space */
6294     new_list = newSV(TO_INTERNAL_SIZE(initial_size));
6295     invlist_set_len(new_list, 0);
6296
6297     /* Force iterinit() to be used to get iteration to work */
6298     *get_invlist_iter_addr(new_list) = UV_MAX;
6299
6300     /* This should force a segfault if a method doesn't initialize this
6301      * properly */
6302     *get_invlist_zero_addr(new_list) = UV_MAX;
6303
6304     return new_list;
6305 }
6306 #endif
6307
6308 STATIC void
6309 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
6310 {
6311     /* Grow the maximum size of an inversion list */
6312
6313     PERL_ARGS_ASSERT_INVLIST_EXTEND;
6314
6315     SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max));
6316 }
6317
6318 PERL_STATIC_INLINE void
6319 S_invlist_trim(pTHX_ SV* const invlist)
6320 {
6321     PERL_ARGS_ASSERT_INVLIST_TRIM;
6322
6323     /* Change the length of the inversion list to how many entries it currently
6324      * has */
6325
6326     SvPV_shrink_to_cur((SV *) invlist);
6327 }
6328
6329 /* An element is in an inversion list iff its index is even numbered: 0, 2, 4,
6330  * etc */
6331 #define ELEMENT_RANGE_MATCHES_INVLIST(i) (! ((i) & 1))
6332 #define PREV_RANGE_MATCHES_INVLIST(i) (! ELEMENT_RANGE_MATCHES_INVLIST(i))
6333
6334 #ifndef PERL_IN_XSUB_RE
6335 void
6336 Perl__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
6337 {
6338    /* Subject to change or removal.  Append the range from 'start' to 'end' at
6339     * the end of the inversion list.  The range must be above any existing
6340     * ones. */
6341
6342     UV* array;
6343     UV max = invlist_max(invlist);
6344     UV len = invlist_len(invlist);
6345
6346     PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
6347
6348     if (len == 0) { /* Empty lists must be initialized */
6349         array = _invlist_array_init(invlist, start == 0);
6350     }
6351     else {
6352         /* Here, the existing list is non-empty. The current max entry in the
6353          * list is generally the first value not in the set, except when the
6354          * set extends to the end of permissible values, in which case it is
6355          * the first entry in that final set, and so this call is an attempt to
6356          * append out-of-order */
6357
6358         UV final_element = len - 1;
6359         array = invlist_array(invlist);
6360         if (array[final_element] > start
6361             || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
6362         {
6363             Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
6364                        array[final_element], start,
6365                        ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
6366         }
6367
6368         /* Here, it is a legal append.  If the new range begins with the first
6369          * value not in the set, it is extending the set, so the new first
6370          * value not in the set is one greater than the newly extended range.
6371          * */
6372         if (array[final_element] == start) {
6373             if (end != UV_MAX) {
6374                 array[final_element] = end + 1;
6375             }
6376             else {
6377                 /* But if the end is the maximum representable on the machine,
6378                  * just let the range that this would extend to have no end */
6379                 invlist_set_len(invlist, len - 1);
6380             }
6381             return;
6382         }
6383     }
6384
6385     /* Here the new range doesn't extend any existing set.  Add it */
6386
6387     len += 2;   /* Includes an element each for the start and end of range */
6388
6389     /* If overflows the existing space, extend, which may cause the array to be
6390      * moved */
6391     if (max < len) {
6392         invlist_extend(invlist, len);
6393         invlist_set_len(invlist, len);  /* Have to set len here to avoid assert
6394                                            failure in invlist_array() */
6395         array = invlist_array(invlist);
6396     }
6397     else {
6398         invlist_set_len(invlist, len);
6399     }
6400
6401     /* The next item on the list starts the range, the one after that is
6402      * one past the new range.  */
6403     array[len - 2] = start;
6404     if (end != UV_MAX) {
6405         array[len - 1] = end + 1;
6406     }
6407     else {
6408         /* But if the end is the maximum representable on the machine, just let
6409          * the range have no end */
6410         invlist_set_len(invlist, len - 1);
6411     }
6412 }
6413
6414 STATIC IV
6415 S_invlist_search(pTHX_ SV* const invlist, const UV cp)
6416 {
6417     /* Searches the inversion list for the entry that contains the input code
6418      * point <cp>.  If <cp> is not in the list, -1 is returned.  Otherwise, the
6419      * return value is the index into the list's array of the range that
6420      * contains <cp> */
6421
6422     IV low = 0;
6423     IV high = invlist_len(invlist);
6424     const UV * const array = invlist_array(invlist);
6425
6426     PERL_ARGS_ASSERT_INVLIST_SEARCH;
6427
6428     /* If list is empty or the code point is before the first element, return
6429      * failure. */
6430     if (high == 0 || cp < array[0]) {
6431         return -1;
6432     }
6433
6434     /* Binary search.  What we are looking for is <i> such that
6435      *  array[i] <= cp < array[i+1]
6436      * The loop below converges on the i+1. */
6437     while (low < high) {
6438         IV mid = (low + high) / 2;
6439         if (array[mid] <= cp) {
6440             low = mid + 1;
6441
6442             /* We could do this extra test to exit the loop early.
6443             if (cp < array[low]) {
6444                 return mid;
6445             }
6446             */
6447         }
6448         else { /* cp < array[mid] */
6449             high = mid;
6450         }
6451     }
6452
6453     return high - 1;
6454 }
6455
6456 void
6457 Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch)
6458 {
6459     /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
6460      * but is used when the swash has an inversion list.  This makes this much
6461      * faster, as it uses a binary search instead of a linear one.  This is
6462      * intimately tied to that function, and perhaps should be in utf8.c,
6463      * except it is intimately tied to inversion lists as well.  It assumes
6464      * that <swatch> is all 0's on input */
6465
6466     UV current = start;
6467     const IV len = invlist_len(invlist);
6468     IV i;
6469     const UV * array;
6470
6471     PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
6472
6473     if (len == 0) { /* Empty inversion list */
6474         return;
6475     }
6476
6477     array = invlist_array(invlist);
6478
6479     /* Find which element it is */
6480     i = invlist_search(invlist, start);
6481
6482     /* We populate from <start> to <end> */
6483     while (current < end) {
6484         UV upper;
6485
6486         /* The inversion list gives the results for every possible code point
6487          * after the first one in the list.  Only those ranges whose index is
6488          * even are ones that the inversion list matches.  For the odd ones,
6489          * and if the initial code point is not in the list, we have to skip
6490          * forward to the next element */
6491         if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
6492             i++;
6493             if (i >= len) { /* Finished if beyond the end of the array */
6494                 return;
6495             }
6496             current = array[i];
6497             if (current >= end) {   /* Finished if beyond the end of what we
6498                                        are populating */
6499                 return;
6500             }
6501         }
6502         assert(current >= start);
6503
6504         /* The current range ends one below the next one, except don't go past
6505          * <end> */
6506         i++;
6507         upper = (i < len && array[i] < end) ? array[i] : end;
6508
6509         /* Here we are in a range that matches.  Populate a bit in the 3-bit U8
6510          * for each code point in it */
6511         for (; current < upper; current++) {
6512             const STRLEN offset = (STRLEN)(current - start);
6513             swatch[offset >> 3] |= 1 << (offset & 7);
6514         }
6515
6516         /* Quit if at the end of the list */
6517         if (i >= len) {
6518
6519             /* But first, have to deal with the highest possible code point on
6520              * the platform.  The previous code assumes that <end> is one
6521              * beyond where we want to populate, but that is impossible at the
6522              * platform's infinity, so have to handle it specially */
6523             if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
6524             {
6525                 const STRLEN offset = (STRLEN)(end - start);
6526                 swatch[offset >> 3] |= 1 << (offset & 7);
6527             }
6528             return;
6529         }
6530
6531         /* Advance to the next range, which will be for code points not in the
6532          * inversion list */
6533         current = array[i];
6534     }
6535
6536     return;
6537 }
6538
6539 void
6540 Perl__invlist_union(pTHX_ SV* const a, SV* const b, SV** output)
6541 {
6542     /* Take the union of two inversion lists and point <output> to it.  *output
6543      * should be defined upon input, and if it points to one of the two lists,
6544      * the reference count to that list will be decremented.
6545      * The basis for this comes from "Unicode Demystified" Chapter 13 by
6546      * Richard Gillam, published by Addison-Wesley, and explained at some
6547      * length there.  The preface says to incorporate its examples into your
6548      * code at your own risk.
6549      *
6550      * The algorithm is like a merge sort.
6551      *
6552      * XXX A potential performance improvement is to keep track as we go along
6553      * if only one of the inputs contributes to the result, meaning the other
6554      * is a subset of that one.  In that case, we can skip the final copy and
6555      * return the larger of the input lists, but then outside code might need
6556      * to keep track of whether to free the input list or not */
6557
6558     UV* array_a;    /* a's array */
6559     UV* array_b;
6560     UV len_a;       /* length of a's array */
6561     UV len_b;
6562
6563     SV* u;                      /* the resulting union */
6564     UV* array_u;
6565     UV len_u;
6566
6567     UV i_a = 0;             /* current index into a's array */
6568     UV i_b = 0;
6569     UV i_u = 0;
6570
6571     /* running count, as explained in the algorithm source book; items are
6572      * stopped accumulating and are output when the count changes to/from 0.
6573      * The count is incremented when we start a range that's in the set, and
6574      * decremented when we start a range that's not in the set.  So its range
6575      * is 0 to 2.  Only when the count is zero is something not in the set.
6576      */
6577     UV count = 0;
6578
6579     PERL_ARGS_ASSERT__INVLIST_UNION;
6580     assert(a != b);
6581
6582     /* If either one is empty, the union is the other one */
6583     len_a = invlist_len(a);
6584     if (len_a == 0) {
6585         if (*output == a) {
6586             SvREFCNT_dec(a);
6587         }
6588         if (*output != b) {
6589             *output = invlist_clone(b);
6590         } /* else *output already = b; */
6591         return;
6592     }
6593     else if ((len_b = invlist_len(b)) == 0) {
6594         if (*output == b) {
6595             SvREFCNT_dec(b);
6596         }
6597         if (*output != a) {
6598             *output = invlist_clone(a);
6599         }
6600         /* else *output already = a; */
6601         return;
6602     }
6603
6604     /* Here both lists exist and are non-empty */
6605     array_a = invlist_array(a);
6606     array_b = invlist_array(b);
6607
6608     /* Size the union for the worst case: that the sets are completely
6609      * disjoint */
6610     u = _new_invlist(len_a + len_b);
6611
6612     /* Will contain U+0000 if either component does */
6613     array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
6614                                       || (len_b > 0 && array_b[0] == 0));
6615
6616     /* Go through each list item by item, stopping when exhausted one of
6617      * them */
6618     while (i_a < len_a && i_b < len_b) {
6619         UV cp;      /* The element to potentially add to the union's array */
6620         bool cp_in_set;   /* is it in the the input list's set or not */
6621
6622         /* We need to take one or the other of the two inputs for the union.
6623          * Since we are merging two sorted lists, we take the smaller of the
6624          * next items.  In case of a tie, we take the one that is in its set
6625          * first.  If we took one not in the set first, it would decrement the
6626          * count, possibly to 0 which would cause it to be output as ending the
6627          * range, and the next time through we would take the same number, and
6628          * output it again as beginning the next range.  By doing it the
6629          * opposite way, there is no possibility that the count will be
6630          * momentarily decremented to 0, and thus the two adjoining ranges will
6631          * be seamlessly merged.  (In a tie and both are in the set or both not
6632          * in the set, it doesn't matter which we take first.) */
6633         if (array_a[i_a] < array_b[i_b]
6634             || (array_a[i_a] == array_b[i_b]
6635                 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
6636         {
6637             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
6638             cp= array_a[i_a++];
6639         }
6640         else {
6641             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
6642             cp= array_b[i_b++];
6643         }
6644
6645         /* Here, have chosen which of the two inputs to look at.  Only output
6646          * if the running count changes to/from 0, which marks the
6647          * beginning/end of a range in that's in the set */
6648         if (cp_in_set) {
6649             if (count == 0) {
6650                 array_u[i_u++] = cp;
6651             }
6652             count++;
6653         }
6654         else {
6655             count--;
6656             if (count == 0) {
6657                 array_u[i_u++] = cp;
6658             }
6659         }
6660     }
6661
6662     /* Here, we are finished going through at least one of the lists, which
6663      * means there is something remaining in at most one.  We check if the list
6664      * that hasn't been exhausted is positioned such that we are in the middle
6665      * of a range in its set or not.  (i_a and i_b point to the element beyond
6666      * the one we care about.) If in the set, we decrement 'count'; if 0, there
6667      * is potentially more to output.
6668      * There are four cases:
6669      *  1) Both weren't in their sets, count is 0, and remains 0.  What's left
6670      *     in the union is entirely from the non-exhausted set.
6671      *  2) Both were in their sets, count is 2.  Nothing further should
6672      *     be output, as everything that remains will be in the exhausted
6673      *     list's set, hence in the union; decrementing to 1 but not 0 insures
6674      *     that
6675      *  3) the exhausted was in its set, non-exhausted isn't, count is 1.
6676      *     Nothing further should be output because the union includes
6677      *     everything from the exhausted set.  Not decrementing ensures that.
6678      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1;
6679      *     decrementing to 0 insures that we look at the remainder of the
6680      *     non-exhausted set */
6681     if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
6682         || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
6683     {
6684         count--;
6685     }
6686
6687     /* The final length is what we've output so far, plus what else is about to
6688      * be output.  (If 'count' is non-zero, then the input list we exhausted
6689      * has everything remaining up to the machine's limit in its set, and hence
6690      * in the union, so there will be no further output. */
6691     len_u = i_u;
6692     if (count == 0) {
6693         /* At most one of the subexpressions will be non-zero */
6694         len_u += (len_a - i_a) + (len_b - i_b);
6695     }
6696
6697     /* Set result to final length, which can change the pointer to array_u, so
6698      * re-find it */
6699     if (len_u != invlist_len(u)) {
6700         invlist_set_len(u, len_u);
6701         invlist_trim(u);
6702         array_u = invlist_array(u);
6703     }
6704
6705     /* When 'count' is 0, the list that was exhausted (if one was shorter than
6706      * the other) ended with everything above it not in its set.  That means
6707      * that the remaining part of the union is precisely the same as the
6708      * non-exhausted list, so can just copy it unchanged.  (If both list were
6709      * exhausted at the same time, then the operations below will be both 0.)
6710      */
6711     if (count == 0) {
6712         IV copy_count; /* At most one will have a non-zero copy count */
6713         if ((copy_count = len_a - i_a) > 0) {
6714             Copy(array_a + i_a, array_u + i_u, copy_count, UV);
6715         }
6716         else if ((copy_count = len_b - i_b) > 0) {
6717             Copy(array_b + i_b, array_u + i_u, copy_count, UV);
6718         }
6719     }
6720
6721     /*  We may be removing a reference to one of the inputs */
6722     if (a == *output || b == *output) {
6723         SvREFCNT_dec(*output);
6724     }
6725
6726     *output = u;
6727     return;
6728 }
6729
6730 void
6731 Perl__invlist_intersection(pTHX_ SV* const a, SV* const b, SV** i)
6732 {
6733     /* Take the intersection of two inversion lists and point <i> to it.  *i
6734      * should be defined upon input, and if it points to one of the two lists,
6735      * the reference count to that list will be decremented.
6736      * The basis for this comes from "Unicode Demystified" Chapter 13 by
6737      * Richard Gillam, published by Addison-Wesley, and explained at some
6738      * length there.  The preface says to incorporate its examples into your
6739      * code at your own risk.  In fact, it had bugs
6740      *
6741      * The algorithm is like a merge sort, and is essentially the same as the
6742      * union above
6743      */
6744
6745     UV* array_a;                /* a's array */
6746     UV* array_b;
6747     UV len_a;   /* length of a's array */
6748     UV len_b;
6749
6750     SV* r;                   /* the resulting intersection */
6751     UV* array_r;
6752     UV len_r;
6753
6754     UV i_a = 0;             /* current index into a's array */
6755     UV i_b = 0;
6756     UV i_r = 0;
6757
6758     /* running count, as explained in the algorithm source book; items are
6759      * stopped accumulating and are output when the count changes to/from 2.
6760      * The count is incremented when we start a range that's in the set, and
6761      * decremented when we start a range that's not in the set.  So its range
6762      * is 0 to 2.  Only when the count is 2 is something in the intersection.
6763      */
6764     UV count = 0;
6765
6766     PERL_ARGS_ASSERT__INVLIST_INTERSECTION;
6767     assert(a != b);
6768
6769     /* If either one is empty, the intersection is null */
6770     len_a = invlist_len(a);
6771     if ((len_a == 0) || ((len_b = invlist_len(b)) == 0)) {
6772
6773         /* If the result is the same as one of the inputs, the input is being
6774          * overwritten */
6775         if (*i == a) {
6776             SvREFCNT_dec(a);
6777         }
6778         else if (*i == b) {
6779             SvREFCNT_dec(b);
6780         }
6781
6782         *i = _new_invlist(0);
6783         return;
6784     }
6785
6786     /* Here both lists exist and are non-empty */
6787     array_a = invlist_array(a);
6788     array_b = invlist_array(b);
6789
6790     /* Size the intersection for the worst case: that the intersection ends up
6791      * fragmenting everything to be completely disjoint */
6792     r= _new_invlist(len_a + len_b);
6793
6794     /* Will contain U+0000 iff both components do */
6795     array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
6796                                      && len_b > 0 && array_b[0] == 0);
6797
6798     /* Go through each list item by item, stopping when exhausted one of
6799      * them */
6800     while (i_a < len_a && i_b < len_b) {
6801         UV cp;      /* The element to potentially add to the intersection's
6802                        array */
6803         bool cp_in_set; /* Is it in the input list's set or not */
6804
6805         /* We need to take one or the other of the two inputs for the
6806          * intersection.  Since we are merging two sorted lists, we take the
6807          * smaller of the next items.  In case of a tie, we take the one that
6808          * is not in its set first (a difference from the union algorithm).  If
6809          * we took one in the set first, it would increment the count, possibly
6810          * to 2 which would cause it to be output as starting a range in the
6811          * intersection, and the next time through we would take that same
6812          * number, and output it again as ending the set.  By doing it the
6813          * opposite of this, there is no possibility that the count will be
6814          * momentarily incremented to 2.  (In a tie and both are in the set or
6815          * both not in the set, it doesn't matter which we take first.) */
6816         if (array_a[i_a] < array_b[i_b]
6817             || (array_a[i_a] == array_b[i_b]
6818                 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
6819         {
6820             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
6821             cp= array_a[i_a++];
6822         }
6823         else {
6824             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
6825             cp= array_b[i_b++];
6826         }
6827
6828         /* Here, have chosen which of the two inputs to look at.  Only output
6829          * if the running count changes to/from 2, which marks the
6830          * beginning/end of a range that's in the intersection */
6831         if (cp_in_set) {
6832             count++;
6833             if (count == 2) {
6834                 array_r[i_r++] = cp;
6835             }
6836         }
6837         else {
6838             if (count == 2) {
6839                 array_r[i_r++] = cp;
6840             }
6841             count--;
6842         }
6843     }
6844
6845     /* Here, we are finished going through at least one of the lists, which
6846      * means there is something remaining in at most one.  We check if the list
6847      * that has been exhausted is positioned such that we are in the middle
6848      * of a range in its set or not.  (i_a and i_b point to elements 1 beyond
6849      * the ones we care about.)  There are four cases:
6850      *  1) Both weren't in their sets, count is 0, and remains 0.  There's
6851      *     nothing left in the intersection.
6852      *  2) Both were in their sets, count is 2 and perhaps is incremented to
6853      *     above 2.  What should be output is exactly that which is in the
6854      *     non-exhausted set, as everything it has is also in the intersection
6855      *     set, and everything it doesn't have can't be in the intersection
6856      *  3) The exhausted was in its set, non-exhausted isn't, count is 1, and
6857      *     gets incremented to 2.  Like the previous case, the intersection is
6858      *     everything that remains in the non-exhausted set.
6859      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
6860      *     remains 1.  And the intersection has nothing more. */
6861     if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
6862         || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
6863     {
6864         count++;
6865     }
6866
6867     /* The final length is what we've output so far plus what else is in the
6868      * intersection.  At most one of the subexpressions below will be non-zero */
6869     len_r = i_r;
6870     if (count >= 2) {
6871         len_r += (len_a - i_a) + (len_b - i_b);
6872     }
6873
6874     /* Set result to final length, which can change the pointer to array_r, so
6875      * re-find it */
6876     if (len_r != invlist_len(r)) {
6877         invlist_set_len(r, len_r);
6878         invlist_trim(r);
6879         array_r = invlist_array(r);
6880     }
6881
6882     /* Finish outputting any remaining */
6883     if (count >= 2) { /* At most one will have a non-zero copy count */
6884         IV copy_count;
6885         if ((copy_count = len_a - i_a) > 0) {
6886             Copy(array_a + i_a, array_r + i_r, copy_count, UV);
6887         }
6888         else if ((copy_count = len_b - i_b) > 0) {
6889             Copy(array_b + i_b, array_r + i_r, copy_count, UV);
6890         }
6891     }
6892
6893     /*  We may be removing a reference to one of the inputs */
6894     if (a == *i || b == *i) {
6895         SvREFCNT_dec(*i);
6896     }
6897
6898     *i = r;
6899     return;
6900 }
6901
6902 #endif
6903
6904 STATIC SV*
6905 S_add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
6906 {
6907     /* Add the range from 'start' to 'end' inclusive to the inversion list's
6908      * set.  A pointer to the inversion list is returned.  This may actually be
6909      * a new list, in which case the passed in one has been destroyed.  The
6910      * passed in inversion list can be NULL, in which case a new one is created
6911      * with just the one range in it */
6912
6913     SV* range_invlist;
6914     UV len;
6915
6916     if (invlist == NULL) {
6917         invlist = _new_invlist(2);
6918         len = 0;
6919     }
6920     else {
6921         len = invlist_len(invlist);
6922     }
6923
6924     /* If comes after the final entry, can just append it to the end */
6925     if (len == 0
6926         || start >= invlist_array(invlist)
6927                                     [invlist_len(invlist) - 1])
6928     {
6929         _append_range_to_invlist(invlist, start, end);
6930         return invlist;
6931     }
6932
6933     /* Here, can't just append things, create and return a new inversion list
6934      * which is the union of this range and the existing inversion list */
6935     range_invlist = _new_invlist(2);
6936     _append_range_to_invlist(range_invlist, start, end);
6937
6938     _invlist_union(invlist, range_invlist, &invlist);
6939
6940     /* The temporary can be freed */
6941     SvREFCNT_dec(range_invlist);
6942
6943     return invlist;
6944 }
6945
6946 PERL_STATIC_INLINE SV*
6947 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
6948     return add_range_to_invlist(invlist, cp, cp);
6949 }
6950
6951 #ifndef PERL_IN_XSUB_RE
6952 void
6953 Perl__invlist_invert(pTHX_ SV* const invlist)
6954 {
6955     /* Complement the input inversion list.  This adds a 0 if the list didn't
6956      * have a zero; removes it otherwise.  As described above, the data
6957      * structure is set up so that this is very efficient */
6958
6959     UV* len_pos = get_invlist_len_addr(invlist);
6960
6961     PERL_ARGS_ASSERT__INVLIST_INVERT;
6962
6963     /* The inverse of matching nothing is matching everything */
6964     if (*len_pos == 0) {
6965         _append_range_to_invlist(invlist, 0, UV_MAX);
6966         return;
6967     }
6968
6969     /* The exclusive or complents 0 to 1; and 1 to 0.  If the result is 1, the
6970      * zero element was a 0, so it is being removed, so the length decrements
6971      * by 1; and vice-versa.  SvCUR is unaffected */
6972     if (*get_invlist_zero_addr(invlist) ^= 1) {
6973         (*len_pos)--;
6974     }
6975     else {
6976         (*len_pos)++;
6977     }
6978 }
6979
6980 void
6981 Perl__invlist_invert_prop(pTHX_ SV* const invlist)
6982 {
6983     /* Complement the input inversion list (which must be a Unicode property,
6984      * all of which don't match above the Unicode maximum code point.)  And
6985      * Perl has chosen to not have the inversion match above that either.  This
6986      * adds a 0x110000 if the list didn't end with it, and removes it if it did
6987      */
6988
6989     UV len;
6990     UV* array;
6991
6992     PERL_ARGS_ASSERT__INVLIST_INVERT_PROP;
6993
6994     _invlist_invert(invlist);
6995
6996     len = invlist_len(invlist);
6997
6998     if (len != 0) { /* If empty do nothing */
6999         array = invlist_array(invlist);
7000         if (array[len - 1] != PERL_UNICODE_MAX + 1) {
7001             /* Add 0x110000.  First, grow if necessary */
7002             len++;
7003             if (invlist_max(invlist) < len) {
7004                 invlist_extend(invlist, len);
7005                 array = invlist_array(invlist);
7006             }
7007             invlist_set_len(invlist, len);
7008             array[len - 1] = PERL_UNICODE_MAX + 1;
7009         }
7010         else {  /* Remove the 0x110000 */
7011             invlist_set_len(invlist, len - 1);
7012         }
7013     }
7014
7015     return;
7016 }
7017 #endif
7018
7019 PERL_STATIC_INLINE SV*
7020 S_invlist_clone(pTHX_ SV* const invlist)
7021 {
7022
7023     /* Return a new inversion list that is a copy of the input one, which is
7024      * unchanged */
7025
7026     /* Need to allocate extra space to accommodate Perl's addition of a
7027      * trailing NUL to SvPV's, since it thinks they are always strings */
7028     SV* new_invlist = _new_invlist(invlist_len(invlist) + 1);
7029     STRLEN length = SvCUR(invlist);
7030
7031     PERL_ARGS_ASSERT_INVLIST_CLONE;
7032
7033     SvCUR_set(new_invlist, length); /* This isn't done automatically */
7034     Copy(SvPVX(invlist), SvPVX(new_invlist), length, char);
7035
7036     return new_invlist;
7037 }
7038
7039 #ifndef PERL_IN_XSUB_RE
7040 void
7041 Perl__invlist_subtract(pTHX_ SV* const a, SV* const b, SV** result)
7042 {
7043     /* Point <result> to an inversion list which consists of all elements in
7044      * <a> that aren't also in <b>.  *result should be defined upon input, and
7045      * if it points to C<b> its reference count will be decremented. */
7046
7047     PERL_ARGS_ASSERT__INVLIST_SUBTRACT;
7048     assert(a != b);
7049
7050     /* Subtracting nothing retains the original */
7051     if (invlist_len(b) == 0) {
7052
7053         if (*result == b) {
7054             SvREFCNT_dec(b);
7055         }
7056
7057         /* If the result is not to be the same variable as the original, create
7058          * a copy */
7059         if (*result != a) {
7060             *result = invlist_clone(a);
7061         }
7062     } else {
7063         SV *b_copy = invlist_clone(b);
7064         _invlist_invert(b_copy);        /* Everything not in 'b' */
7065
7066         if (*result == b) {
7067             SvREFCNT_dec(b);
7068         }
7069
7070         _invlist_intersection(a, b_copy, result);    /* Everything in 'a' not in
7071                                                        'b' */
7072         SvREFCNT_dec(b_copy);
7073     }
7074
7075     return;
7076 }
7077 #endif
7078
7079 PERL_STATIC_INLINE UV*
7080 S_get_invlist_iter_addr(pTHX_ SV* invlist)
7081 {
7082     /* Return the address of the UV that contains the current iteration
7083      * position */
7084
7085     PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
7086
7087     return (UV *) (SvPVX(invlist) + (INVLIST_ITER_OFFSET * sizeof (UV)));
7088 }
7089
7090 PERL_STATIC_INLINE void
7091 S_invlist_iterinit(pTHX_ SV* invlist)   /* Initialize iterator for invlist */
7092 {
7093     PERL_ARGS_ASSERT_INVLIST_ITERINIT;
7094
7095     *get_invlist_iter_addr(invlist) = 0;
7096 }
7097
7098 STATIC bool
7099 S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
7100 {
7101     /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
7102      * This call sets in <*start> and <*end>, the next range in <invlist>.
7103      * Returns <TRUE> if successful and the next call will return the next
7104      * range; <FALSE> if was already at the end of the list.  If the latter,
7105      * <*start> and <*end> are unchanged, and the next call to this function
7106      * will start over at the beginning of the list */
7107
7108     UV* pos = get_invlist_iter_addr(invlist);
7109     UV len = invlist_len(invlist);
7110     UV *array;
7111
7112     PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
7113
7114     if (*pos >= len) {
7115         *pos = UV_MAX;  /* Force iternit() to be required next time */
7116         return FALSE;
7117     }
7118
7119     array = invlist_array(invlist);
7120
7121     *start = array[(*pos)++];
7122
7123     if (*pos >= len) {
7124         *end = UV_MAX;
7125     }
7126     else {
7127         *end = array[(*pos)++] - 1;
7128     }
7129
7130     return TRUE;
7131 }
7132
7133 #ifndef PERL_IN_XSUB_RE
7134 SV *
7135 Perl__invlist_contents(pTHX_ SV* const invlist)
7136 {
7137     /* Get the contents of an inversion list into a string SV so that they can
7138      * be printed out.  It uses the format traditionally done for debug tracing
7139      */
7140
7141     UV start, end;
7142     SV* output = newSVpvs("\n");
7143
7144     PERL_ARGS_ASSERT__INVLIST_CONTENTS;
7145
7146     invlist_iterinit(invlist);
7147     while (invlist_iternext(invlist, &start, &end)) {
7148         if (end == UV_MAX) {
7149             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
7150         }
7151         else if (end != start) {
7152             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
7153                     start,       end);
7154         }
7155         else {
7156             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
7157         }
7158     }
7159
7160     return output;
7161 }
7162 #endif
7163
7164 #if 0
7165 void
7166 S_invlist_dump(pTHX_ SV* const invlist, const char * const header)
7167 {
7168     /* Dumps out the ranges in an inversion list.  The string 'header'
7169      * if present is output on a line before the first range */
7170
7171     UV start, end;
7172
7173     if (header && strlen(header)) {
7174         PerlIO_printf(Perl_debug_log, "%s\n", header);
7175     }
7176     invlist_iterinit(invlist);
7177     while (invlist_iternext(invlist, &start, &end)) {
7178         if (end == UV_MAX) {
7179             PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. INFINITY\n", start);
7180         }
7181         else {
7182             PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. 0x%04"UVXf"\n", start, end);
7183         }
7184     }
7185 }
7186 #endif
7187
7188 #undef HEADER_LENGTH
7189 #undef INVLIST_INITIAL_LENGTH
7190 #undef TO_INTERNAL_SIZE
7191 #undef FROM_INTERNAL_SIZE
7192 #undef INVLIST_LEN_OFFSET
7193 #undef INVLIST_ZERO_OFFSET
7194 #undef INVLIST_ITER_OFFSET
7195
7196 /* End of inversion list object */
7197
7198 /*
7199  - reg - regular expression, i.e. main body or parenthesized thing
7200  *
7201  * Caller must absorb opening parenthesis.
7202  *
7203  * Combining parenthesis handling with the base level of regular expression
7204  * is a trifle forced, but the need to tie the tails of the branches to what
7205  * follows makes it hard to avoid.
7206  */
7207 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
7208 #ifdef DEBUGGING
7209 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
7210 #else
7211 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
7212 #endif
7213
7214 STATIC regnode *
7215 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
7216     /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
7217 {
7218     dVAR;
7219     register regnode *ret;              /* Will be the head of the group. */
7220     register regnode *br;
7221     register regnode *lastbr;
7222     register regnode *ender = NULL;
7223     register I32 parno = 0;
7224     I32 flags;
7225     U32 oregflags = RExC_flags;
7226     bool have_branch = 0;
7227     bool is_open = 0;
7228     I32 freeze_paren = 0;
7229     I32 after_freeze = 0;
7230
7231     /* for (?g), (?gc), and (?o) warnings; warning
7232        about (?c) will warn about (?g) -- japhy    */
7233
7234 #define WASTED_O  0x01
7235 #define WASTED_G  0x02
7236 #define WASTED_C  0x04
7237 #define WASTED_GC (0x02|0x04)
7238     I32 wastedflags = 0x00;
7239
7240     char * parse_start = RExC_parse; /* MJD */
7241     char * const oregcomp_parse = RExC_parse;
7242
7243     GET_RE_DEBUG_FLAGS_DECL;
7244
7245     PERL_ARGS_ASSERT_REG;
7246     DEBUG_PARSE("reg ");
7247
7248     *flagp = 0;                         /* Tentatively. */
7249
7250
7251     /* Make an OPEN node, if parenthesized. */
7252     if (paren) {
7253         if ( *RExC_parse == '*') { /* (*VERB:ARG) */
7254             char *start_verb = RExC_parse;
7255             STRLEN verb_len = 0;
7256             char *start_arg = NULL;
7257             unsigned char op = 0;
7258             int argok = 1;
7259             int internal_argval = 0; /* internal_argval is only useful if !argok */
7260             while ( *RExC_parse && *RExC_parse != ')' ) {
7261                 if ( *RExC_parse == ':' ) {
7262                     start_arg = RExC_parse + 1;
7263                     break;
7264                 }
7265                 RExC_parse++;
7266             }
7267             ++start_verb;
7268             verb_len = RExC_parse - start_verb;
7269             if ( start_arg ) {
7270                 RExC_parse++;
7271                 while ( *RExC_parse && *RExC_parse != ')' )
7272                     RExC_parse++;
7273                 if ( *RExC_parse != ')' )
7274                     vFAIL("Unterminated verb pattern argument");
7275                 if ( RExC_parse == start_arg )
7276                     start_arg = NULL;
7277             } else {
7278                 if ( *RExC_parse != ')' )
7279                     vFAIL("Unterminated verb pattern");
7280             }
7281
7282             switch ( *start_verb ) {
7283             case 'A':  /* (*ACCEPT) */
7284                 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
7285                     op = ACCEPT;
7286                     internal_argval = RExC_nestroot;
7287                 }
7288                 break;
7289             case 'C':  /* (*COMMIT) */
7290                 if ( memEQs(start_verb,verb_len,"COMMIT") )
7291                     op = COMMIT;
7292                 break;
7293             case 'F':  /* (*FAIL) */
7294                 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
7295                     op = OPFAIL;
7296                     argok = 0;
7297                 }
7298                 break;
7299             case ':':  /* (*:NAME) */
7300             case 'M':  /* (*MARK:NAME) */
7301                 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
7302                     op = MARKPOINT;
7303                     argok = -1;
7304                 }
7305                 break;
7306             case 'P':  /* (*PRUNE) */
7307                 if ( memEQs(start_verb,verb_len,"PRUNE") )
7308                     op = PRUNE;
7309                 break;
7310             case 'S':   /* (*SKIP) */
7311                 if ( memEQs(start_verb,verb_len,"SKIP") )
7312                     op = SKIP;
7313                 break;
7314             case 'T':  /* (*THEN) */
7315                 /* [19:06] <TimToady> :: is then */
7316                 if ( memEQs(start_verb,verb_len,"THEN") ) {
7317                     op = CUTGROUP;
7318                     RExC_seen |= REG_SEEN_CUTGROUP;
7319                 }
7320                 break;
7321             }
7322             if ( ! op ) {
7323                 RExC_parse++;
7324                 vFAIL3("Unknown verb pattern '%.*s'",
7325                     verb_len, start_verb);
7326             }
7327             if ( argok ) {
7328                 if ( start_arg && internal_argval ) {
7329                     vFAIL3("Verb pattern '%.*s' may not have an argument",
7330                         verb_len, start_verb);
7331                 } else if ( argok < 0 && !start_arg ) {
7332                     vFAIL3("Verb pattern '%.*s' has a mandatory argument",
7333                         verb_len, start_verb);
7334                 } else {
7335                     ret = reganode(pRExC_state, op, internal_argval);
7336                     if ( ! internal_argval && ! SIZE_ONLY ) {
7337                         if (start_arg) {
7338                             SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
7339                             ARG(ret) = add_data( pRExC_state, 1, "S" );
7340                             RExC_rxi->data->data[ARG(ret)]=(void*)sv;
7341                             ret->flags = 0;
7342                         } else {
7343                             ret->flags = 1;
7344                         }
7345                     }
7346                 }
7347                 if (!internal_argval)
7348                     RExC_seen |= REG_SEEN_VERBARG;
7349             } else if ( start_arg ) {
7350                 vFAIL3("Verb pattern '%.*s' may not have an argument",
7351                         verb_len, start_verb);
7352             } else {
7353                 ret = reg_node(pRExC_state, op);
7354             }
7355             nextchar(pRExC_state);
7356             return ret;
7357         } else
7358         if (*RExC_parse == '?') { /* (?...) */
7359             bool is_logical = 0;
7360             const char * const seqstart = RExC_parse;
7361             bool has_use_defaults = FALSE;
7362
7363             RExC_parse++;
7364             paren = *RExC_parse++;
7365             ret = NULL;                 /* For look-ahead/behind. */
7366             switch (paren) {
7367
7368             case 'P':   /* (?P...) variants for those used to PCRE/Python */
7369                 paren = *RExC_parse++;
7370                 if ( paren == '<')         /* (?P<...>) named capture */
7371                     goto named_capture;
7372                 else if (paren == '>') {   /* (?P>name) named recursion */
7373                     goto named_recursion;
7374                 }
7375                 else if (paren == '=') {   /* (?P=...)  named backref */
7376                     /* this pretty much dupes the code for \k<NAME> in regatom(), if
7377                        you change this make sure you change that */
7378                     char* name_start = RExC_parse;
7379                     U32 num = 0;
7380                     SV *sv_dat = reg_scan_name(pRExC_state,
7381                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7382                     if (RExC_parse == name_start || *RExC_parse != ')')
7383                         vFAIL2("Sequence %.3s... not terminated",parse_start);
7384
7385                     if (!SIZE_ONLY) {
7386                         num = add_data( pRExC_state, 1, "S" );
7387                         RExC_rxi->data->data[num]=(void*)sv_dat;
7388                         SvREFCNT_inc_simple_void(sv_dat);
7389                     }
7390                     RExC_sawback = 1;
7391                     ret = reganode(pRExC_state,
7392                                    ((! FOLD)
7393                                      ? NREF
7394                                      : (MORE_ASCII_RESTRICTED)
7395                                        ? NREFFA
7396                                        : (AT_LEAST_UNI_SEMANTICS)
7397                                          ? NREFFU
7398                                          : (LOC)
7399                                            ? NREFFL
7400                                            : NREFF),
7401                                     num);
7402                     *flagp |= HASWIDTH;
7403
7404                     Set_Node_Offset(ret, parse_start+1);
7405                     Set_Node_Cur_Length(ret); /* MJD */
7406
7407                     nextchar(pRExC_state);
7408                     return ret;
7409                 }
7410                 RExC_parse++;
7411                 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
7412                 /*NOTREACHED*/
7413             case '<':           /* (?<...) */
7414                 if (*RExC_parse == '!')
7415                     paren = ',';
7416                 else if (*RExC_parse != '=')
7417               named_capture:
7418                 {               /* (?<...>) */
7419                     char *name_start;
7420                     SV *svname;
7421                     paren= '>';
7422             case '\'':          /* (?'...') */
7423                     name_start= RExC_parse;
7424                     svname = reg_scan_name(pRExC_state,
7425                         SIZE_ONLY ?  /* reverse test from the others */
7426                         REG_RSN_RETURN_NAME :
7427                         REG_RSN_RETURN_NULL);
7428                     if (RExC_parse == name_start) {
7429                         RExC_parse++;
7430                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
7431                         /*NOTREACHED*/
7432                     }
7433                     if (*RExC_parse != paren)
7434                         vFAIL2("Sequence (?%c... not terminated",
7435                             paren=='>' ? '<' : paren);
7436                     if (SIZE_ONLY) {
7437                         HE *he_str;
7438                         SV *sv_dat = NULL;
7439                         if (!svname) /* shouldn't happen */
7440                             Perl_croak(aTHX_
7441                                 "panic: reg_scan_name returned NULL");
7442                         if (!RExC_paren_names) {
7443                             RExC_paren_names= newHV();
7444                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
7445 #ifdef DEBUGGING
7446                             RExC_paren_name_list= newAV();
7447                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
7448 #endif
7449                         }
7450                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
7451                         if ( he_str )
7452                             sv_dat = HeVAL(he_str);
7453                         if ( ! sv_dat ) {
7454                             /* croak baby croak */
7455                             Perl_croak(aTHX_
7456                                 "panic: paren_name hash element allocation failed");
7457                         } else if ( SvPOK(sv_dat) ) {
7458                             /* (?|...) can mean we have dupes so scan to check
7459                                its already been stored. Maybe a flag indicating
7460                                we are inside such a construct would be useful,
7461                                but the arrays are likely to be quite small, so
7462                                for now we punt -- dmq */
7463                             IV count = SvIV(sv_dat);
7464                             I32 *pv = (I32*)SvPVX(sv_dat);
7465                             IV i;
7466                             for ( i = 0 ; i < count ; i++ ) {
7467                                 if ( pv[i] == RExC_npar ) {
7468                                     count = 0;
7469                                     break;
7470                                 }
7471                             }
7472                             if ( count ) {
7473                                 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
7474                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
7475                                 pv[count] = RExC_npar;
7476                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
7477                             }
7478                         } else {
7479                             (void)SvUPGRADE(sv_dat,SVt_PVNV);
7480                             sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
7481                             SvIOK_on(sv_dat);
7482                             SvIV_set(sv_dat, 1);
7483                         }
7484 #ifdef DEBUGGING
7485                         /* Yes this does cause a memory leak in debugging Perls */
7486                         if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
7487                             SvREFCNT_dec(svname);
7488 #endif
7489
7490                         /*sv_dump(sv_dat);*/
7491                     }
7492                     nextchar(pRExC_state);
7493                     paren = 1;
7494                     goto capturing_parens;
7495                 }
7496                 RExC_seen |= REG_SEEN_LOOKBEHIND;
7497                 RExC_in_lookbehind++;
7498                 RExC_parse++;
7499             case '=':           /* (?=...) */
7500                 RExC_seen_zerolen++;
7501                 break;
7502             case '!':           /* (?!...) */
7503                 RExC_seen_zerolen++;
7504                 if (*RExC_parse == ')') {
7505                     ret=reg_node(pRExC_state, OPFAIL);
7506                     nextchar(pRExC_state);
7507                     return ret;
7508                 }
7509                 break;
7510             case '|':           /* (?|...) */
7511                 /* branch reset, behave like a (?:...) except that
7512                    buffers in alternations share the same numbers */
7513                 paren = ':';
7514                 after_freeze = freeze_paren = RExC_npar;
7515                 break;
7516             case ':':           /* (?:...) */
7517             case '>':           /* (?>...) */
7518                 break;
7519             case '$':           /* (?$...) */
7520             case '@':           /* (?@...) */
7521                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
7522                 break;
7523             case '#':           /* (?#...) */
7524                 while (*RExC_parse && *RExC_parse != ')')
7525                     RExC_parse++;
7526                 if (*RExC_parse != ')')
7527                     FAIL("Sequence (?#... not terminated");
7528                 nextchar(pRExC_state);
7529                 *flagp = TRYAGAIN;
7530                 return NULL;
7531             case '0' :           /* (?0) */
7532             case 'R' :           /* (?R) */
7533                 if (*RExC_parse != ')')
7534                     FAIL("Sequence (?R) not terminated");
7535                 ret = reg_node(pRExC_state, GOSTART);
7536                 *flagp |= POSTPONED;
7537                 nextchar(pRExC_state);
7538                 return ret;
7539                 /*notreached*/
7540             { /* named and numeric backreferences */
7541                 I32 num;
7542             case '&':            /* (?&NAME) */
7543                 parse_start = RExC_parse - 1;
7544               named_recursion:
7545                 {
7546                     SV *sv_dat = reg_scan_name(pRExC_state,
7547                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7548                      num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
7549                 }
7550                 goto gen_recurse_regop;
7551                 /* NOT REACHED */
7552             case '+':
7553                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
7554                     RExC_parse++;
7555                     vFAIL("Illegal pattern");
7556                 }
7557                 goto parse_recursion;
7558                 /* NOT REACHED*/
7559             case '-': /* (?-1) */
7560                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
7561                     RExC_parse--; /* rewind to let it be handled later */
7562                     goto parse_flags;
7563                 }
7564                 /*FALLTHROUGH */
7565             case '1': case '2': case '3': case '4': /* (?1) */
7566             case '5': case '6': case '7': case '8': case '9':
7567                 RExC_parse--;
7568               parse_recursion:
7569                 num = atoi(RExC_parse);
7570                 parse_start = RExC_parse - 1; /* MJD */
7571                 if (*RExC_parse == '-')
7572                     RExC_parse++;
7573                 while (isDIGIT(*RExC_parse))
7574                         RExC_parse++;
7575                 if (*RExC_parse!=')')
7576                     vFAIL("Expecting close bracket");
7577
7578               gen_recurse_regop:
7579                 if ( paren == '-' ) {
7580                     /*
7581                     Diagram of capture buffer numbering.
7582                     Top line is the normal capture buffer numbers
7583                     Bottom line is the negative indexing as from
7584                     the X (the (?-2))
7585
7586                     +   1 2    3 4 5 X          6 7
7587                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
7588                     -   5 4    3 2 1 X          x x
7589
7590                     */
7591                     num = RExC_npar + num;
7592                     if (num < 1)  {
7593                         RExC_parse++;
7594                         vFAIL("Reference to nonexistent group");
7595                     }
7596                 } else if ( paren == '+' ) {
7597                     num = RExC_npar + num - 1;
7598                 }
7599
7600                 ret = reganode(pRExC_state, GOSUB, num);
7601                 if (!SIZE_ONLY) {
7602                     if (num > (I32)RExC_rx->nparens) {
7603                         RExC_parse++;
7604                         vFAIL("Reference to nonexistent group");
7605                     }
7606                     ARG2L_SET( ret, RExC_recurse_count++);
7607                     RExC_emit++;
7608                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
7609                         "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
7610                 } else {
7611                     RExC_size++;
7612                 }
7613                 RExC_seen |= REG_SEEN_RECURSE;
7614                 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
7615                 Set_Node_Offset(ret, parse_start); /* MJD */
7616
7617                 *flagp |= POSTPONED;
7618                 nextchar(pRExC_state);
7619                 return ret;
7620             } /* named and numeric backreferences */
7621             /* NOT REACHED */
7622
7623             case '?':           /* (??...) */
7624                 is_logical = 1;
7625                 if (*RExC_parse != '{') {
7626                     RExC_parse++;
7627                     vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
7628                     /*NOTREACHED*/
7629                 }
7630                 *flagp |= POSTPONED;
7631                 paren = *RExC_parse++;
7632                 /* FALL THROUGH */
7633             case '{':           /* (?{...}) */
7634             {
7635                 I32 count = 1;
7636                 U32 n = 0;
7637                 char c;
7638                 char *s = RExC_parse;
7639
7640                 RExC_seen_zerolen++;
7641                 RExC_seen |= REG_SEEN_EVAL;
7642                 while (count && (c = *RExC_parse)) {
7643                     if (c == '\\') {
7644                         if (RExC_parse[1])
7645                             RExC_parse++;
7646                     }
7647                     else if (c == '{')
7648                         count++;
7649                     else if (c == '}')
7650                         count--;
7651                     RExC_parse++;
7652                 }
7653                 if (*RExC_parse != ')') {
7654                     RExC_parse = s;
7655                     vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
7656                 }
7657                 if (!SIZE_ONLY) {
7658                     PAD *pad;
7659                     OP_4tree *sop, *rop;
7660                     SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
7661
7662                     ENTER;
7663                     Perl_save_re_context(aTHX);
7664                     rop = Perl_sv_compile_2op_is_broken(aTHX_ sv, &sop, "re", &pad);
7665                     sop->op_private |= OPpREFCOUNTED;
7666                     /* re_dup will OpREFCNT_inc */
7667                     OpREFCNT_set(sop, 1);
7668                     LEAVE;
7669
7670                     n = add_data(pRExC_state, 3, "nop");
7671                     RExC_rxi->data->data[n] = (void*)rop;
7672                     RExC_rxi->data->data[n+1] = (void*)sop;
7673                     RExC_rxi->data->data[n+2] = (void*)pad;
7674                     SvREFCNT_dec(sv);
7675                 }
7676                 else {                                          /* First pass */
7677                     if (PL_reginterp_cnt < ++RExC_seen_evals
7678                         && IN_PERL_RUNTIME)
7679                         /* No compiled RE interpolated, has runtime
7680                            components ===> unsafe.  */
7681                         FAIL("Eval-group not allowed at runtime, use re 'eval'");
7682                     if (PL_tainting && PL_tainted)
7683                         FAIL("Eval-group in insecure regular expression");
7684 #if PERL_VERSION > 8
7685                     if (IN_PERL_COMPILETIME)
7686                         PL_cv_has_eval = 1;
7687 #endif
7688                 }
7689
7690                 nextchar(pRExC_state);
7691                 if (is_logical) {
7692                     ret = reg_node(pRExC_state, LOGICAL);
7693                     if (!SIZE_ONLY)
7694                         ret->flags = 2;
7695                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
7696                     /* deal with the length of this later - MJD */
7697                     return ret;
7698                 }
7699                 ret = reganode(pRExC_state, EVAL, n);
7700                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
7701                 Set_Node_Offset(ret, parse_start);
7702                 return ret;
7703             }
7704             case '(':           /* (?(?{...})...) and (?(?=...)...) */
7705             {
7706                 int is_define= 0;
7707                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
7708                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
7709                         || RExC_parse[1] == '<'
7710                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
7711                         I32 flag;
7712
7713                         ret = reg_node(pRExC_state, LOGICAL);
7714                         if (!SIZE_ONLY)
7715                             ret->flags = 1;
7716                         REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
7717                         goto insert_if;
7718                     }
7719                 }
7720                 else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
7721                          || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
7722                 {
7723                     char ch = RExC_parse[0] == '<' ? '>' : '\'';
7724                     char *name_start= RExC_parse++;
7725                     U32 num = 0;
7726                     SV *sv_dat=reg_scan_name(pRExC_state,
7727                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7728                     if (RExC_parse == name_start || *RExC_parse != ch)
7729                         vFAIL2("Sequence (?(%c... not terminated",
7730                             (ch == '>' ? '<' : ch));
7731                     RExC_parse++;
7732                     if (!SIZE_ONLY) {
7733                         num = add_data( pRExC_state, 1, "S" );
7734                         RExC_rxi->data->data[num]=(void*)sv_dat;
7735                         SvREFCNT_inc_simple_void(sv_dat);
7736                     }
7737                     ret = reganode(pRExC_state,NGROUPP,num);
7738                     goto insert_if_check_paren;
7739                 }
7740                 else if (RExC_parse[0] == 'D' &&
7741                          RExC_parse[1] == 'E' &&
7742                          RExC_parse[2] == 'F' &&
7743                          RExC_parse[3] == 'I' &&
7744                          RExC_parse[4] == 'N' &&
7745                          RExC_parse[5] == 'E')
7746                 {
7747                     ret = reganode(pRExC_state,DEFINEP,0);
7748                     RExC_parse +=6 ;
7749                     is_define = 1;
7750                     goto insert_if_check_paren;
7751                 }
7752                 else if (RExC_parse[0] == 'R') {
7753                     RExC_parse++;
7754                     parno = 0;
7755                     if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
7756                         parno = atoi(RExC_parse++);
7757                         while (isDIGIT(*RExC_parse))
7758                             RExC_parse++;
7759                     } else if (RExC_parse[0] == '&') {
7760                         SV *sv_dat;
7761                         RExC_parse++;
7762                         sv_dat = reg_scan_name(pRExC_state,
7763                             SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7764                         parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
7765                     }
7766                     ret = reganode(pRExC_state,INSUBP,parno);
7767                     goto insert_if_check_paren;
7768                 }
7769                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
7770                     /* (?(1)...) */
7771                     char c;
7772                     parno = atoi(RExC_parse++);
7773
7774                     while (isDIGIT(*RExC_parse))
7775                         RExC_parse++;
7776                     ret = reganode(pRExC_state, GROUPP, parno);
7777
7778                  insert_if_check_paren:
7779                     if ((c = *nextchar(pRExC_state)) != ')')
7780                         vFAIL("Switch condition not recognized");
7781                   insert_if:
7782                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
7783                     br = regbranch(pRExC_state, &flags, 1,depth+1);
7784                     if (br == NULL)
7785                         br = reganode(pRExC_state, LONGJMP, 0);
7786                     else
7787                         REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
7788                     c = *nextchar(pRExC_state);
7789                     if (flags&HASWIDTH)
7790                         *flagp |= HASWIDTH;
7791                     if (c == '|') {
7792                         if (is_define)
7793                             vFAIL("(?(DEFINE)....) does not allow branches");
7794                         lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
7795                         regbranch(pRExC_state, &flags, 1,depth+1);
7796                         REGTAIL(pRExC_state, ret, lastbr);
7797                         if (flags&HASWIDTH)
7798                             *flagp |= HASWIDTH;
7799                         c = *nextchar(pRExC_state);
7800                     }
7801                     else
7802                         lastbr = NULL;
7803                     if (c != ')')
7804                         vFAIL("Switch (?(condition)... contains too many branches");
7805                     ender = reg_node(pRExC_state, TAIL);
7806                     REGTAIL(pRExC_state, br, ender);
7807                     if (lastbr) {
7808                         REGTAIL(pRExC_state, lastbr, ender);
7809                         REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
7810                     }
7811                     else
7812                         REGTAIL(pRExC_state, ret, ender);
7813                     RExC_size++; /* XXX WHY do we need this?!!
7814                                     For large programs it seems to be required
7815                                     but I can't figure out why. -- dmq*/
7816                     return ret;
7817                 }
7818                 else {
7819                     vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
7820                 }
7821             }
7822             case 0:
7823                 RExC_parse--; /* for vFAIL to print correctly */
7824                 vFAIL("Sequence (? incomplete");
7825                 break;
7826             case DEFAULT_PAT_MOD:   /* Use default flags with the exceptions
7827                                        that follow */
7828                 has_use_defaults = TRUE;
7829                 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
7830                 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
7831                                                 ? REGEX_UNICODE_CHARSET
7832                                                 : REGEX_DEPENDS_CHARSET);
7833                 goto parse_flags;
7834             default:
7835                 --RExC_parse;
7836                 parse_flags:      /* (?i) */
7837             {
7838                 U32 posflags = 0, negflags = 0;
7839                 U32 *flagsp = &posflags;
7840                 char has_charset_modifier = '\0';
7841                 regex_charset cs = (RExC_utf8 || RExC_uni_semantics)
7842                                     ? REGEX_UNICODE_CHARSET
7843                                     : REGEX_DEPENDS_CHARSET;
7844
7845                 while (*RExC_parse) {
7846                     /* && strchr("iogcmsx", *RExC_parse) */
7847                     /* (?g), (?gc) and (?o) are useless here
7848                        and must be globally applied -- japhy */
7849                     switch (*RExC_parse) {
7850                     CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
7851                     case LOCALE_PAT_MOD:
7852                         if (has_charset_modifier) {
7853                             goto excess_modifier;
7854                         }
7855                         else if (flagsp == &negflags) {
7856                             goto neg_modifier;
7857                         }
7858                         cs = REGEX_LOCALE_CHARSET;
7859                         has_charset_modifier = LOCALE_PAT_MOD;
7860                         RExC_contains_locale = 1;
7861                         break;
7862                     case UNICODE_PAT_MOD:
7863                         if (has_charset_modifier) {
7864                             goto excess_modifier;
7865                         }
7866                         else if (flagsp == &negflags) {
7867                             goto neg_modifier;
7868                         }
7869                         cs = REGEX_UNICODE_CHARSET;
7870                         has_charset_modifier = UNICODE_PAT_MOD;
7871                         break;
7872                     case ASCII_RESTRICT_PAT_MOD:
7873                         if (flagsp == &negflags) {
7874                             goto neg_modifier;
7875                         }
7876                         if (has_charset_modifier) {
7877                             if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
7878                                 goto excess_modifier;
7879                             }
7880                             /* Doubled modifier implies more restricted */
7881                             cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
7882                         }
7883                         else {
7884                             cs = REGEX_ASCII_RESTRICTED_CHARSET;
7885                         }
7886                         has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
7887                         break;
7888                     case DEPENDS_PAT_MOD:
7889                         if (has_use_defaults) {
7890                             goto fail_modifiers;
7891                         }
7892                         else if (flagsp == &negflags) {
7893                             goto neg_modifier;
7894                         }
7895                         else if (has_charset_modifier) {
7896                             goto excess_modifier;
7897                         }
7898
7899                         /* The dual charset means unicode semantics if the
7900                          * pattern (or target, not known until runtime) are
7901                          * utf8, or something in the pattern indicates unicode
7902                          * semantics */
7903                         cs = (RExC_utf8 || RExC_uni_semantics)
7904                              ? REGEX_UNICODE_CHARSET
7905                              : REGEX_DEPENDS_CHARSET;
7906                         has_charset_modifier = DEPENDS_PAT_MOD;
7907                         break;
7908                     excess_modifier:
7909                         RExC_parse++;
7910                         if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
7911                             vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
7912                         }
7913                         else if (has_charset_modifier == *(RExC_parse - 1)) {
7914                             vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
7915                         }
7916                         else {
7917                             vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
7918                         }
7919                         /*NOTREACHED*/
7920                     neg_modifier:
7921                         RExC_parse++;
7922                         vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
7923                         /*NOTREACHED*/
7924                     case ONCE_PAT_MOD: /* 'o' */
7925                     case GLOBAL_PAT_MOD: /* 'g' */
7926                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
7927                             const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
7928                             if (! (wastedflags & wflagbit) ) {
7929                                 wastedflags |= wflagbit;
7930                                 vWARN5(
7931                                     RExC_parse + 1,
7932                                     "Useless (%s%c) - %suse /%c modifier",
7933                                     flagsp == &negflags ? "?-" : "?",
7934                                     *RExC_parse,
7935                                     flagsp == &negflags ? "don't " : "",
7936                                     *RExC_parse
7937                                 );
7938                             }
7939                         }
7940                         break;
7941
7942                     case CONTINUE_PAT_MOD: /* 'c' */
7943                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
7944                             if (! (wastedflags & WASTED_C) ) {
7945                                 wastedflags |= WASTED_GC;
7946                                 vWARN3(
7947                                     RExC_parse + 1,
7948                                     "Useless (%sc) - %suse /gc modifier",
7949                                     flagsp == &negflags ? "?-" : "?",
7950                                     flagsp == &negflags ? "don't " : ""
7951                                 );
7952                             }
7953                         }
7954                         break;
7955                     case KEEPCOPY_PAT_MOD: /* 'p' */
7956                         if (flagsp == &negflags) {
7957                             if (SIZE_ONLY)
7958                                 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
7959                         } else {
7960                             *flagsp |= RXf_PMf_KEEPCOPY;
7961                         }
7962                         break;
7963                     case '-':
7964                         /* A flag is a default iff it is following a minus, so
7965                          * if there is a minus, it means will be trying to
7966                          * re-specify a default which is an error */
7967                         if (has_use_defaults || flagsp == &negflags) {
7968             fail_modifiers:
7969                             RExC_parse++;
7970                             vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
7971                             /*NOTREACHED*/
7972                         }
7973                         flagsp = &negflags;
7974                         wastedflags = 0;  /* reset so (?g-c) warns twice */
7975                         break;
7976                     case ':':
7977                         paren = ':';
7978                         /*FALLTHROUGH*/
7979                     case ')':
7980                         RExC_flags |= posflags;
7981                         RExC_flags &= ~negflags;
7982                         set_regex_charset(&RExC_flags, cs);
7983                         if (paren != ':') {
7984                             oregflags |= posflags;
7985                             oregflags &= ~negflags;
7986                             set_regex_charset(&oregflags, cs);
7987                         }
7988                         nextchar(pRExC_state);
7989                         if (paren != ':') {
7990                             *flagp = TRYAGAIN;
7991                             return NULL;
7992                         } else {
7993                             ret = NULL;
7994                             goto parse_rest;
7995                         }
7996                         /*NOTREACHED*/
7997                     default:
7998                         RExC_parse++;
7999                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8000                         /*NOTREACHED*/
8001                     }
8002                     ++RExC_parse;
8003                 }
8004             }} /* one for the default block, one for the switch */
8005         }
8006         else {                  /* (...) */
8007           capturing_parens:
8008             parno = RExC_npar;
8009             RExC_npar++;
8010
8011             ret = reganode(pRExC_state, OPEN, parno);
8012             if (!SIZE_ONLY ){
8013                 if (!RExC_nestroot)
8014                     RExC_nestroot = parno;
8015                 if (RExC_seen & REG_SEEN_RECURSE
8016                     && !RExC_open_parens[parno-1])
8017                 {
8018                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
8019                         "Setting open paren #%"IVdf" to %d\n",
8020                         (IV)parno, REG_NODE_NUM(ret)));
8021                     RExC_open_parens[parno-1]= ret;
8022                 }
8023             }
8024             Set_Node_Length(ret, 1); /* MJD */
8025             Set_Node_Offset(ret, RExC_parse); /* MJD */
8026             is_open = 1;
8027         }
8028     }
8029     else                        /* ! paren */
8030         ret = NULL;
8031
8032    parse_rest:
8033     /* Pick up the branches, linking them together. */
8034     parse_start = RExC_parse;   /* MJD */
8035     br = regbranch(pRExC_state, &flags, 1,depth+1);
8036
8037     /*     branch_len = (paren != 0); */
8038
8039     if (br == NULL)
8040         return(NULL);
8041     if (*RExC_parse == '|') {
8042         if (!SIZE_ONLY && RExC_extralen) {
8043             reginsert(pRExC_state, BRANCHJ, br, depth+1);
8044         }
8045         else {                  /* MJD */
8046             reginsert(pRExC_state, BRANCH, br, depth+1);
8047             Set_Node_Length(br, paren != 0);
8048             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
8049         }
8050         have_branch = 1;
8051         if (SIZE_ONLY)
8052             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
8053     }
8054     else if (paren == ':') {
8055         *flagp |= flags&SIMPLE;
8056     }
8057     if (is_open) {                              /* Starts with OPEN. */
8058         REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
8059     }
8060     else if (paren != '?')              /* Not Conditional */
8061         ret = br;
8062     *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
8063     lastbr = br;
8064     while (*RExC_parse == '|') {
8065         if (!SIZE_ONLY && RExC_extralen) {
8066             ender = reganode(pRExC_state, LONGJMP,0);
8067             REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
8068         }
8069         if (SIZE_ONLY)
8070             RExC_extralen += 2;         /* Account for LONGJMP. */
8071         nextchar(pRExC_state);
8072         if (freeze_paren) {
8073             if (RExC_npar > after_freeze)
8074                 after_freeze = RExC_npar;
8075             RExC_npar = freeze_paren;
8076         }
8077         br = regbranch(pRExC_state, &flags, 0, depth+1);
8078
8079         if (br == NULL)
8080             return(NULL);
8081         REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
8082         lastbr = br;
8083         *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
8084     }
8085
8086     if (have_branch || paren != ':') {
8087         /* Make a closing node, and hook it on the end. */
8088         switch (paren) {
8089         case ':':
8090             ender = reg_node(pRExC_state, TAIL);
8091             break;
8092         case 1:
8093             ender = reganode(pRExC_state, CLOSE, parno);
8094             if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
8095                 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
8096                         "Setting close paren #%"IVdf" to %d\n",
8097                         (IV)parno, REG_NODE_NUM(ender)));
8098                 RExC_close_parens[parno-1]= ender;
8099                 if (RExC_nestroot == parno)
8100                     RExC_nestroot = 0;
8101             }
8102             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
8103             Set_Node_Length(ender,1); /* MJD */
8104             break;
8105         case '<':
8106         case ',':
8107         case '=':
8108         case '!':
8109             *flagp &= ~HASWIDTH;
8110             /* FALL THROUGH */
8111         case '>':
8112             ender = reg_node(pRExC_state, SUCCEED);
8113             break;
8114         case 0:
8115             ender = reg_node(pRExC_state, END);
8116             if (!SIZE_ONLY) {
8117                 assert(!RExC_opend); /* there can only be one! */
8118                 RExC_opend = ender;
8119             }
8120             break;
8121         }
8122         REGTAIL(pRExC_state, lastbr, ender);
8123
8124         if (have_branch && !SIZE_ONLY) {
8125             if (depth==1)
8126                 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
8127
8128             /* Hook the tails of the branches to the closing node. */
8129             for (br = ret; br; br = regnext(br)) {
8130                 const U8 op = PL_regkind[OP(br)];
8131                 if (op == BRANCH) {
8132                     REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
8133                 }
8134                 else if (op == BRANCHJ) {
8135                     REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
8136                 }
8137             }
8138         }
8139     }
8140
8141     {
8142         const char *p;
8143         static const char parens[] = "=!<,>";
8144
8145         if (paren && (p = strchr(parens, paren))) {
8146             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
8147             int flag = (p - parens) > 1;
8148
8149             if (paren == '>')
8150                 node = SUSPEND, flag = 0;
8151             reginsert(pRExC_state, node,ret, depth+1);
8152             Set_Node_Cur_Length(ret);
8153             Set_Node_Offset(ret, parse_start + 1);
8154             ret->flags = flag;
8155             REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
8156         }
8157     }
8158
8159     /* Check for proper termination. */
8160     if (paren) {
8161         RExC_flags = oregflags;
8162         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
8163             RExC_parse = oregcomp_parse;
8164             vFAIL("Unmatched (");
8165         }
8166     }
8167     else if (!paren && RExC_parse < RExC_end) {
8168         if (*RExC_parse == ')') {
8169             RExC_parse++;
8170             vFAIL("Unmatched )");
8171         }
8172         else
8173             FAIL("Junk on end of regexp");      /* "Can't happen". */
8174         /* NOTREACHED */
8175     }
8176
8177     if (RExC_in_lookbehind) {
8178         RExC_in_lookbehind--;
8179     }
8180     if (after_freeze > RExC_npar)
8181         RExC_npar = after_freeze;
8182     return(ret);
8183 }
8184
8185 /*
8186  - regbranch - one alternative of an | operator
8187  *
8188  * Implements the concatenation operator.
8189  */
8190 STATIC regnode *
8191 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
8192 {
8193     dVAR;
8194     register regnode *ret;
8195     register regnode *chain = NULL;
8196     register regnode *latest;
8197     I32 flags = 0, c = 0;
8198     GET_RE_DEBUG_FLAGS_DECL;
8199
8200     PERL_ARGS_ASSERT_REGBRANCH;
8201
8202     DEBUG_PARSE("brnc");
8203
8204     if (first)
8205         ret = NULL;
8206     else {
8207         if (!SIZE_ONLY && RExC_extralen)
8208             ret = reganode(pRExC_state, BRANCHJ,0);
8209         else {
8210             ret = reg_node(pRExC_state, BRANCH);
8211             Set_Node_Length(ret, 1);
8212         }
8213     }
8214
8215     if (!first && SIZE_ONLY)
8216         RExC_extralen += 1;                     /* BRANCHJ */
8217
8218     *flagp = WORST;                     /* Tentatively. */
8219
8220     RExC_parse--;
8221     nextchar(pRExC_state);
8222     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
8223         flags &= ~TRYAGAIN;
8224         latest = regpiece(pRExC_state, &flags,depth+1);
8225         if (latest == NULL) {
8226             if (flags & TRYAGAIN)
8227                 continue;
8228             return(NULL);
8229         }
8230         else if (ret == NULL)
8231             ret = latest;
8232         *flagp |= flags&(HASWIDTH|POSTPONED);
8233         if (chain == NULL)      /* First piece. */
8234             *flagp |= flags&SPSTART;
8235         else {
8236             RExC_naughty++;
8237             REGTAIL(pRExC_state, chain, latest);
8238         }
8239         chain = latest;
8240         c++;
8241     }
8242     if (chain == NULL) {        /* Loop ran zero times. */
8243         chain = reg_node(pRExC_state, NOTHING);
8244         if (ret == NULL)
8245             ret = chain;
8246     }
8247     if (c == 1) {
8248         *flagp |= flags&SIMPLE;
8249     }
8250
8251     return ret;
8252 }
8253
8254 /*
8255  - regpiece - something followed by possible [*+?]
8256  *
8257  * Note that the branching code sequences used for ? and the general cases
8258  * of * and + are somewhat optimized:  they use the same NOTHING node as
8259  * both the endmarker for their branch list and the body of the last branch.
8260  * It might seem that this node could be dispensed with entirely, but the
8261  * endmarker role is not redundant.
8262  */
8263 STATIC regnode *
8264 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
8265 {
8266     dVAR;
8267     register regnode *ret;
8268     register char op;
8269     register char *next;
8270     I32 flags;
8271     const char * const origparse = RExC_parse;
8272     I32 min;
8273     I32 max = REG_INFTY;
8274 #ifdef RE_TRACK_PATTERN_OFFSETS
8275     char *parse_start;
8276 #endif
8277     const char *maxpos = NULL;
8278     GET_RE_DEBUG_FLAGS_DECL;
8279
8280     PERL_ARGS_ASSERT_REGPIECE;
8281
8282     DEBUG_PARSE("piec");
8283
8284     ret = regatom(pRExC_state, &flags,depth+1);
8285     if (ret == NULL) {
8286         if (flags & TRYAGAIN)
8287             *flagp |= TRYAGAIN;
8288         return(NULL);
8289     }
8290
8291     op = *RExC_parse;
8292
8293     if (op == '{' && regcurly(RExC_parse)) {
8294         maxpos = NULL;
8295 #ifdef RE_TRACK_PATTERN_OFFSETS
8296         parse_start = RExC_parse; /* MJD */
8297 #endif
8298         next = RExC_parse + 1;
8299         while (isDIGIT(*next) || *next == ',') {
8300             if (*next == ',') {
8301                 if (maxpos)
8302                     break;
8303                 else
8304                     maxpos = next;
8305             }
8306             next++;
8307         }
8308         if (*next == '}') {             /* got one */
8309             if (!maxpos)
8310                 maxpos = next;
8311             RExC_parse++;
8312             min = atoi(RExC_parse);
8313             if (*maxpos == ',')
8314                 maxpos++;
8315             else
8316                 maxpos = RExC_parse;
8317             max = atoi(maxpos);
8318             if (!max && *maxpos != '0')
8319                 max = REG_INFTY;                /* meaning "infinity" */
8320             else if (max >= REG_INFTY)
8321                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
8322             RExC_parse = next;
8323             nextchar(pRExC_state);
8324
8325         do_curly:
8326             if ((flags&SIMPLE)) {
8327                 RExC_naughty += 2 + RExC_naughty / 2;
8328                 reginsert(pRExC_state, CURLY, ret, depth+1);
8329                 Set_Node_Offset(ret, parse_start+1); /* MJD */
8330                 Set_Node_Cur_Length(ret);
8331             }
8332             else {
8333                 regnode * const w = reg_node(pRExC_state, WHILEM);
8334
8335                 w->flags = 0;
8336                 REGTAIL(pRExC_state, ret, w);
8337                 if (!SIZE_ONLY && RExC_extralen) {
8338                     reginsert(pRExC_state, LONGJMP,ret, depth+1);
8339                     reginsert(pRExC_state, NOTHING,ret, depth+1);
8340                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
8341                 }
8342                 reginsert(pRExC_state, CURLYX,ret, depth+1);
8343                                 /* MJD hk */
8344                 Set_Node_Offset(ret, parse_start+1);
8345                 Set_Node_Length(ret,
8346                                 op == '{' ? (RExC_parse - parse_start) : 1);
8347
8348                 if (!SIZE_ONLY && RExC_extralen)
8349                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
8350                 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
8351                 if (SIZE_ONLY)
8352                     RExC_whilem_seen++, RExC_extralen += 3;
8353                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
8354             }
8355             ret->flags = 0;
8356
8357             if (min > 0)
8358                 *flagp = WORST;
8359             if (max > 0)
8360                 *flagp |= HASWIDTH;
8361             if (max < min)
8362                 vFAIL("Can't do {n,m} with n > m");
8363             if (!SIZE_ONLY) {
8364                 ARG1_SET(ret, (U16)min);
8365                 ARG2_SET(ret, (U16)max);
8366             }
8367
8368             goto nest_check;
8369         }
8370     }
8371
8372     if (!ISMULT1(op)) {
8373         *flagp = flags;
8374         return(ret);
8375     }
8376
8377 #if 0                           /* Now runtime fix should be reliable. */
8378
8379     /* if this is reinstated, don't forget to put this back into perldiag:
8380
8381             =item Regexp *+ operand could be empty at {#} in regex m/%s/
8382
8383            (F) The part of the regexp subject to either the * or + quantifier
8384            could match an empty string. The {#} shows in the regular
8385            expression about where the problem was discovered.
8386
8387     */
8388
8389     if (!(flags&HASWIDTH) && op != '?')
8390       vFAIL("Regexp *+ operand could be empty");
8391 #endif
8392
8393 #ifdef RE_TRACK_PATTERN_OFFSETS
8394     parse_start = RExC_parse;
8395 #endif
8396     nextchar(pRExC_state);
8397
8398     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
8399
8400     if (op == '*' && (flags&SIMPLE)) {
8401         reginsert(pRExC_state, STAR, ret, depth+1);
8402         ret->flags = 0;
8403         RExC_naughty += 4;
8404     }
8405     else if (op == '*') {
8406         min = 0;
8407         goto do_curly;
8408     }
8409     else if (op == '+' && (flags&SIMPLE)) {
8410         reginsert(pRExC_state, PLUS, ret, depth+1);
8411         ret->flags = 0;
8412         RExC_naughty += 3;
8413     }
8414     else if (op == '+') {
8415         min = 1;
8416         goto do_curly;
8417     }
8418     else if (op == '?') {
8419         min = 0; max = 1;
8420         goto do_curly;
8421     }
8422   nest_check:
8423     if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
8424         ckWARN3reg(RExC_parse,
8425                    "%.*s matches null string many times",
8426                    (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
8427                    origparse);
8428     }
8429
8430     if (RExC_parse < RExC_end && *RExC_parse == '?') {
8431         nextchar(pRExC_state);
8432         reginsert(pRExC_state, MINMOD, ret, depth+1);
8433         REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
8434     }
8435 #ifndef REG_ALLOW_MINMOD_SUSPEND
8436     else
8437 #endif
8438     if (RExC_parse < RExC_end && *RExC_parse == '+') {
8439         regnode *ender;
8440         nextchar(pRExC_state);
8441         ender = reg_node(pRExC_state, SUCCEED);
8442         REGTAIL(pRExC_state, ret, ender);
8443         reginsert(pRExC_state, SUSPEND, ret, depth+1);
8444         ret->flags = 0;
8445         ender = reg_node(pRExC_state, TAIL);
8446         REGTAIL(pRExC_state, ret, ender);
8447         /*ret= ender;*/
8448     }
8449
8450     if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
8451         RExC_parse++;
8452         vFAIL("Nested quantifiers");
8453     }
8454
8455     return(ret);
8456 }
8457
8458
8459 /* reg_namedseq(pRExC_state,UVp, UV depth)
8460
8461    This is expected to be called by a parser routine that has
8462    recognized '\N' and needs to handle the rest. RExC_parse is
8463    expected to point at the first char following the N at the time
8464    of the call.
8465
8466    The \N may be inside (indicated by valuep not being NULL) or outside a
8467    character class.
8468
8469    \N may begin either a named sequence, or if outside a character class, mean
8470    to match a non-newline.  For non single-quoted regexes, the tokenizer has
8471    attempted to decide which, and in the case of a named sequence converted it
8472    into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
8473    where c1... are the characters in the sequence.  For single-quoted regexes,
8474    the tokenizer passes the \N sequence through unchanged; this code will not
8475    attempt to determine this nor expand those.  The net effect is that if the
8476    beginning of the passed-in pattern isn't '{U+' or there is no '}', it
8477    signals that this \N occurrence means to match a non-newline.
8478
8479    Only the \N{U+...} form should occur in a character class, for the same
8480    reason that '.' inside a character class means to just match a period: it
8481    just doesn't make sense.
8482
8483    If valuep is non-null then it is assumed that we are parsing inside
8484    of a charclass definition and the first codepoint in the resolved
8485    string is returned via *valuep and the routine will return NULL.
8486    In this mode if a multichar string is returned from the charnames
8487    handler, a warning will be issued, and only the first char in the
8488    sequence will be examined. If the string returned is zero length
8489    then the value of *valuep is undefined and NON-NULL will
8490    be returned to indicate failure. (This will NOT be a valid pointer
8491    to a regnode.)
8492
8493    If valuep is null then it is assumed that we are parsing normal text and a
8494    new EXACT node is inserted into the program containing the resolved string,
8495    and a pointer to the new node is returned.  But if the string is zero length
8496    a NOTHING node is emitted instead.
8497
8498    On success RExC_parse is set to the char following the endbrace.
8499    Parsing failures will generate a fatal error via vFAIL(...)
8500  */
8501 STATIC regnode *
8502 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp, U32 depth)
8503 {
8504     char * endbrace;    /* '}' following the name */
8505     regnode *ret = NULL;
8506     char* p;
8507
8508     GET_RE_DEBUG_FLAGS_DECL;
8509
8510     PERL_ARGS_ASSERT_REG_NAMEDSEQ;
8511
8512     GET_RE_DEBUG_FLAGS;
8513
8514     /* The [^\n] meaning of \N ignores spaces and comments under the /x
8515      * modifier.  The other meaning does not */
8516     p = (RExC_flags & RXf_PMf_EXTENDED)
8517         ? regwhite( pRExC_state, RExC_parse )
8518         : RExC_parse;
8519
8520     /* Disambiguate between \N meaning a named character versus \N meaning
8521      * [^\n].  The former is assumed when it can't be the latter. */
8522     if (*p != '{' || regcurly(p)) {
8523         RExC_parse = p;
8524         if (valuep) {
8525             /* no bare \N in a charclass */
8526             vFAIL("\\N in a character class must be a named character: \\N{...}");
8527         }
8528         nextchar(pRExC_state);
8529         ret = reg_node(pRExC_state, REG_ANY);
8530         *flagp |= HASWIDTH|SIMPLE;
8531         RExC_naughty++;
8532         RExC_parse--;
8533         Set_Node_Length(ret, 1); /* MJD */
8534         return ret;
8535     }
8536
8537     /* Here, we have decided it should be a named sequence */
8538
8539     /* The test above made sure that the next real character is a '{', but
8540      * under the /x modifier, it could be separated by space (or a comment and
8541      * \n) and this is not allowed (for consistency with \x{...} and the
8542      * tokenizer handling of \N{NAME}). */
8543     if (*RExC_parse != '{') {
8544         vFAIL("Missing braces on \\N{}");
8545     }
8546
8547     RExC_parse++;       /* Skip past the '{' */
8548
8549     if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
8550         || ! (endbrace == RExC_parse            /* nothing between the {} */
8551               || (endbrace - RExC_parse >= 2    /* U+ (bad hex is checked below */
8552                   && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
8553     {
8554         if (endbrace) RExC_parse = endbrace;    /* position msg's '<--HERE' */
8555         vFAIL("\\N{NAME} must be resolved by the lexer");
8556     }
8557
8558     if (endbrace == RExC_parse) {   /* empty: \N{} */
8559         if (! valuep) {
8560             RExC_parse = endbrace + 1;
8561             return reg_node(pRExC_state,NOTHING);
8562         }
8563
8564         if (SIZE_ONLY) {
8565             ckWARNreg(RExC_parse,
8566                     "Ignoring zero length \\N{} in character class"
8567             );
8568             RExC_parse = endbrace + 1;
8569         }
8570         *valuep = 0;
8571         return (regnode *) &RExC_parse; /* Invalid regnode pointer */
8572     }
8573
8574     REQUIRE_UTF8;       /* named sequences imply Unicode semantics */
8575     RExC_parse += 2;    /* Skip past the 'U+' */
8576
8577     if (valuep) {   /* In a bracketed char class */
8578         /* We only pay attention to the first char of
8579         multichar strings being returned. I kinda wonder
8580         if this makes sense as it does change the behaviour
8581         from earlier versions, OTOH that behaviour was broken
8582         as well. XXX Solution is to recharacterize as
8583         [rest-of-class]|multi1|multi2... */
8584
8585         STRLEN length_of_hex;
8586         I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
8587             | PERL_SCAN_DISALLOW_PREFIX
8588             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
8589
8590         char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
8591         if (endchar < endbrace) {
8592             ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
8593         }
8594
8595         length_of_hex = (STRLEN)(endchar - RExC_parse);
8596         *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
8597
8598         /* The tokenizer should have guaranteed validity, but it's possible to
8599          * bypass it by using single quoting, so check */
8600         if (length_of_hex == 0
8601             || length_of_hex != (STRLEN)(endchar - RExC_parse) )
8602         {
8603             RExC_parse += length_of_hex;        /* Includes all the valid */
8604             RExC_parse += (RExC_orig_utf8)      /* point to after 1st invalid */
8605                             ? UTF8SKIP(RExC_parse)
8606                             : 1;
8607             /* Guard against malformed utf8 */
8608             if (RExC_parse >= endchar) RExC_parse = endchar;
8609             vFAIL("Invalid hexadecimal number in \\N{U+...}");
8610         }
8611
8612         RExC_parse = endbrace + 1;
8613         if (endchar == endbrace) return NULL;
8614
8615         ret = (regnode *) &RExC_parse;  /* Invalid regnode pointer */
8616     }
8617     else {      /* Not a char class */
8618
8619         /* What is done here is to convert this to a sub-pattern of the form
8620          * (?:\x{char1}\x{char2}...)
8621          * and then call reg recursively.  That way, it retains its atomicness,
8622          * while not having to worry about special handling that some code
8623          * points may have.  toke.c has converted the original Unicode values
8624          * to native, so that we can just pass on the hex values unchanged.  We
8625          * do have to set a flag to keep recoding from happening in the
8626          * recursion */
8627
8628         SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
8629         STRLEN len;
8630         char *endchar;      /* Points to '.' or '}' ending cur char in the input
8631                                stream */
8632         char *orig_end = RExC_end;
8633
8634         while (RExC_parse < endbrace) {
8635
8636             /* Code points are separated by dots.  If none, there is only one
8637              * code point, and is terminated by the brace */
8638             endchar = RExC_parse + strcspn(RExC_parse, ".}");
8639
8640             /* Convert to notation the rest of the code understands */
8641             sv_catpv(substitute_parse, "\\x{");
8642             sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
8643             sv_catpv(substitute_parse, "}");
8644
8645             /* Point to the beginning of the next character in the sequence. */
8646             RExC_parse = endchar + 1;
8647         }
8648         sv_catpv(substitute_parse, ")");
8649
8650         RExC_parse = SvPV(substitute_parse, len);
8651
8652         /* Don't allow empty number */
8653         if (len < 8) {
8654             vFAIL("Invalid hexadecimal number in \\N{U+...}");
8655         }
8656         RExC_end = RExC_parse + len;
8657
8658         /* The values are Unicode, and therefore not subject to recoding */
8659         RExC_override_recoding = 1;
8660
8661         ret = reg(pRExC_state, 1, flagp, depth+1);
8662
8663         RExC_parse = endbrace;
8664         RExC_end = orig_end;
8665         RExC_override_recoding = 0;
8666
8667         nextchar(pRExC_state);
8668     }
8669
8670     return ret;
8671 }
8672
8673
8674 /*
8675  * reg_recode
8676  *
8677  * It returns the code point in utf8 for the value in *encp.
8678  *    value: a code value in the source encoding
8679  *    encp:  a pointer to an Encode object
8680  *
8681  * If the result from Encode is not a single character,
8682  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
8683  */
8684 STATIC UV
8685 S_reg_recode(pTHX_ const char value, SV **encp)
8686 {
8687     STRLEN numlen = 1;
8688     SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
8689     const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
8690     const STRLEN newlen = SvCUR(sv);
8691     UV uv = UNICODE_REPLACEMENT;
8692
8693     PERL_ARGS_ASSERT_REG_RECODE;
8694
8695     if (newlen)
8696         uv = SvUTF8(sv)
8697              ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
8698              : *(U8*)s;
8699
8700     if (!newlen || numlen != newlen) {
8701         uv = UNICODE_REPLACEMENT;
8702         *encp = NULL;
8703     }
8704     return uv;
8705 }
8706
8707
8708 /*
8709  - regatom - the lowest level
8710
8711    Try to identify anything special at the start of the pattern. If there
8712    is, then handle it as required. This may involve generating a single regop,
8713    such as for an assertion; or it may involve recursing, such as to
8714    handle a () structure.
8715
8716    If the string doesn't start with something special then we gobble up
8717    as much literal text as we can.
8718
8719    Once we have been able to handle whatever type of thing started the
8720    sequence, we return.
8721
8722    Note: we have to be careful with escapes, as they can be both literal
8723    and special, and in the case of \10 and friends can either, depending
8724    on context. Specifically there are two separate switches for handling
8725    escape sequences, with the one for handling literal escapes requiring
8726    a dummy entry for all of the special escapes that are actually handled
8727    by the other.
8728 */
8729
8730 STATIC regnode *
8731 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
8732 {
8733     dVAR;
8734     register regnode *ret = NULL;
8735     I32 flags;
8736     char *parse_start = RExC_parse;
8737     U8 op;
8738     GET_RE_DEBUG_FLAGS_DECL;
8739     DEBUG_PARSE("atom");
8740     *flagp = WORST;             /* Tentatively. */
8741
8742     PERL_ARGS_ASSERT_REGATOM;
8743
8744 tryagain:
8745     switch ((U8)*RExC_parse) {
8746     case '^':
8747         RExC_seen_zerolen++;
8748         nextchar(pRExC_state);
8749         if (RExC_flags & RXf_PMf_MULTILINE)
8750             ret = reg_node(pRExC_state, MBOL);
8751         else if (RExC_flags & RXf_PMf_SINGLELINE)
8752             ret = reg_node(pRExC_state, SBOL);
8753         else
8754             ret = reg_node(pRExC_state, BOL);
8755         Set_Node_Length(ret, 1); /* MJD */
8756         break;
8757     case '$':
8758         nextchar(pRExC_state);
8759         if (*RExC_parse)
8760             RExC_seen_zerolen++;
8761         if (RExC_flags & RXf_PMf_MULTILINE)
8762             ret = reg_node(pRExC_state, MEOL);
8763         else if (RExC_flags & RXf_PMf_SINGLELINE)
8764             ret = reg_node(pRExC_state, SEOL);
8765         else
8766             ret = reg_node(pRExC_state, EOL);
8767         Set_Node_Length(ret, 1); /* MJD */
8768         break;
8769     case '.':
8770         nextchar(pRExC_state);
8771         if (RExC_flags & RXf_PMf_SINGLELINE)
8772             ret = reg_node(pRExC_state, SANY);
8773         else
8774             ret = reg_node(pRExC_state, REG_ANY);
8775         *flagp |= HASWIDTH|SIMPLE;
8776         RExC_naughty++;
8777         Set_Node_Length(ret, 1); /* MJD */
8778         break;
8779     case '[':
8780     {
8781         char * const oregcomp_parse = ++RExC_parse;
8782         ret = regclass(pRExC_state,depth+1);
8783         if (*RExC_parse != ']') {
8784             RExC_parse = oregcomp_parse;
8785             vFAIL("Unmatched [");
8786         }
8787         nextchar(pRExC_state);
8788         *flagp |= HASWIDTH|SIMPLE;
8789         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
8790         break;
8791     }
8792     case '(':
8793         nextchar(pRExC_state);
8794         ret = reg(pRExC_state, 1, &flags,depth+1);
8795         if (ret == NULL) {
8796                 if (flags & TRYAGAIN) {
8797                     if (RExC_parse == RExC_end) {
8798                          /* Make parent create an empty node if needed. */
8799                         *flagp |= TRYAGAIN;
8800                         return(NULL);
8801                     }
8802                     goto tryagain;
8803                 }
8804                 return(NULL);
8805         }
8806         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
8807         break;
8808     case '|':
8809     case ')':
8810         if (flags & TRYAGAIN) {
8811             *flagp |= TRYAGAIN;
8812             return NULL;
8813         }
8814         vFAIL("Internal urp");
8815                                 /* Supposed to be caught earlier. */
8816         break;
8817     case '{':
8818         if (!regcurly(RExC_parse)) {
8819             RExC_parse++;
8820             goto defchar;
8821         }
8822         /* FALL THROUGH */
8823     case '?':
8824     case '+':
8825     case '*':
8826         RExC_parse++;
8827         vFAIL("Quantifier follows nothing");
8828         break;
8829     case '\\':
8830         /* Special Escapes
8831
8832            This switch handles escape sequences that resolve to some kind
8833            of special regop and not to literal text. Escape sequnces that
8834            resolve to literal text are handled below in the switch marked
8835            "Literal Escapes".
8836
8837            Every entry in this switch *must* have a corresponding entry
8838            in the literal escape switch. However, the opposite is not
8839            required, as the default for this switch is to jump to the
8840            literal text handling code.
8841         */
8842         switch ((U8)*++RExC_parse) {
8843         /* Special Escapes */
8844         case 'A':
8845             RExC_seen_zerolen++;
8846             ret = reg_node(pRExC_state, SBOL);
8847             *flagp |= SIMPLE;
8848             goto finish_meta_pat;
8849         case 'G':
8850             ret = reg_node(pRExC_state, GPOS);
8851             RExC_seen |= REG_SEEN_GPOS;
8852             *flagp |= SIMPLE;
8853             goto finish_meta_pat;
8854         case 'K':
8855             RExC_seen_zerolen++;
8856             ret = reg_node(pRExC_state, KEEPS);
8857             *flagp |= SIMPLE;
8858             /* XXX:dmq : disabling in-place substitution seems to
8859              * be necessary here to avoid cases of memory corruption, as
8860              * with: C<$_="x" x 80; s/x\K/y/> -- rgs
8861              */
8862             RExC_seen |= REG_SEEN_LOOKBEHIND;
8863             goto finish_meta_pat;
8864         case 'Z':
8865             ret = reg_node(pRExC_state, SEOL);
8866             *flagp |= SIMPLE;
8867             RExC_seen_zerolen++;                /* Do not optimize RE away */
8868             goto finish_meta_pat;
8869         case 'z':
8870             ret = reg_node(pRExC_state, EOS);
8871             *flagp |= SIMPLE;
8872             RExC_seen_zerolen++;                /* Do not optimize RE away */
8873             goto finish_meta_pat;
8874         case 'C':
8875             ret = reg_node(pRExC_state, CANY);
8876             RExC_seen |= REG_SEEN_CANY;
8877             *flagp |= HASWIDTH|SIMPLE;
8878             goto finish_meta_pat;
8879         case 'X':
8880             ret = reg_node(pRExC_state, CLUMP);
8881             *flagp |= HASWIDTH;
8882             goto finish_meta_pat;
8883         case 'w':
8884             switch (get_regex_charset(RExC_flags)) {
8885                 case REGEX_LOCALE_CHARSET:
8886                     op = ALNUML;
8887                     break;
8888                 case REGEX_UNICODE_CHARSET:
8889                     op = ALNUMU;
8890                     break;
8891                 case REGEX_ASCII_RESTRICTED_CHARSET:
8892                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8893                     op = ALNUMA;
8894                     break;
8895                 case REGEX_DEPENDS_CHARSET:
8896                     op = ALNUM;
8897                     break;
8898                 default:
8899                     goto bad_charset;
8900             }
8901             ret = reg_node(pRExC_state, op);
8902             *flagp |= HASWIDTH|SIMPLE;
8903             goto finish_meta_pat;
8904         case 'W':
8905             switch (get_regex_charset(RExC_flags)) {
8906                 case REGEX_LOCALE_CHARSET:
8907                     op = NALNUML;
8908                     break;
8909                 case REGEX_UNICODE_CHARSET:
8910                     op = NALNUMU;
8911                     break;
8912                 case REGEX_ASCII_RESTRICTED_CHARSET:
8913                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8914                     op = NALNUMA;
8915                     break;
8916                 case REGEX_DEPENDS_CHARSET:
8917                     op = NALNUM;
8918                     break;
8919                 default:
8920                     goto bad_charset;
8921             }
8922             ret = reg_node(pRExC_state, op);
8923             *flagp |= HASWIDTH|SIMPLE;
8924             goto finish_meta_pat;
8925         case 'b':
8926             RExC_seen_zerolen++;
8927             RExC_seen |= REG_SEEN_LOOKBEHIND;
8928             switch (get_regex_charset(RExC_flags)) {
8929                 case REGEX_LOCALE_CHARSET:
8930                     op = BOUNDL;
8931                     break;
8932                 case REGEX_UNICODE_CHARSET:
8933                     op = BOUNDU;
8934                     break;
8935                 case REGEX_ASCII_RESTRICTED_CHARSET:
8936                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8937                     op = BOUNDA;
8938                     break;
8939                 case REGEX_DEPENDS_CHARSET:
8940                     op = BOUND;
8941                     break;
8942                 default:
8943                     goto bad_charset;
8944             }
8945             ret = reg_node(pRExC_state, op);
8946             FLAGS(ret) = get_regex_charset(RExC_flags);
8947             *flagp |= SIMPLE;
8948             if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
8949                 ckWARNregdep(RExC_parse, "\"\\b{\" is deprecated; use \"\\b\\{\" instead");
8950             }
8951             goto finish_meta_pat;
8952         case 'B':
8953             RExC_seen_zerolen++;
8954             RExC_seen |= REG_SEEN_LOOKBEHIND;
8955             switch (get_regex_charset(RExC_flags)) {
8956                 case REGEX_LOCALE_CHARSET:
8957                     op = NBOUNDL;
8958                     break;
8959                 case REGEX_UNICODE_CHARSET:
8960                     op = NBOUNDU;
8961                     break;
8962                 case REGEX_ASCII_RESTRICTED_CHARSET:
8963                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8964                     op = NBOUNDA;
8965                     break;
8966                 case REGEX_DEPENDS_CHARSET:
8967                     op = NBOUND;
8968                     break;
8969                 default:
8970                     goto bad_charset;
8971             }
8972             ret = reg_node(pRExC_state, op);
8973             FLAGS(ret) = get_regex_charset(RExC_flags);
8974             *flagp |= SIMPLE;
8975             if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
8976                 ckWARNregdep(RExC_parse, "\"\\B{\" is deprecated; use \"\\B\\{\" instead");
8977             }
8978             goto finish_meta_pat;
8979         case 's':
8980             switch (get_regex_charset(RExC_flags)) {
8981                 case REGEX_LOCALE_CHARSET:
8982                     op = SPACEL;
8983                     break;
8984                 case REGEX_UNICODE_CHARSET:
8985                     op = SPACEU;
8986                     break;
8987                 case REGEX_ASCII_RESTRICTED_CHARSET:
8988                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8989                     op = SPACEA;
8990                     break;
8991                 case REGEX_DEPENDS_CHARSET:
8992                     op = SPACE;
8993                     break;
8994                 default:
8995                     goto bad_charset;
8996             }
8997             ret = reg_node(pRExC_state, op);
8998             *flagp |= HASWIDTH|SIMPLE;
8999             goto finish_meta_pat;
9000         case 'S':
9001             switch (get_regex_charset(RExC_flags)) {
9002                 case REGEX_LOCALE_CHARSET:
9003                     op = NSPACEL;
9004                     break;
9005                 case REGEX_UNICODE_CHARSET:
9006                     op = NSPACEU;
9007                     break;
9008                 case REGEX_ASCII_RESTRICTED_CHARSET:
9009                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9010                     op = NSPACEA;
9011                     break;
9012                 case REGEX_DEPENDS_CHARSET:
9013                     op = NSPACE;
9014                     break;
9015                 default:
9016                     goto bad_charset;
9017             }
9018             ret = reg_node(pRExC_state, op);
9019             *flagp |= HASWIDTH|SIMPLE;
9020             goto finish_meta_pat;
9021         case 'd':
9022             switch (get_regex_charset(RExC_flags)) {
9023                 case REGEX_LOCALE_CHARSET:
9024                     op = DIGITL;
9025                     break;
9026                 case REGEX_ASCII_RESTRICTED_CHARSET:
9027                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9028                     op = DIGITA;
9029                     break;
9030                 case REGEX_DEPENDS_CHARSET: /* No difference between these */
9031                 case REGEX_UNICODE_CHARSET:
9032                     op = DIGIT;
9033                     break;
9034                 default:
9035                     goto bad_charset;
9036             }
9037             ret = reg_node(pRExC_state, op);
9038             *flagp |= HASWIDTH|SIMPLE;
9039             goto finish_meta_pat;
9040         case 'D':
9041             switch (get_regex_charset(RExC_flags)) {
9042                 case REGEX_LOCALE_CHARSET:
9043                     op = NDIGITL;
9044                     break;
9045                 case REGEX_ASCII_RESTRICTED_CHARSET:
9046                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9047                     op = NDIGITA;
9048                     break;
9049                 case REGEX_DEPENDS_CHARSET: /* No difference between these */
9050                 case REGEX_UNICODE_CHARSET:
9051                     op = NDIGIT;
9052                     break;
9053                 default:
9054                     goto bad_charset;
9055             }
9056             ret = reg_node(pRExC_state, op);
9057             *flagp |= HASWIDTH|SIMPLE;
9058             goto finish_meta_pat;
9059         case 'R':
9060             ret = reg_node(pRExC_state, LNBREAK);
9061             *flagp |= HASWIDTH|SIMPLE;
9062             goto finish_meta_pat;
9063         case 'h':
9064             ret = reg_node(pRExC_state, HORIZWS);
9065             *flagp |= HASWIDTH|SIMPLE;
9066             goto finish_meta_pat;
9067         case 'H':
9068             ret = reg_node(pRExC_state, NHORIZWS);
9069             *flagp |= HASWIDTH|SIMPLE;
9070             goto finish_meta_pat;
9071         case 'v':
9072             ret = reg_node(pRExC_state, VERTWS);
9073             *flagp |= HASWIDTH|SIMPLE;
9074             goto finish_meta_pat;
9075         case 'V':
9076             ret = reg_node(pRExC_state, NVERTWS);
9077             *flagp |= HASWIDTH|SIMPLE;
9078          finish_meta_pat:
9079             nextchar(pRExC_state);
9080             Set_Node_Length(ret, 2); /* MJD */
9081             break;
9082         case 'p':
9083         case 'P':
9084             {
9085                 char* const oldregxend = RExC_end;
9086 #ifdef DEBUGGING
9087                 char* parse_start = RExC_parse - 2;
9088 #endif
9089
9090                 if (RExC_parse[1] == '{') {
9091                   /* a lovely hack--pretend we saw [\pX] instead */
9092                     RExC_end = strchr(RExC_parse, '}');
9093                     if (!RExC_end) {
9094                         const U8 c = (U8)*RExC_parse;
9095                         RExC_parse += 2;
9096                         RExC_end = oldregxend;
9097                         vFAIL2("Missing right brace on \\%c{}", c);
9098                     }
9099                     RExC_end++;
9100                 }
9101                 else {
9102                     RExC_end = RExC_parse + 2;
9103                     if (RExC_end > oldregxend)
9104                         RExC_end = oldregxend;
9105                 }
9106                 RExC_parse--;
9107
9108                 ret = regclass(pRExC_state,depth+1);
9109
9110                 RExC_end = oldregxend;
9111                 RExC_parse--;
9112
9113                 Set_Node_Offset(ret, parse_start + 2);
9114                 Set_Node_Cur_Length(ret);
9115                 nextchar(pRExC_state);
9116                 *flagp |= HASWIDTH|SIMPLE;
9117             }
9118             break;
9119         case 'N':
9120             /* Handle \N and \N{NAME} here and not below because it can be
9121             multicharacter. join_exact() will join them up later on.
9122             Also this makes sure that things like /\N{BLAH}+/ and
9123             \N{BLAH} being multi char Just Happen. dmq*/
9124             ++RExC_parse;
9125             ret= reg_namedseq(pRExC_state, NULL, flagp, depth);
9126             break;
9127         case 'k':    /* Handle \k<NAME> and \k'NAME' */
9128         parse_named_seq:
9129         {
9130             char ch= RExC_parse[1];
9131             if (ch != '<' && ch != '\'' && ch != '{') {
9132                 RExC_parse++;
9133                 vFAIL2("Sequence %.2s... not terminated",parse_start);
9134             } else {
9135                 /* this pretty much dupes the code for (?P=...) in reg(), if
9136                    you change this make sure you change that */
9137                 char* name_start = (RExC_parse += 2);
9138                 U32 num = 0;
9139                 SV *sv_dat = reg_scan_name(pRExC_state,
9140                     SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9141                 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
9142                 if (RExC_parse == name_start || *RExC_parse != ch)
9143                     vFAIL2("Sequence %.3s... not terminated",parse_start);
9144
9145                 if (!SIZE_ONLY) {
9146                     num = add_data( pRExC_state, 1, "S" );
9147                     RExC_rxi->data->data[num]=(void*)sv_dat;
9148                     SvREFCNT_inc_simple_void(sv_dat);
9149                 }
9150
9151                 RExC_sawback = 1;
9152                 ret = reganode(pRExC_state,
9153                                ((! FOLD)
9154                                  ? NREF
9155                                  : (MORE_ASCII_RESTRICTED)
9156                                    ? NREFFA
9157                                    : (AT_LEAST_UNI_SEMANTICS)
9158                                      ? NREFFU
9159                                      : (LOC)
9160                                        ? NREFFL
9161                                        : NREFF),
9162                                 num);
9163                 *flagp |= HASWIDTH;
9164
9165                 /* override incorrect value set in reganode MJD */
9166                 Set_Node_Offset(ret, parse_start+1);
9167                 Set_Node_Cur_Length(ret); /* MJD */
9168                 nextchar(pRExC_state);
9169
9170             }
9171             break;
9172         }
9173         case 'g':
9174         case '1': case '2': case '3': case '4':
9175         case '5': case '6': case '7': case '8': case '9':
9176             {
9177                 I32 num;
9178                 bool isg = *RExC_parse == 'g';
9179                 bool isrel = 0;
9180                 bool hasbrace = 0;
9181                 if (isg) {
9182                     RExC_parse++;
9183                     if (*RExC_parse == '{') {
9184                         RExC_parse++;
9185                         hasbrace = 1;
9186                     }
9187                     if (*RExC_parse == '-') {
9188                         RExC_parse++;
9189                         isrel = 1;
9190                     }
9191                     if (hasbrace && !isDIGIT(*RExC_parse)) {
9192                         if (isrel) RExC_parse--;
9193                         RExC_parse -= 2;
9194                         goto parse_named_seq;
9195                 }   }
9196                 num = atoi(RExC_parse);
9197                 if (isg && num == 0)
9198                     vFAIL("Reference to invalid group 0");
9199                 if (isrel) {
9200                     num = RExC_npar - num;
9201                     if (num < 1)
9202                         vFAIL("Reference to nonexistent or unclosed group");
9203                 }
9204                 if (!isg && num > 9 && num >= RExC_npar)
9205                     goto defchar;
9206                 else {
9207                     char * const parse_start = RExC_parse - 1; /* MJD */
9208                     while (isDIGIT(*RExC_parse))
9209                         RExC_parse++;
9210                     if (parse_start == RExC_parse - 1)
9211                         vFAIL("Unterminated \\g... pattern");
9212                     if (hasbrace) {
9213                         if (*RExC_parse != '}')
9214                             vFAIL("Unterminated \\g{...} pattern");
9215                         RExC_parse++;
9216                     }
9217                     if (!SIZE_ONLY) {
9218                         if (num > (I32)RExC_rx->nparens)
9219                             vFAIL("Reference to nonexistent group");
9220                     }
9221                     RExC_sawback = 1;
9222                     ret = reganode(pRExC_state,
9223                                    ((! FOLD)
9224                                      ? REF
9225                                      : (MORE_ASCII_RESTRICTED)
9226                                        ? REFFA
9227                                        : (AT_LEAST_UNI_SEMANTICS)
9228                                          ? REFFU
9229                                          : (LOC)
9230                                            ? REFFL
9231                                            : REFF),
9232                                     num);
9233                     *flagp |= HASWIDTH;
9234
9235                     /* override incorrect value set in reganode MJD */
9236                     Set_Node_Offset(ret, parse_start+1);
9237                     Set_Node_Cur_Length(ret); /* MJD */
9238                     RExC_parse--;
9239                     nextchar(pRExC_state);
9240                 }
9241             }
9242             break;
9243         case '\0':
9244             if (RExC_parse >= RExC_end)
9245                 FAIL("Trailing \\");
9246             /* FALL THROUGH */
9247         default:
9248             /* Do not generate "unrecognized" warnings here, we fall
9249                back into the quick-grab loop below */
9250             parse_start--;
9251             goto defchar;
9252         }
9253         break;
9254
9255     case '#':
9256         if (RExC_flags & RXf_PMf_EXTENDED) {
9257             if ( reg_skipcomment( pRExC_state ) )
9258                 goto tryagain;
9259         }
9260         /* FALL THROUGH */
9261
9262     default:
9263
9264             parse_start = RExC_parse - 1;
9265
9266             RExC_parse++;
9267
9268         defchar: {
9269             register STRLEN len;
9270             register UV ender;
9271             register char *p;
9272             char *s;
9273             STRLEN foldlen;
9274             U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
9275             regnode * orig_emit;
9276             U8 node_type;
9277
9278             /* Is this a LATIN LOWER CASE SHARP S in an EXACTFU node?  If so,
9279              * it is folded to 'ss' even if not utf8 */
9280             bool is_exactfu_sharp_s;
9281
9282             ender = 0;
9283             orig_emit = RExC_emit; /* Save the original output node position in
9284                                       case we need to output a different node
9285                                       type */
9286             node_type = ((! FOLD) ? EXACT
9287                         : (LOC)
9288                           ? EXACTFL
9289                           : (MORE_ASCII_RESTRICTED)
9290                             ? EXACTFA
9291                             : (AT_LEAST_UNI_SEMANTICS)
9292                               ? EXACTFU
9293                               : EXACTF);
9294             ret = reg_node(pRExC_state, node_type);
9295             s = STRING(ret);
9296
9297             /* XXX The node can hold up to 255 bytes, yet this only goes to
9298              * 127.  I (khw) do not know why.  Keeping it somewhat less than
9299              * 255 allows us to not have to worry about overflow due to
9300              * converting to utf8 and fold expansion, but that value is
9301              * 255-UTF8_MAXBYTES_CASE.  join_exact() may join adjacent nodes
9302              * split up by this limit into a single one using the real max of
9303              * 255.  Even at 127, this breaks under rare circumstances.  If
9304              * folding, we do not want to split a node at a character that is a
9305              * non-final in a multi-char fold, as an input string could just
9306              * happen to want to match across the node boundary.  The join
9307              * would solve that problem if the join actually happens.  But a
9308              * series of more than two nodes in a row each of 127 would cause
9309              * the first join to succeed to get to 254, but then there wouldn't
9310              * be room for the next one, which could at be one of those split
9311              * multi-char folds.  I don't know of any fool-proof solution.  One
9312              * could back off to end with only a code point that isn't such a
9313              * non-final, but it is possible for there not to be any in the
9314              * entire node. */
9315             for (len = 0, p = RExC_parse - 1;
9316                  len < 127 && p < RExC_end;
9317                  len++)
9318             {
9319                 char * const oldp = p;
9320
9321                 if (RExC_flags & RXf_PMf_EXTENDED)
9322                     p = regwhite( pRExC_state, p );
9323                 switch ((U8)*p) {
9324                 case '^':
9325                 case '$':
9326                 case '.':
9327                 case '[':
9328                 case '(':
9329                 case ')':
9330                 case '|':
9331                     goto loopdone;
9332                 case '\\':
9333                     /* Literal Escapes Switch
9334
9335                        This switch is meant to handle escape sequences that
9336                        resolve to a literal character.
9337
9338                        Every escape sequence that represents something
9339                        else, like an assertion or a char class, is handled
9340                        in the switch marked 'Special Escapes' above in this
9341                        routine, but also has an entry here as anything that
9342                        isn't explicitly mentioned here will be treated as
9343                        an unescaped equivalent literal.
9344                     */
9345
9346                     switch ((U8)*++p) {
9347                     /* These are all the special escapes. */
9348                     case 'A':             /* Start assertion */
9349                     case 'b': case 'B':   /* Word-boundary assertion*/
9350                     case 'C':             /* Single char !DANGEROUS! */
9351                     case 'd': case 'D':   /* digit class */
9352                     case 'g': case 'G':   /* generic-backref, pos assertion */
9353                     case 'h': case 'H':   /* HORIZWS */
9354                     case 'k': case 'K':   /* named backref, keep marker */
9355                     case 'N':             /* named char sequence */
9356                     case 'p': case 'P':   /* Unicode property */
9357                               case 'R':   /* LNBREAK */
9358                     case 's': case 'S':   /* space class */
9359                     case 'v': case 'V':   /* VERTWS */
9360                     case 'w': case 'W':   /* word class */
9361                     case 'X':             /* eXtended Unicode "combining character sequence" */
9362                     case 'z': case 'Z':   /* End of line/string assertion */
9363                         --p;
9364                         goto loopdone;
9365
9366                     /* Anything after here is an escape that resolves to a
9367                        literal. (Except digits, which may or may not)
9368                      */
9369                     case 'n':
9370                         ender = '\n';
9371                         p++;
9372                         break;
9373                     case 'r':
9374                         ender = '\r';
9375                         p++;
9376                         break;
9377                     case 't':
9378                         ender = '\t';
9379                         p++;
9380                         break;
9381                     case 'f':
9382                         ender = '\f';
9383                         p++;
9384                         break;
9385                     case 'e':
9386                           ender = ASCII_TO_NATIVE('\033');
9387                         p++;
9388                         break;
9389                     case 'a':
9390                           ender = ASCII_TO_NATIVE('\007');
9391                         p++;
9392                         break;
9393                     case 'o':
9394                         {
9395                             STRLEN brace_len = len;
9396                             UV result;
9397                             const char* error_msg;
9398
9399                             bool valid = grok_bslash_o(p,
9400                                                        &result,
9401                                                        &brace_len,
9402                                                        &error_msg,
9403                                                        1);
9404                             p += brace_len;
9405                             if (! valid) {
9406                                 RExC_parse = p; /* going to die anyway; point
9407                                                    to exact spot of failure */
9408                                 vFAIL(error_msg);
9409                             }
9410                             else
9411                             {
9412                                 ender = result;
9413                             }
9414                             if (PL_encoding && ender < 0x100) {
9415                                 goto recode_encoding;
9416                             }
9417                             if (ender > 0xff) {
9418                                 REQUIRE_UTF8;
9419                             }
9420                             break;
9421                         }
9422                     case 'x':
9423                         if (*++p == '{') {
9424                             char* const e = strchr(p, '}');
9425
9426                             if (!e) {
9427                                 RExC_parse = p + 1;
9428                                 vFAIL("Missing right brace on \\x{}");
9429                             }
9430                             else {
9431                                 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
9432                                     | PERL_SCAN_DISALLOW_PREFIX;
9433                                 STRLEN numlen = e - p - 1;
9434                                 ender = grok_hex(p + 1, &numlen, &flags, NULL);
9435                                 if (ender > 0xff)
9436                                     REQUIRE_UTF8;
9437                                 p = e + 1;
9438                             }
9439                         }
9440                         else {
9441                             I32 flags = PERL_SCAN_DISALLOW_PREFIX;
9442                             STRLEN numlen = 2;
9443                             ender = grok_hex(p, &numlen, &flags, NULL);
9444                             p += numlen;
9445                         }
9446                         if (PL_encoding && ender < 0x100)
9447                             goto recode_encoding;
9448                         break;
9449                     case 'c':
9450                         p++;
9451                         ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
9452                         break;
9453                     case '0': case '1': case '2': case '3':case '4':
9454                     case '5': case '6': case '7': case '8':case '9':
9455                         if (*p == '0' ||
9456                             (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
9457                         {
9458                             I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
9459                             STRLEN numlen = 3;
9460                             ender = grok_oct(p, &numlen, &flags, NULL);
9461                             if (ender > 0xff) {
9462                                 REQUIRE_UTF8;
9463                             }
9464                             p += numlen;
9465                         }
9466                         else {
9467                             --p;
9468                             goto loopdone;
9469                         }
9470                         if (PL_encoding && ender < 0x100)
9471                             goto recode_encoding;
9472                         break;
9473                     recode_encoding:
9474                         if (! RExC_override_recoding) {
9475                             SV* enc = PL_encoding;
9476                             ender = reg_recode((const char)(U8)ender, &enc);
9477                             if (!enc && SIZE_ONLY)
9478                                 ckWARNreg(p, "Invalid escape in the specified encoding");
9479                             REQUIRE_UTF8;
9480                         }
9481                         break;
9482                     case '\0':
9483                         if (p >= RExC_end)
9484                             FAIL("Trailing \\");
9485                         /* FALL THROUGH */
9486                     default:
9487                         if (!SIZE_ONLY&& isALPHA(*p)) {
9488                             /* Include any { following the alpha to emphasize
9489                              * that it could be part of an escape at some point
9490                              * in the future */
9491                             int len = (*(p + 1) == '{') ? 2 : 1;
9492                             ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
9493                         }
9494                         goto normal_default;
9495                     }
9496                     break;
9497                 default:
9498                   normal_default:
9499                     if (UTF8_IS_START(*p) && UTF) {
9500                         STRLEN numlen;
9501                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
9502                                                &numlen, UTF8_ALLOW_DEFAULT);
9503                         p += numlen;
9504                     }
9505                     else
9506                         ender = (U8) *p++;
9507                     break;
9508                 } /* End of switch on the literal */
9509
9510                 is_exactfu_sharp_s = (node_type == EXACTFU
9511                                       && ender == LATIN_SMALL_LETTER_SHARP_S);
9512                 if ( RExC_flags & RXf_PMf_EXTENDED)
9513                     p = regwhite( pRExC_state, p );
9514                 if ((UTF && FOLD) || is_exactfu_sharp_s) {
9515                     /* Prime the casefolded buffer.  Locale rules, which apply
9516                      * only to code points < 256, aren't known until execution,
9517                      * so for them, just output the original character using
9518                      * utf8.  If we start to fold non-UTF patterns, be sure to
9519                      * update join_exact() */
9520                     if (LOC && ender < 256) {
9521                         if (UNI_IS_INVARIANT(ender)) {
9522                             *tmpbuf = (U8) ender;
9523                             foldlen = 1;
9524                         } else {
9525                             *tmpbuf = UTF8_TWO_BYTE_HI(ender);
9526                             *(tmpbuf + 1) = UTF8_TWO_BYTE_LO(ender);
9527                             foldlen = 2;
9528                         }
9529                     }
9530                     else if (isASCII(ender)) {  /* Note: Here can't also be LOC
9531                                                  */
9532                         ender = toLOWER(ender);
9533                         *tmpbuf = (U8) ender;
9534                         foldlen = 1;
9535                     }
9536                     else if (! MORE_ASCII_RESTRICTED && ! LOC) {
9537
9538                         /* Locale and /aa require more selectivity about the
9539                          * fold, so are handled below.  Otherwise, here, just
9540                          * use the fold */
9541                         ender = toFOLD_uni(ender, tmpbuf, &foldlen);
9542                     }
9543                     else {
9544                         /* Under locale rules or /aa we are not to mix,
9545                          * respectively, ords < 256 or ASCII with non-.  So
9546                          * reject folds that mix them, using only the
9547                          * non-folded code point.  So do the fold to a
9548                          * temporary, and inspect each character in it. */
9549                         U8 trialbuf[UTF8_MAXBYTES_CASE+1];
9550                         U8* s = trialbuf;
9551                         UV tmpender = toFOLD_uni(ender, trialbuf, &foldlen);
9552                         U8* e = s + foldlen;
9553                         bool fold_ok = TRUE;
9554
9555                         while (s < e) {
9556                             if (isASCII(*s)
9557                                 || (LOC && (UTF8_IS_INVARIANT(*s)
9558                                            || UTF8_IS_DOWNGRADEABLE_START(*s))))
9559                             {
9560                                 fold_ok = FALSE;
9561                                 break;
9562                             }
9563                             s += UTF8SKIP(s);
9564                         }
9565                         if (fold_ok) {
9566                             Copy(trialbuf, tmpbuf, foldlen, U8);
9567                             ender = tmpender;
9568                         }
9569                         else {
9570                             uvuni_to_utf8(tmpbuf, ender);
9571                             foldlen = UNISKIP(ender);
9572                         }
9573                     }
9574                 }
9575                 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
9576                     if (len)
9577                         p = oldp;
9578                     else if (UTF || is_exactfu_sharp_s) {
9579                          if (FOLD) {
9580                               /* Emit all the Unicode characters. */
9581                               STRLEN numlen;
9582                               for (foldbuf = tmpbuf;
9583                                    foldlen;
9584                                    foldlen -= numlen) {
9585                                    ender = utf8_to_uvchr(foldbuf, &numlen);
9586                                    if (numlen > 0) {
9587                                         const STRLEN unilen = reguni(pRExC_state, ender, s);
9588                                         s       += unilen;
9589                                         len     += unilen;
9590                                         /* In EBCDIC the numlen
9591                                          * and unilen can differ. */
9592                                         foldbuf += numlen;
9593                                         if (numlen >= foldlen)
9594                                              break;
9595                                    }
9596                                    else
9597                                         break; /* "Can't happen." */
9598                               }
9599                          }
9600                          else {
9601                               const STRLEN unilen = reguni(pRExC_state, ender, s);
9602                               if (unilen > 0) {
9603                                    s   += unilen;
9604                                    len += unilen;
9605                               }
9606                          }
9607                     }
9608                     else {
9609                         len++;
9610                         REGC((char)ender, s++);
9611                     }
9612                     break;
9613                 }
9614                 if (UTF || is_exactfu_sharp_s) {
9615                      if (FOLD) {
9616                           /* Emit all the Unicode characters. */
9617                           STRLEN numlen;
9618                           for (foldbuf = tmpbuf;
9619                                foldlen;
9620                                foldlen -= numlen) {
9621                                ender = utf8_to_uvchr(foldbuf, &numlen);
9622                                if (numlen > 0) {
9623                                     const STRLEN unilen = reguni(pRExC_state, ender, s);
9624                                     len     += unilen;
9625                                     s       += unilen;
9626                                     /* In EBCDIC the numlen
9627                                      * and unilen can differ. */
9628                                     foldbuf += numlen;
9629                                     if (numlen >= foldlen)
9630                                          break;
9631                                }
9632                                else
9633                                     break;
9634                           }
9635                      }
9636                      else {
9637                           const STRLEN unilen = reguni(pRExC_state, ender, s);
9638                           if (unilen > 0) {
9639                                s   += unilen;
9640                                len += unilen;
9641                           }
9642                      }
9643                      len--;
9644                 }
9645                 else {
9646                     REGC((char)ender, s++);
9647                 }
9648             }
9649         loopdone:   /* Jumped to when encounters something that shouldn't be in
9650                        the node */
9651             RExC_parse = p - 1;
9652             Set_Node_Cur_Length(ret); /* MJD */
9653             nextchar(pRExC_state);
9654             {
9655                 /* len is STRLEN which is unsigned, need to copy to signed */
9656                 IV iv = len;
9657                 if (iv < 0)
9658                     vFAIL("Internal disaster");
9659             }
9660             if (len > 0)
9661                 *flagp |= HASWIDTH;
9662             if (len == 1 && UNI_IS_INVARIANT(ender))
9663                 *flagp |= SIMPLE;
9664
9665             if (SIZE_ONLY)
9666                 RExC_size += STR_SZ(len);
9667             else {
9668                 STR_LEN(ret) = len;
9669                 RExC_emit += STR_SZ(len);
9670             }
9671         }
9672         break;
9673     }
9674
9675     return(ret);
9676
9677 /* Jumped to when an unrecognized character set is encountered */
9678 bad_charset:
9679     Perl_croak(aTHX_ "panic: Unknown regex character set encoding: %u", get_regex_charset(RExC_flags));
9680     return(NULL);
9681 }
9682
9683 STATIC char *
9684 S_regwhite( RExC_state_t *pRExC_state, char *p )
9685 {
9686     const char *e = RExC_end;
9687
9688     PERL_ARGS_ASSERT_REGWHITE;
9689
9690     while (p < e) {
9691         if (isSPACE(*p))
9692             ++p;
9693         else if (*p == '#') {
9694             bool ended = 0;
9695             do {
9696                 if (*p++ == '\n') {
9697                     ended = 1;
9698                     break;
9699                 }
9700             } while (p < e);
9701             if (!ended)
9702                 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
9703         }
9704         else
9705             break;
9706     }
9707     return p;
9708 }
9709
9710 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
9711    Character classes ([:foo:]) can also be negated ([:^foo:]).
9712    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
9713    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
9714    but trigger failures because they are currently unimplemented. */
9715
9716 #define POSIXCC_DONE(c)   ((c) == ':')
9717 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
9718 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
9719
9720 STATIC I32
9721 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
9722 {
9723     dVAR;
9724     I32 namedclass = OOB_NAMEDCLASS;
9725
9726     PERL_ARGS_ASSERT_REGPPOSIXCC;
9727
9728     if (value == '[' && RExC_parse + 1 < RExC_end &&
9729         /* I smell either [: or [= or [. -- POSIX has been here, right? */
9730         POSIXCC(UCHARAT(RExC_parse))) {
9731         const char c = UCHARAT(RExC_parse);
9732         char* const s = RExC_parse++;
9733
9734         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
9735             RExC_parse++;
9736         if (RExC_parse == RExC_end)
9737             /* Grandfather lone [:, [=, [. */
9738             RExC_parse = s;
9739         else {
9740             const char* const t = RExC_parse++; /* skip over the c */
9741             assert(*t == c);
9742
9743             if (UCHARAT(RExC_parse) == ']') {
9744                 const char *posixcc = s + 1;
9745                 RExC_parse++; /* skip over the ending ] */
9746
9747                 if (*s == ':') {
9748                     const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
9749                     const I32 skip = t - posixcc;
9750
9751                     /* Initially switch on the length of the name.  */
9752                     switch (skip) {
9753                     case 4:
9754                         if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
9755                             namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
9756                         break;
9757                     case 5:
9758                         /* Names all of length 5.  */
9759                         /* alnum alpha ascii blank cntrl digit graph lower
9760                            print punct space upper  */
9761                         /* Offset 4 gives the best switch position.  */
9762                         switch (posixcc[4]) {
9763                         case 'a':
9764                             if (memEQ(posixcc, "alph", 4)) /* alpha */
9765                                 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
9766                             break;
9767                         case 'e':
9768                             if (memEQ(posixcc, "spac", 4)) /* space */
9769                                 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
9770                             break;
9771                         case 'h':
9772                             if (memEQ(posixcc, "grap", 4)) /* graph */
9773                                 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
9774                             break;
9775                         case 'i':
9776                             if (memEQ(posixcc, "asci", 4)) /* ascii */
9777                                 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
9778                             break;
9779                         case 'k':
9780                             if (memEQ(posixcc, "blan", 4)) /* blank */
9781                                 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
9782                             break;
9783                         case 'l':
9784                             if (memEQ(posixcc, "cntr", 4)) /* cntrl */
9785                                 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
9786                             break;
9787                         case 'm':
9788                             if (memEQ(posixcc, "alnu", 4)) /* alnum */
9789                                 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
9790                             break;
9791                         case 'r':
9792                             if (memEQ(posixcc, "lowe", 4)) /* lower */
9793                                 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
9794                             else if (memEQ(posixcc, "uppe", 4)) /* upper */
9795                                 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
9796                             break;
9797                         case 't':
9798                             if (memEQ(posixcc, "digi", 4)) /* digit */
9799                                 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
9800                             else if (memEQ(posixcc, "prin", 4)) /* print */
9801                                 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
9802                             else if (memEQ(posixcc, "punc", 4)) /* punct */
9803                                 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
9804                             break;
9805                         }
9806                         break;
9807                     case 6:
9808                         if (memEQ(posixcc, "xdigit", 6))
9809                             namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
9810                         break;
9811                     }
9812
9813                     if (namedclass == OOB_NAMEDCLASS)
9814                         Simple_vFAIL3("POSIX class [:%.*s:] unknown",
9815                                       t - s - 1, s + 1);
9816                     assert (posixcc[skip] == ':');
9817                     assert (posixcc[skip+1] == ']');
9818                 } else if (!SIZE_ONLY) {
9819                     /* [[=foo=]] and [[.foo.]] are still future. */
9820
9821                     /* adjust RExC_parse so the warning shows after
9822                        the class closes */
9823                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
9824                         RExC_parse++;
9825                     Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
9826                 }
9827             } else {
9828                 /* Maternal grandfather:
9829                  * "[:" ending in ":" but not in ":]" */
9830                 RExC_parse = s;
9831             }
9832         }
9833     }
9834
9835     return namedclass;
9836 }
9837
9838 STATIC void
9839 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
9840 {
9841     dVAR;
9842
9843     PERL_ARGS_ASSERT_CHECKPOSIXCC;
9844
9845     if (POSIXCC(UCHARAT(RExC_parse))) {
9846         const char *s = RExC_parse;
9847         const char  c = *s++;
9848
9849         while (isALNUM(*s))
9850             s++;
9851         if (*s && c == *s && s[1] == ']') {
9852             ckWARN3reg(s+2,
9853                        "POSIX syntax [%c %c] belongs inside character classes",
9854                        c, c);
9855
9856             /* [[=foo=]] and [[.foo.]] are still future. */
9857             if (POSIXCC_NOTYET(c)) {
9858                 /* adjust RExC_parse so the error shows after
9859                    the class closes */
9860                 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
9861                     NOOP;
9862                 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
9863             }
9864         }
9865     }
9866 }
9867
9868 /* No locale test, and always Unicode semantics, no ignore-case differences */
9869 #define _C_C_T_NOLOC_(NAME,TEST,WORD)                                          \
9870 ANYOF_##NAME:                                                                  \
9871         for (value = 0; value < 256; value++)                                  \
9872             if (TEST)                                                          \
9873             stored += set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);  \
9874     yesno = '+';                                                               \
9875     what = WORD;                                                               \
9876     break;                                                                     \
9877 case ANYOF_N##NAME:                                                            \
9878         for (value = 0; value < 256; value++)                                  \
9879             if (!TEST)                                                         \
9880             stored += set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);  \
9881     yesno = '!';                                                               \
9882     what = WORD;                                                               \
9883     break
9884
9885 /* Like the above, but there are differences if we are in uni-8-bit or not, so
9886  * there are two tests passed in, to use depending on that. There aren't any
9887  * cases where the label is different from the name, so no need for that
9888  * parameter.
9889  * Sets 'what' to WORD which is the property name for non-bitmap code points;
9890  * But, uses FOLD_WORD instead if /i has been selected, to allow a different
9891  * property name */
9892 #define _C_C_T_(NAME, TEST_8, TEST_7, WORD, FOLD_WORD)                         \
9893 ANYOF_##NAME:                                                                  \
9894     if (LOC) ANYOF_CLASS_SET(ret, ANYOF_##NAME);                               \
9895     else if (UNI_SEMANTICS) {                                                  \
9896         for (value = 0; value < 256; value++) {                                \
9897             if (TEST_8(value)) stored +=                                       \
9898                       set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);  \
9899         }                                                                      \
9900     }                                                                          \
9901     else {                                                                     \
9902         for (value = 0; value < 128; value++) {                                \
9903             if (TEST_7(UNI_TO_NATIVE(value))) stored +=                        \
9904                 set_regclass_bit(pRExC_state, ret,                     \
9905                                    (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate);                 \
9906         }                                                                      \
9907     }                                                                          \
9908     yesno = '+';                                                               \
9909     if (FOLD) {                                                                \
9910         what = FOLD_WORD;                                                      \
9911     }                                                                          \
9912     else {                                                                     \
9913         what = WORD;                                                           \
9914     }                                                                          \
9915     break;                                                                     \
9916 case ANYOF_N##NAME:                                                            \
9917     if (LOC) ANYOF_CLASS_SET(ret, ANYOF_N##NAME);                              \
9918     else if (UNI_SEMANTICS) {                                                  \
9919         for (value = 0; value < 256; value++) {                                \
9920             if (! TEST_8(value)) stored +=                                     \
9921                     set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);    \
9922         }                                                                      \
9923     }                                                                          \
9924     else {                                                                     \
9925         for (value = 0; value < 128; value++) {                                \
9926             if (! TEST_7(UNI_TO_NATIVE(value))) stored += set_regclass_bit(  \
9927                         pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate);    \
9928         }                                                                      \
9929         if (AT_LEAST_ASCII_RESTRICTED) {                                       \
9930             for (value = 128; value < 256; value++) {                          \
9931              stored += set_regclass_bit(                                     \
9932                            pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate); \
9933             }                                                                  \
9934             ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;                             \
9935         }                                                                      \
9936         else {                                                                 \
9937             /* For a non-ut8 target string with DEPENDS semantics, all above   \
9938              * ASCII Latin1 code points match the complement of any of the     \
9939              * classes.  But in utf8, they have their Unicode semantics, so    \
9940              * can't just set them in the bitmap, or else regexec.c will think \
9941              * they matched when they shouldn't. */                            \
9942             ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;                     \
9943         }                                                                      \
9944     }                                                                          \
9945     yesno = '!';                                                               \
9946     if (FOLD) {                                                                \
9947         what = FOLD_WORD;                                                      \
9948     }                                                                          \
9949     else {                                                                     \
9950         what = WORD;                                                           \
9951     }                                                                          \
9952     break
9953
9954 STATIC U8
9955 S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, SV** invlist_ptr, AV** alternate_ptr)
9956 {
9957
9958     /* Handle the setting of folds in the bitmap for non-locale ANYOF nodes.
9959      * Locale folding is done at run-time, so this function should not be
9960      * called for nodes that are for locales.
9961      *
9962      * This function sets the bit corresponding to the fold of the input
9963      * 'value', if not already set.  The fold of 'f' is 'F', and the fold of
9964      * 'F' is 'f'.
9965      *
9966      * It also knows about the characters that are in the bitmap that have
9967      * folds that are matchable only outside it, and sets the appropriate lists
9968      * and flags.
9969      *
9970      * It returns the number of bits that actually changed from 0 to 1 */
9971
9972     U8 stored = 0;
9973     U8 fold;
9974
9975     PERL_ARGS_ASSERT_SET_REGCLASS_BIT_FOLD;
9976
9977     fold = (AT_LEAST_UNI_SEMANTICS) ? PL_fold_latin1[value]
9978                                     : PL_fold[value];
9979
9980     /* It assumes the bit for 'value' has already been set */
9981     if (fold != value && ! ANYOF_BITMAP_TEST(node, fold)) {
9982         ANYOF_BITMAP_SET(node, fold);
9983         stored++;
9984     }
9985     if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value) && (! isASCII(value) || ! MORE_ASCII_RESTRICTED)) {
9986         /* Certain Latin1 characters have matches outside the bitmap.  To get
9987          * here, 'value' is one of those characters.   None of these matches is
9988          * valid for ASCII characters under /aa, which have been excluded by
9989          * the 'if' above.  The matches fall into three categories:
9990          * 1) They are singly folded-to or -from an above 255 character, as
9991          *    LATIN SMALL LETTER Y WITH DIAERESIS and LATIN CAPITAL LETTER Y
9992          *    WITH DIAERESIS;
9993          * 2) They are part of a multi-char fold with another character in the
9994          *    bitmap, only LATIN SMALL LETTER SHARP S => "ss" fits that bill;
9995          * 3) They are part of a multi-char fold with a character not in the
9996          *    bitmap, such as various ligatures.
9997          * We aren't dealing fully with multi-char folds, except we do deal
9998          * with the pattern containing a character that has a multi-char fold
9999          * (not so much the inverse).
10000          * For types 1) and 3), the matches only happen when the target string
10001          * is utf8; that's not true for 2), and we set a flag for it.
10002          *
10003          * The code below adds to the passed in inversion list the single fold
10004          * closures for 'value'.  The values are hard-coded here so that an
10005          * innocent-looking character class, like /[ks]/i won't have to go out
10006          * to disk to find the possible matches.  XXX It would be better to
10007          * generate these via regen, in case a new version of the Unicode
10008          * standard adds new mappings, though that is not really likely. */
10009         switch (value) {
10010             case 'k':
10011             case 'K':
10012                 /* KELVIN SIGN */
10013                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212A);
10014                 break;
10015             case 's':
10016             case 'S':
10017                 /* LATIN SMALL LETTER LONG S */
10018                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x017F);
10019                 break;
10020             case MICRO_SIGN:
10021                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10022                                                  GREEK_SMALL_LETTER_MU);
10023                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10024                                                  GREEK_CAPITAL_LETTER_MU);
10025                 break;
10026             case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
10027             case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
10028                 /* ANGSTROM SIGN */
10029                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212B);
10030                 if (DEPENDS_SEMANTICS) {    /* See DEPENDS comment below */
10031                     *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10032                                                      PL_fold_latin1[value]);
10033                 }
10034                 break;
10035             case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
10036                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10037                                         LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
10038                 break;
10039             case LATIN_SMALL_LETTER_SHARP_S:
10040                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10041                                         LATIN_CAPITAL_LETTER_SHARP_S);
10042
10043                 /* Under /a, /d, and /u, this can match the two chars "ss" */
10044                 if (! MORE_ASCII_RESTRICTED) {
10045                     add_alternate(alternate_ptr, (U8 *) "ss", 2);
10046
10047                     /* And under /u or /a, it can match even if the target is
10048                      * not utf8 */
10049                     if (AT_LEAST_UNI_SEMANTICS) {
10050                         ANYOF_FLAGS(node) |= ANYOF_NONBITMAP_NON_UTF8;
10051                     }
10052                 }
10053                 break;
10054             case 'F': case 'f':
10055             case 'I': case 'i':
10056             case 'L': case 'l':
10057             case 'T': case 't':
10058             case 'A': case 'a':
10059             case 'H': case 'h':
10060             case 'J': case 'j':
10061             case 'N': case 'n':
10062             case 'W': case 'w':
10063             case 'Y': case 'y':
10064                 /* These all are targets of multi-character folds from code
10065                  * points that require UTF8 to express, so they can't match
10066                  * unless the target string is in UTF-8, so no action here is
10067                  * necessary, as regexec.c properly handles the general case
10068                  * for UTF-8 matching */
10069                 break;
10070             default:
10071                 /* Use deprecated warning to increase the chances of this
10072                  * being output */
10073                 ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%x; please use the perlbug utility to report;", value);
10074                 break;
10075         }
10076     }
10077     else if (DEPENDS_SEMANTICS
10078             && ! isASCII(value)
10079             && PL_fold_latin1[value] != value)
10080     {
10081            /* Under DEPENDS rules, non-ASCII Latin1 characters match their
10082             * folds only when the target string is in UTF-8.  We add the fold
10083             * here to the list of things to match outside the bitmap, which
10084             * won't be looked at unless it is UTF8 (or else if something else
10085             * says to look even if not utf8, but those things better not happen
10086             * under DEPENDS semantics. */
10087         *invlist_ptr = add_cp_to_invlist(*invlist_ptr, PL_fold_latin1[value]);
10088     }
10089
10090     return stored;
10091 }
10092
10093
10094 PERL_STATIC_INLINE U8
10095 S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, SV** invlist_ptr, AV** alternate_ptr)
10096 {
10097     /* This inline function sets a bit in the bitmap if not already set, and if
10098      * appropriate, its fold, returning the number of bits that actually
10099      * changed from 0 to 1 */
10100
10101     U8 stored;
10102
10103     PERL_ARGS_ASSERT_SET_REGCLASS_BIT;
10104
10105     if (ANYOF_BITMAP_TEST(node, value)) {   /* Already set */
10106         return 0;
10107     }
10108
10109     ANYOF_BITMAP_SET(node, value);
10110     stored = 1;
10111
10112     if (FOLD && ! LOC) {        /* Locale folds aren't known until runtime */
10113         stored += set_regclass_bit_fold(pRExC_state, node, value, invlist_ptr, alternate_ptr);
10114     }
10115
10116     return stored;
10117 }
10118
10119 STATIC void
10120 S_add_alternate(pTHX_ AV** alternate_ptr, U8* string, STRLEN len)
10121 {
10122     /* Adds input 'string' with length 'len' to the ANYOF node's unicode
10123      * alternate list, pointed to by 'alternate_ptr'.  This is an array of
10124      * the multi-character folds of characters in the node */
10125     SV *sv;
10126
10127     PERL_ARGS_ASSERT_ADD_ALTERNATE;
10128
10129     if (! *alternate_ptr) {
10130         *alternate_ptr = newAV();
10131     }
10132     sv = newSVpvn_utf8((char*)string, len, TRUE);
10133     av_push(*alternate_ptr, sv);
10134     return;
10135 }
10136
10137 /*
10138    parse a class specification and produce either an ANYOF node that
10139    matches the pattern or perhaps will be optimized into an EXACTish node
10140    instead. The node contains a bit map for the first 256 characters, with the
10141    corresponding bit set if that character is in the list.  For characters
10142    above 255, a range list is used */
10143
10144 STATIC regnode *
10145 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
10146 {
10147     dVAR;
10148     register UV nextvalue;
10149     register IV prevvalue = OOB_UNICODE;
10150     register IV range = 0;
10151     UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
10152     register regnode *ret;
10153     STRLEN numlen;
10154     IV namedclass;
10155     char *rangebegin = NULL;
10156     bool need_class = 0;
10157     bool allow_full_fold = TRUE;   /* Assume wants multi-char folding */
10158     SV *listsv = NULL;
10159     STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
10160                                       than just initialized.  */
10161     SV* properties = NULL;    /* Code points that match \p{} \P{} */
10162     UV element_count = 0;   /* Number of distinct elements in the class.
10163                                Optimizations may be possible if this is tiny */
10164     UV n;
10165
10166     /* Unicode properties are stored in a swash; this holds the current one
10167      * being parsed.  If this swash is the only above-latin1 component of the
10168      * character class, an optimization is to pass it directly on to the
10169      * execution engine.  Otherwise, it is set to NULL to indicate that there
10170      * are other things in the class that have to be dealt with at execution
10171      * time */
10172     SV* swash = NULL;           /* Code points that match \p{} \P{} */
10173
10174     /* Set if a component of this character class is user-defined; just passed
10175      * on to the engine */
10176     UV has_user_defined_property = 0;
10177
10178     /* code points this node matches that can't be stored in the bitmap */
10179     SV* nonbitmap = NULL;
10180
10181     /* The items that are to match that aren't stored in the bitmap, but are a
10182      * result of things that are stored there.  This is the fold closure of
10183      * such a character, either because it has DEPENDS semantics and shouldn't
10184      * be matched unless the target string is utf8, or is a code point that is
10185      * too large for the bit map, as for example, the fold of the MICRO SIGN is
10186      * above 255.  This all is solely for performance reasons.  By having this
10187      * code know the outside-the-bitmap folds that the bitmapped characters are
10188      * involved with, we don't have to go out to disk to find the list of
10189      * matches, unless the character class includes code points that aren't
10190      * storable in the bit map.  That means that a character class with an 's'
10191      * in it, for example, doesn't need to go out to disk to find everything
10192      * that matches.  A 2nd list is used so that the 'nonbitmap' list is kept
10193      * empty unless there is something whose fold we don't know about, and will
10194      * have to go out to the disk to find. */
10195     SV* l1_fold_invlist = NULL;
10196
10197     /* List of multi-character folds that are matched by this node */
10198     AV* unicode_alternate  = NULL;
10199 #ifdef EBCDIC
10200     UV literal_endpoint = 0;
10201 #endif
10202     UV stored = 0;  /* how many chars stored in the bitmap */
10203
10204     regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
10205         case we need to change the emitted regop to an EXACT. */
10206     const char * orig_parse = RExC_parse;
10207     GET_RE_DEBUG_FLAGS_DECL;
10208
10209     PERL_ARGS_ASSERT_REGCLASS;
10210 #ifndef DEBUGGING
10211     PERL_UNUSED_ARG(depth);
10212 #endif
10213
10214     DEBUG_PARSE("clas");
10215
10216     /* Assume we are going to generate an ANYOF node. */
10217     ret = reganode(pRExC_state, ANYOF, 0);
10218
10219
10220     if (!SIZE_ONLY) {
10221         ANYOF_FLAGS(ret) = 0;
10222     }
10223
10224     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
10225         RExC_naughty++;
10226         RExC_parse++;
10227         if (!SIZE_ONLY)
10228             ANYOF_FLAGS(ret) |= ANYOF_INVERT;
10229
10230         /* We have decided to not allow multi-char folds in inverted character
10231          * classes, due to the confusion that can happen, especially with
10232          * classes that are designed for a non-Unicode world:  You have the
10233          * peculiar case that:
10234             "s s" =~ /^[^\xDF]+$/i => Y
10235             "ss"  =~ /^[^\xDF]+$/i => N
10236          *
10237          * See [perl #89750] */
10238         allow_full_fold = FALSE;
10239     }
10240
10241     if (SIZE_ONLY) {
10242         RExC_size += ANYOF_SKIP;
10243         listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
10244     }
10245     else {
10246         RExC_emit += ANYOF_SKIP;
10247         if (LOC) {
10248             ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
10249         }
10250         ANYOF_BITMAP_ZERO(ret);
10251         listsv = newSVpvs("# comment\n");
10252         initial_listsv_len = SvCUR(listsv);
10253     }
10254
10255     nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
10256
10257     if (!SIZE_ONLY && POSIXCC(nextvalue))
10258         checkposixcc(pRExC_state);
10259
10260     /* allow 1st char to be ] (allowing it to be - is dealt with later) */
10261     if (UCHARAT(RExC_parse) == ']')
10262         goto charclassloop;
10263
10264 parseit:
10265     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
10266
10267     charclassloop:
10268
10269         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
10270
10271         if (!range) {
10272             rangebegin = RExC_parse;
10273             element_count++;
10274         }
10275         if (UTF) {
10276             value = utf8n_to_uvchr((U8*)RExC_parse,
10277                                    RExC_end - RExC_parse,
10278                                    &numlen, UTF8_ALLOW_DEFAULT);
10279             RExC_parse += numlen;
10280         }
10281         else
10282             value = UCHARAT(RExC_parse++);
10283
10284         nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
10285         if (value == '[' && POSIXCC(nextvalue))
10286             namedclass = regpposixcc(pRExC_state, value);
10287         else if (value == '\\') {
10288             if (UTF) {
10289                 value = utf8n_to_uvchr((U8*)RExC_parse,
10290                                    RExC_end - RExC_parse,
10291                                    &numlen, UTF8_ALLOW_DEFAULT);
10292                 RExC_parse += numlen;
10293             }
10294             else
10295                 value = UCHARAT(RExC_parse++);
10296             /* Some compilers cannot handle switching on 64-bit integer
10297              * values, therefore value cannot be an UV.  Yes, this will
10298              * be a problem later if we want switch on Unicode.
10299              * A similar issue a little bit later when switching on
10300              * namedclass. --jhi */
10301             switch ((I32)value) {
10302             case 'w':   namedclass = ANYOF_ALNUM;       break;
10303             case 'W':   namedclass = ANYOF_NALNUM;      break;
10304             case 's':   namedclass = ANYOF_SPACE;       break;
10305             case 'S':   namedclass = ANYOF_NSPACE;      break;
10306             case 'd':   namedclass = ANYOF_DIGIT;       break;
10307             case 'D':   namedclass = ANYOF_NDIGIT;      break;
10308             case 'v':   namedclass = ANYOF_VERTWS;      break;
10309             case 'V':   namedclass = ANYOF_NVERTWS;     break;
10310             case 'h':   namedclass = ANYOF_HORIZWS;     break;
10311             case 'H':   namedclass = ANYOF_NHORIZWS;    break;
10312             case 'N':  /* Handle \N{NAME} in class */
10313                 {
10314                     /* We only pay attention to the first char of
10315                     multichar strings being returned. I kinda wonder
10316                     if this makes sense as it does change the behaviour
10317                     from earlier versions, OTOH that behaviour was broken
10318                     as well. */
10319                     UV v; /* value is register so we cant & it /grrr */
10320                     if (reg_namedseq(pRExC_state, &v, NULL, depth)) {
10321                         goto parseit;
10322                     }
10323                     value= v;
10324                 }
10325                 break;
10326             case 'p':
10327             case 'P':
10328                 {
10329                 char *e;
10330                 if (RExC_parse >= RExC_end)
10331                     vFAIL2("Empty \\%c{}", (U8)value);
10332                 if (*RExC_parse == '{') {
10333                     const U8 c = (U8)value;
10334                     e = strchr(RExC_parse++, '}');
10335                     if (!e)
10336                         vFAIL2("Missing right brace on \\%c{}", c);
10337                     while (isSPACE(UCHARAT(RExC_parse)))
10338                         RExC_parse++;
10339                     if (e == RExC_parse)
10340                         vFAIL2("Empty \\%c{}", c);
10341                     n = e - RExC_parse;
10342                     while (isSPACE(UCHARAT(RExC_parse + n - 1)))
10343                         n--;
10344                 }
10345                 else {
10346                     e = RExC_parse;
10347                     n = 1;
10348                 }
10349                 if (!SIZE_ONLY) {
10350                     SV** invlistsvp;
10351                     SV* invlist;
10352                     char* name;
10353                     if (UCHARAT(RExC_parse) == '^') {
10354                          RExC_parse++;
10355                          n--;
10356                          value = value == 'p' ? 'P' : 'p'; /* toggle */
10357                          while (isSPACE(UCHARAT(RExC_parse))) {
10358                               RExC_parse++;
10359                               n--;
10360                          }
10361                     }
10362                     /* Try to get the definition of the property into
10363                      * <invlist>.  If /i is in effect, the effective property
10364                      * will have its name be <__NAME_i>.  The design is
10365                      * discussed in commit
10366                      * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
10367                     Newx(name, n + sizeof("_i__\n"), char);
10368
10369                     sprintf(name, "%s%.*s%s\n",
10370                                     (FOLD) ? "__" : "",
10371                                     (int)n,
10372                                     RExC_parse,
10373                                     (FOLD) ? "_i" : ""
10374                     );
10375
10376                     /* Look up the property name, and get its swash and
10377                      * inversion list, if the property is found  */
10378                     if (swash) {
10379                         SvREFCNT_dec(swash);
10380                     }
10381                     swash = _core_swash_init("utf8", name, &PL_sv_undef,
10382                                              1, /* binary */
10383                                              0, /* not tr/// */
10384                                              TRUE, /* this routine will handle
10385                                                       undefined properties */
10386                                              NULL, FALSE /* No inversion list */
10387                                             );
10388                     if (   ! swash
10389                         || ! SvROK(swash)
10390                         || ! SvTYPE(SvRV(swash)) == SVt_PVHV
10391                         || ! (invlistsvp =
10392                                 hv_fetchs(MUTABLE_HV(SvRV(swash)),
10393                                 "INVLIST", FALSE))
10394                         || ! (invlist = *invlistsvp))
10395                     {
10396                         if (swash) {
10397                             SvREFCNT_dec(swash);
10398                             swash = NULL;
10399                         }
10400
10401                         /* Here didn't find it.  It could be a user-defined
10402                          * property that will be available at run-time.  Add it
10403                          * to the list to look up then */
10404                         Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n",
10405                                         (value == 'p' ? '+' : '!'),
10406                                         name);
10407                         has_user_defined_property = 1;
10408
10409                         /* We don't know yet, so have to assume that the
10410                          * property could match something in the Latin1 range,
10411                          * hence something that isn't utf8 */
10412                         ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
10413                     }
10414                     else {
10415
10416                         /* Here, did get the swash and its inversion list.  If
10417                          * the swash is from a user-defined property, then this
10418                          * whole character class should be regarded as such */
10419                         SV** user_defined_svp =
10420                                             hv_fetchs(MUTABLE_HV(SvRV(swash)),
10421                                                         "USER_DEFINED", FALSE);
10422                         if (user_defined_svp) {
10423                             has_user_defined_property
10424                                                     |= SvUV(*user_defined_svp);
10425                         }
10426
10427                         /* Invert if asking for the complement */
10428                         if (value == 'P') {
10429
10430                             /* Add to any existing list */
10431                             if (! properties) {
10432                                 properties = invlist_clone(invlist);
10433                                 _invlist_invert(properties);
10434                             }
10435                             else {
10436                                 invlist = invlist_clone(invlist);
10437                                 _invlist_invert(invlist);
10438                                 _invlist_union(properties, invlist, &properties);
10439                                 SvREFCNT_dec(invlist);
10440                             }
10441
10442                             /* The swash can't be used as-is, because we've
10443                              * inverted things; delay removing it to here after
10444                              * have copied its invlist above */
10445                             SvREFCNT_dec(swash);
10446                             swash = NULL;
10447                         }
10448                         else {
10449                             if (! properties) {
10450                                 properties = invlist_clone(invlist);
10451                             }
10452                             else {
10453                                 _invlist_union(properties, invlist, &properties);
10454                             }
10455                         }
10456                     }
10457                     Safefree(name);
10458                 }
10459                 RExC_parse = e + 1;
10460                 namedclass = ANYOF_MAX;  /* no official name, but it's named */
10461
10462                 /* \p means they want Unicode semantics */
10463                 RExC_uni_semantics = 1;
10464                 }
10465                 break;
10466             case 'n':   value = '\n';                   break;
10467             case 'r':   value = '\r';                   break;
10468             case 't':   value = '\t';                   break;
10469             case 'f':   value = '\f';                   break;
10470             case 'b':   value = '\b';                   break;
10471             case 'e':   value = ASCII_TO_NATIVE('\033');break;
10472             case 'a':   value = ASCII_TO_NATIVE('\007');break;
10473             case 'o':
10474                 RExC_parse--;   /* function expects to be pointed at the 'o' */
10475                 {
10476                     const char* error_msg;
10477                     bool valid = grok_bslash_o(RExC_parse,
10478                                                &value,
10479                                                &numlen,
10480                                                &error_msg,
10481                                                SIZE_ONLY);
10482                     RExC_parse += numlen;
10483                     if (! valid) {
10484                         vFAIL(error_msg);
10485                     }
10486                 }
10487                 if (PL_encoding && value < 0x100) {
10488                     goto recode_encoding;
10489                 }
10490                 break;
10491             case 'x':
10492                 if (*RExC_parse == '{') {
10493                     I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
10494                         | PERL_SCAN_DISALLOW_PREFIX;
10495                     char * const e = strchr(RExC_parse++, '}');
10496                     if (!e)
10497                         vFAIL("Missing right brace on \\x{}");
10498
10499                     numlen = e - RExC_parse;
10500                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
10501                     RExC_parse = e + 1;
10502                 }
10503                 else {
10504                     I32 flags = PERL_SCAN_DISALLOW_PREFIX;
10505                     numlen = 2;
10506                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
10507                     RExC_parse += numlen;
10508                 }
10509                 if (PL_encoding && value < 0x100)
10510                     goto recode_encoding;
10511                 break;
10512             case 'c':
10513                 value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
10514                 break;
10515             case '0': case '1': case '2': case '3': case '4':
10516             case '5': case '6': case '7':
10517                 {
10518                     /* Take 1-3 octal digits */
10519                     I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
10520                     numlen = 3;
10521                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
10522                     RExC_parse += numlen;
10523                     if (PL_encoding && value < 0x100)
10524                         goto recode_encoding;
10525                     break;
10526                 }
10527             recode_encoding:
10528                 if (! RExC_override_recoding) {
10529                     SV* enc = PL_encoding;
10530                     value = reg_recode((const char)(U8)value, &enc);
10531                     if (!enc && SIZE_ONLY)
10532                         ckWARNreg(RExC_parse,
10533                                   "Invalid escape in the specified encoding");
10534                     break;
10535                 }
10536             default:
10537                 /* Allow \_ to not give an error */
10538                 if (!SIZE_ONLY && isALNUM(value) && value != '_') {
10539                     ckWARN2reg(RExC_parse,
10540                                "Unrecognized escape \\%c in character class passed through",
10541                                (int)value);
10542                 }
10543                 break;
10544             }
10545         } /* end of \blah */
10546 #ifdef EBCDIC
10547         else
10548             literal_endpoint++;
10549 #endif
10550
10551         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
10552
10553             /* What matches in a locale is not known until runtime, so need to
10554              * (one time per class) allocate extra space to pass to regexec.
10555              * The space will contain a bit for each named class that is to be
10556              * matched against.  This isn't needed for \p{} and pseudo-classes,
10557              * as they are not affected by locale, and hence are dealt with
10558              * separately */
10559             if (LOC && namedclass < ANYOF_MAX && ! need_class) {
10560                 need_class = 1;
10561                 if (SIZE_ONLY) {
10562                     RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
10563                 }
10564                 else {
10565                     RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
10566                     ANYOF_CLASS_ZERO(ret);
10567                 }
10568                 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
10569             }
10570
10571             /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
10572              * literal, as is the character that began the false range, i.e.
10573              * the 'a' in the examples */
10574             if (range) {
10575                 if (!SIZE_ONLY) {
10576                     const int w =
10577                         RExC_parse >= rangebegin ?
10578                         RExC_parse - rangebegin : 0;
10579                     ckWARN4reg(RExC_parse,
10580                                "False [] range \"%*.*s\"",
10581                                w, w, rangebegin);
10582
10583                     stored +=
10584                          set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
10585                     if (prevvalue < 256) {
10586                         stored +=
10587                          set_regclass_bit(pRExC_state, ret, (U8) prevvalue, &l1_fold_invlist, &unicode_alternate);
10588                     }
10589                     else {
10590                         nonbitmap = add_cp_to_invlist(nonbitmap, prevvalue);
10591                     }
10592                 }
10593
10594                 range = 0; /* this was not a true range */
10595             }
10596
10597             if (!SIZE_ONLY) {
10598                 const char *what = NULL;
10599                 char yesno = 0;
10600
10601                 /* Possible truncation here but in some 64-bit environments
10602                  * the compiler gets heartburn about switch on 64-bit values.
10603                  * A similar issue a little earlier when switching on value.
10604                  * --jhi */
10605                 switch ((I32)namedclass) {
10606
10607                 case _C_C_T_(ALNUMC, isALNUMC_L1, isALNUMC, "XPosixAlnum", "XPosixAlnum");
10608                 case _C_C_T_(ALPHA, isALPHA_L1, isALPHA, "XPosixAlpha", "XPosixAlpha");
10609                 case _C_C_T_(BLANK, isBLANK_L1, isBLANK, "XPosixBlank", "XPosixBlank");
10610                 case _C_C_T_(CNTRL, isCNTRL_L1, isCNTRL, "XPosixCntrl", "XPosixCntrl");
10611                 case _C_C_T_(GRAPH, isGRAPH_L1, isGRAPH, "XPosixGraph", "XPosixGraph");
10612                 case _C_C_T_(LOWER, isLOWER_L1, isLOWER, "XPosixLower", "__XPosixLower_i");
10613                 case _C_C_T_(PRINT, isPRINT_L1, isPRINT, "XPosixPrint", "XPosixPrint");
10614                 case _C_C_T_(PSXSPC, isPSXSPC_L1, isPSXSPC, "XPosixSpace", "XPosixSpace");
10615                 case _C_C_T_(PUNCT, isPUNCT_L1, isPUNCT, "XPosixPunct", "XPosixPunct");
10616                 case _C_C_T_(UPPER, isUPPER_L1, isUPPER, "XPosixUpper", "__XPosixUpper_i");
10617                 /* \s, \w match all unicode if utf8. */
10618                 case _C_C_T_(SPACE, isSPACE_L1, isSPACE, "SpacePerl", "SpacePerl");
10619                 case _C_C_T_(ALNUM, isWORDCHAR_L1, isALNUM, "Word", "Word");
10620                 case _C_C_T_(XDIGIT, isXDIGIT_L1, isXDIGIT, "XPosixXDigit", "XPosixXDigit");
10621                 case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
10622                 case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
10623                 case ANYOF_ASCII:
10624                     if (LOC)
10625                         ANYOF_CLASS_SET(ret, ANYOF_ASCII);
10626                     else {
10627                         for (value = 0; value < 128; value++)
10628                             stored +=
10629                               set_regclass_bit(pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate);
10630                     }
10631                     yesno = '+';
10632                     what = NULL;        /* Doesn't match outside ascii, so
10633                                            don't want to add +utf8:: */
10634                     break;
10635                 case ANYOF_NASCII:
10636                     if (LOC)
10637                         ANYOF_CLASS_SET(ret, ANYOF_NASCII);
10638                     else {
10639                         for (value = 128; value < 256; value++)
10640                             stored +=
10641                               set_regclass_bit(pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate);
10642                     }
10643                     ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
10644                     yesno = '!';
10645                     what = "ASCII";
10646                     break;
10647                 case ANYOF_DIGIT:
10648                     if (LOC)
10649                         ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
10650                     else {
10651                         /* consecutive digits assumed */
10652                         for (value = '0'; value <= '9'; value++)
10653                             stored +=
10654                               set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);
10655                     }
10656                     yesno = '+';
10657                     what = "Digit";
10658                     break;
10659                 case ANYOF_NDIGIT:
10660                     if (LOC)
10661                         ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
10662                     else {
10663                         /* consecutive digits assumed */
10664                         for (value = 0; value < '0'; value++)
10665                             stored +=
10666                               set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);
10667                         for (value = '9' + 1; value < 256; value++)
10668                             stored +=
10669                               set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);
10670                     }
10671                     yesno = '!';
10672                     what = "Digit";
10673                     if (AT_LEAST_ASCII_RESTRICTED ) {
10674                         ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
10675                     }
10676                     break;
10677                 case ANYOF_MAX:
10678                     /* this is to handle \p and \P */
10679                     break;
10680                 default:
10681                     vFAIL("Invalid [::] class");
10682                     break;
10683                 }
10684                 if (what && ! (AT_LEAST_ASCII_RESTRICTED)) {
10685                     /* Strings such as "+utf8::isWord\n" */
10686                     Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n", yesno, what);
10687                 }
10688
10689                 continue;
10690             }
10691         } /* end of namedclass \blah */
10692
10693         if (range) {
10694             if (prevvalue > (IV)value) /* b-a */ {
10695                 const int w = RExC_parse - rangebegin;
10696                 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
10697                 range = 0; /* not a valid range */
10698             }
10699         }
10700         else {
10701             prevvalue = value; /* save the beginning of the range */
10702             if (RExC_parse+1 < RExC_end
10703                 && *RExC_parse == '-'
10704                 && RExC_parse[1] != ']')
10705             {
10706                 RExC_parse++;
10707
10708                 /* a bad range like \w-, [:word:]- ? */
10709                 if (namedclass > OOB_NAMEDCLASS) {
10710                     if (ckWARN(WARN_REGEXP)) {
10711                         const int w =
10712                             RExC_parse >= rangebegin ?
10713                             RExC_parse - rangebegin : 0;
10714                         vWARN4(RExC_parse,
10715                                "False [] range \"%*.*s\"",
10716                                w, w, rangebegin);
10717                     }
10718                     if (!SIZE_ONLY)
10719                         stored +=
10720                             set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
10721                 } else
10722                     range = 1;  /* yeah, it's a range! */
10723                 continue;       /* but do it the next time */
10724             }
10725         }
10726
10727         /* non-Latin1 code point implies unicode semantics.  Must be set in
10728          * pass1 so is there for the whole of pass 2 */
10729         if (value > 255) {
10730             RExC_uni_semantics = 1;
10731         }
10732
10733         /* now is the next time */
10734         if (!SIZE_ONLY) {
10735             if (prevvalue < 256) {
10736                 const IV ceilvalue = value < 256 ? value : 255;
10737                 IV i;
10738 #ifdef EBCDIC
10739                 /* In EBCDIC [\x89-\x91] should include
10740                  * the \x8e but [i-j] should not. */
10741                 if (literal_endpoint == 2 &&
10742                     ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
10743                      (isUPPER(prevvalue) && isUPPER(ceilvalue))))
10744                 {
10745                     if (isLOWER(prevvalue)) {
10746                         for (i = prevvalue; i <= ceilvalue; i++)
10747                             if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
10748                                 stored +=
10749                                   set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
10750                             }
10751                     } else {
10752                         for (i = prevvalue; i <= ceilvalue; i++)
10753                             if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
10754                                 stored +=
10755                                   set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
10756                             }
10757                     }
10758                 }
10759                 else
10760 #endif
10761                       for (i = prevvalue; i <= ceilvalue; i++) {
10762                         stored += set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
10763                       }
10764           }
10765           if (value > 255) {
10766             const UV prevnatvalue  = NATIVE_TO_UNI(prevvalue);
10767             const UV natvalue      = NATIVE_TO_UNI(value);
10768             nonbitmap = add_range_to_invlist(nonbitmap, prevnatvalue, natvalue);
10769         }
10770 #ifdef EBCDIC
10771             literal_endpoint = 0;
10772 #endif
10773         }
10774
10775         range = 0; /* this range (if it was one) is done now */
10776     }
10777
10778
10779
10780     if (SIZE_ONLY)
10781         return ret;
10782     /****** !SIZE_ONLY AFTER HERE *********/
10783
10784     /* If folding and there are code points above 255, we calculate all
10785      * characters that could fold to or from the ones already on the list */
10786     if (FOLD && nonbitmap) {
10787         UV start, end;  /* End points of code point ranges */
10788
10789         SV* fold_intersection = NULL;
10790
10791         /* This is a list of all the characters that participate in folds
10792             * (except marks, etc in multi-char folds */
10793         if (! PL_utf8_foldable) {
10794             SV* swash = swash_init("utf8", "Cased", &PL_sv_undef, 1, 0);
10795             PL_utf8_foldable = _swash_to_invlist(swash);
10796             SvREFCNT_dec(swash);
10797         }
10798
10799         /* This is a hash that for a particular fold gives all characters
10800             * that are involved in it */
10801         if (! PL_utf8_foldclosures) {
10802
10803             /* If we were unable to find any folds, then we likely won't be
10804              * able to find the closures.  So just create an empty list.
10805              * Folding will effectively be restricted to the non-Unicode rules
10806              * hard-coded into Perl.  (This case happens legitimately during
10807              * compilation of Perl itself before the Unicode tables are
10808              * generated) */
10809             if (invlist_len(PL_utf8_foldable) == 0) {
10810                 PL_utf8_foldclosures = newHV();
10811             } else {
10812                 /* If the folds haven't been read in, call a fold function
10813                     * to force that */
10814                 if (! PL_utf8_tofold) {
10815                     U8 dummy[UTF8_MAXBYTES+1];
10816                     STRLEN dummy_len;
10817
10818                     /* This particular string is above \xff in both UTF-8 and
10819                      * UTFEBCDIC */
10820                     to_utf8_fold((U8*) "\xC8\x80", dummy, &dummy_len);
10821                     assert(PL_utf8_tofold); /* Verify that worked */
10822                 }
10823                 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
10824             }
10825         }
10826
10827         /* Only the characters in this class that participate in folds need be
10828          * checked.  Get the intersection of this class and all the possible
10829          * characters that are foldable.  This can quickly narrow down a large
10830          * class */
10831         _invlist_intersection(PL_utf8_foldable, nonbitmap, &fold_intersection);
10832
10833         /* Now look at the foldable characters in this class individually */
10834         invlist_iterinit(fold_intersection);
10835         while (invlist_iternext(fold_intersection, &start, &end)) {
10836             UV j;
10837
10838             /* Look at every character in the range */
10839             for (j = start; j <= end; j++) {
10840
10841                 /* Get its fold */
10842                 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
10843                 STRLEN foldlen;
10844                 const UV f =
10845                     _to_uni_fold_flags(j, foldbuf, &foldlen, allow_full_fold);
10846
10847                 if (foldlen > (STRLEN)UNISKIP(f)) {
10848
10849                     /* Any multicharacter foldings (disallowed in lookbehind
10850                      * patterns) require the following transform: [ABCDEF] ->
10851                      * (?:[ABCabcDEFd]|pq|rst) where E folds into "pq" and F
10852                      * folds into "rst", all other characters fold to single
10853                      * characters.  We save away these multicharacter foldings,
10854                      * to be later saved as part of the additional "s" data. */
10855                     if (! RExC_in_lookbehind) {
10856                         U8* loc = foldbuf;
10857                         U8* e = foldbuf + foldlen;
10858
10859                         /* If any of the folded characters of this are in the
10860                          * Latin1 range, tell the regex engine that this can
10861                          * match a non-utf8 target string.  The only multi-byte
10862                          * fold whose source is in the Latin1 range (U+00DF)
10863                          * applies only when the target string is utf8, or
10864                          * under unicode rules */
10865                         if (j > 255 || AT_LEAST_UNI_SEMANTICS) {
10866                             while (loc < e) {
10867
10868                                 /* Can't mix ascii with non- under /aa */
10869                                 if (MORE_ASCII_RESTRICTED
10870                                     && (isASCII(*loc) != isASCII(j)))
10871                                 {
10872                                     goto end_multi_fold;
10873                                 }
10874                                 if (UTF8_IS_INVARIANT(*loc)
10875                                     || UTF8_IS_DOWNGRADEABLE_START(*loc))
10876                                 {
10877                                     /* Can't mix above and below 256 under LOC
10878                                      */
10879                                     if (LOC) {
10880                                         goto end_multi_fold;
10881                                     }
10882                                     ANYOF_FLAGS(ret)
10883                                             |= ANYOF_NONBITMAP_NON_UTF8;
10884                                     break;
10885                                 }
10886                                 loc += UTF8SKIP(loc);
10887                             }
10888                         }
10889
10890                         add_alternate(&unicode_alternate, foldbuf, foldlen);
10891                     end_multi_fold: ;
10892                     }
10893
10894                     /* This is special-cased, as it is the only letter which
10895                      * has both a multi-fold and single-fold in Latin1.  All
10896                      * the other chars that have single and multi-folds are
10897                      * always in utf8, and the utf8 folding algorithm catches
10898                      * them */
10899                     if (! LOC && j == LATIN_CAPITAL_LETTER_SHARP_S) {
10900                         stored += set_regclass_bit(pRExC_state,
10901                                         ret,
10902                                         LATIN_SMALL_LETTER_SHARP_S,
10903                                         &l1_fold_invlist, &unicode_alternate);
10904                     }
10905                 }
10906                 else {
10907                     /* Single character fold.  Add everything in its fold
10908                      * closure to the list that this node should match */
10909                     SV** listp;
10910
10911                     /* The fold closures data structure is a hash with the keys
10912                      * being every character that is folded to, like 'k', and
10913                      * the values each an array of everything that folds to its
10914                      * key.  e.g. [ 'k', 'K', KELVIN_SIGN ] */
10915                     if ((listp = hv_fetch(PL_utf8_foldclosures,
10916                                     (char *) foldbuf, foldlen, FALSE)))
10917                     {
10918                         AV* list = (AV*) *listp;
10919                         IV k;
10920                         for (k = 0; k <= av_len(list); k++) {
10921                             SV** c_p = av_fetch(list, k, FALSE);
10922                             UV c;
10923                             if (c_p == NULL) {
10924                                 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
10925                             }
10926                             c = SvUV(*c_p);
10927
10928                             /* /aa doesn't allow folds between ASCII and non-;
10929                              * /l doesn't allow them between above and below
10930                              * 256 */
10931                             if ((MORE_ASCII_RESTRICTED
10932                                  && (isASCII(c) != isASCII(j)))
10933                                     || (LOC && ((c < 256) != (j < 256))))
10934                             {
10935                                 continue;
10936                             }
10937
10938                             if (c < 256 && AT_LEAST_UNI_SEMANTICS) {
10939                                 stored += set_regclass_bit(pRExC_state,
10940                                         ret,
10941                                         (U8) c,
10942                                         &l1_fold_invlist, &unicode_alternate);
10943                             }
10944                                 /* It may be that the code point is already in
10945                                  * this range or already in the bitmap, in
10946                                  * which case we need do nothing */
10947                             else if ((c < start || c > end)
10948                                         && (c > 255
10949                                             || ! ANYOF_BITMAP_TEST(ret, c)))
10950                             {
10951                                 nonbitmap = add_cp_to_invlist(nonbitmap, c);
10952                             }
10953                         }
10954                     }
10955                 }
10956             }
10957         }
10958         SvREFCNT_dec(fold_intersection);
10959     }
10960
10961     /* Combine the two lists into one. */
10962     if (l1_fold_invlist) {
10963         if (nonbitmap) {
10964             _invlist_union(nonbitmap, l1_fold_invlist, &nonbitmap);
10965             SvREFCNT_dec(l1_fold_invlist);
10966         }
10967         else {
10968             nonbitmap = l1_fold_invlist;
10969         }
10970     }
10971
10972     /* And combine the result (if any) with any inversion list from properties.
10973      * The lists are kept separate up to now because we don't want to fold the
10974      * properties */
10975     if (properties) {
10976         if (nonbitmap) {
10977             _invlist_union(nonbitmap, properties, &nonbitmap);
10978             SvREFCNT_dec(properties);
10979         }
10980         else {
10981             nonbitmap = properties;
10982         }
10983     }
10984
10985     /* Here, <nonbitmap> contains all the code points we can determine at
10986      * compile time that we haven't put into the bitmap.  Go through it, and
10987      * for things that belong in the bitmap, put them there, and delete from
10988      * <nonbitmap> */
10989     if (nonbitmap) {
10990
10991         /* Above-ASCII code points in /d have to stay in <nonbitmap>, as they
10992          * possibly only should match when the target string is UTF-8 */
10993         UV max_cp_to_set = (DEPENDS_SEMANTICS) ? 127 : 255;
10994
10995         /* This gets set if we actually need to modify things */
10996         bool change_invlist = FALSE;
10997
10998         UV start, end;
10999
11000         /* Start looking through <nonbitmap> */
11001         invlist_iterinit(nonbitmap);
11002         while (invlist_iternext(nonbitmap, &start, &end)) {
11003             UV high;
11004             int i;
11005
11006             /* Quit if are above what we should change */
11007             if (start > max_cp_to_set) {
11008                 break;
11009             }
11010
11011             change_invlist = TRUE;
11012
11013             /* Set all the bits in the range, up to the max that we are doing */
11014             high = (end < max_cp_to_set) ? end : max_cp_to_set;
11015             for (i = start; i <= (int) high; i++) {
11016                 if (! ANYOF_BITMAP_TEST(ret, i)) {
11017                     ANYOF_BITMAP_SET(ret, i);
11018                     stored++;
11019                     prevvalue = value;
11020                     value = i;
11021                 }
11022             }
11023         }
11024
11025         /* Done with loop; set <nonbitmap> to not include any code points that
11026          * are in the bitmap */
11027         if (change_invlist) {
11028             SV* keep_list = _new_invlist(2);
11029             _append_range_to_invlist(keep_list, max_cp_to_set + 1, UV_MAX);
11030             _invlist_intersection(nonbitmap, keep_list, &nonbitmap);
11031             SvREFCNT_dec(keep_list);
11032         }
11033
11034         /* If have completely emptied it, remove it completely */
11035         if (invlist_len(nonbitmap) == 0) {
11036             SvREFCNT_dec(nonbitmap);
11037             nonbitmap = NULL;
11038         }
11039     }
11040
11041     /* Here, we have calculated what code points should be in the character
11042      * class.  <nonbitmap> does not overlap the bitmap except possibly in the
11043      * case of DEPENDS rules.
11044      *
11045      * Now we can see about various optimizations.  Fold calculation (which we
11046      * did above) needs to take place before inversion.  Otherwise /[^k]/i
11047      * would invert to include K, which under /i would match k, which it
11048      * shouldn't. */
11049
11050     /* Optimize inverted simple patterns (e.g. [^a-z]).  Note that we haven't
11051      * set the FOLD flag yet, so this does optimize those.  It doesn't
11052      * optimize locale.  Doing so perhaps could be done as long as there is
11053      * nothing like \w in it; some thought also would have to be given to the
11054      * interaction with above 0x100 chars */
11055     if ((ANYOF_FLAGS(ret) & ANYOF_INVERT)
11056         && ! LOC
11057         && ! unicode_alternate
11058         /* In case of /d, there are some things that should match only when in
11059          * not in the bitmap, i.e., they require UTF8 to match.  These are
11060          * listed in nonbitmap, but if ANYOF_NONBITMAP_NON_UTF8 is set in this
11061          * case, they don't require UTF8, so can invert here */
11062         && (! nonbitmap
11063             || ! DEPENDS_SEMANTICS
11064             || (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP_NON_UTF8))
11065         && SvCUR(listsv) == initial_listsv_len)
11066     {
11067         int i;
11068         if (! nonbitmap) {
11069             for (i = 0; i < 256; ++i) {
11070                 if (ANYOF_BITMAP_TEST(ret, i)) {
11071                     ANYOF_BITMAP_CLEAR(ret, i);
11072                 }
11073                 else {
11074                     ANYOF_BITMAP_SET(ret, i);
11075                     prevvalue = value;
11076                     value = i;
11077                 }
11078             }
11079             /* The inversion means that everything above 255 is matched */
11080             ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
11081         }
11082         else {
11083             /* Here, also has things outside the bitmap that may overlap with
11084              * the bitmap.  We have to sync them up, so that they get inverted
11085              * in both places.  Earlier, we removed all overlaps except in the
11086              * case of /d rules, so no syncing is needed except for this case
11087              */
11088             SV *remove_list = NULL;
11089
11090             if (DEPENDS_SEMANTICS) {
11091                 UV start, end;
11092
11093                 /* Set the bits that correspond to the ones that aren't in the
11094                  * bitmap.  Otherwise, when we invert, we'll miss these.
11095                  * Earlier, we removed from the nonbitmap all code points
11096                  * < 128, so there is no extra work here */
11097                 invlist_iterinit(nonbitmap);
11098                 while (invlist_iternext(nonbitmap, &start, &end)) {
11099                     if (start > 255) {  /* The bit map goes to 255 */
11100                         break;
11101                     }
11102                     if (end > 255) {
11103                         end = 255;
11104                     }
11105                     for (i = start; i <= (int) end; ++i) {
11106                         ANYOF_BITMAP_SET(ret, i);
11107                         prevvalue = value;
11108                         value = i;
11109                     }
11110                 }
11111             }
11112
11113             /* Now invert both the bitmap and the nonbitmap.  Anything in the
11114              * bitmap has to also be removed from the non-bitmap, but again,
11115              * there should not be overlap unless is /d rules. */
11116             _invlist_invert(nonbitmap);
11117
11118             for (i = 0; i < 256; ++i) {
11119                 if (ANYOF_BITMAP_TEST(ret, i)) {
11120                     ANYOF_BITMAP_CLEAR(ret, i);
11121                     if (DEPENDS_SEMANTICS) {
11122                         if (! remove_list) {
11123                             remove_list = _new_invlist(2);
11124                         }
11125                         remove_list = add_cp_to_invlist(remove_list, i);
11126                     }
11127                 }
11128                 else {
11129                     ANYOF_BITMAP_SET(ret, i);
11130                     prevvalue = value;
11131                     value = i;
11132                 }
11133             }
11134
11135             /* And do the removal */
11136             if (DEPENDS_SEMANTICS) {
11137                 if (remove_list) {
11138                     _invlist_subtract(nonbitmap, remove_list, &nonbitmap);
11139                     SvREFCNT_dec(remove_list);
11140                 }
11141             }
11142             else {
11143                 /* There is no overlap for non-/d, so just delete anything
11144                  * below 256 */
11145                 SV* keep_list = _new_invlist(2);
11146                 _append_range_to_invlist(keep_list, 256, UV_MAX);
11147                 _invlist_intersection(nonbitmap, keep_list, &nonbitmap);
11148                 SvREFCNT_dec(keep_list);
11149             }
11150         }
11151
11152         stored = 256 - stored;
11153
11154         /* Clear the invert flag since have just done it here */
11155         ANYOF_FLAGS(ret) &= ~ANYOF_INVERT;
11156     }
11157
11158     /* Folding in the bitmap is taken care of above, but not for locale (for
11159      * which we have to wait to see what folding is in effect at runtime), and
11160      * for some things not in the bitmap (only the upper latin folds in this
11161      * case, as all other single-char folding has been set above).  Set
11162      * run-time fold flag for these */
11163     if (FOLD && (LOC
11164                 || (DEPENDS_SEMANTICS
11165                     && nonbitmap
11166                     && ! (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP_NON_UTF8))
11167                 || unicode_alternate))
11168     {
11169         ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD;
11170     }
11171
11172     /* A single character class can be "optimized" into an EXACTish node.
11173      * Note that since we don't currently count how many characters there are
11174      * outside the bitmap, we are XXX missing optimization possibilities for
11175      * them.  This optimization can't happen unless this is a truly single
11176      * character class, which means that it can't be an inversion into a
11177      * many-character class, and there must be no possibility of there being
11178      * things outside the bitmap.  'stored' (only) for locales doesn't include
11179      * \w, etc, so have to make a special test that they aren't present
11180      *
11181      * Similarly A 2-character class of the very special form like [bB] can be
11182      * optimized into an EXACTFish node, but only for non-locales, and for
11183      * characters which only have the two folds; so things like 'fF' and 'Ii'
11184      * wouldn't work because they are part of the fold of 'LATIN SMALL LIGATURE
11185      * FI'. */
11186     if (! nonbitmap
11187         && ! unicode_alternate
11188         && SvCUR(listsv) == initial_listsv_len
11189         && ! (ANYOF_FLAGS(ret) & (ANYOF_INVERT|ANYOF_UNICODE_ALL))
11190         && (((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
11191                               || (! ANYOF_CLASS_TEST_ANY_SET(ret)))))
11192             || (stored == 2 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
11193                                  && (! _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value))
11194                                  /* If the latest code point has a fold whose
11195                                   * bit is set, it must be the only other one */
11196                                 && ((prevvalue = PL_fold_latin1[value]) != (IV)value)
11197                                  && ANYOF_BITMAP_TEST(ret, prevvalue)))))
11198     {
11199         /* Note that the information needed to decide to do this optimization
11200          * is not currently available until the 2nd pass, and that the actually
11201          * used EXACTish node takes less space than the calculated ANYOF node,
11202          * and hence the amount of space calculated in the first pass is larger
11203          * than actually used, so this optimization doesn't gain us any space.
11204          * But an EXACT node is faster than an ANYOF node, and can be combined
11205          * with any adjacent EXACT nodes later by the optimizer for further
11206          * gains.  The speed of executing an EXACTF is similar to an ANYOF
11207          * node, so the optimization advantage comes from the ability to join
11208          * it to adjacent EXACT nodes */
11209
11210         const char * cur_parse= RExC_parse;
11211         U8 op;
11212         RExC_emit = (regnode *)orig_emit;
11213         RExC_parse = (char *)orig_parse;
11214
11215         if (stored == 1) {
11216
11217             /* A locale node with one point can be folded; all the other cases
11218              * with folding will have two points, since we calculate them above
11219              */
11220             if (ANYOF_FLAGS(ret) & ANYOF_LOC_NONBITMAP_FOLD) {
11221                  op = EXACTFL;
11222             }
11223             else {
11224                 op = EXACT;
11225             }
11226         }
11227         else {   /* else 2 chars in the bit map: the folds of each other */
11228
11229             /* Use the folded value, which for the cases where we get here,
11230              * is just the lower case of the current one (which may resolve to
11231              * itself, or to the other one */
11232             value = toLOWER_LATIN1(value);
11233
11234             /* To join adjacent nodes, they must be the exact EXACTish type.
11235              * Try to use the most likely type, by using EXACTFA if possible,
11236              * then EXACTFU if the regex calls for it, or is required because
11237              * the character is non-ASCII.  (If <value> is ASCII, its fold is
11238              * also ASCII for the cases where we get here.) */
11239             if (MORE_ASCII_RESTRICTED && isASCII(value)) {
11240                 op = EXACTFA;
11241             }
11242             else if (AT_LEAST_UNI_SEMANTICS || !isASCII(value)) {
11243                 op = EXACTFU;
11244             }
11245             else {    /* Otherwise, more likely to be EXACTF type */
11246                 op = EXACTF;
11247             }
11248         }
11249
11250         ret = reg_node(pRExC_state, op);
11251         RExC_parse = (char *)cur_parse;
11252         if (UTF && ! NATIVE_IS_INVARIANT(value)) {
11253             *STRING(ret)= UTF8_EIGHT_BIT_HI((U8) value);
11254             *(STRING(ret) + 1)= UTF8_EIGHT_BIT_LO((U8) value);
11255             STR_LEN(ret)= 2;
11256             RExC_emit += STR_SZ(2);
11257         }
11258         else {
11259             *STRING(ret)= (char)value;
11260             STR_LEN(ret)= 1;
11261             RExC_emit += STR_SZ(1);
11262         }
11263         SvREFCNT_dec(listsv);
11264         return ret;
11265     }
11266
11267     /* If there is a swash and more than one element, we can't use the swash in
11268      * the optimization below. */
11269     if (swash && element_count > 1) {
11270         SvREFCNT_dec(swash);
11271         swash = NULL;
11272     }
11273     if (! nonbitmap
11274         && SvCUR(listsv) == initial_listsv_len
11275         && ! unicode_alternate)
11276     {
11277         ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
11278         SvREFCNT_dec(listsv);
11279         SvREFCNT_dec(unicode_alternate);
11280     }
11281     else {
11282         /* av[0] stores the character class description in its textual form:
11283          *       used later (regexec.c:Perl_regclass_swash()) to initialize the
11284          *       appropriate swash, and is also useful for dumping the regnode.
11285          * av[1] if NULL, is a placeholder to later contain the swash computed
11286          *       from av[0].  But if no further computation need be done, the
11287          *       swash is stored there now.
11288          * av[2] stores the multicharacter foldings, used later in
11289          *       regexec.c:S_reginclass().
11290          * av[3] stores the nonbitmap inversion list for use in addition or
11291          *       instead of av[0]; not used if av[1] isn't NULL
11292          * av[4] is set if any component of the class is from a user-defined
11293          *       property; not used if av[1] isn't NULL */
11294         AV * const av = newAV();
11295         SV *rv;
11296
11297         av_store(av, 0, (SvCUR(listsv) == initial_listsv_len)
11298                         ? &PL_sv_undef
11299                         : listsv);
11300         if (swash) {
11301             av_store(av, 1, swash);
11302             SvREFCNT_dec(nonbitmap);
11303         }
11304         else {
11305             av_store(av, 1, NULL);
11306             if (nonbitmap) {
11307                 av_store(av, 3, nonbitmap);
11308                 av_store(av, 4, newSVuv(has_user_defined_property));
11309             }
11310         }
11311
11312         /* Store any computed multi-char folds only if we are allowing
11313          * them */
11314         if (allow_full_fold) {
11315             av_store(av, 2, MUTABLE_SV(unicode_alternate));
11316             if (unicode_alternate) { /* This node is variable length */
11317                 OP(ret) = ANYOFV;
11318             }
11319         }
11320         else {
11321             av_store(av, 2, NULL);
11322         }
11323         rv = newRV_noinc(MUTABLE_SV(av));
11324         n = add_data(pRExC_state, 1, "s");
11325         RExC_rxi->data->data[n] = (void*)rv;
11326         ARG_SET(ret, n);
11327     }
11328     return ret;
11329 }
11330 #undef _C_C_T_
11331
11332
11333 /* reg_skipcomment()
11334
11335    Absorbs an /x style # comments from the input stream.
11336    Returns true if there is more text remaining in the stream.
11337    Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
11338    terminates the pattern without including a newline.
11339
11340    Note its the callers responsibility to ensure that we are
11341    actually in /x mode
11342
11343 */
11344
11345 STATIC bool
11346 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
11347 {
11348     bool ended = 0;
11349
11350     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
11351
11352     while (RExC_parse < RExC_end)
11353         if (*RExC_parse++ == '\n') {
11354             ended = 1;
11355             break;
11356         }
11357     if (!ended) {
11358         /* we ran off the end of the pattern without ending
11359            the comment, so we have to add an \n when wrapping */
11360         RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11361         return 0;
11362     } else
11363         return 1;
11364 }
11365
11366 /* nextchar()
11367
11368    Advances the parse position, and optionally absorbs
11369    "whitespace" from the inputstream.
11370
11371    Without /x "whitespace" means (?#...) style comments only,
11372    with /x this means (?#...) and # comments and whitespace proper.
11373
11374    Returns the RExC_parse point from BEFORE the scan occurs.
11375
11376    This is the /x friendly way of saying RExC_parse++.
11377 */
11378
11379 STATIC char*
11380 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
11381 {
11382     char* const retval = RExC_parse++;
11383
11384     PERL_ARGS_ASSERT_NEXTCHAR;
11385
11386     for (;;) {
11387         if (RExC_end - RExC_parse >= 3
11388             && *RExC_parse == '('
11389             && RExC_parse[1] == '?'
11390             && RExC_parse[2] == '#')
11391         {
11392             while (*RExC_parse != ')') {
11393                 if (RExC_parse == RExC_end)
11394                     FAIL("Sequence (?#... not terminated");
11395                 RExC_parse++;
11396             }
11397             RExC_parse++;
11398             continue;
11399         }
11400         if (RExC_flags & RXf_PMf_EXTENDED) {
11401             if (isSPACE(*RExC_parse)) {
11402                 RExC_parse++;
11403                 continue;
11404             }
11405             else if (*RExC_parse == '#') {
11406                 if ( reg_skipcomment( pRExC_state ) )
11407                     continue;
11408             }
11409         }
11410         return retval;
11411     }
11412 }
11413
11414 /*
11415 - reg_node - emit a node
11416 */
11417 STATIC regnode *                        /* Location. */
11418 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
11419 {
11420     dVAR;
11421     register regnode *ptr;
11422     regnode * const ret = RExC_emit;
11423     GET_RE_DEBUG_FLAGS_DECL;
11424
11425     PERL_ARGS_ASSERT_REG_NODE;
11426
11427     if (SIZE_ONLY) {
11428         SIZE_ALIGN(RExC_size);
11429         RExC_size += 1;
11430         return(ret);
11431     }
11432     if (RExC_emit >= RExC_emit_bound)
11433         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
11434                    op, RExC_emit, RExC_emit_bound);
11435
11436     NODE_ALIGN_FILL(ret);
11437     ptr = ret;
11438     FILL_ADVANCE_NODE(ptr, op);
11439     REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 1);
11440 #ifdef RE_TRACK_PATTERN_OFFSETS
11441     if (RExC_offsets) {         /* MJD */
11442         MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
11443               "reg_node", __LINE__,
11444               PL_reg_name[op],
11445               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
11446                 ? "Overwriting end of array!\n" : "OK",
11447               (UV)(RExC_emit - RExC_emit_start),
11448               (UV)(RExC_parse - RExC_start),
11449               (UV)RExC_offsets[0]));
11450         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
11451     }
11452 #endif
11453     RExC_emit = ptr;
11454     return(ret);
11455 }
11456
11457 /*
11458 - reganode - emit a node with an argument
11459 */
11460 STATIC regnode *                        /* Location. */
11461 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
11462 {
11463     dVAR;
11464     register regnode *ptr;
11465     regnode * const ret = RExC_emit;
11466     GET_RE_DEBUG_FLAGS_DECL;
11467
11468     PERL_ARGS_ASSERT_REGANODE;
11469
11470     if (SIZE_ONLY) {
11471         SIZE_ALIGN(RExC_size);
11472         RExC_size += 2;
11473         /*
11474            We can't do this:
11475
11476            assert(2==regarglen[op]+1);
11477
11478            Anything larger than this has to allocate the extra amount.
11479            If we changed this to be:
11480
11481            RExC_size += (1 + regarglen[op]);
11482
11483            then it wouldn't matter. Its not clear what side effect
11484            might come from that so its not done so far.
11485            -- dmq
11486         */
11487         return(ret);
11488     }
11489     if (RExC_emit >= RExC_emit_bound)
11490         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
11491                    op, RExC_emit, RExC_emit_bound);
11492
11493     NODE_ALIGN_FILL(ret);
11494     ptr = ret;
11495     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
11496     REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 2);
11497 #ifdef RE_TRACK_PATTERN_OFFSETS
11498     if (RExC_offsets) {         /* MJD */
11499         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
11500               "reganode",
11501               __LINE__,
11502               PL_reg_name[op],
11503               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
11504               "Overwriting end of array!\n" : "OK",
11505               (UV)(RExC_emit - RExC_emit_start),
11506               (UV)(RExC_parse - RExC_start),
11507               (UV)RExC_offsets[0]));
11508         Set_Cur_Node_Offset;
11509     }
11510 #endif
11511     RExC_emit = ptr;
11512     return(ret);
11513 }
11514
11515 /*
11516 - reguni - emit (if appropriate) a Unicode character
11517 */
11518 STATIC STRLEN
11519 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
11520 {
11521     dVAR;
11522
11523     PERL_ARGS_ASSERT_REGUNI;
11524
11525     return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
11526 }
11527
11528 /*
11529 - reginsert - insert an operator in front of already-emitted operand
11530 *
11531 * Means relocating the operand.
11532 */
11533 STATIC void
11534 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
11535 {
11536     dVAR;
11537     register regnode *src;
11538     register regnode *dst;
11539     register regnode *place;
11540     const int offset = regarglen[(U8)op];
11541     const int size = NODE_STEP_REGNODE + offset;
11542     GET_RE_DEBUG_FLAGS_DECL;
11543
11544     PERL_ARGS_ASSERT_REGINSERT;
11545     PERL_UNUSED_ARG(depth);
11546 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
11547     DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
11548     if (SIZE_ONLY) {
11549         RExC_size += size;
11550         return;
11551     }
11552
11553     src = RExC_emit;
11554     RExC_emit += size;
11555     dst = RExC_emit;
11556     if (RExC_open_parens) {
11557         int paren;
11558         /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
11559         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
11560             if ( RExC_open_parens[paren] >= opnd ) {
11561                 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
11562                 RExC_open_parens[paren] += size;
11563             } else {
11564                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
11565             }
11566             if ( RExC_close_parens[paren] >= opnd ) {
11567                 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
11568                 RExC_close_parens[paren] += size;
11569             } else {
11570                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
11571             }
11572         }
11573     }
11574
11575     while (src > opnd) {
11576         StructCopy(--src, --dst, regnode);
11577 #ifdef RE_TRACK_PATTERN_OFFSETS
11578         if (RExC_offsets) {     /* MJD 20010112 */
11579             MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
11580                   "reg_insert",
11581                   __LINE__,
11582                   PL_reg_name[op],
11583                   (UV)(dst - RExC_emit_start) > RExC_offsets[0]
11584                     ? "Overwriting end of array!\n" : "OK",
11585                   (UV)(src - RExC_emit_start),
11586                   (UV)(dst - RExC_emit_start),
11587                   (UV)RExC_offsets[0]));
11588             Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
11589             Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
11590         }
11591 #endif
11592     }
11593
11594
11595     place = opnd;               /* Op node, where operand used to be. */
11596 #ifdef RE_TRACK_PATTERN_OFFSETS
11597     if (RExC_offsets) {         /* MJD */
11598         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
11599               "reginsert",
11600               __LINE__,
11601               PL_reg_name[op],
11602               (UV)(place - RExC_emit_start) > RExC_offsets[0]
11603               ? "Overwriting end of array!\n" : "OK",
11604               (UV)(place - RExC_emit_start),
11605               (UV)(RExC_parse - RExC_start),
11606               (UV)RExC_offsets[0]));
11607         Set_Node_Offset(place, RExC_parse);
11608         Set_Node_Length(place, 1);
11609     }
11610 #endif
11611     src = NEXTOPER(place);
11612     FILL_ADVANCE_NODE(place, op);
11613     REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (place) - 1);
11614     Zero(src, offset, regnode);
11615 }
11616
11617 /*
11618 - regtail - set the next-pointer at the end of a node chain of p to val.
11619 - SEE ALSO: regtail_study
11620 */
11621 /* TODO: All three parms should be const */
11622 STATIC void
11623 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
11624 {
11625     dVAR;
11626     register regnode *scan;
11627     GET_RE_DEBUG_FLAGS_DECL;
11628
11629     PERL_ARGS_ASSERT_REGTAIL;
11630 #ifndef DEBUGGING
11631     PERL_UNUSED_ARG(depth);
11632 #endif
11633
11634     if (SIZE_ONLY)
11635         return;
11636
11637     /* Find last node. */
11638     scan = p;
11639     for (;;) {
11640         regnode * const temp = regnext(scan);
11641         DEBUG_PARSE_r({
11642             SV * const mysv=sv_newmortal();
11643             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
11644             regprop(RExC_rx, mysv, scan);
11645             PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
11646                 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
11647                     (temp == NULL ? "->" : ""),
11648                     (temp == NULL ? PL_reg_name[OP(val)] : "")
11649             );
11650         });
11651         if (temp == NULL)
11652             break;
11653         scan = temp;
11654     }
11655
11656     if (reg_off_by_arg[OP(scan)]) {
11657         ARG_SET(scan, val - scan);
11658     }
11659     else {
11660         NEXT_OFF(scan) = val - scan;
11661     }
11662 }
11663
11664 #ifdef DEBUGGING
11665 /*
11666 - regtail_study - set the next-pointer at the end of a node chain of p to val.
11667 - Look for optimizable sequences at the same time.
11668 - currently only looks for EXACT chains.
11669
11670 This is experimental code. The idea is to use this routine to perform
11671 in place optimizations on branches and groups as they are constructed,
11672 with the long term intention of removing optimization from study_chunk so
11673 that it is purely analytical.
11674
11675 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
11676 to control which is which.
11677
11678 */
11679 /* TODO: All four parms should be const */
11680
11681 STATIC U8
11682 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
11683 {
11684     dVAR;
11685     register regnode *scan;
11686     U8 exact = PSEUDO;
11687 #ifdef EXPERIMENTAL_INPLACESCAN
11688     I32 min = 0;
11689 #endif
11690     GET_RE_DEBUG_FLAGS_DECL;
11691
11692     PERL_ARGS_ASSERT_REGTAIL_STUDY;
11693
11694
11695     if (SIZE_ONLY)
11696         return exact;
11697
11698     /* Find last node. */
11699
11700     scan = p;
11701     for (;;) {
11702         regnode * const temp = regnext(scan);
11703 #ifdef EXPERIMENTAL_INPLACESCAN
11704         if (PL_regkind[OP(scan)] == EXACT) {
11705             bool has_exactf_sharp_s;    /* Unexamined in this routine */
11706             if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1))
11707                 return EXACT;
11708         }
11709 #endif
11710         if ( exact ) {
11711             switch (OP(scan)) {
11712                 case EXACT:
11713                 case EXACTF:
11714                 case EXACTFA:
11715                 case EXACTFU:
11716                 case EXACTFU_SS:
11717                 case EXACTFU_NO_TRIE:
11718                 case EXACTFL:
11719                         if( exact == PSEUDO )
11720                             exact= OP(scan);
11721                         else if ( exact != OP(scan) )
11722                             exact= 0;
11723                 case NOTHING:
11724                     break;
11725                 default:
11726                     exact= 0;
11727             }
11728         }
11729         DEBUG_PARSE_r({
11730             SV * const mysv=sv_newmortal();
11731             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
11732             regprop(RExC_rx, mysv, scan);
11733             PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
11734                 SvPV_nolen_const(mysv),
11735                 REG_NODE_NUM(scan),
11736                 PL_reg_name[exact]);
11737         });
11738         if (temp == NULL)
11739             break;
11740         scan = temp;
11741     }
11742     DEBUG_PARSE_r({
11743         SV * const mysv_val=sv_newmortal();
11744         DEBUG_PARSE_MSG("");
11745         regprop(RExC_rx, mysv_val, val);
11746         PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
11747                       SvPV_nolen_const(mysv_val),
11748                       (IV)REG_NODE_NUM(val),
11749                       (IV)(val - scan)
11750         );
11751     });
11752     if (reg_off_by_arg[OP(scan)]) {
11753         ARG_SET(scan, val - scan);
11754     }
11755     else {
11756         NEXT_OFF(scan) = val - scan;
11757     }
11758
11759     return exact;
11760 }
11761 #endif
11762
11763 /*
11764  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
11765  */
11766 #ifdef DEBUGGING
11767 static void
11768 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
11769 {
11770     int bit;
11771     int set=0;
11772     regex_charset cs;
11773
11774     for (bit=0; bit<32; bit++) {
11775         if (flags & (1<<bit)) {
11776             if ((1<<bit) & RXf_PMf_CHARSET) {   /* Output separately, below */
11777                 continue;
11778             }
11779             if (!set++ && lead)
11780                 PerlIO_printf(Perl_debug_log, "%s",lead);
11781             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
11782         }
11783     }
11784     if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
11785             if (!set++ && lead) {
11786                 PerlIO_printf(Perl_debug_log, "%s",lead);
11787             }
11788             switch (cs) {
11789                 case REGEX_UNICODE_CHARSET:
11790                     PerlIO_printf(Perl_debug_log, "UNICODE");
11791                     break;
11792                 case REGEX_LOCALE_CHARSET:
11793                     PerlIO_printf(Perl_debug_log, "LOCALE");
11794                     break;
11795                 case REGEX_ASCII_RESTRICTED_CHARSET:
11796                     PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
11797                     break;
11798                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
11799                     PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
11800                     break;
11801                 default:
11802                     PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
11803                     break;
11804             }
11805     }
11806     if (lead)  {
11807         if (set)
11808             PerlIO_printf(Perl_debug_log, "\n");
11809         else
11810             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
11811     }
11812 }
11813 #endif
11814
11815 void
11816 Perl_regdump(pTHX_ const regexp *r)
11817 {
11818 #ifdef DEBUGGING
11819     dVAR;
11820     SV * const sv = sv_newmortal();
11821     SV *dsv= sv_newmortal();
11822     RXi_GET_DECL(r,ri);
11823     GET_RE_DEBUG_FLAGS_DECL;
11824
11825     PERL_ARGS_ASSERT_REGDUMP;
11826
11827     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
11828
11829     /* Header fields of interest. */
11830     if (r->anchored_substr) {
11831         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
11832             RE_SV_DUMPLEN(r->anchored_substr), 30);
11833         PerlIO_printf(Perl_debug_log,
11834                       "anchored %s%s at %"IVdf" ",
11835                       s, RE_SV_TAIL(r->anchored_substr),
11836                       (IV)r->anchored_offset);
11837     } else if (r->anchored_utf8) {
11838         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
11839             RE_SV_DUMPLEN(r->anchored_utf8), 30);
11840         PerlIO_printf(Perl_debug_log,
11841                       "anchored utf8 %s%s at %"IVdf" ",
11842                       s, RE_SV_TAIL(r->anchored_utf8),
11843                       (IV)r->anchored_offset);
11844     }
11845     if (r->float_substr) {
11846         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
11847             RE_SV_DUMPLEN(r->float_substr), 30);
11848         PerlIO_printf(Perl_debug_log,
11849                       "floating %s%s at %"IVdf"..%"UVuf" ",
11850                       s, RE_SV_TAIL(r->float_substr),
11851                       (IV)r->float_min_offset, (UV)r->float_max_offset);
11852     } else if (r->float_utf8) {
11853         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
11854             RE_SV_DUMPLEN(r->float_utf8), 30);
11855         PerlIO_printf(Perl_debug_log,
11856                       "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
11857                       s, RE_SV_TAIL(r->float_utf8),
11858                       (IV)r->float_min_offset, (UV)r->float_max_offset);
11859     }
11860     if (r->check_substr || r->check_utf8)
11861         PerlIO_printf(Perl_debug_log,
11862                       (const char *)
11863                       (r->check_substr == r->float_substr
11864                        && r->check_utf8 == r->float_utf8
11865                        ? "(checking floating" : "(checking anchored"));
11866     if (r->extflags & RXf_NOSCAN)
11867         PerlIO_printf(Perl_debug_log, " noscan");
11868     if (r->extflags & RXf_CHECK_ALL)
11869         PerlIO_printf(Perl_debug_log, " isall");
11870     if (r->check_substr || r->check_utf8)
11871         PerlIO_printf(Perl_debug_log, ") ");
11872
11873     if (ri->regstclass) {
11874         regprop(r, sv, ri->regstclass);
11875         PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
11876     }
11877     if (r->extflags & RXf_ANCH) {
11878         PerlIO_printf(Perl_debug_log, "anchored");
11879         if (r->extflags & RXf_ANCH_BOL)
11880             PerlIO_printf(Perl_debug_log, "(BOL)");
11881         if (r->extflags & RXf_ANCH_MBOL)
11882             PerlIO_printf(Perl_debug_log, "(MBOL)");
11883         if (r->extflags & RXf_ANCH_SBOL)
11884             PerlIO_printf(Perl_debug_log, "(SBOL)");
11885         if (r->extflags & RXf_ANCH_GPOS)
11886             PerlIO_printf(Perl_debug_log, "(GPOS)");
11887         PerlIO_putc(Perl_debug_log, ' ');
11888     }
11889     if (r->extflags & RXf_GPOS_SEEN)
11890         PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
11891     if (r->intflags & PREGf_SKIP)
11892         PerlIO_printf(Perl_debug_log, "plus ");
11893     if (r->intflags & PREGf_IMPLICIT)
11894         PerlIO_printf(Perl_debug_log, "implicit ");
11895     PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
11896     if (r->extflags & RXf_EVAL_SEEN)
11897         PerlIO_printf(Perl_debug_log, "with eval ");
11898     PerlIO_printf(Perl_debug_log, "\n");
11899     DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
11900 #else
11901     PERL_ARGS_ASSERT_REGDUMP;
11902     PERL_UNUSED_CONTEXT;
11903     PERL_UNUSED_ARG(r);
11904 #endif  /* DEBUGGING */
11905 }
11906
11907 /*
11908 - regprop - printable representation of opcode
11909 */
11910 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
11911 STMT_START { \
11912         if (do_sep) {                           \
11913             Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
11914             if (flags & ANYOF_INVERT)           \
11915                 /*make sure the invert info is in each */ \
11916                 sv_catpvs(sv, "^");             \
11917             do_sep = 0;                         \
11918         }                                       \
11919 } STMT_END
11920
11921 void
11922 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
11923 {
11924 #ifdef DEBUGGING
11925     dVAR;
11926     register int k;
11927     RXi_GET_DECL(prog,progi);
11928     GET_RE_DEBUG_FLAGS_DECL;
11929
11930     PERL_ARGS_ASSERT_REGPROP;
11931
11932     sv_setpvs(sv, "");
11933
11934     if (OP(o) > REGNODE_MAX)            /* regnode.type is unsigned */
11935         /* It would be nice to FAIL() here, but this may be called from
11936            regexec.c, and it would be hard to supply pRExC_state. */
11937         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
11938     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
11939
11940     k = PL_regkind[OP(o)];
11941
11942     if (k == EXACT) {
11943         sv_catpvs(sv, " ");
11944         /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
11945          * is a crude hack but it may be the best for now since
11946          * we have no flag "this EXACTish node was UTF-8"
11947          * --jhi */
11948         pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
11949                   PERL_PV_ESCAPE_UNI_DETECT |
11950                   PERL_PV_ESCAPE_NONASCII   |
11951                   PERL_PV_PRETTY_ELLIPSES   |
11952                   PERL_PV_PRETTY_LTGT       |
11953                   PERL_PV_PRETTY_NOCLEAR
11954                   );
11955     } else if (k == TRIE) {
11956         /* print the details of the trie in dumpuntil instead, as
11957          * progi->data isn't available here */
11958         const char op = OP(o);
11959         const U32 n = ARG(o);
11960         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
11961                (reg_ac_data *)progi->data->data[n] :
11962                NULL;
11963         const reg_trie_data * const trie
11964             = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
11965
11966         Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
11967         DEBUG_TRIE_COMPILE_r(
11968             Perl_sv_catpvf(aTHX_ sv,
11969                 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
11970                 (UV)trie->startstate,
11971                 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
11972                 (UV)trie->wordcount,
11973                 (UV)trie->minlen,
11974                 (UV)trie->maxlen,
11975                 (UV)TRIE_CHARCOUNT(trie),
11976                 (UV)trie->uniquecharcount
11977             )
11978         );
11979         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
11980             int i;
11981             int rangestart = -1;
11982             U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
11983             sv_catpvs(sv, "[");
11984             for (i = 0; i <= 256; i++) {
11985                 if (i < 256 && BITMAP_TEST(bitmap,i)) {
11986                     if (rangestart == -1)
11987                         rangestart = i;
11988                 } else if (rangestart != -1) {
11989                     if (i <= rangestart + 3)
11990                         for (; rangestart < i; rangestart++)
11991                             put_byte(sv, rangestart);
11992                     else {
11993                         put_byte(sv, rangestart);
11994                         sv_catpvs(sv, "-");
11995                         put_byte(sv, i - 1);
11996                     }
11997                     rangestart = -1;
11998                 }
11999             }
12000             sv_catpvs(sv, "]");
12001         }
12002
12003     } else if (k == CURLY) {
12004         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
12005             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
12006         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
12007     }
12008     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
12009         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
12010     else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
12011         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
12012         if ( RXp_PAREN_NAMES(prog) ) {
12013             if ( k != REF || (OP(o) < NREF)) {
12014                 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
12015                 SV **name= av_fetch(list, ARG(o), 0 );
12016                 if (name)
12017                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
12018             }
12019             else {
12020                 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
12021                 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
12022                 I32 *nums=(I32*)SvPVX(sv_dat);
12023                 SV **name= av_fetch(list, nums[0], 0 );
12024                 I32 n;
12025                 if (name) {
12026                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
12027                         Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
12028                                     (n ? "," : ""), (IV)nums[n]);
12029                     }
12030                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
12031                 }
12032             }
12033         }
12034     } else if (k == GOSUB)
12035         Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
12036     else if (k == VERB) {
12037         if (!o->flags)
12038             Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
12039                            SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
12040     } else if (k == LOGICAL)
12041         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);     /* 2: embedded, otherwise 1 */
12042     else if (k == ANYOF) {
12043         int i, rangestart = -1;
12044         const U8 flags = ANYOF_FLAGS(o);
12045         int do_sep = 0;
12046
12047         /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
12048         static const char * const anyofs[] = {
12049             "\\w",
12050             "\\W",
12051             "\\s",
12052             "\\S",
12053             "\\d",
12054             "\\D",
12055             "[:alnum:]",
12056             "[:^alnum:]",
12057             "[:alpha:]",
12058             "[:^alpha:]",
12059             "[:ascii:]",
12060             "[:^ascii:]",
12061             "[:cntrl:]",
12062             "[:^cntrl:]",
12063             "[:graph:]",
12064             "[:^graph:]",
12065             "[:lower:]",
12066             "[:^lower:]",
12067             "[:print:]",
12068             "[:^print:]",
12069             "[:punct:]",
12070             "[:^punct:]",
12071             "[:upper:]",
12072             "[:^upper:]",
12073             "[:xdigit:]",
12074             "[:^xdigit:]",
12075             "[:space:]",
12076             "[:^space:]",
12077             "[:blank:]",
12078             "[:^blank:]"
12079         };
12080
12081         if (flags & ANYOF_LOCALE)
12082             sv_catpvs(sv, "{loc}");
12083         if (flags & ANYOF_LOC_NONBITMAP_FOLD)
12084             sv_catpvs(sv, "{i}");
12085         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
12086         if (flags & ANYOF_INVERT)
12087             sv_catpvs(sv, "^");
12088
12089         /* output what the standard cp 0-255 bitmap matches */
12090         for (i = 0; i <= 256; i++) {
12091             if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
12092                 if (rangestart == -1)
12093                     rangestart = i;
12094             } else if (rangestart != -1) {
12095                 if (i <= rangestart + 3)
12096                     for (; rangestart < i; rangestart++)
12097                         put_byte(sv, rangestart);
12098                 else {
12099                     put_byte(sv, rangestart);
12100                     sv_catpvs(sv, "-");
12101                     put_byte(sv, i - 1);
12102                 }
12103                 do_sep = 1;
12104                 rangestart = -1;
12105             }
12106         }
12107
12108         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
12109         /* output any special charclass tests (used entirely under use locale) */
12110         if (ANYOF_CLASS_TEST_ANY_SET(o))
12111             for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
12112                 if (ANYOF_CLASS_TEST(o,i)) {
12113                     sv_catpv(sv, anyofs[i]);
12114                     do_sep = 1;
12115                 }
12116
12117         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
12118
12119         if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
12120             sv_catpvs(sv, "{non-utf8-latin1-all}");
12121         }
12122
12123         /* output information about the unicode matching */
12124         if (flags & ANYOF_UNICODE_ALL)
12125             sv_catpvs(sv, "{unicode_all}");
12126         else if (ANYOF_NONBITMAP(o))
12127             sv_catpvs(sv, "{unicode}");
12128         if (flags & ANYOF_NONBITMAP_NON_UTF8)
12129             sv_catpvs(sv, "{outside bitmap}");
12130
12131         if (ANYOF_NONBITMAP(o)) {
12132             SV *lv; /* Set if there is something outside the bit map */
12133             SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
12134             bool byte_output = FALSE;   /* If something in the bitmap has been
12135                                            output */
12136
12137             if (lv && lv != &PL_sv_undef) {
12138                 if (sw) {
12139                     U8 s[UTF8_MAXBYTES_CASE+1];
12140
12141                     for (i = 0; i <= 256; i++) { /* Look at chars in bitmap */
12142                         uvchr_to_utf8(s, i);
12143
12144                         if (i < 256
12145                             && ! ANYOF_BITMAP_TEST(o, i)    /* Don't duplicate
12146                                                                things already
12147                                                                output as part
12148                                                                of the bitmap */
12149                             && swash_fetch(sw, s, TRUE))
12150                         {
12151                             if (rangestart == -1)
12152                                 rangestart = i;
12153                         } else if (rangestart != -1) {
12154                             byte_output = TRUE;
12155                             if (i <= rangestart + 3)
12156                                 for (; rangestart < i; rangestart++) {
12157                                     put_byte(sv, rangestart);
12158                                 }
12159                             else {
12160                                 put_byte(sv, rangestart);
12161                                 sv_catpvs(sv, "-");
12162                                 put_byte(sv, i-1);
12163                             }
12164                             rangestart = -1;
12165                         }
12166                     }
12167                 }
12168
12169                 {
12170                     char *s = savesvpv(lv);
12171                     char * const origs = s;
12172
12173                     while (*s && *s != '\n')
12174                         s++;
12175
12176                     if (*s == '\n') {
12177                         const char * const t = ++s;
12178
12179                         if (byte_output) {
12180                             sv_catpvs(sv, " ");
12181                         }
12182
12183                         while (*s) {
12184                             if (*s == '\n') {
12185
12186                                 /* Truncate very long output */
12187                                 if (s - origs > 256) {
12188                                     Perl_sv_catpvf(aTHX_ sv,
12189                                                    "%.*s...",
12190                                                    (int) (s - origs - 1),
12191                                                    t);
12192                                     goto out_dump;
12193                                 }
12194                                 *s = ' ';
12195                             }
12196                             else if (*s == '\t') {
12197                                 *s = '-';
12198                             }
12199                             s++;
12200                         }
12201                         if (s[-1] == ' ')
12202                             s[-1] = 0;
12203
12204                         sv_catpv(sv, t);
12205                     }
12206
12207                 out_dump:
12208
12209                     Safefree(origs);
12210                 }
12211                 SvREFCNT_dec(lv);
12212             }
12213         }
12214
12215         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
12216     }
12217     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
12218         Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
12219 #else
12220     PERL_UNUSED_CONTEXT;
12221     PERL_UNUSED_ARG(sv);
12222     PERL_UNUSED_ARG(o);
12223     PERL_UNUSED_ARG(prog);
12224 #endif  /* DEBUGGING */
12225 }
12226
12227 SV *
12228 Perl_re_intuit_string(pTHX_ REGEXP * const r)
12229 {                               /* Assume that RE_INTUIT is set */
12230     dVAR;
12231     struct regexp *const prog = (struct regexp *)SvANY(r);
12232     GET_RE_DEBUG_FLAGS_DECL;
12233
12234     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
12235     PERL_UNUSED_CONTEXT;
12236
12237     DEBUG_COMPILE_r(
12238         {
12239             const char * const s = SvPV_nolen_const(prog->check_substr
12240                       ? prog->check_substr : prog->check_utf8);
12241
12242             if (!PL_colorset) reginitcolors();
12243             PerlIO_printf(Perl_debug_log,
12244                       "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
12245                       PL_colors[4],
12246                       prog->check_substr ? "" : "utf8 ",
12247                       PL_colors[5],PL_colors[0],
12248                       s,
12249                       PL_colors[1],
12250                       (strlen(s) > 60 ? "..." : ""));
12251         } );
12252
12253     return prog->check_substr ? prog->check_substr : prog->check_utf8;
12254 }
12255
12256 /*
12257    pregfree()
12258
12259    handles refcounting and freeing the perl core regexp structure. When
12260    it is necessary to actually free the structure the first thing it
12261    does is call the 'free' method of the regexp_engine associated to
12262    the regexp, allowing the handling of the void *pprivate; member
12263    first. (This routine is not overridable by extensions, which is why
12264    the extensions free is called first.)
12265
12266    See regdupe and regdupe_internal if you change anything here.
12267 */
12268 #ifndef PERL_IN_XSUB_RE
12269 void
12270 Perl_pregfree(pTHX_ REGEXP *r)
12271 {
12272     SvREFCNT_dec(r);
12273 }
12274
12275 void
12276 Perl_pregfree2(pTHX_ REGEXP *rx)
12277 {
12278     dVAR;
12279     struct regexp *const r = (struct regexp *)SvANY(rx);
12280     GET_RE_DEBUG_FLAGS_DECL;
12281
12282     PERL_ARGS_ASSERT_PREGFREE2;
12283
12284     if (r->mother_re) {
12285         ReREFCNT_dec(r->mother_re);
12286     } else {
12287         CALLREGFREE_PVT(rx); /* free the private data */
12288         SvREFCNT_dec(RXp_PAREN_NAMES(r));
12289     }
12290     if (r->substrs) {
12291         SvREFCNT_dec(r->anchored_substr);
12292         SvREFCNT_dec(r->anchored_utf8);
12293         SvREFCNT_dec(r->float_substr);
12294         SvREFCNT_dec(r->float_utf8);
12295         Safefree(r->substrs);
12296     }
12297     RX_MATCH_COPY_FREE(rx);
12298 #ifdef PERL_OLD_COPY_ON_WRITE
12299     SvREFCNT_dec(r->saved_copy);
12300 #endif
12301     Safefree(r->offs);
12302 }
12303
12304 /*  reg_temp_copy()
12305
12306     This is a hacky workaround to the structural issue of match results
12307     being stored in the regexp structure which is in turn stored in
12308     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
12309     could be PL_curpm in multiple contexts, and could require multiple
12310     result sets being associated with the pattern simultaneously, such
12311     as when doing a recursive match with (??{$qr})
12312
12313     The solution is to make a lightweight copy of the regexp structure
12314     when a qr// is returned from the code executed by (??{$qr}) this
12315     lightweight copy doesn't actually own any of its data except for
12316     the starp/end and the actual regexp structure itself.
12317
12318 */
12319
12320
12321 REGEXP *
12322 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
12323 {
12324     struct regexp *ret;
12325     struct regexp *const r = (struct regexp *)SvANY(rx);
12326     register const I32 npar = r->nparens+1;
12327
12328     PERL_ARGS_ASSERT_REG_TEMP_COPY;
12329
12330     if (!ret_x)
12331         ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
12332     ret = (struct regexp *)SvANY(ret_x);
12333
12334     (void)ReREFCNT_inc(rx);
12335     /* We can take advantage of the existing "copied buffer" mechanism in SVs
12336        by pointing directly at the buffer, but flagging that the allocated
12337        space in the copy is zero. As we've just done a struct copy, it's now
12338        a case of zero-ing that, rather than copying the current length.  */
12339     SvPV_set(ret_x, RX_WRAPPED(rx));
12340     SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
12341     memcpy(&(ret->xpv_cur), &(r->xpv_cur),
12342            sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
12343     SvLEN_set(ret_x, 0);
12344     SvSTASH_set(ret_x, NULL);
12345     SvMAGIC_set(ret_x, NULL);
12346     Newx(ret->offs, npar, regexp_paren_pair);
12347     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
12348     if (r->substrs) {
12349         Newx(ret->substrs, 1, struct reg_substr_data);
12350         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
12351
12352         SvREFCNT_inc_void(ret->anchored_substr);
12353         SvREFCNT_inc_void(ret->anchored_utf8);
12354         SvREFCNT_inc_void(ret->float_substr);
12355         SvREFCNT_inc_void(ret->float_utf8);
12356
12357         /* check_substr and check_utf8, if non-NULL, point to either their
12358            anchored or float namesakes, and don't hold a second reference.  */
12359     }
12360     RX_MATCH_COPIED_off(ret_x);
12361 #ifdef PERL_OLD_COPY_ON_WRITE
12362     ret->saved_copy = NULL;
12363 #endif
12364     ret->mother_re = rx;
12365
12366     return ret_x;
12367 }
12368 #endif
12369
12370 /* regfree_internal()
12371
12372    Free the private data in a regexp. This is overloadable by
12373    extensions. Perl takes care of the regexp structure in pregfree(),
12374    this covers the *pprivate pointer which technically perl doesn't
12375    know about, however of course we have to handle the
12376    regexp_internal structure when no extension is in use.
12377
12378    Note this is called before freeing anything in the regexp
12379    structure.
12380  */
12381
12382 void
12383 Perl_regfree_internal(pTHX_ REGEXP * const rx)
12384 {
12385     dVAR;
12386     struct regexp *const r = (struct regexp *)SvANY(rx);
12387     RXi_GET_DECL(r,ri);
12388     GET_RE_DEBUG_FLAGS_DECL;
12389
12390     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
12391
12392     DEBUG_COMPILE_r({
12393         if (!PL_colorset)
12394             reginitcolors();
12395         {
12396             SV *dsv= sv_newmortal();
12397             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
12398                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
12399             PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
12400                 PL_colors[4],PL_colors[5],s);
12401         }
12402     });
12403 #ifdef RE_TRACK_PATTERN_OFFSETS
12404     if (ri->u.offsets)
12405         Safefree(ri->u.offsets);             /* 20010421 MJD */
12406 #endif
12407     if (ri->data) {
12408         int n = ri->data->count;
12409         PAD* new_comppad = NULL;
12410         PAD* old_comppad;
12411         PADOFFSET refcnt;
12412
12413         while (--n >= 0) {
12414           /* If you add a ->what type here, update the comment in regcomp.h */
12415             switch (ri->data->what[n]) {
12416             case 'a':
12417             case 's':
12418             case 'S':
12419             case 'u':
12420                 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
12421                 break;
12422             case 'f':
12423                 Safefree(ri->data->data[n]);
12424                 break;
12425             case 'p':
12426                 new_comppad = MUTABLE_AV(ri->data->data[n]);
12427                 break;
12428             case 'o':
12429                 if (new_comppad == NULL)
12430                     Perl_croak(aTHX_ "panic: pregfree comppad");
12431                 PAD_SAVE_LOCAL(old_comppad,
12432                     /* Watch out for global destruction's random ordering. */
12433                     (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
12434                 );
12435                 OP_REFCNT_LOCK;
12436                 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
12437                 OP_REFCNT_UNLOCK;
12438                 if (!refcnt)
12439                     op_free((OP_4tree*)ri->data->data[n]);
12440
12441                 PAD_RESTORE_LOCAL(old_comppad);
12442                 SvREFCNT_dec(MUTABLE_SV(new_comppad));
12443                 new_comppad = NULL;
12444                 break;
12445             case 'n':
12446                 break;
12447             case 'T':
12448                 { /* Aho Corasick add-on structure for a trie node.
12449                      Used in stclass optimization only */
12450                     U32 refcount;
12451                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
12452                     OP_REFCNT_LOCK;
12453                     refcount = --aho->refcount;
12454                     OP_REFCNT_UNLOCK;
12455                     if ( !refcount ) {
12456                         PerlMemShared_free(aho->states);
12457                         PerlMemShared_free(aho->fail);
12458                          /* do this last!!!! */
12459                         PerlMemShared_free(ri->data->data[n]);
12460                         PerlMemShared_free(ri->regstclass);
12461                     }
12462                 }
12463                 break;
12464             case 't':
12465                 {
12466                     /* trie structure. */
12467                     U32 refcount;
12468                     reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
12469                     OP_REFCNT_LOCK;
12470                     refcount = --trie->refcount;
12471                     OP_REFCNT_UNLOCK;
12472                     if ( !refcount ) {
12473                         PerlMemShared_free(trie->charmap);
12474                         PerlMemShared_free(trie->states);
12475                         PerlMemShared_free(trie->trans);
12476                         if (trie->bitmap)
12477                             PerlMemShared_free(trie->bitmap);
12478                         if (trie->jump)
12479                             PerlMemShared_free(trie->jump);
12480                         PerlMemShared_free(trie->wordinfo);
12481                         /* do this last!!!! */
12482                         PerlMemShared_free(ri->data->data[n]);
12483                     }
12484                 }
12485                 break;
12486             default:
12487                 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
12488             }
12489         }
12490         Safefree(ri->data->what);
12491         Safefree(ri->data);
12492     }
12493
12494     Safefree(ri);
12495 }
12496
12497 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
12498 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
12499 #define SAVEPVN(p,n)    ((p) ? savepvn(p,n) : NULL)
12500
12501 /*
12502    re_dup - duplicate a regexp.
12503
12504    This routine is expected to clone a given regexp structure. It is only
12505    compiled under USE_ITHREADS.
12506
12507    After all of the core data stored in struct regexp is duplicated
12508    the regexp_engine.dupe method is used to copy any private data
12509    stored in the *pprivate pointer. This allows extensions to handle
12510    any duplication it needs to do.
12511
12512    See pregfree() and regfree_internal() if you change anything here.
12513 */
12514 #if defined(USE_ITHREADS)
12515 #ifndef PERL_IN_XSUB_RE
12516 void
12517 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
12518 {
12519     dVAR;
12520     I32 npar;
12521     const struct regexp *r = (const struct regexp *)SvANY(sstr);
12522     struct regexp *ret = (struct regexp *)SvANY(dstr);
12523
12524     PERL_ARGS_ASSERT_RE_DUP_GUTS;
12525
12526     npar = r->nparens+1;
12527     Newx(ret->offs, npar, regexp_paren_pair);
12528     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
12529     if(ret->swap) {
12530         /* no need to copy these */
12531         Newx(ret->swap, npar, regexp_paren_pair);
12532     }
12533
12534     if (ret->substrs) {
12535         /* Do it this way to avoid reading from *r after the StructCopy().
12536            That way, if any of the sv_dup_inc()s dislodge *r from the L1
12537            cache, it doesn't matter.  */
12538         const bool anchored = r->check_substr
12539             ? r->check_substr == r->anchored_substr
12540             : r->check_utf8 == r->anchored_utf8;
12541         Newx(ret->substrs, 1, struct reg_substr_data);
12542         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
12543
12544         ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
12545         ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
12546         ret->float_substr = sv_dup_inc(ret->float_substr, param);
12547         ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
12548
12549         /* check_substr and check_utf8, if non-NULL, point to either their
12550            anchored or float namesakes, and don't hold a second reference.  */
12551
12552         if (ret->check_substr) {
12553             if (anchored) {
12554                 assert(r->check_utf8 == r->anchored_utf8);
12555                 ret->check_substr = ret->anchored_substr;
12556                 ret->check_utf8 = ret->anchored_utf8;
12557             } else {
12558                 assert(r->check_substr == r->float_substr);
12559                 assert(r->check_utf8 == r->float_utf8);
12560                 ret->check_substr = ret->float_substr;
12561                 ret->check_utf8 = ret->float_utf8;
12562             }
12563         } else if (ret->check_utf8) {
12564             if (anchored) {
12565                 ret->check_utf8 = ret->anchored_utf8;
12566             } else {
12567                 ret->check_utf8 = ret->float_utf8;
12568             }
12569         }
12570     }
12571
12572     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
12573
12574     if (ret->pprivate)
12575         RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
12576
12577     if (RX_MATCH_COPIED(dstr))
12578         ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
12579     else
12580         ret->subbeg = NULL;
12581 #ifdef PERL_OLD_COPY_ON_WRITE
12582     ret->saved_copy = NULL;
12583 #endif
12584
12585     if (ret->mother_re) {
12586         if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
12587             /* Our storage points directly to our mother regexp, but that's
12588                1: a buffer in a different thread
12589                2: something we no longer hold a reference on
12590                so we need to copy it locally.  */
12591             /* Note we need to use SvCUR(), rather than
12592                SvLEN(), on our mother_re, because it, in
12593                turn, may well be pointing to its own mother_re.  */
12594             SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
12595                                    SvCUR(ret->mother_re)+1));
12596             SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
12597         }
12598         ret->mother_re      = NULL;
12599     }
12600     ret->gofs = 0;
12601 }
12602 #endif /* PERL_IN_XSUB_RE */
12603
12604 /*
12605    regdupe_internal()
12606
12607    This is the internal complement to regdupe() which is used to copy
12608    the structure pointed to by the *pprivate pointer in the regexp.
12609    This is the core version of the extension overridable cloning hook.
12610    The regexp structure being duplicated will be copied by perl prior
12611    to this and will be provided as the regexp *r argument, however
12612    with the /old/ structures pprivate pointer value. Thus this routine
12613    may override any copying normally done by perl.
12614
12615    It returns a pointer to the new regexp_internal structure.
12616 */
12617
12618 void *
12619 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
12620 {
12621     dVAR;
12622     struct regexp *const r = (struct regexp *)SvANY(rx);
12623     regexp_internal *reti;
12624     int len;
12625     RXi_GET_DECL(r,ri);
12626
12627     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
12628
12629     len = ProgLen(ri);
12630
12631     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
12632     Copy(ri->program, reti->program, len+1, regnode);
12633
12634
12635     reti->regstclass = NULL;
12636
12637     if (ri->data) {
12638         struct reg_data *d;
12639         const int count = ri->data->count;
12640         int i;
12641
12642         Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
12643                 char, struct reg_data);
12644         Newx(d->what, count, U8);
12645
12646         d->count = count;
12647         for (i = 0; i < count; i++) {
12648             d->what[i] = ri->data->what[i];
12649             switch (d->what[i]) {
12650                 /* legal options are one of: sSfpontTua
12651                    see also regcomp.h and pregfree() */
12652             case 'a': /* actually an AV, but the dup function is identical.  */
12653             case 's':
12654             case 'S':
12655             case 'p': /* actually an AV, but the dup function is identical.  */
12656             case 'u': /* actually an HV, but the dup function is identical.  */
12657                 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
12658                 break;
12659             case 'f':
12660                 /* This is cheating. */
12661                 Newx(d->data[i], 1, struct regnode_charclass_class);
12662                 StructCopy(ri->data->data[i], d->data[i],
12663                             struct regnode_charclass_class);
12664                 reti->regstclass = (regnode*)d->data[i];
12665                 break;
12666             case 'o':
12667                 /* Compiled op trees are readonly and in shared memory,
12668                    and can thus be shared without duplication. */
12669                 OP_REFCNT_LOCK;
12670                 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
12671                 OP_REFCNT_UNLOCK;
12672                 break;
12673             case 'T':
12674                 /* Trie stclasses are readonly and can thus be shared
12675                  * without duplication. We free the stclass in pregfree
12676                  * when the corresponding reg_ac_data struct is freed.
12677                  */
12678                 reti->regstclass= ri->regstclass;
12679                 /* Fall through */
12680             case 't':
12681                 OP_REFCNT_LOCK;
12682                 ((reg_trie_data*)ri->data->data[i])->refcount++;
12683                 OP_REFCNT_UNLOCK;
12684                 /* Fall through */
12685             case 'n':
12686                 d->data[i] = ri->data->data[i];
12687                 break;
12688             default:
12689                 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
12690             }
12691         }
12692
12693         reti->data = d;
12694     }
12695     else
12696         reti->data = NULL;
12697
12698     reti->name_list_idx = ri->name_list_idx;
12699
12700 #ifdef RE_TRACK_PATTERN_OFFSETS
12701     if (ri->u.offsets) {
12702         Newx(reti->u.offsets, 2*len+1, U32);
12703         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
12704     }
12705 #else
12706     SetProgLen(reti,len);
12707 #endif
12708
12709     return (void*)reti;
12710 }
12711
12712 #endif    /* USE_ITHREADS */
12713
12714 #ifndef PERL_IN_XSUB_RE
12715
12716 /*
12717  - regnext - dig the "next" pointer out of a node
12718  */
12719 regnode *
12720 Perl_regnext(pTHX_ register regnode *p)
12721 {
12722     dVAR;
12723     register I32 offset;
12724
12725     if (!p)
12726         return(NULL);
12727
12728     if (OP(p) > REGNODE_MAX) {          /* regnode.type is unsigned */
12729         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
12730     }
12731
12732     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
12733     if (offset == 0)
12734         return(NULL);
12735
12736     return(p+offset);
12737 }
12738 #endif
12739
12740 STATIC void
12741 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
12742 {
12743     va_list args;
12744     STRLEN l1 = strlen(pat1);
12745     STRLEN l2 = strlen(pat2);
12746     char buf[512];
12747     SV *msv;
12748     const char *message;
12749
12750     PERL_ARGS_ASSERT_RE_CROAK2;
12751
12752     if (l1 > 510)
12753         l1 = 510;
12754     if (l1 + l2 > 510)
12755         l2 = 510 - l1;
12756     Copy(pat1, buf, l1 , char);
12757     Copy(pat2, buf + l1, l2 , char);
12758     buf[l1 + l2] = '\n';
12759     buf[l1 + l2 + 1] = '\0';
12760 #ifdef I_STDARG
12761     /* ANSI variant takes additional second argument */
12762     va_start(args, pat2);
12763 #else
12764     va_start(args);
12765 #endif
12766     msv = vmess(buf, &args);
12767     va_end(args);
12768     message = SvPV_const(msv,l1);
12769     if (l1 > 512)
12770         l1 = 512;
12771     Copy(message, buf, l1 , char);
12772     buf[l1-1] = '\0';                   /* Overwrite \n */
12773     Perl_croak(aTHX_ "%s", buf);
12774 }
12775
12776 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
12777
12778 #ifndef PERL_IN_XSUB_RE
12779 void
12780 Perl_save_re_context(pTHX)
12781 {
12782     dVAR;
12783
12784     struct re_save_state *state;
12785
12786     SAVEVPTR(PL_curcop);
12787     SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
12788
12789     state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
12790     PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
12791     SSPUSHUV(SAVEt_RE_STATE);
12792
12793     Copy(&PL_reg_state, state, 1, struct re_save_state);
12794
12795     PL_reg_start_tmp = 0;
12796     PL_reg_start_tmpl = 0;
12797     PL_reg_oldsaved = NULL;
12798     PL_reg_oldsavedlen = 0;
12799     PL_reg_maxiter = 0;
12800     PL_reg_leftiter = 0;
12801     PL_reg_poscache = NULL;
12802     PL_reg_poscache_size = 0;
12803 #ifdef PERL_OLD_COPY_ON_WRITE
12804     PL_nrs = NULL;
12805 #endif
12806
12807     /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
12808     if (PL_curpm) {
12809         const REGEXP * const rx = PM_GETRE(PL_curpm);
12810         if (rx) {
12811             U32 i;
12812             for (i = 1; i <= RX_NPARENS(rx); i++) {
12813                 char digits[TYPE_CHARS(long)];
12814                 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
12815                 GV *const *const gvp
12816                     = (GV**)hv_fetch(PL_defstash, digits, len, 0);
12817
12818                 if (gvp) {
12819                     GV * const gv = *gvp;
12820                     if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
12821                         save_scalar(gv);
12822                 }
12823             }
12824         }
12825     }
12826 }
12827 #endif
12828
12829 static void
12830 clear_re(pTHX_ void *r)
12831 {
12832     dVAR;
12833     ReREFCNT_dec((REGEXP *)r);
12834 }
12835
12836 #ifdef DEBUGGING
12837
12838 STATIC void
12839 S_put_byte(pTHX_ SV *sv, int c)
12840 {
12841     PERL_ARGS_ASSERT_PUT_BYTE;
12842
12843     /* Our definition of isPRINT() ignores locales, so only bytes that are
12844        not part of UTF-8 are considered printable. I assume that the same
12845        holds for UTF-EBCDIC.
12846        Also, code point 255 is not printable in either (it's E0 in EBCDIC,
12847        which Wikipedia says:
12848
12849        EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
12850        ones (binary 1111 1111, hexadecimal FF). It is similar, but not
12851        identical, to the ASCII delete (DEL) or rubout control character.
12852        ) So the old condition can be simplified to !isPRINT(c)  */
12853     if (!isPRINT(c)) {
12854         if (c < 256) {
12855             Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
12856         }
12857         else {
12858             Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
12859         }
12860     }
12861     else {
12862         const char string = c;
12863         if (c == '-' || c == ']' || c == '\\' || c == '^')
12864             sv_catpvs(sv, "\\");
12865         sv_catpvn(sv, &string, 1);
12866     }
12867 }
12868
12869
12870 #define CLEAR_OPTSTART \
12871     if (optstart) STMT_START { \
12872             DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
12873             optstart=NULL; \
12874     } STMT_END
12875
12876 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
12877
12878 STATIC const regnode *
12879 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
12880             const regnode *last, const regnode *plast,
12881             SV* sv, I32 indent, U32 depth)
12882 {
12883     dVAR;
12884     register U8 op = PSEUDO;    /* Arbitrary non-END op. */
12885     register const regnode *next;
12886     const regnode *optstart= NULL;
12887
12888     RXi_GET_DECL(r,ri);
12889     GET_RE_DEBUG_FLAGS_DECL;
12890
12891     PERL_ARGS_ASSERT_DUMPUNTIL;
12892
12893 #ifdef DEBUG_DUMPUNTIL
12894     PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
12895         last ? last-start : 0,plast ? plast-start : 0);
12896 #endif
12897
12898     if (plast && plast < last)
12899         last= plast;
12900
12901     while (PL_regkind[op] != END && (!last || node < last)) {
12902         /* While that wasn't END last time... */
12903         NODE_ALIGN(node);
12904         op = OP(node);
12905         if (op == CLOSE || op == WHILEM)
12906             indent--;
12907         next = regnext((regnode *)node);
12908
12909         /* Where, what. */
12910         if (OP(node) == OPTIMIZED) {
12911             if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
12912                 optstart = node;
12913             else
12914                 goto after_print;
12915         } else
12916             CLEAR_OPTSTART;
12917
12918         regprop(r, sv, node);
12919         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
12920                       (int)(2*indent + 1), "", SvPVX_const(sv));
12921
12922         if (OP(node) != OPTIMIZED) {
12923             if (next == NULL)           /* Next ptr. */
12924                 PerlIO_printf(Perl_debug_log, " (0)");
12925             else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
12926                 PerlIO_printf(Perl_debug_log, " (FAIL)");
12927             else
12928                 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
12929             (void)PerlIO_putc(Perl_debug_log, '\n');
12930         }
12931
12932       after_print:
12933         if (PL_regkind[(U8)op] == BRANCHJ) {
12934             assert(next);
12935             {
12936                 register const regnode *nnode = (OP(next) == LONGJMP
12937                                              ? regnext((regnode *)next)
12938                                              : next);
12939                 if (last && nnode > last)
12940                     nnode = last;
12941                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
12942             }
12943         }
12944         else if (PL_regkind[(U8)op] == BRANCH) {
12945             assert(next);
12946             DUMPUNTIL(NEXTOPER(node), next);
12947         }
12948         else if ( PL_regkind[(U8)op]  == TRIE ) {
12949             const regnode *this_trie = node;
12950             const char op = OP(node);
12951             const U32 n = ARG(node);
12952             const reg_ac_data * const ac = op>=AHOCORASICK ?
12953                (reg_ac_data *)ri->data->data[n] :
12954                NULL;
12955             const reg_trie_data * const trie =
12956                 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
12957 #ifdef DEBUGGING
12958             AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
12959 #endif
12960             const regnode *nextbranch= NULL;
12961             I32 word_idx;
12962             sv_setpvs(sv, "");
12963             for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
12964                 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
12965
12966                 PerlIO_printf(Perl_debug_log, "%*s%s ",
12967                    (int)(2*(indent+3)), "",
12968                     elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
12969                             PL_colors[0], PL_colors[1],
12970                             (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
12971                             PERL_PV_PRETTY_ELLIPSES    |
12972                             PERL_PV_PRETTY_LTGT
12973                             )
12974                             : "???"
12975                 );
12976                 if (trie->jump) {
12977                     U16 dist= trie->jump[word_idx+1];
12978                     PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
12979                                   (UV)((dist ? this_trie + dist : next) - start));
12980                     if (dist) {
12981                         if (!nextbranch)
12982                             nextbranch= this_trie + trie->jump[0];
12983                         DUMPUNTIL(this_trie + dist, nextbranch);
12984                     }
12985                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
12986                         nextbranch= regnext((regnode *)nextbranch);
12987                 } else {
12988                     PerlIO_printf(Perl_debug_log, "\n");
12989                 }
12990             }
12991             if (last && next > last)
12992                 node= last;
12993             else
12994                 node= next;
12995         }
12996         else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
12997             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
12998                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
12999         }
13000         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
13001             assert(next);
13002             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
13003         }
13004         else if ( op == PLUS || op == STAR) {
13005             DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
13006         }
13007         else if (PL_regkind[(U8)op] == ANYOF) {
13008             /* arglen 1 + class block */
13009             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
13010                     ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
13011             node = NEXTOPER(node);
13012         }
13013         else if (PL_regkind[(U8)op] == EXACT) {
13014             /* Literal string, where present. */
13015             node += NODE_SZ_STR(node) - 1;
13016             node = NEXTOPER(node);
13017         }
13018         else {
13019             node = NEXTOPER(node);
13020             node += regarglen[(U8)op];
13021         }
13022         if (op == CURLYX || op == OPEN)
13023             indent++;
13024     }
13025     CLEAR_OPTSTART;
13026 #ifdef DEBUG_DUMPUNTIL
13027     PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
13028 #endif
13029     return node;
13030 }
13031
13032 #endif  /* DEBUGGING */
13033
13034 /*
13035  * Local variables:
13036  * c-indentation-style: bsd
13037  * c-basic-offset: 4
13038  * indent-tabs-mode: t
13039  * End:
13040  *
13041  * ex: set ts=8 sts=4 sw=4 noet:
13042  */