regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  *      Copyright (c) 1986 by University of Toronto.
  42  *      Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  *      Permission is granted to anyone to use this software for any
  45  *      purpose on any computer system, and to redistribute it freely,
  46  *      subject to the following restrictions:
  47  *
  48  *      1. The author is not responsible for the consequences of use of
  49  *              this software, no matter how awful, even if they arise
  50  *              from defects in it.
  51  *
  52  *      2. The origin of this software must not be misrepresented, either
  53  *              by explicit claim or by omission.
  54  *
  55  *      3. Altered versions must be plainly marked as such, and must not
  56  *              be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #include "perl.h"
  76
  77 #ifndef PERL_IN_XSUB_RE
  78 #include "re_defs.h"
  79 #endif
  80
  81 #define REG_COMP_C
  82 #ifdef PERL_IN_XSUB_RE
  83 #  include "re_comp.h"
  84 #else
  85 #  include "regcomp.h"
  86 #endif
  87
  88 #include "dquote_static.c"
  89
  90 #ifdef op
  91 #undef op
  92 #endif /* op */
  93
  94 #ifdef MSDOS
  95 #  if defined(BUGGY_MSC6)
  96  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
  97 #    pragma optimize("a",off)
  98  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
  99 #    pragma optimize("w",on )
 100 #  endif /* BUGGY_MSC6 */
 101 #endif /* MSDOS */
 102
 103 #ifndef STATIC
 104 #define STATIC  static
 105 #endif
 106
 107 typedef struct RExC_state_t {
 108     U32         flags;                  /* are we folding, multilining? */
 109     char        *precomp;               /* uncompiled string. */
 110     REGEXP      *rx_sv;                 /* The SV that is the regexp. */
 111     regexp      *rx;                    /* perl core regexp structure */
 112     regexp_internal     *rxi;           /* internal data for regexp object pprivate field */
 113     char        *start;                 /* Start of input for compile */
 114     char        *end;                   /* End of input for compile */
 115     char        *parse;                 /* Input-scan pointer. */
 116     I32         whilem_seen;            /* number of WHILEM in this expr */
 117     regnode     *emit_start;            /* Start of emitted-code area */
 118     regnode     *emit_bound;            /* First regnode outside of the allocated space */
 119     regnode     *emit;                  /* Code-emit pointer; &regdummy = don't = compiling */
 120     I32         naughty;                /* How bad is this pattern? */
 121     I32         sawback;                /* Did we see \1, ...? */
 122     U32         seen;
 123     I32         size;                   /* Code size. */
 124     I32         npar;                   /* Capture buffer count, (OPEN). */
 125     I32         cpar;                   /* Capture buffer count, (CLOSE). */
 126     I32         nestroot;               /* root parens we are in - used by accept */
 127     I32         extralen;
 128     I32         seen_zerolen;
 129     I32         seen_evals;
 130     regnode     **open_parens;          /* pointers to open parens */
 131     regnode     **close_parens;         /* pointers to close parens */
 132     regnode     *opend;                 /* END node in program */
 133     I32         utf8;           /* whether the pattern is utf8 or not */
 134     I32         orig_utf8;      /* whether the pattern was originally in utf8 */
 135                                 /* XXX use this for future optimisation of case
 136                                  * where pattern must be upgraded to utf8. */
 137     I32         uni_semantics;  /* If a d charset modifier should use unicode
 138                                    rules, even if the pattern is not in
 139                                    utf8 */
 140     HV          *paren_names;           /* Paren names */
 141
 142     regnode     **recurse;              /* Recurse regops */
 143     I32         recurse_count;          /* Number of recurse regops */
 144     I32         in_lookbehind;
 145 #if ADD_TO_REGEXEC
 146     char        *starttry;              /* -Dr: where regtry was called. */
 147 #define RExC_starttry   (pRExC_state->starttry)
 148 #endif
 149 #ifdef DEBUGGING
 150     const char  *lastparse;
 151     I32         lastnum;
 152     AV          *paren_name_list;       /* idx -> name */
 153 #define RExC_lastparse  (pRExC_state->lastparse)
 154 #define RExC_lastnum    (pRExC_state->lastnum)
 155 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 156 #endif
 157 } RExC_state_t;
 158
 159 #define RExC_flags      (pRExC_state->flags)
 160 #define RExC_precomp    (pRExC_state->precomp)
 161 #define RExC_rx_sv      (pRExC_state->rx_sv)
 162 #define RExC_rx         (pRExC_state->rx)
 163 #define RExC_rxi        (pRExC_state->rxi)
 164 #define RExC_start      (pRExC_state->start)
 165 #define RExC_end        (pRExC_state->end)
 166 #define RExC_parse      (pRExC_state->parse)
 167 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
 168 #ifdef RE_TRACK_PATTERN_OFFSETS
 169 #define RExC_offsets    (pRExC_state->rxi->u.offsets) /* I am not like the others */
 170 #endif
 171 #define RExC_emit       (pRExC_state->emit)
 172 #define RExC_emit_start (pRExC_state->emit_start)
 173 #define RExC_emit_bound (pRExC_state->emit_bound)
 174 #define RExC_naughty    (pRExC_state->naughty)
 175 #define RExC_sawback    (pRExC_state->sawback)
 176 #define RExC_seen       (pRExC_state->seen)
 177 #define RExC_size       (pRExC_state->size)
 178 #define RExC_npar       (pRExC_state->npar)
 179 #define RExC_nestroot   (pRExC_state->nestroot)
 180 #define RExC_extralen   (pRExC_state->extralen)
 181 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
 182 #define RExC_seen_evals (pRExC_state->seen_evals)
 183 #define RExC_utf8       (pRExC_state->utf8)
 184 #define RExC_uni_semantics      (pRExC_state->uni_semantics)
 185 #define RExC_orig_utf8  (pRExC_state->orig_utf8)
 186 #define RExC_open_parens        (pRExC_state->open_parens)
 187 #define RExC_close_parens       (pRExC_state->close_parens)
 188 #define RExC_opend      (pRExC_state->opend)
 189 #define RExC_paren_names        (pRExC_state->paren_names)
 190 #define RExC_recurse    (pRExC_state->recurse)
 191 #define RExC_recurse_count      (pRExC_state->recurse_count)
 192 #define RExC_in_lookbehind      (pRExC_state->in_lookbehind)
 193
 194
 195 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
 196 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 197         ((*s) == '{' && regcurly(s)))
 198
 199 #ifdef SPSTART
 200 #undef SPSTART          /* dratted cpp namespace... */
 201 #endif
 202 /*
 203  * Flags to be passed up and down.
 204  */
 205 #define WORST           0       /* Worst case. */
 206 #define HASWIDTH        0x01    /* Known to match non-null strings. */
 207
 208 /* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
 209  * character, and if utf8, must be invariant.  Note that this is not the same thing as REGNODE_SIMPLE */
 210 #define SIMPLE          0x02
 211 #define SPSTART         0x04    /* Starts with * or +. */
 212 #define TRYAGAIN        0x08    /* Weeded out a declaration. */
 213 #define POSTPONED       0x10    /* (?1),(?&name), (??{...}) or similar */
 214
 215 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 216
 217 /* whether trie related optimizations are enabled */
 218 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 219 #define TRIE_STUDY_OPT
 220 #define FULL_TRIE_STUDY
 221 #define TRIE_STCLASS
 222 #endif
 223
 224
 225
 226 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 227 #define PBITVAL(paren) (1 << ((paren) & 7))
 228 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 229 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 230 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 231
 232 /* If not already in utf8, do a longjmp back to the beginning */
 233 #define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
 234 #define REQUIRE_UTF8    STMT_START {                                       \
 235                                      if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
 236                         } STMT_END
 237
 238 /* About scan_data_t.
 239
 240   During optimisation we recurse through the regexp program performing
 241   various inplace (keyhole style) optimisations. In addition study_chunk
 242   and scan_commit populate this data structure with information about
 243   what strings MUST appear in the pattern. We look for the longest
 244   string that must appear at a fixed location, and we look for the
 245   longest string that may appear at a floating location. So for instance
 246   in the pattern:
 247
 248     /FOO[xX]A.*B[xX]BAR/
 249
 250   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 251   strings (because they follow a .* construct). study_chunk will identify
 252   both FOO and BAR as being the longest fixed and floating strings respectively.
 253
 254   The strings can be composites, for instance
 255
 256      /(f)(o)(o)/
 257
 258   will result in a composite fixed substring 'foo'.
 259
 260   For each string some basic information is maintained:
 261
 262   - offset or min_offset
 263     This is the position the string must appear at, or not before.
 264     It also implicitly (when combined with minlenp) tells us how many
 265     characters must match before the string we are searching for.
 266     Likewise when combined with minlenp and the length of the string it
 267     tells us how many characters must appear after the string we have
 268     found.
 269
 270   - max_offset
 271     Only used for floating strings. This is the rightmost point that
 272     the string can appear at. If set to I32 max it indicates that the
 273     string can occur infinitely far to the right.
 274
 275   - minlenp
 276     A pointer to the minimum length of the pattern that the string
 277     was found inside. This is important as in the case of positive
 278     lookahead or positive lookbehind we can have multiple patterns
 279     involved. Consider
 280
 281     /(?=FOO).*F/
 282
 283     The minimum length of the pattern overall is 3, the minimum length
 284     of the lookahead part is 3, but the minimum length of the part that
 285     will actually match is 1. So 'FOO's minimum length is 3, but the
 286     minimum length for the F is 1. This is important as the minimum length
 287     is used to determine offsets in front of and behind the string being
 288     looked for.  Since strings can be composites this is the length of the
 289     pattern at the time it was committed with a scan_commit. Note that
 290     the length is calculated by study_chunk, so that the minimum lengths
 291     are not known until the full pattern has been compiled, thus the
 292     pointer to the value.
 293
 294   - lookbehind
 295
 296     In the case of lookbehind the string being searched for can be
 297     offset past the start point of the final matching string.
 298     If this value was just blithely removed from the min_offset it would
 299     invalidate some of the calculations for how many chars must match
 300     before or after (as they are derived from min_offset and minlen and
 301     the length of the string being searched for).
 302     When the final pattern is compiled and the data is moved from the
 303     scan_data_t structure into the regexp structure the information
 304     about lookbehind is factored in, with the information that would
 305     have been lost precalculated in the end_shift field for the
 306     associated string.
 307
 308   The fields pos_min and pos_delta are used to store the minimum offset
 309   and the delta to the maximum offset at the current point in the pattern.
 310
 311 */
 312
 313 typedef struct scan_data_t {
 314     /*I32 len_min;      unused */
 315     /*I32 len_delta;    unused */
 316     I32 pos_min;
 317     I32 pos_delta;
 318     SV *last_found;
 319     I32 last_end;           /* min value, <0 unless valid. */
 320     I32 last_start_min;
 321     I32 last_start_max;
 322     SV **longest;           /* Either &l_fixed, or &l_float. */
 323     SV *longest_fixed;      /* longest fixed string found in pattern */
 324     I32 offset_fixed;       /* offset where it starts */
 325     I32 *minlen_fixed;      /* pointer to the minlen relevant to the string */
 326     I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 327     SV *longest_float;      /* longest floating string found in pattern */
 328     I32 offset_float_min;   /* earliest point in string it can appear */
 329     I32 offset_float_max;   /* latest point in string it can appear */
 330     I32 *minlen_float;      /* pointer to the minlen relevant to the string */
 331     I32 lookbehind_float;   /* is the position of the string modified by LB */
 332     I32 flags;
 333     I32 whilem_c;
 334     I32 *last_closep;
 335     struct regnode_charclass_class *start_class;
 336 } scan_data_t;
 337
 338 /*
 339  * Forward declarations for pregcomp()'s friends.
 340  */
 341
 342 static const scan_data_t zero_scan_data =
 343   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 344
 345 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 346 #define SF_BEFORE_SEOL          0x0001
 347 #define SF_BEFORE_MEOL          0x0002
 348 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 349 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 350
 351 #ifdef NO_UNARY_PLUS
 352 #  define SF_FIX_SHIFT_EOL      (0+2)
 353 #  define SF_FL_SHIFT_EOL               (0+4)
 354 #else
 355 #  define SF_FIX_SHIFT_EOL      (+2)
 356 #  define SF_FL_SHIFT_EOL               (+4)
 357 #endif
 358
 359 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 360 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 361
 362 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 363 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 364 #define SF_IS_INF               0x0040
 365 #define SF_HAS_PAR              0x0080
 366 #define SF_IN_PAR               0x0100
 367 #define SF_HAS_EVAL             0x0200
 368 #define SCF_DO_SUBSTR           0x0400
 369 #define SCF_DO_STCLASS_AND      0x0800
 370 #define SCF_DO_STCLASS_OR       0x1000
 371 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 372 #define SCF_WHILEM_VISITED_POS  0x2000
 373
 374 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 375 #define SCF_SEEN_ACCEPT         0x8000
 376
 377 #define UTF cBOOL(RExC_utf8)
 378 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 379 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
 380 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
 381 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
 382 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
 383 #define MORE_ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
 384 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
 385
 386 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 387
 388 #define OOB_UNICODE             12345678
 389 #define OOB_NAMEDCLASS          -1
 390
 391 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 392 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 393
 394
 395 /* length of regex to show in messages that don't mark a position within */
 396 #define RegexLengthToShowInErrorMessages 127
 397
 398 /*
 399  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 400  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 401  * op/pragma/warn/regcomp.
 402  */
 403 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 404 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 405
 406 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
 407
 408 /*
 409  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 410  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 411  * "...".
 412  */
 413 #define _FAIL(code) STMT_START {                                        \
 414     const char *ellipses = "";                                          \
 415     IV len = RExC_end - RExC_precomp;                                   \
 416                                                                         \
 417     if (!SIZE_ONLY)                                                     \
 418         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);                   \
 419     if (len > RegexLengthToShowInErrorMessages) {                       \
 420         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
 421         len = RegexLengthToShowInErrorMessages - 10;                    \
 422         ellipses = "...";                                               \
 423     }                                                                   \
 424     code;                                                               \
 425 } STMT_END
 426
 427 #define FAIL(msg) _FAIL(                            \
 428     Perl_croak(aTHX_ "%s in regex m/%.*s%s/",       \
 429             msg, (int)len, RExC_precomp, ellipses))
 430
 431 #define FAIL2(msg,arg) _FAIL(                       \
 432     Perl_croak(aTHX_ msg " in regex m/%.*s%s/",     \
 433             arg, (int)len, RExC_precomp, ellipses))
 434
 435 /*
 436  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 437  */
 438 #define Simple_vFAIL(m) STMT_START {                                    \
 439     const IV offset = RExC_parse - RExC_precomp;                        \
 440     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
 441             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
 442 } STMT_END
 443
 444 /*
 445  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 446  */
 447 #define vFAIL(m) STMT_START {                           \
 448     if (!SIZE_ONLY)                                     \
 449         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 450     Simple_vFAIL(m);                                    \
 451 } STMT_END
 452
 453 /*
 454  * Like Simple_vFAIL(), but accepts two arguments.
 455  */
 456 #define Simple_vFAIL2(m,a1) STMT_START {                        \
 457     const IV offset = RExC_parse - RExC_precomp;                        \
 458     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,                   \
 459             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 460 } STMT_END
 461
 462 /*
 463  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 464  */
 465 #define vFAIL2(m,a1) STMT_START {                       \
 466     if (!SIZE_ONLY)                                     \
 467         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 468     Simple_vFAIL2(m, a1);                               \
 469 } STMT_END
 470
 471
 472 /*
 473  * Like Simple_vFAIL(), but accepts three arguments.
 474  */
 475 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
 476     const IV offset = RExC_parse - RExC_precomp;                \
 477     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,               \
 478             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 479 } STMT_END
 480
 481 /*
 482  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 483  */
 484 #define vFAIL3(m,a1,a2) STMT_START {                    \
 485     if (!SIZE_ONLY)                                     \
 486         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 487     Simple_vFAIL3(m, a1, a2);                           \
 488 } STMT_END
 489
 490 /*
 491  * Like Simple_vFAIL(), but accepts four arguments.
 492  */
 493 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
 494     const IV offset = RExC_parse - RExC_precomp;                \
 495     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,           \
 496             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 497 } STMT_END
 498
 499 #define ckWARNreg(loc,m) STMT_START {                                   \
 500     const IV offset = loc - RExC_precomp;                               \
 501     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 502             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 503 } STMT_END
 504
 505 #define ckWARNregdep(loc,m) STMT_START {                                \
 506     const IV offset = loc - RExC_precomp;                               \
 507     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 508             m REPORT_LOCATION,                                          \
 509             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 510 } STMT_END
 511
 512 #define ckWARN2reg(loc, m, a1) STMT_START {                             \
 513     const IV offset = loc - RExC_precomp;                               \
 514     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 515             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
 516 } STMT_END
 517
 518 #define vWARN3(loc, m, a1, a2) STMT_START {                             \
 519     const IV offset = loc - RExC_precomp;                               \
 520     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 521             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 522 } STMT_END
 523
 524 #define ckWARN3reg(loc, m, a1, a2) STMT_START {                         \
 525     const IV offset = loc - RExC_precomp;                               \
 526     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 527             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 528 } STMT_END
 529
 530 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
 531     const IV offset = loc - RExC_precomp;                               \
 532     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 533             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 534 } STMT_END
 535
 536 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START {                     \
 537     const IV offset = loc - RExC_precomp;                               \
 538     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 539             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 540 } STMT_END
 541
 542 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
 543     const IV offset = loc - RExC_precomp;                               \
 544     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 545             a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
 546 } STMT_END
 547
 548
 549 /* Allow for side effects in s */
 550 #define REGC(c,s) STMT_START {                  \
 551     if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
 552 } STMT_END
 553
 554 /* Macros for recording node offsets.   20001227 mjd@plover.com
 555  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 556  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 557  * Element 0 holds the number n.
 558  * Position is 1 indexed.
 559  */
 560 #ifndef RE_TRACK_PATTERN_OFFSETS
 561 #define Set_Node_Offset_To_R(node,byte)
 562 #define Set_Node_Offset(node,byte)
 563 #define Set_Cur_Node_Offset
 564 #define Set_Node_Length_To_R(node,len)
 565 #define Set_Node_Length(node,len)
 566 #define Set_Node_Cur_Length(node)
 567 #define Node_Offset(n)
 568 #define Node_Length(n)
 569 #define Set_Node_Offset_Length(node,offset,len)
 570 #define ProgLen(ri) ri->u.proglen
 571 #define SetProgLen(ri,x) ri->u.proglen = x
 572 #else
 573 #define ProgLen(ri) ri->u.offsets[0]
 574 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 575 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
 576     if (! SIZE_ONLY) {                                                  \
 577         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
 578                     __LINE__, (int)(node), (int)(byte)));               \
 579         if((node) < 0) {                                                \
 580             Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
 581         } else {                                                        \
 582             RExC_offsets[2*(node)-1] = (byte);                          \
 583         }                                                               \
 584     }                                                                   \
 585 } STMT_END
 586
 587 #define Set_Node_Offset(node,byte) \
 588     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 589 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 590
 591 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
 592     if (! SIZE_ONLY) {                                                  \
 593         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
 594                 __LINE__, (int)(node), (int)(len)));                    \
 595         if((node) < 0) {                                                \
 596             Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
 597         } else {                                                        \
 598             RExC_offsets[2*(node)] = (len);                             \
 599         }                                                               \
 600     }                                                                   \
 601 } STMT_END
 602
 603 #define Set_Node_Length(node,len) \
 604     Set_Node_Length_To_R((node)-RExC_emit_start, len)
 605 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
 606 #define Set_Node_Cur_Length(node) \
 607     Set_Node_Length(node, RExC_parse - parse_start)
 608
 609 /* Get offsets and lengths */
 610 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 611 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 612
 613 #define Set_Node_Offset_Length(node,offset,len) STMT_START {    \
 614     Set_Node_Offset_To_R((node)-RExC_emit_start, (offset));     \
 615     Set_Node_Length_To_R((node)-RExC_emit_start, (len));        \
 616 } STMT_END
 617 #endif
 618
 619 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 620 #define EXPERIMENTAL_INPLACESCAN
 621 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
 622
 623 #define DEBUG_STUDYDATA(str,data,depth)                              \
 624 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
 625     PerlIO_printf(Perl_debug_log,                                    \
 626         "%*s" str "Pos:%"IVdf"/%"IVdf                                \
 627         " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s",       \
 628         (int)(depth)*2, "",                                          \
 629         (IV)((data)->pos_min),                                       \
 630         (IV)((data)->pos_delta),                                     \
 631         (UV)((data)->flags),                                         \
 632         (IV)((data)->whilem_c),                                      \
 633         (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
 634         is_inf ? "INF " : ""                                         \
 635     );                                                               \
 636     if ((data)->last_found)                                          \
 637         PerlIO_printf(Perl_debug_log,                                \
 638             "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 639             " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 640             SvPVX_const((data)->last_found),                         \
 641             (IV)((data)->last_end),                                  \
 642             (IV)((data)->last_start_min),                            \
 643             (IV)((data)->last_start_max),                            \
 644             ((data)->longest &&                                      \
 645              (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 646             SvPVX_const((data)->longest_fixed),                      \
 647             (IV)((data)->offset_fixed),                              \
 648             ((data)->longest &&                                      \
 649              (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 650             SvPVX_const((data)->longest_float),                      \
 651             (IV)((data)->offset_float_min),                          \
 652             (IV)((data)->offset_float_max)                           \
 653         );                                                           \
 654     PerlIO_printf(Perl_debug_log,"\n");                              \
 655 });
 656
 657 static void clear_re(pTHX_ void *r);
 658
 659 /* Mark that we cannot extend a found fixed substring at this point.
 660    Update the longest found anchored substring and the longest found
 661    floating substrings if needed. */
 662
 663 STATIC void
 664 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
 665 {
 666     const STRLEN l = CHR_SVLEN(data->last_found);
 667     const STRLEN old_l = CHR_SVLEN(*data->longest);
 668     GET_RE_DEBUG_FLAGS_DECL;
 669
 670     PERL_ARGS_ASSERT_SCAN_COMMIT;
 671
 672     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 673         SvSetMagicSV(*data->longest, data->last_found);
 674         if (*data->longest == data->longest_fixed) {
 675             data->offset_fixed = l ? data->last_start_min : data->pos_min;
 676             if (data->flags & SF_BEFORE_EOL)
 677                 data->flags
 678                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
 679             else
 680                 data->flags &= ~SF_FIX_BEFORE_EOL;
 681             data->minlen_fixed=minlenp;
 682             data->lookbehind_fixed=0;
 683         }
 684         else { /* *data->longest == data->longest_float */
 685             data->offset_float_min = l ? data->last_start_min : data->pos_min;
 686             data->offset_float_max = (l
 687                                       ? data->last_start_max
 688                                       : data->pos_min + data->pos_delta);
 689             if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
 690                 data->offset_float_max = I32_MAX;
 691             if (data->flags & SF_BEFORE_EOL)
 692                 data->flags
 693                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
 694             else
 695                 data->flags &= ~SF_FL_BEFORE_EOL;
 696             data->minlen_float=minlenp;
 697             data->lookbehind_float=0;
 698         }
 699     }
 700     SvCUR_set(data->last_found, 0);
 701     {
 702         SV * const sv = data->last_found;
 703         if (SvUTF8(sv) && SvMAGICAL(sv)) {
 704             MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 705             if (mg)
 706                 mg->mg_len = 0;
 707         }
 708     }
 709     data->last_end = -1;
 710     data->flags &= ~SF_BEFORE_EOL;
 711     DEBUG_STUDYDATA("commit: ",data,0);
 712 }
 713
 714 /* Can match anything (initialization) */
 715 STATIC void
 716 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 717 {
 718     PERL_ARGS_ASSERT_CL_ANYTHING;
 719
 720     ANYOF_CLASS_ZERO(cl);
 721     ANYOF_BITMAP_SETALL(cl);
 722     cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL|ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL;
 723     if (LOC)
 724         cl->flags |= ANYOF_LOCALE;
 725 }
 726
 727 /* Can match anything (initialization) */
 728 STATIC int
 729 S_cl_is_anything(const struct regnode_charclass_class *cl)
 730 {
 731     int value;
 732
 733     PERL_ARGS_ASSERT_CL_IS_ANYTHING;
 734
 735     for (value = 0; value <= ANYOF_MAX; value += 2)
 736         if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
 737             return 1;
 738     if (!(cl->flags & ANYOF_UNICODE_ALL))
 739         return 0;
 740     if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
 741         return 0;
 742     return 1;
 743 }
 744
 745 /* Can match anything (initialization) */
 746 STATIC void
 747 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 748 {
 749     PERL_ARGS_ASSERT_CL_INIT;
 750
 751     Zero(cl, 1, struct regnode_charclass_class);
 752     cl->type = ANYOF;
 753     cl_anything(pRExC_state, cl);
 754 }
 755
 756 STATIC void
 757 S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 758 {
 759     PERL_ARGS_ASSERT_CL_INIT_ZERO;
 760
 761     Zero(cl, 1, struct regnode_charclass_class);
 762     cl->type = ANYOF;
 763     cl_anything(pRExC_state, cl);
 764     if (LOC)
 765         cl->flags |= ANYOF_LOCALE;
 766 }
 767
 768 /* 'And' a given class with another one.  Can create false positives */
 769 /* We assume that cl is not inverted */
 770 STATIC void
 771 S_cl_and(struct regnode_charclass_class *cl,
 772         const struct regnode_charclass_class *and_with)
 773 {
 774     PERL_ARGS_ASSERT_CL_AND;
 775
 776     assert(and_with->type == ANYOF);
 777
 778     if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
 779         && !(ANYOF_CLASS_TEST_ANY_SET(cl))
 780         && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 781         && !(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
 782         && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) {
 783         int i;
 784
 785         if (and_with->flags & ANYOF_INVERT)
 786             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 787                 cl->bitmap[i] &= ~and_with->bitmap[i];
 788         else
 789             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 790                 cl->bitmap[i] &= and_with->bitmap[i];
 791     } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
 792     if (!(and_with->flags & ANYOF_EOS))
 793         cl->flags &= ~ANYOF_EOS;
 794
 795     if (!(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD))
 796         cl->flags &= ~ANYOF_LOC_NONBITMAP_FOLD;
 797     if (!(and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL))
 798         cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
 799
 800     if (cl->flags & ANYOF_UNICODE_ALL
 801         && and_with->flags & ANYOF_NONBITMAP
 802         && !(and_with->flags & ANYOF_INVERT))
 803     {
 804         if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
 805             cl->flags &= ~ANYOF_UNICODE_ALL;
 806         }
 807         cl->flags |= and_with->flags & ANYOF_NONBITMAP; /* field is 2 bits; use
 808                                                            only the one(s)
 809                                                            actually set */
 810         ARG_SET(cl, ARG(and_with));
 811     }
 812     if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
 813         !(and_with->flags & ANYOF_INVERT))
 814         cl->flags &= ~ANYOF_UNICODE_ALL;
 815     if (!(and_with->flags & (ANYOF_NONBITMAP|ANYOF_UNICODE_ALL)) &&
 816         !(and_with->flags & ANYOF_INVERT))
 817         cl->flags &= ~ANYOF_NONBITMAP;
 818 }
 819
 820 /* 'OR' a given class with another one.  Can create false positives */
 821 /* We assume that cl is not inverted */
 822 STATIC void
 823 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
 824 {
 825     PERL_ARGS_ASSERT_CL_OR;
 826
 827     if (or_with->flags & ANYOF_INVERT) {
 828         /* We do not use
 829          * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
 830          *   <= (B1 | !B2) | (CL1 | !CL2)
 831          * which is wasteful if CL2 is small, but we ignore CL2:
 832          *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
 833          * XXXX Can we handle case-fold?  Unclear:
 834          *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
 835          *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
 836          */
 837         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 838              && !(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
 839              && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD) ) {
 840             int i;
 841
 842             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 843                 cl->bitmap[i] |= ~or_with->bitmap[i];
 844         } /* XXXX: logic is complicated otherwise */
 845         else {
 846             cl_anything(pRExC_state, cl);
 847         }
 848     } else {
 849         /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
 850         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 851              && (!(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
 852                  || (cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) ) {
 853             int i;
 854
 855             /* OR char bitmap and class bitmap separately */
 856             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 857                 cl->bitmap[i] |= or_with->bitmap[i];
 858             if (ANYOF_CLASS_TEST_ANY_SET(or_with)) {
 859                 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
 860                     cl->classflags[i] |= or_with->classflags[i];
 861                 cl->flags |= ANYOF_CLASS;
 862             }
 863         }
 864         else { /* XXXX: logic is complicated, leave it along for a moment. */
 865             cl_anything(pRExC_state, cl);
 866         }
 867     }
 868     if (or_with->flags & ANYOF_EOS)
 869         cl->flags |= ANYOF_EOS;
 870     if (!(or_with->flags & ANYOF_NON_UTF8_LATIN1_ALL))
 871         cl->flags |= ANYOF_NON_UTF8_LATIN1_ALL;
 872
 873     if (or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
 874         cl->flags |= ANYOF_LOC_NONBITMAP_FOLD;
 875
 876     /* If both nodes match something outside the bitmap, but what they match
 877      * outside is not the same pointer, and hence not easily compared, give up
 878      * and allow the start class to match everything outside the bitmap */
 879     if (cl->flags & ANYOF_NONBITMAP && or_with->flags & ANYOF_NONBITMAP &&
 880         ARG(cl) != ARG(or_with)) {
 881         cl->flags |= ANYOF_UNICODE_ALL;
 882     }
 883
 884     if (or_with->flags & ANYOF_UNICODE_ALL) {
 885         cl->flags |= ANYOF_UNICODE_ALL;
 886     }
 887 }
 888
 889 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
 890 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
 891 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
 892 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
 893
 894
 895 #ifdef DEBUGGING
 896 /*
 897    dump_trie(trie,widecharmap,revcharmap)
 898    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
 899    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
 900
 901    These routines dump out a trie in a somewhat readable format.
 902    The _interim_ variants are used for debugging the interim
 903    tables that are used to generate the final compressed
 904    representation which is what dump_trie expects.
 905
 906    Part of the reason for their existence is to provide a form
 907    of documentation as to how the different representations function.
 908
 909 */
 910
 911 /*
 912   Dumps the final compressed table form of the trie to Perl_debug_log.
 913   Used for debugging make_trie().
 914 */
 915
 916 STATIC void
 917 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
 918             AV *revcharmap, U32 depth)
 919 {
 920     U32 state;
 921     SV *sv=sv_newmortal();
 922     int colwidth= widecharmap ? 6 : 4;
 923     U16 word;
 924     GET_RE_DEBUG_FLAGS_DECL;
 925
 926     PERL_ARGS_ASSERT_DUMP_TRIE;
 927
 928     PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
 929         (int)depth * 2 + 2,"",
 930         "Match","Base","Ofs" );
 931
 932     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
 933         SV ** const tmp = av_fetch( revcharmap, state, 0);
 934         if ( tmp ) {
 935             PerlIO_printf( Perl_debug_log, "%*s",
 936                 colwidth,
 937                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
 938                             PL_colors[0], PL_colors[1],
 939                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
 940                             PERL_PV_ESCAPE_FIRSTCHAR
 941                 )
 942             );
 943         }
 944     }
 945     PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
 946         (int)depth * 2 + 2,"");
 947
 948     for( state = 0 ; state < trie->uniquecharcount ; state++ )
 949         PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
 950     PerlIO_printf( Perl_debug_log, "\n");
 951
 952     for( state = 1 ; state < trie->statecount ; state++ ) {
 953         const U32 base = trie->states[ state ].trans.base;
 954
 955         PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
 956
 957         if ( trie->states[ state ].wordnum ) {
 958             PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
 959         } else {
 960             PerlIO_printf( Perl_debug_log, "%6s", "" );
 961         }
 962
 963         PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
 964
 965         if ( base ) {
 966             U32 ofs = 0;
 967
 968             while( ( base + ofs  < trie->uniquecharcount ) ||
 969                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
 970                      && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
 971                     ofs++;
 972
 973             PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
 974
 975             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
 976                 if ( ( base + ofs >= trie->uniquecharcount ) &&
 977                      ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
 978                      trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
 979                 {
 980                    PerlIO_printf( Perl_debug_log, "%*"UVXf,
 981                     colwidth,
 982                     (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
 983                 } else {
 984                     PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
 985                 }
 986             }
 987
 988             PerlIO_printf( Perl_debug_log, "]");
 989
 990         }
 991         PerlIO_printf( Perl_debug_log, "\n" );
 992     }
 993     PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
 994     for (word=1; word <= trie->wordcount; word++) {
 995         PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
 996             (int)word, (int)(trie->wordinfo[word].prev),
 997             (int)(trie->wordinfo[word].len));
 998     }
 999     PerlIO_printf(Perl_debug_log, "\n" );
1000 }
1001 /*
1002   Dumps a fully constructed but uncompressed trie in list form.
1003   List tries normally only are used for construction when the number of
1004   possible chars (trie->uniquecharcount) is very high.
1005   Used for debugging make_trie().
1006 */
1007 STATIC void
1008 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1009                          HV *widecharmap, AV *revcharmap, U32 next_alloc,
1010                          U32 depth)
1011 {
1012     U32 state;
1013     SV *sv=sv_newmortal();
1014     int colwidth= widecharmap ? 6 : 4;
1015     GET_RE_DEBUG_FLAGS_DECL;
1016
1017     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1018
1019     /* print out the table precompression.  */
1020     PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1021         (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1022         "------:-----+-----------------\n" );
1023
1024     for( state=1 ; state < next_alloc ; state ++ ) {
1025         U16 charid;
1026
1027         PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1028             (int)depth * 2 + 2,"", (UV)state  );
1029         if ( ! trie->states[ state ].wordnum ) {
1030             PerlIO_printf( Perl_debug_log, "%5s| ","");
1031         } else {
1032             PerlIO_printf( Perl_debug_log, "W%4x| ",
1033                 trie->states[ state ].wordnum
1034             );
1035         }
1036         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1037             SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1038             if ( tmp ) {
1039                 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1040                     colwidth,
1041                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1042                             PL_colors[0], PL_colors[1],
1043                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1044                             PERL_PV_ESCAPE_FIRSTCHAR
1045                     ) ,
1046                     TRIE_LIST_ITEM(state,charid).forid,
1047                     (UV)TRIE_LIST_ITEM(state,charid).newstate
1048                 );
1049                 if (!(charid % 10))
1050                     PerlIO_printf(Perl_debug_log, "\n%*s| ",
1051                         (int)((depth * 2) + 14), "");
1052             }
1053         }
1054         PerlIO_printf( Perl_debug_log, "\n");
1055     }
1056 }
1057
1058 /*
1059   Dumps a fully constructed but uncompressed trie in table form.
1060   This is the normal DFA style state transition table, with a few
1061   twists to facilitate compression later.
1062   Used for debugging make_trie().
1063 */
1064 STATIC void
1065 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1066                           HV *widecharmap, AV *revcharmap, U32 next_alloc,
1067                           U32 depth)
1068 {
1069     U32 state;
1070     U16 charid;
1071     SV *sv=sv_newmortal();
1072     int colwidth= widecharmap ? 6 : 4;
1073     GET_RE_DEBUG_FLAGS_DECL;
1074
1075     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1076
1077     /*
1078        print out the table precompression so that we can do a visual check
1079        that they are identical.
1080      */
1081
1082     PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1083
1084     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1085         SV ** const tmp = av_fetch( revcharmap, charid, 0);
1086         if ( tmp ) {
1087             PerlIO_printf( Perl_debug_log, "%*s",
1088                 colwidth,
1089                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1090                             PL_colors[0], PL_colors[1],
1091                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1092                             PERL_PV_ESCAPE_FIRSTCHAR
1093                 )
1094             );
1095         }
1096     }
1097
1098     PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1099
1100     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1101         PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1102     }
1103
1104     PerlIO_printf( Perl_debug_log, "\n" );
1105
1106     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1107
1108         PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1109             (int)depth * 2 + 2,"",
1110             (UV)TRIE_NODENUM( state ) );
1111
1112         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1113             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1114             if (v)
1115                 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1116             else
1117                 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1118         }
1119         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1120             PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1121         } else {
1122             PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1123             trie->states[ TRIE_NODENUM( state ) ].wordnum );
1124         }
1125     }
1126 }
1127
1128 #endif
1129
1130
1131 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1132   startbranch: the first branch in the whole branch sequence
1133   first      : start branch of sequence of branch-exact nodes.
1134                May be the same as startbranch
1135   last       : Thing following the last branch.
1136                May be the same as tail.
1137   tail       : item following the branch sequence
1138   count      : words in the sequence
1139   flags      : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1140   depth      : indent depth
1141
1142 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1143
1144 A trie is an N'ary tree where the branches are determined by digital
1145 decomposition of the key. IE, at the root node you look up the 1st character and
1146 follow that branch repeat until you find the end of the branches. Nodes can be
1147 marked as "accepting" meaning they represent a complete word. Eg:
1148
1149   /he|she|his|hers/
1150
1151 would convert into the following structure. Numbers represent states, letters
1152 following numbers represent valid transitions on the letter from that state, if
1153 the number is in square brackets it represents an accepting state, otherwise it
1154 will be in parenthesis.
1155
1156       +-h->+-e->[3]-+-r->(8)-+-s->[9]
1157       |    |
1158       |   (2)
1159       |    |
1160      (1)   +-i->(6)-+-s->[7]
1161       |
1162       +-s->(3)-+-h->(4)-+-e->[5]
1163
1164       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1165
1166 This shows that when matching against the string 'hers' we will begin at state 1
1167 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1168 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1169 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1170 single traverse. We store a mapping from accepting to state to which word was
1171 matched, and then when we have multiple possibilities we try to complete the
1172 rest of the regex in the order in which they occured in the alternation.
1173
1174 The only prior NFA like behaviour that would be changed by the TRIE support is
1175 the silent ignoring of duplicate alternations which are of the form:
1176
1177  / (DUPE|DUPE) X? (?{ ... }) Y /x
1178
1179 Thus EVAL blocks following a trie may be called a different number of times with
1180 and without the optimisation. With the optimisations dupes will be silently
1181 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1182 the following demonstrates:
1183
1184  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1185
1186 which prints out 'word' three times, but
1187
1188  'words'=~/(word|word|word)(?{ print $1 })S/
1189
1190 which doesnt print it out at all. This is due to other optimisations kicking in.
1191
1192 Example of what happens on a structural level:
1193
1194 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1195
1196    1: CURLYM[1] {1,32767}(18)
1197    5:   BRANCH(8)
1198    6:     EXACT <ac>(16)
1199    8:   BRANCH(11)
1200    9:     EXACT <ad>(16)
1201   11:   BRANCH(14)
1202   12:     EXACT <ab>(16)
1203   16:   SUCCEED(0)
1204   17:   NOTHING(18)
1205   18: END(0)
1206
1207 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1208 and should turn into:
1209
1210    1: CURLYM[1] {1,32767}(18)
1211    5:   TRIE(16)
1212         [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1213           <ac>
1214           <ad>
1215           <ab>
1216   16:   SUCCEED(0)
1217   17:   NOTHING(18)
1218   18: END(0)
1219
1220 Cases where tail != last would be like /(?foo|bar)baz/:
1221
1222    1: BRANCH(4)
1223    2:   EXACT <foo>(8)
1224    4: BRANCH(7)
1225    5:   EXACT <bar>(8)
1226    7: TAIL(8)
1227    8: EXACT <baz>(10)
1228   10: END(0)
1229
1230 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1231 and would end up looking like:
1232
1233     1: TRIE(8)
1234       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1235         <foo>
1236         <bar>
1237    7: TAIL(8)
1238    8: EXACT <baz>(10)
1239   10: END(0)
1240
1241     d = uvuni_to_utf8_flags(d, uv, 0);
1242
1243 is the recommended Unicode-aware way of saying
1244
1245     *(d++) = uv;
1246 */
1247
1248 #define TRIE_STORE_REVCHAR                                                 \
1249     STMT_START {                                                           \
1250         if (UTF) {                                                         \
1251             SV *zlopp = newSV(2);                                          \
1252             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
1253             unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
1254             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
1255             SvPOK_on(zlopp);                                               \
1256             SvUTF8_on(zlopp);                                              \
1257             av_push(revcharmap, zlopp);                                    \
1258         } else {                                                           \
1259             char ooooff = (char)uvc;                                               \
1260             av_push(revcharmap, newSVpvn(&ooooff, 1));                     \
1261         }                                                                  \
1262         } STMT_END
1263
1264 #define TRIE_READ_CHAR STMT_START {                                           \
1265     wordlen++;                                                                \
1266     if ( UTF ) {                                                              \
1267         if ( folder ) {                                                       \
1268             if ( foldlen > 0 ) {                                              \
1269                uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags );     \
1270                foldlen -= len;                                                \
1271                scan += len;                                                   \
1272                len = 0;                                                       \
1273             } else {                                                          \
1274                 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1275                 uvc = to_uni_fold( uvc, foldbuf, &foldlen );                  \
1276                 foldlen -= UNISKIP( uvc );                                    \
1277                 scan = foldbuf + UNISKIP( uvc );                              \
1278             }                                                                 \
1279         } else {                                                              \
1280             uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1281         }                                                                     \
1282     } else {                                                                  \
1283         uvc = (U32)*uc;                                                       \
1284         len = 1;                                                              \
1285     }                                                                         \
1286 } STMT_END
1287
1288
1289
1290 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
1291     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
1292         U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
1293         Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1294     }                                                           \
1295     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
1296     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
1297     TRIE_LIST_CUR( state )++;                                   \
1298 } STMT_END
1299
1300 #define TRIE_LIST_NEW(state) STMT_START {                       \
1301     Newxz( trie->states[ state ].trans.list,               \
1302         4, reg_trie_trans_le );                                 \
1303      TRIE_LIST_CUR( state ) = 1;                                \
1304      TRIE_LIST_LEN( state ) = 4;                                \
1305 } STMT_END
1306
1307 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
1308     U16 dupe= trie->states[ state ].wordnum;                    \
1309     regnode * const noper_next = regnext( noper );              \
1310                                                                 \
1311     DEBUG_r({                                                   \
1312         /* store the word for dumping */                        \
1313         SV* tmp;                                                \
1314         if (OP(noper) != NOTHING)                               \
1315             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);    \
1316         else                                                    \
1317             tmp = newSVpvn_utf8( "", 0, UTF );                  \
1318         av_push( trie_words, tmp );                             \
1319     });                                                         \
1320                                                                 \
1321     curword++;                                                  \
1322     trie->wordinfo[curword].prev   = 0;                         \
1323     trie->wordinfo[curword].len    = wordlen;                   \
1324     trie->wordinfo[curword].accept = state;                     \
1325                                                                 \
1326     if ( noper_next < tail ) {                                  \
1327         if (!trie->jump)                                        \
1328             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1329         trie->jump[curword] = (U16)(noper_next - convert);      \
1330         if (!jumper)                                            \
1331             jumper = noper_next;                                \
1332         if (!nextbranch)                                        \
1333             nextbranch= regnext(cur);                           \
1334     }                                                           \
1335                                                                 \
1336     if ( dupe ) {                                               \
1337         /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
1338         /* chain, so that when the bits of chain are later    */\
1339         /* linked together, the dups appear in the chain      */\
1340         trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1341         trie->wordinfo[dupe].prev = curword;                    \
1342     } else {                                                    \
1343         /* we haven't inserted this word yet.                */ \
1344         trie->states[ state ].wordnum = curword;                \
1345     }                                                           \
1346 } STMT_END
1347
1348
1349 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)          \
1350      ( ( base + charid >=  ucharcount                                   \
1351          && base + charid < ubound                                      \
1352          && state == trie->trans[ base - ucharcount + charid ].check    \
1353          && trie->trans[ base - ucharcount + charid ].next )            \
1354            ? trie->trans[ base - ucharcount + charid ].next             \
1355            : ( state==1 ? special : 0 )                                 \
1356       )
1357
1358 #define MADE_TRIE       1
1359 #define MADE_JUMP_TRIE  2
1360 #define MADE_EXACT_TRIE 4
1361
1362 STATIC I32
1363 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1364 {
1365     dVAR;
1366     /* first pass, loop through and scan words */
1367     reg_trie_data *trie;
1368     HV *widecharmap = NULL;
1369     AV *revcharmap = newAV();
1370     regnode *cur;
1371     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1372     STRLEN len = 0;
1373     UV uvc = 0;
1374     U16 curword = 0;
1375     U32 next_alloc = 0;
1376     regnode *jumper = NULL;
1377     regnode *nextbranch = NULL;
1378     regnode *convert = NULL;
1379     U32 *prev_states; /* temp array mapping each state to previous one */
1380     /* we just use folder as a flag in utf8 */
1381     const U8 * folder = NULL;
1382
1383 #ifdef DEBUGGING
1384     const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1385     AV *trie_words = NULL;
1386     /* along with revcharmap, this only used during construction but both are
1387      * useful during debugging so we store them in the struct when debugging.
1388      */
1389 #else
1390     const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1391     STRLEN trie_charcount=0;
1392 #endif
1393     SV *re_trie_maxbuff;
1394     GET_RE_DEBUG_FLAGS_DECL;
1395
1396     PERL_ARGS_ASSERT_MAKE_TRIE;
1397 #ifndef DEBUGGING
1398     PERL_UNUSED_ARG(depth);
1399 #endif
1400
1401     switch (flags) {
1402         case EXACTFA:
1403         case EXACTFU: folder = PL_fold_latin1; break;
1404         case EXACTF:  folder = PL_fold; break;
1405         case EXACTFL: folder = PL_fold_locale; break;
1406     }
1407
1408     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1409     trie->refcount = 1;
1410     trie->startstate = 1;
1411     trie->wordcount = word_count;
1412     RExC_rxi->data->data[ data_slot ] = (void*)trie;
1413     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1414     if (!(UTF && folder))
1415         trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1416     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1417                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
1418
1419     DEBUG_r({
1420         trie_words = newAV();
1421     });
1422
1423     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1424     if (!SvIOK(re_trie_maxbuff)) {
1425         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1426     }
1427     DEBUG_OPTIMISE_r({
1428                 PerlIO_printf( Perl_debug_log,
1429                   "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1430                   (int)depth * 2 + 2, "",
1431                   REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1432                   REG_NODE_NUM(last), REG_NODE_NUM(tail),
1433                   (int)depth);
1434     });
1435
1436    /* Find the node we are going to overwrite */
1437     if ( first == startbranch && OP( last ) != BRANCH ) {
1438         /* whole branch chain */
1439         convert = first;
1440     } else {
1441         /* branch sub-chain */
1442         convert = NEXTOPER( first );
1443     }
1444
1445     /*  -- First loop and Setup --
1446
1447        We first traverse the branches and scan each word to determine if it
1448        contains widechars, and how many unique chars there are, this is
1449        important as we have to build a table with at least as many columns as we
1450        have unique chars.
1451
1452        We use an array of integers to represent the character codes 0..255
1453        (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1454        native representation of the character value as the key and IV's for the
1455        coded index.
1456
1457        *TODO* If we keep track of how many times each character is used we can
1458        remap the columns so that the table compression later on is more
1459        efficient in terms of memory by ensuring the most common value is in the
1460        middle and the least common are on the outside.  IMO this would be better
1461        than a most to least common mapping as theres a decent chance the most
1462        common letter will share a node with the least common, meaning the node
1463        will not be compressible. With a middle is most common approach the worst
1464        case is when we have the least common nodes twice.
1465
1466      */
1467
1468     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1469         regnode * const noper = NEXTOPER( cur );
1470         const U8 *uc = (U8*)STRING( noper );
1471         const U8 * const e  = uc + STR_LEN( noper );
1472         STRLEN foldlen = 0;
1473         U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1474         const U8 *scan = (U8*)NULL;
1475         U32 wordlen      = 0;         /* required init */
1476         STRLEN chars = 0;
1477         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1478
1479         if (OP(noper) == NOTHING) {
1480             trie->minlen= 0;
1481             continue;
1482         }
1483         if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1484             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1485                                           regardless of encoding */
1486
1487         for ( ; uc < e ; uc += len ) {
1488             TRIE_CHARCOUNT(trie)++;
1489             TRIE_READ_CHAR;
1490             chars++;
1491             if ( uvc < 256 ) {
1492                 if ( !trie->charmap[ uvc ] ) {
1493                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1494                     if ( folder )
1495                         trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1496                     TRIE_STORE_REVCHAR;
1497                 }
1498                 if ( set_bit ) {
1499                     /* store the codepoint in the bitmap, and its folded
1500                      * equivalent. */
1501                     TRIE_BITMAP_SET(trie,uvc);
1502
1503                     /* store the folded codepoint */
1504                     if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1505
1506                     if ( !UTF ) {
1507                         /* store first byte of utf8 representation of
1508                            variant codepoints */
1509                         if (! UNI_IS_INVARIANT(uvc)) {
1510                             TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
1511                         }
1512                     }
1513                     set_bit = 0; /* We've done our bit :-) */
1514                 }
1515             } else {
1516                 SV** svpp;
1517                 if ( !widecharmap )
1518                     widecharmap = newHV();
1519
1520                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1521
1522                 if ( !svpp )
1523                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1524
1525                 if ( !SvTRUE( *svpp ) ) {
1526                     sv_setiv( *svpp, ++trie->uniquecharcount );
1527                     TRIE_STORE_REVCHAR;
1528                 }
1529             }
1530         }
1531         if( cur == first ) {
1532             trie->minlen=chars;
1533             trie->maxlen=chars;
1534         } else if (chars < trie->minlen) {
1535             trie->minlen=chars;
1536         } else if (chars > trie->maxlen) {
1537             trie->maxlen=chars;
1538         }
1539
1540     } /* end first pass */
1541     DEBUG_TRIE_COMPILE_r(
1542         PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1543                 (int)depth * 2 + 2,"",
1544                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1545                 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1546                 (int)trie->minlen, (int)trie->maxlen )
1547     );
1548
1549     /*
1550         We now know what we are dealing with in terms of unique chars and
1551         string sizes so we can calculate how much memory a naive
1552         representation using a flat table  will take. If it's over a reasonable
1553         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1554         conservative but potentially much slower representation using an array
1555         of lists.
1556
1557         At the end we convert both representations into the same compressed
1558         form that will be used in regexec.c for matching with. The latter
1559         is a form that cannot be used to construct with but has memory
1560         properties similar to the list form and access properties similar
1561         to the table form making it both suitable for fast searches and
1562         small enough that its feasable to store for the duration of a program.
1563
1564         See the comment in the code where the compressed table is produced
1565         inplace from the flat tabe representation for an explanation of how
1566         the compression works.
1567
1568     */
1569
1570
1571     Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1572     prev_states[1] = 0;
1573
1574     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1575         /*
1576             Second Pass -- Array Of Lists Representation
1577
1578             Each state will be represented by a list of charid:state records
1579             (reg_trie_trans_le) the first such element holds the CUR and LEN
1580             points of the allocated array. (See defines above).
1581
1582             We build the initial structure using the lists, and then convert
1583             it into the compressed table form which allows faster lookups
1584             (but cant be modified once converted).
1585         */
1586
1587         STRLEN transcount = 1;
1588
1589         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1590             "%*sCompiling trie using list compiler\n",
1591             (int)depth * 2 + 2, ""));
1592
1593         trie->states = (reg_trie_state *)
1594             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1595                                   sizeof(reg_trie_state) );
1596         TRIE_LIST_NEW(1);
1597         next_alloc = 2;
1598
1599         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1600
1601             regnode * const noper = NEXTOPER( cur );
1602             U8 *uc           = (U8*)STRING( noper );
1603             const U8 * const e = uc + STR_LEN( noper );
1604             U32 state        = 1;         /* required init */
1605             U16 charid       = 0;         /* sanity init */
1606             U8 *scan         = (U8*)NULL; /* sanity init */
1607             STRLEN foldlen   = 0;         /* required init */
1608             U32 wordlen      = 0;         /* required init */
1609             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1610
1611             if (OP(noper) != NOTHING) {
1612                 for ( ; uc < e ; uc += len ) {
1613
1614                     TRIE_READ_CHAR;
1615
1616                     if ( uvc < 256 ) {
1617                         charid = trie->charmap[ uvc ];
1618                     } else {
1619                         SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1620                         if ( !svpp ) {
1621                             charid = 0;
1622                         } else {
1623                             charid=(U16)SvIV( *svpp );
1624                         }
1625                     }
1626                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
1627                     if ( charid ) {
1628
1629                         U16 check;
1630                         U32 newstate = 0;
1631
1632                         charid--;
1633                         if ( !trie->states[ state ].trans.list ) {
1634                             TRIE_LIST_NEW( state );
1635                         }
1636                         for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1637                             if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1638                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1639                                 break;
1640                             }
1641                         }
1642                         if ( ! newstate ) {
1643                             newstate = next_alloc++;
1644                             prev_states[newstate] = state;
1645                             TRIE_LIST_PUSH( state, charid, newstate );
1646                             transcount++;
1647                         }
1648                         state = newstate;
1649                     } else {
1650                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1651                     }
1652                 }
1653             }
1654             TRIE_HANDLE_WORD(state);
1655
1656         } /* end second pass */
1657
1658         /* next alloc is the NEXT state to be allocated */
1659         trie->statecount = next_alloc;
1660         trie->states = (reg_trie_state *)
1661             PerlMemShared_realloc( trie->states,
1662                                    next_alloc
1663                                    * sizeof(reg_trie_state) );
1664
1665         /* and now dump it out before we compress it */
1666         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1667                                                          revcharmap, next_alloc,
1668                                                          depth+1)
1669         );
1670
1671         trie->trans = (reg_trie_trans *)
1672             PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1673         {
1674             U32 state;
1675             U32 tp = 0;
1676             U32 zp = 0;
1677
1678
1679             for( state=1 ; state < next_alloc ; state ++ ) {
1680                 U32 base=0;
1681
1682                 /*
1683                 DEBUG_TRIE_COMPILE_MORE_r(
1684                     PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1685                 );
1686                 */
1687
1688                 if (trie->states[state].trans.list) {
1689                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1690                     U16 maxid=minid;
1691                     U16 idx;
1692
1693                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1694                         const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1695                         if ( forid < minid ) {
1696                             minid=forid;
1697                         } else if ( forid > maxid ) {
1698                             maxid=forid;
1699                         }
1700                     }
1701                     if ( transcount < tp + maxid - minid + 1) {
1702                         transcount *= 2;
1703                         trie->trans = (reg_trie_trans *)
1704                             PerlMemShared_realloc( trie->trans,
1705                                                      transcount
1706                                                      * sizeof(reg_trie_trans) );
1707                         Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1708                     }
1709                     base = trie->uniquecharcount + tp - minid;
1710                     if ( maxid == minid ) {
1711                         U32 set = 0;
1712                         for ( ; zp < tp ; zp++ ) {
1713                             if ( ! trie->trans[ zp ].next ) {
1714                                 base = trie->uniquecharcount + zp - minid;
1715                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1716                                 trie->trans[ zp ].check = state;
1717                                 set = 1;
1718                                 break;
1719                             }
1720                         }
1721                         if ( !set ) {
1722                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1723                             trie->trans[ tp ].check = state;
1724                             tp++;
1725                             zp = tp;
1726                         }
1727                     } else {
1728                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1729                             const U32 tid = base -  trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1730                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1731                             trie->trans[ tid ].check = state;
1732                         }
1733                         tp += ( maxid - minid + 1 );
1734                     }
1735                     Safefree(trie->states[ state ].trans.list);
1736                 }
1737                 /*
1738                 DEBUG_TRIE_COMPILE_MORE_r(
1739                     PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1740                 );
1741                 */
1742                 trie->states[ state ].trans.base=base;
1743             }
1744             trie->lasttrans = tp + 1;
1745         }
1746     } else {
1747         /*
1748            Second Pass -- Flat Table Representation.
1749
1750            we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1751            We know that we will need Charcount+1 trans at most to store the data
1752            (one row per char at worst case) So we preallocate both structures
1753            assuming worst case.
1754
1755            We then construct the trie using only the .next slots of the entry
1756            structs.
1757
1758            We use the .check field of the first entry of the node temporarily to
1759            make compression both faster and easier by keeping track of how many non
1760            zero fields are in the node.
1761
1762            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1763            transition.
1764
1765            There are two terms at use here: state as a TRIE_NODEIDX() which is a
1766            number representing the first entry of the node, and state as a
1767            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1768            TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1769            are 2 entrys per node. eg:
1770
1771              A B       A B
1772           1. 2 4    1. 3 7
1773           2. 0 3    3. 0 5
1774           3. 0 0    5. 0 0
1775           4. 0 0    7. 0 0
1776
1777            The table is internally in the right hand, idx form. However as we also
1778            have to deal with the states array which is indexed by nodenum we have to
1779            use TRIE_NODENUM() to convert.
1780
1781         */
1782         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1783             "%*sCompiling trie using table compiler\n",
1784             (int)depth * 2 + 2, ""));
1785
1786         trie->trans = (reg_trie_trans *)
1787             PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1788                                   * trie->uniquecharcount + 1,
1789                                   sizeof(reg_trie_trans) );
1790         trie->states = (reg_trie_state *)
1791             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1792                                   sizeof(reg_trie_state) );
1793         next_alloc = trie->uniquecharcount + 1;
1794
1795
1796         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1797
1798             regnode * const noper   = NEXTOPER( cur );
1799             const U8 *uc     = (U8*)STRING( noper );
1800             const U8 * const e = uc + STR_LEN( noper );
1801
1802             U32 state        = 1;         /* required init */
1803
1804             U16 charid       = 0;         /* sanity init */
1805             U32 accept_state = 0;         /* sanity init */
1806             U8 *scan         = (U8*)NULL; /* sanity init */
1807
1808             STRLEN foldlen   = 0;         /* required init */
1809             U32 wordlen      = 0;         /* required init */
1810             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1811
1812             if ( OP(noper) != NOTHING ) {
1813                 for ( ; uc < e ; uc += len ) {
1814
1815                     TRIE_READ_CHAR;
1816
1817                     if ( uvc < 256 ) {
1818                         charid = trie->charmap[ uvc ];
1819                     } else {
1820                         SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1821                         charid = svpp ? (U16)SvIV(*svpp) : 0;
1822                     }
1823                     if ( charid ) {
1824                         charid--;
1825                         if ( !trie->trans[ state + charid ].next ) {
1826                             trie->trans[ state + charid ].next = next_alloc;
1827                             trie->trans[ state ].check++;
1828                             prev_states[TRIE_NODENUM(next_alloc)]
1829                                     = TRIE_NODENUM(state);
1830                             next_alloc += trie->uniquecharcount;
1831                         }
1832                         state = trie->trans[ state + charid ].next;
1833                     } else {
1834                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1835                     }
1836                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
1837                 }
1838             }
1839             accept_state = TRIE_NODENUM( state );
1840             TRIE_HANDLE_WORD(accept_state);
1841
1842         } /* end second pass */
1843
1844         /* and now dump it out before we compress it */
1845         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1846                                                           revcharmap,
1847                                                           next_alloc, depth+1));
1848
1849         {
1850         /*
1851            * Inplace compress the table.*
1852
1853            For sparse data sets the table constructed by the trie algorithm will
1854            be mostly 0/FAIL transitions or to put it another way mostly empty.
1855            (Note that leaf nodes will not contain any transitions.)
1856
1857            This algorithm compresses the tables by eliminating most such
1858            transitions, at the cost of a modest bit of extra work during lookup:
1859
1860            - Each states[] entry contains a .base field which indicates the
1861            index in the state[] array wheres its transition data is stored.
1862
1863            - If .base is 0 there are no valid transitions from that node.
1864
1865            - If .base is nonzero then charid is added to it to find an entry in
1866            the trans array.
1867
1868            -If trans[states[state].base+charid].check!=state then the
1869            transition is taken to be a 0/Fail transition. Thus if there are fail
1870            transitions at the front of the node then the .base offset will point
1871            somewhere inside the previous nodes data (or maybe even into a node
1872            even earlier), but the .check field determines if the transition is
1873            valid.
1874
1875            XXX - wrong maybe?
1876            The following process inplace converts the table to the compressed
1877            table: We first do not compress the root node 1,and mark all its
1878            .check pointers as 1 and set its .base pointer as 1 as well. This
1879            allows us to do a DFA construction from the compressed table later,
1880            and ensures that any .base pointers we calculate later are greater
1881            than 0.
1882
1883            - We set 'pos' to indicate the first entry of the second node.
1884
1885            - We then iterate over the columns of the node, finding the first and
1886            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1887            and set the .check pointers accordingly, and advance pos
1888            appropriately and repreat for the next node. Note that when we copy
1889            the next pointers we have to convert them from the original
1890            NODEIDX form to NODENUM form as the former is not valid post
1891            compression.
1892
1893            - If a node has no transitions used we mark its base as 0 and do not
1894            advance the pos pointer.
1895
1896            - If a node only has one transition we use a second pointer into the
1897            structure to fill in allocated fail transitions from other states.
1898            This pointer is independent of the main pointer and scans forward
1899            looking for null transitions that are allocated to a state. When it
1900            finds one it writes the single transition into the "hole".  If the
1901            pointer doesnt find one the single transition is appended as normal.
1902
1903            - Once compressed we can Renew/realloc the structures to release the
1904            excess space.
1905
1906            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1907            specifically Fig 3.47 and the associated pseudocode.
1908
1909            demq
1910         */
1911         const U32 laststate = TRIE_NODENUM( next_alloc );
1912         U32 state, charid;
1913         U32 pos = 0, zp=0;
1914         trie->statecount = laststate;
1915
1916         for ( state = 1 ; state < laststate ; state++ ) {
1917             U8 flag = 0;
1918             const U32 stateidx = TRIE_NODEIDX( state );
1919             const U32 o_used = trie->trans[ stateidx ].check;
1920             U32 used = trie->trans[ stateidx ].check;
1921             trie->trans[ stateidx ].check = 0;
1922
1923             for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1924                 if ( flag || trie->trans[ stateidx + charid ].next ) {
1925                     if ( trie->trans[ stateidx + charid ].next ) {
1926                         if (o_used == 1) {
1927                             for ( ; zp < pos ; zp++ ) {
1928                                 if ( ! trie->trans[ zp ].next ) {
1929                                     break;
1930                                 }
1931                             }
1932                             trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1933                             trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1934                             trie->trans[ zp ].check = state;
1935                             if ( ++zp > pos ) pos = zp;
1936                             break;
1937                         }
1938                         used--;
1939                     }
1940                     if ( !flag ) {
1941                         flag = 1;
1942                         trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1943                     }
1944                     trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1945                     trie->trans[ pos ].check = state;
1946                     pos++;
1947                 }
1948             }
1949         }
1950         trie->lasttrans = pos + 1;
1951         trie->states = (reg_trie_state *)
1952             PerlMemShared_realloc( trie->states, laststate
1953                                    * sizeof(reg_trie_state) );
1954         DEBUG_TRIE_COMPILE_MORE_r(
1955                 PerlIO_printf( Perl_debug_log,
1956                     "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1957                     (int)depth * 2 + 2,"",
1958                     (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
1959                     (IV)next_alloc,
1960                     (IV)pos,
1961                     ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1962             );
1963
1964         } /* end table compress */
1965     }
1966     DEBUG_TRIE_COMPILE_MORE_r(
1967             PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1968                 (int)depth * 2 + 2, "",
1969                 (UV)trie->statecount,
1970                 (UV)trie->lasttrans)
1971     );
1972     /* resize the trans array to remove unused space */
1973     trie->trans = (reg_trie_trans *)
1974         PerlMemShared_realloc( trie->trans, trie->lasttrans
1975                                * sizeof(reg_trie_trans) );
1976
1977     {   /* Modify the program and insert the new TRIE node */
1978         U8 nodetype =(U8)(flags & 0xFF);
1979         char *str=NULL;
1980
1981 #ifdef DEBUGGING
1982         regnode *optimize = NULL;
1983 #ifdef RE_TRACK_PATTERN_OFFSETS
1984
1985         U32 mjd_offset = 0;
1986         U32 mjd_nodelen = 0;
1987 #endif /* RE_TRACK_PATTERN_OFFSETS */
1988 #endif /* DEBUGGING */
1989         /*
1990            This means we convert either the first branch or the first Exact,
1991            depending on whether the thing following (in 'last') is a branch
1992            or not and whther first is the startbranch (ie is it a sub part of
1993            the alternation or is it the whole thing.)
1994            Assuming its a sub part we convert the EXACT otherwise we convert
1995            the whole branch sequence, including the first.
1996          */
1997         /* Find the node we are going to overwrite */
1998         if ( first != startbranch || OP( last ) == BRANCH ) {
1999             /* branch sub-chain */
2000             NEXT_OFF( first ) = (U16)(last - first);
2001 #ifdef RE_TRACK_PATTERN_OFFSETS
2002             DEBUG_r({
2003                 mjd_offset= Node_Offset((convert));
2004                 mjd_nodelen= Node_Length((convert));
2005             });
2006 #endif
2007             /* whole branch chain */
2008         }
2009 #ifdef RE_TRACK_PATTERN_OFFSETS
2010         else {
2011             DEBUG_r({
2012                 const  regnode *nop = NEXTOPER( convert );
2013                 mjd_offset= Node_Offset((nop));
2014                 mjd_nodelen= Node_Length((nop));
2015             });
2016         }
2017         DEBUG_OPTIMISE_r(
2018             PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2019                 (int)depth * 2 + 2, "",
2020                 (UV)mjd_offset, (UV)mjd_nodelen)
2021         );
2022 #endif
2023         /* But first we check to see if there is a common prefix we can
2024            split out as an EXACT and put in front of the TRIE node.  */
2025         trie->startstate= 1;
2026         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
2027             U32 state;
2028             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2029                 U32 ofs = 0;
2030                 I32 idx = -1;
2031                 U32 count = 0;
2032                 const U32 base = trie->states[ state ].trans.base;
2033
2034                 if ( trie->states[state].wordnum )
2035                         count = 1;
2036
2037                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2038                     if ( ( base + ofs >= trie->uniquecharcount ) &&
2039                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2040                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2041                     {
2042                         if ( ++count > 1 ) {
2043                             SV **tmp = av_fetch( revcharmap, ofs, 0);
2044                             const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2045                             if ( state == 1 ) break;
2046                             if ( count == 2 ) {
2047                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2048                                 DEBUG_OPTIMISE_r(
2049                                     PerlIO_printf(Perl_debug_log,
2050                                         "%*sNew Start State=%"UVuf" Class: [",
2051                                         (int)depth * 2 + 2, "",
2052                                         (UV)state));
2053                                 if (idx >= 0) {
2054                                     SV ** const tmp = av_fetch( revcharmap, idx, 0);
2055                                     const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2056
2057                                     TRIE_BITMAP_SET(trie,*ch);
2058                                     if ( folder )
2059                                         TRIE_BITMAP_SET(trie, folder[ *ch ]);
2060                                     DEBUG_OPTIMISE_r(
2061                                         PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2062                                     );
2063                                 }
2064                             }
2065                             TRIE_BITMAP_SET(trie,*ch);
2066                             if ( folder )
2067                                 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2068                             DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2069                         }
2070                         idx = ofs;
2071                     }
2072                 }
2073                 if ( count == 1 ) {
2074                     SV **tmp = av_fetch( revcharmap, idx, 0);
2075                     STRLEN len;
2076                     char *ch = SvPV( *tmp, len );
2077                     DEBUG_OPTIMISE_r({
2078                         SV *sv=sv_newmortal();
2079                         PerlIO_printf( Perl_debug_log,
2080                             "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2081                             (int)depth * 2 + 2, "",
2082                             (UV)state, (UV)idx,
2083                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2084                                 PL_colors[0], PL_colors[1],
2085                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2086                                 PERL_PV_ESCAPE_FIRSTCHAR
2087                             )
2088                         );
2089                     });
2090                     if ( state==1 ) {
2091                         OP( convert ) = nodetype;
2092                         str=STRING(convert);
2093                         STR_LEN(convert)=0;
2094                     }
2095                     STR_LEN(convert) += len;
2096                     while (len--)
2097                         *str++ = *ch++;
2098                 } else {
2099 #ifdef DEBUGGING
2100                     if (state>1)
2101                         DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2102 #endif
2103                     break;
2104                 }
2105             }
2106             trie->prefixlen = (state-1);
2107             if (str) {
2108                 regnode *n = convert+NODE_SZ_STR(convert);
2109                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2110                 trie->startstate = state;
2111                 trie->minlen -= (state - 1);
2112                 trie->maxlen -= (state - 1);
2113 #ifdef DEBUGGING
2114                /* At least the UNICOS C compiler choked on this
2115                 * being argument to DEBUG_r(), so let's just have
2116                 * it right here. */
2117                if (
2118 #ifdef PERL_EXT_RE_BUILD
2119                    1
2120 #else
2121                    DEBUG_r_TEST
2122 #endif
2123                    ) {
2124                    regnode *fix = convert;
2125                    U32 word = trie->wordcount;
2126                    mjd_nodelen++;
2127                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2128                    while( ++fix < n ) {
2129                        Set_Node_Offset_Length(fix, 0, 0);
2130                    }
2131                    while (word--) {
2132                        SV ** const tmp = av_fetch( trie_words, word, 0 );
2133                        if (tmp) {
2134                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
2135                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2136                            else
2137                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2138                        }
2139                    }
2140                }
2141 #endif
2142                 if (trie->maxlen) {
2143                     convert = n;
2144                 } else {
2145                     NEXT_OFF(convert) = (U16)(tail - convert);
2146                     DEBUG_r(optimize= n);
2147                 }
2148             }
2149         }
2150         if (!jumper)
2151             jumper = last;
2152         if ( trie->maxlen ) {
2153             NEXT_OFF( convert ) = (U16)(tail - convert);
2154             ARG_SET( convert, data_slot );
2155             /* Store the offset to the first unabsorbed branch in
2156                jump[0], which is otherwise unused by the jump logic.
2157                We use this when dumping a trie and during optimisation. */
2158             if (trie->jump)
2159                 trie->jump[0] = (U16)(nextbranch - convert);
2160
2161             /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2162              *   and there is a bitmap
2163              *   and the first "jump target" node we found leaves enough room
2164              * then convert the TRIE node into a TRIEC node, with the bitmap
2165              * embedded inline in the opcode - this is hypothetically faster.
2166              */
2167             if ( !trie->states[trie->startstate].wordnum
2168                  && trie->bitmap
2169                  && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2170             {
2171                 OP( convert ) = TRIEC;
2172                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2173                 PerlMemShared_free(trie->bitmap);
2174                 trie->bitmap= NULL;
2175             } else
2176                 OP( convert ) = TRIE;
2177
2178             /* store the type in the flags */
2179             convert->flags = nodetype;
2180             DEBUG_r({
2181             optimize = convert
2182                       + NODE_STEP_REGNODE
2183                       + regarglen[ OP( convert ) ];
2184             });
2185             /* XXX We really should free up the resource in trie now,
2186                    as we won't use them - (which resources?) dmq */
2187         }
2188         /* needed for dumping*/
2189         DEBUG_r(if (optimize) {
2190             regnode *opt = convert;
2191
2192             while ( ++opt < optimize) {
2193                 Set_Node_Offset_Length(opt,0,0);
2194             }
2195             /*
2196                 Try to clean up some of the debris left after the
2197                 optimisation.
2198              */
2199             while( optimize < jumper ) {
2200                 mjd_nodelen += Node_Length((optimize));
2201                 OP( optimize ) = OPTIMIZED;
2202                 Set_Node_Offset_Length(optimize,0,0);
2203                 optimize++;
2204             }
2205             Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2206         });
2207     } /* end node insert */
2208
2209     /*  Finish populating the prev field of the wordinfo array.  Walk back
2210      *  from each accept state until we find another accept state, and if
2211      *  so, point the first word's .prev field at the second word. If the
2212      *  second already has a .prev field set, stop now. This will be the
2213      *  case either if we've already processed that word's accept state,
2214      *  or that state had multiple words, and the overspill words were
2215      *  already linked up earlier.
2216      */
2217     {
2218         U16 word;
2219         U32 state;
2220         U16 prev;
2221
2222         for (word=1; word <= trie->wordcount; word++) {
2223             prev = 0;
2224             if (trie->wordinfo[word].prev)
2225                 continue;
2226             state = trie->wordinfo[word].accept;
2227             while (state) {
2228                 state = prev_states[state];
2229                 if (!state)
2230                     break;
2231                 prev = trie->states[state].wordnum;
2232                 if (prev)
2233                     break;
2234             }
2235             trie->wordinfo[word].prev = prev;
2236         }
2237         Safefree(prev_states);
2238     }
2239
2240
2241     /* and now dump out the compressed format */
2242     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2243
2244     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2245 #ifdef DEBUGGING
2246     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2247     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2248 #else
2249     SvREFCNT_dec(revcharmap);
2250 #endif
2251     return trie->jump
2252            ? MADE_JUMP_TRIE
2253            : trie->startstate>1
2254              ? MADE_EXACT_TRIE
2255              : MADE_TRIE;
2256 }
2257
2258 STATIC void
2259 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode *stclass, U32 depth)
2260 {
2261 /* The Trie is constructed and compressed now so we can build a fail array if it's needed
2262
2263    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2264    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2265    ISBN 0-201-10088-6
2266
2267    We find the fail state for each state in the trie, this state is the longest proper
2268    suffix of the current state's 'word' that is also a proper prefix of another word in our
2269    trie. State 1 represents the word '' and is thus the default fail state. This allows
2270    the DFA not to have to restart after its tried and failed a word at a given point, it
2271    simply continues as though it had been matching the other word in the first place.
2272    Consider
2273       'abcdgu'=~/abcdefg|cdgu/
2274    When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2275    fail, which would bring us to the state representing 'd' in the second word where we would
2276    try 'g' and succeed, proceeding to match 'cdgu'.
2277  */
2278  /* add a fail transition */
2279     const U32 trie_offset = ARG(source);
2280     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2281     U32 *q;
2282     const U32 ucharcount = trie->uniquecharcount;
2283     const U32 numstates = trie->statecount;
2284     const U32 ubound = trie->lasttrans + ucharcount;
2285     U32 q_read = 0;
2286     U32 q_write = 0;
2287     U32 charid;
2288     U32 base = trie->states[ 1 ].trans.base;
2289     U32 *fail;
2290     reg_ac_data *aho;
2291     const U32 data_slot = add_data( pRExC_state, 1, "T" );
2292     GET_RE_DEBUG_FLAGS_DECL;
2293
2294     PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2295 #ifndef DEBUGGING
2296     PERL_UNUSED_ARG(depth);
2297 #endif
2298
2299
2300     ARG_SET( stclass, data_slot );
2301     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2302     RExC_rxi->data->data[ data_slot ] = (void*)aho;
2303     aho->trie=trie_offset;
2304     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2305     Copy( trie->states, aho->states, numstates, reg_trie_state );
2306     Newxz( q, numstates, U32);
2307     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2308     aho->refcount = 1;
2309     fail = aho->fail;
2310     /* initialize fail[0..1] to be 1 so that we always have
2311        a valid final fail state */
2312     fail[ 0 ] = fail[ 1 ] = 1;
2313
2314     for ( charid = 0; charid < ucharcount ; charid++ ) {
2315         const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2316         if ( newstate ) {
2317             q[ q_write ] = newstate;
2318             /* set to point at the root */
2319             fail[ q[ q_write++ ] ]=1;
2320         }
2321     }
2322     while ( q_read < q_write) {
2323         const U32 cur = q[ q_read++ % numstates ];
2324         base = trie->states[ cur ].trans.base;
2325
2326         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2327             const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2328             if (ch_state) {
2329                 U32 fail_state = cur;
2330                 U32 fail_base;
2331                 do {
2332                     fail_state = fail[ fail_state ];
2333                     fail_base = aho->states[ fail_state ].trans.base;
2334                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2335
2336                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2337                 fail[ ch_state ] = fail_state;
2338                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2339                 {
2340                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
2341                 }
2342                 q[ q_write++ % numstates] = ch_state;
2343             }
2344         }
2345     }
2346     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2347        when we fail in state 1, this allows us to use the
2348        charclass scan to find a valid start char. This is based on the principle
2349        that theres a good chance the string being searched contains lots of stuff
2350        that cant be a start char.
2351      */
2352     fail[ 0 ] = fail[ 1 ] = 0;
2353     DEBUG_TRIE_COMPILE_r({
2354         PerlIO_printf(Perl_debug_log,
2355                       "%*sStclass Failtable (%"UVuf" states): 0",
2356                       (int)(depth * 2), "", (UV)numstates
2357         );
2358         for( q_read=1; q_read<numstates; q_read++ ) {
2359             PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2360         }
2361         PerlIO_printf(Perl_debug_log, "\n");
2362     });
2363     Safefree(q);
2364     /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2365 }
2366
2367
2368 /*
2369  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2370  * These need to be revisited when a newer toolchain becomes available.
2371  */
2372 #if defined(__sparc64__) && defined(__GNUC__)
2373 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2374 #       undef  SPARC64_GCC_WORKAROUND
2375 #       define SPARC64_GCC_WORKAROUND 1
2376 #   endif
2377 #endif
2378
2379 #define DEBUG_PEEP(str,scan,depth) \
2380     DEBUG_OPTIMISE_r({if (scan){ \
2381        SV * const mysv=sv_newmortal(); \
2382        regnode *Next = regnext(scan); \
2383        regprop(RExC_rx, mysv, scan); \
2384        PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2385        (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2386        Next ? (REG_NODE_NUM(Next)) : 0 ); \
2387    }});
2388
2389
2390
2391
2392
2393 #define JOIN_EXACT(scan,min,flags) \
2394     if (PL_regkind[OP(scan)] == EXACT) \
2395         join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2396
2397 STATIC U32
2398 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2399     /* Merge several consecutive EXACTish nodes into one. */
2400     regnode *n = regnext(scan);
2401     U32 stringok = 1;
2402     regnode *next = scan + NODE_SZ_STR(scan);
2403     U32 merged = 0;
2404     U32 stopnow = 0;
2405 #ifdef DEBUGGING
2406     regnode *stop = scan;
2407     GET_RE_DEBUG_FLAGS_DECL;
2408 #else
2409     PERL_UNUSED_ARG(depth);
2410 #endif
2411
2412     PERL_ARGS_ASSERT_JOIN_EXACT;
2413 #ifndef EXPERIMENTAL_INPLACESCAN
2414     PERL_UNUSED_ARG(flags);
2415     PERL_UNUSED_ARG(val);
2416 #endif
2417     DEBUG_PEEP("join",scan,depth);
2418
2419     /* Skip NOTHING, merge EXACT*. */
2420     while (n &&
2421            ( PL_regkind[OP(n)] == NOTHING ||
2422              (stringok && (OP(n) == OP(scan))))
2423            && NEXT_OFF(n)
2424            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2425
2426         if (OP(n) == TAIL || n > next)
2427             stringok = 0;
2428         if (PL_regkind[OP(n)] == NOTHING) {
2429             DEBUG_PEEP("skip:",n,depth);
2430             NEXT_OFF(scan) += NEXT_OFF(n);
2431             next = n + NODE_STEP_REGNODE;
2432 #ifdef DEBUGGING
2433             if (stringok)
2434                 stop = n;
2435 #endif
2436             n = regnext(n);
2437         }
2438         else if (stringok) {
2439             const unsigned int oldl = STR_LEN(scan);
2440             regnode * const nnext = regnext(n);
2441
2442             DEBUG_PEEP("merg",n,depth);
2443
2444             merged++;
2445             if (oldl + STR_LEN(n) > U8_MAX)
2446                 break;
2447             NEXT_OFF(scan) += NEXT_OFF(n);
2448             STR_LEN(scan) += STR_LEN(n);
2449             next = n + NODE_SZ_STR(n);
2450             /* Now we can overwrite *n : */
2451             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2452 #ifdef DEBUGGING
2453             stop = next - 1;
2454 #endif
2455             n = nnext;
2456             if (stopnow) break;
2457         }
2458
2459 #ifdef EXPERIMENTAL_INPLACESCAN
2460         if (flags && !NEXT_OFF(n)) {
2461             DEBUG_PEEP("atch", val, depth);
2462             if (reg_off_by_arg[OP(n)]) {
2463                 ARG_SET(n, val - n);
2464             }
2465             else {
2466                 NEXT_OFF(n) = val - n;
2467             }
2468             stopnow = 1;
2469         }
2470 #endif
2471     }
2472 #define GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS   0x0390
2473 #define IOTA_D_T        GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS
2474 #define GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS     0x03B0
2475 #define UPSILON_D_T     GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS
2476
2477     if (UTF
2478         && ( OP(scan) == EXACTF || OP(scan) == EXACTFU || OP(scan) == EXACTFA)
2479         && ( STR_LEN(scan) >= 6 ) )
2480     {
2481     /*
2482     Two problematic code points in Unicode casefolding of EXACT nodes:
2483
2484     U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2485     U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2486
2487     which casefold to
2488
2489     Unicode                      UTF-8
2490
2491     U+03B9 U+0308 U+0301         0xCE 0xB9 0xCC 0x88 0xCC 0x81
2492     U+03C5 U+0308 U+0301         0xCF 0x85 0xCC 0x88 0xCC 0x81
2493
2494     This means that in case-insensitive matching (or "loose matching",
2495     as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2496     length of the above casefolded versions) can match a target string
2497     of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2498     This would rather mess up the minimum length computation.
2499
2500     What we'll do is to look for the tail four bytes, and then peek
2501     at the preceding two bytes to see whether we need to decrease
2502     the minimum length by four (six minus two).
2503
2504     Thanks to the design of UTF-8, there cannot be false matches:
2505     A sequence of valid UTF-8 bytes cannot be a subsequence of
2506     another valid sequence of UTF-8 bytes.
2507
2508     */
2509          char * const s0 = STRING(scan), *s, *t;
2510          char * const s1 = s0 + STR_LEN(scan) - 1;
2511          char * const s2 = s1 - 4;
2512 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2513          const char t0[] = "\xaf\x49\xaf\x42";
2514 #else
2515          const char t0[] = "\xcc\x88\xcc\x81";
2516 #endif
2517          const char * const t1 = t0 + 3;
2518
2519          for (s = s0 + 2;
2520               s < s2 && (t = ninstr(s, s1, t0, t1));
2521               s = t + 4) {
2522 #ifdef EBCDIC
2523               if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2524                   ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2525 #else
2526               if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2527                   ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
2528 #endif
2529                    *min -= 4;
2530          }
2531     }
2532
2533 #ifdef DEBUGGING
2534     /* Allow dumping */
2535     n = scan + NODE_SZ_STR(scan);
2536     while (n <= stop) {
2537         if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2538             OP(n) = OPTIMIZED;
2539             NEXT_OFF(n) = 0;
2540         }
2541         n++;
2542     }
2543 #endif
2544     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2545     return stopnow;
2546 }
2547
2548 /* REx optimizer.  Converts nodes into quicker variants "in place".
2549    Finds fixed substrings.  */
2550
2551 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2552    to the position after last scanned or to NULL. */
2553
2554 #define INIT_AND_WITHP \
2555     assert(!and_withp); \
2556     Newx(and_withp,1,struct regnode_charclass_class); \
2557     SAVEFREEPV(and_withp)
2558
2559 /* this is a chain of data about sub patterns we are processing that
2560    need to be handled separately/specially in study_chunk. Its so
2561    we can simulate recursion without losing state.  */
2562 struct scan_frame;
2563 typedef struct scan_frame {
2564     regnode *last;  /* last node to process in this frame */
2565     regnode *next;  /* next node to process when last is reached */
2566     struct scan_frame *prev; /*previous frame*/
2567     I32 stop; /* what stopparen do we use */
2568 } scan_frame;
2569
2570
2571 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2572
2573 #define CASE_SYNST_FNC(nAmE)                                       \
2574 case nAmE:                                                         \
2575     if (flags & SCF_DO_STCLASS_AND) {                              \
2576             for (value = 0; value < 256; value++)                  \
2577                 if (!is_ ## nAmE ## _cp(value))                       \
2578                     ANYOF_BITMAP_CLEAR(data->start_class, value);  \
2579     }                                                              \
2580     else {                                                         \
2581             for (value = 0; value < 256; value++)                  \
2582                 if (is_ ## nAmE ## _cp(value))                        \
2583                     ANYOF_BITMAP_SET(data->start_class, value);    \
2584     }                                                              \
2585     break;                                                         \
2586 case N ## nAmE:                                                    \
2587     if (flags & SCF_DO_STCLASS_AND) {                              \
2588             for (value = 0; value < 256; value++)                   \
2589                 if (is_ ## nAmE ## _cp(value))                         \
2590                     ANYOF_BITMAP_CLEAR(data->start_class, value);   \
2591     }                                                               \
2592     else {                                                          \
2593             for (value = 0; value < 256; value++)                   \
2594                 if (!is_ ## nAmE ## _cp(value))                        \
2595                     ANYOF_BITMAP_SET(data->start_class, value);     \
2596     }                                                               \
2597     break
2598
2599
2600
2601 STATIC I32
2602 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2603                         I32 *minlenp, I32 *deltap,
2604                         regnode *last,
2605                         scan_data_t *data,
2606                         I32 stopparen,
2607                         U8* recursed,
2608                         struct regnode_charclass_class *and_withp,
2609                         U32 flags, U32 depth)
2610                         /* scanp: Start here (read-write). */
2611                         /* deltap: Write maxlen-minlen here. */
2612                         /* last: Stop before this one. */
2613                         /* data: string data about the pattern */
2614                         /* stopparen: treat close N as END */
2615                         /* recursed: which subroutines have we recursed into */
2616                         /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2617 {
2618     dVAR;
2619     I32 min = 0, pars = 0, code;
2620     regnode *scan = *scanp, *next;
2621     I32 delta = 0;
2622     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2623     int is_inf_internal = 0;            /* The studied chunk is infinite */
2624     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2625     scan_data_t data_fake;
2626     SV *re_trie_maxbuff = NULL;
2627     regnode *first_non_open = scan;
2628     I32 stopmin = I32_MAX;
2629     scan_frame *frame = NULL;
2630     GET_RE_DEBUG_FLAGS_DECL;
2631
2632     PERL_ARGS_ASSERT_STUDY_CHUNK;
2633
2634 #ifdef DEBUGGING
2635     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2636 #endif
2637
2638     if ( depth == 0 ) {
2639         while (first_non_open && OP(first_non_open) == OPEN)
2640             first_non_open=regnext(first_non_open);
2641     }
2642
2643
2644   fake_study_recurse:
2645     while ( scan && OP(scan) != END && scan < last ){
2646         /* Peephole optimizer: */
2647         DEBUG_STUDYDATA("Peep:", data,depth);
2648         DEBUG_PEEP("Peep",scan,depth);
2649         JOIN_EXACT(scan,&min,0);
2650
2651         /* Follow the next-chain of the current node and optimize
2652            away all the NOTHINGs from it.  */
2653         if (OP(scan) != CURLYX) {
2654             const int max = (reg_off_by_arg[OP(scan)]
2655                        ? I32_MAX
2656                        /* I32 may be smaller than U16 on CRAYs! */
2657                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2658             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2659             int noff;
2660             regnode *n = scan;
2661
2662             /* Skip NOTHING and LONGJMP. */
2663             while ((n = regnext(n))
2664                    && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2665                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2666                    && off + noff < max)
2667                 off += noff;
2668             if (reg_off_by_arg[OP(scan)])
2669                 ARG(scan) = off;
2670             else
2671                 NEXT_OFF(scan) = off;
2672         }
2673
2674
2675
2676         /* The principal pseudo-switch.  Cannot be a switch, since we
2677            look into several different things.  */
2678         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2679                    || OP(scan) == IFTHEN) {
2680             next = regnext(scan);
2681             code = OP(scan);
2682             /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2683
2684             if (OP(next) == code || code == IFTHEN) {
2685                 /* NOTE - There is similar code to this block below for handling
2686                    TRIE nodes on a re-study.  If you change stuff here check there
2687                    too. */
2688                 I32 max1 = 0, min1 = I32_MAX, num = 0;
2689                 struct regnode_charclass_class accum;
2690                 regnode * const startbranch=scan;
2691
2692                 if (flags & SCF_DO_SUBSTR)
2693                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2694                 if (flags & SCF_DO_STCLASS)
2695                     cl_init_zero(pRExC_state, &accum);
2696
2697                 while (OP(scan) == code) {
2698                     I32 deltanext, minnext, f = 0, fake;
2699                     struct regnode_charclass_class this_class;
2700
2701                     num++;
2702                     data_fake.flags = 0;
2703                     if (data) {
2704                         data_fake.whilem_c = data->whilem_c;
2705                         data_fake.last_closep = data->last_closep;
2706                     }
2707                     else
2708                         data_fake.last_closep = &fake;
2709
2710                     data_fake.pos_delta = delta;
2711                     next = regnext(scan);
2712                     scan = NEXTOPER(scan);
2713                     if (code != BRANCH)
2714                         scan = NEXTOPER(scan);
2715                     if (flags & SCF_DO_STCLASS) {
2716                         cl_init(pRExC_state, &this_class);
2717                         data_fake.start_class = &this_class;
2718                         f = SCF_DO_STCLASS_AND;
2719                     }
2720                     if (flags & SCF_WHILEM_VISITED_POS)
2721                         f |= SCF_WHILEM_VISITED_POS;
2722
2723                     /* we suppose the run is continuous, last=next...*/
2724                     minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2725                                           next, &data_fake,
2726                                           stopparen, recursed, NULL, f,depth+1);
2727                     if (min1 > minnext)
2728                         min1 = minnext;
2729                     if (max1 < minnext + deltanext)
2730                         max1 = minnext + deltanext;
2731                     if (deltanext == I32_MAX)
2732                         is_inf = is_inf_internal = 1;
2733                     scan = next;
2734                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2735                         pars++;
2736                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
2737                         if ( stopmin > minnext)
2738                             stopmin = min + min1;
2739                         flags &= ~SCF_DO_SUBSTR;
2740                         if (data)
2741                             data->flags |= SCF_SEEN_ACCEPT;
2742                     }
2743                     if (data) {
2744                         if (data_fake.flags & SF_HAS_EVAL)
2745                             data->flags |= SF_HAS_EVAL;
2746                         data->whilem_c = data_fake.whilem_c;
2747                     }
2748                     if (flags & SCF_DO_STCLASS)
2749                         cl_or(pRExC_state, &accum, &this_class);
2750                 }
2751                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2752                     min1 = 0;
2753                 if (flags & SCF_DO_SUBSTR) {
2754                     data->pos_min += min1;
2755                     data->pos_delta += max1 - min1;
2756                     if (max1 != min1 || is_inf)
2757                         data->longest = &(data->longest_float);
2758                 }
2759                 min += min1;
2760                 delta += max1 - min1;
2761                 if (flags & SCF_DO_STCLASS_OR) {
2762                     cl_or(pRExC_state, data->start_class, &accum);
2763                     if (min1) {
2764                         cl_and(data->start_class, and_withp);
2765                         flags &= ~SCF_DO_STCLASS;
2766                     }
2767                 }
2768                 else if (flags & SCF_DO_STCLASS_AND) {
2769                     if (min1) {
2770                         cl_and(data->start_class, &accum);
2771                         flags &= ~SCF_DO_STCLASS;
2772                     }
2773                     else {
2774                         /* Switch to OR mode: cache the old value of
2775                          * data->start_class */
2776                         INIT_AND_WITHP;
2777                         StructCopy(data->start_class, and_withp,
2778                                    struct regnode_charclass_class);
2779                         flags &= ~SCF_DO_STCLASS_AND;
2780                         StructCopy(&accum, data->start_class,
2781                                    struct regnode_charclass_class);
2782                         flags |= SCF_DO_STCLASS_OR;
2783                         data->start_class->flags |= ANYOF_EOS;
2784                     }
2785                 }
2786
2787                 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2788                 /* demq.
2789
2790                    Assuming this was/is a branch we are dealing with: 'scan' now
2791                    points at the item that follows the branch sequence, whatever
2792                    it is. We now start at the beginning of the sequence and look
2793                    for subsequences of
2794
2795                    BRANCH->EXACT=>x1
2796                    BRANCH->EXACT=>x2
2797                    tail
2798
2799                    which would be constructed from a pattern like /A|LIST|OF|WORDS/
2800
2801                    If we can find such a subsequence we need to turn the first
2802                    element into a trie and then add the subsequent branch exact
2803                    strings to the trie.
2804
2805                    We have two cases
2806
2807                      1. patterns where the whole set of branches can be converted.
2808
2809                      2. patterns where only a subset can be converted.
2810
2811                    In case 1 we can replace the whole set with a single regop
2812                    for the trie. In case 2 we need to keep the start and end
2813                    branches so
2814
2815                      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2816                      becomes BRANCH TRIE; BRANCH X;
2817
2818                   There is an additional case, that being where there is a
2819                   common prefix, which gets split out into an EXACT like node
2820                   preceding the TRIE node.
2821
2822                   If x(1..n)==tail then we can do a simple trie, if not we make
2823                   a "jump" trie, such that when we match the appropriate word
2824                   we "jump" to the appropriate tail node. Essentially we turn
2825                   a nested if into a case structure of sorts.
2826
2827                 */
2828
2829                     int made=0;
2830                     if (!re_trie_maxbuff) {
2831                         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2832                         if (!SvIOK(re_trie_maxbuff))
2833                             sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2834                     }
2835                     if ( SvIV(re_trie_maxbuff)>=0  ) {
2836                         regnode *cur;
2837                         regnode *first = (regnode *)NULL;
2838                         regnode *last = (regnode *)NULL;
2839                         regnode *tail = scan;
2840                         U8 optype = 0;
2841                         U32 count=0;
2842
2843 #ifdef DEBUGGING
2844                         SV * const mysv = sv_newmortal();       /* for dumping */
2845 #endif
2846                         /* var tail is used because there may be a TAIL
2847                            regop in the way. Ie, the exacts will point to the
2848                            thing following the TAIL, but the last branch will
2849                            point at the TAIL. So we advance tail. If we
2850                            have nested (?:) we may have to move through several
2851                            tails.
2852                          */
2853
2854                         while ( OP( tail ) == TAIL ) {
2855                             /* this is the TAIL generated by (?:) */
2856                             tail = regnext( tail );
2857                         }
2858
2859
2860                         DEBUG_OPTIMISE_r({
2861                             regprop(RExC_rx, mysv, tail );
2862                             PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2863                                 (int)depth * 2 + 2, "",
2864                                 "Looking for TRIE'able sequences. Tail node is: ",
2865                                 SvPV_nolen_const( mysv )
2866                             );
2867                         });
2868
2869                         /*
2870
2871                            step through the branches, cur represents each
2872                            branch, noper is the first thing to be matched
2873                            as part of that branch and noper_next is the
2874                            regnext() of that node. if noper is an EXACT
2875                            and noper_next is the same as scan (our current
2876                            position in the regex) then the EXACT branch is
2877                            a possible optimization target. Once we have
2878                            two or more consecutive such branches we can
2879                            create a trie of the EXACT's contents and stich
2880                            it in place. If the sequence represents all of
2881                            the branches we eliminate the whole thing and
2882                            replace it with a single TRIE. If it is a
2883                            subsequence then we need to stitch it in. This
2884                            means the first branch has to remain, and needs
2885                            to be repointed at the item on the branch chain
2886                            following the last branch optimized. This could
2887                            be either a BRANCH, in which case the
2888                            subsequence is internal, or it could be the
2889                            item following the branch sequence in which
2890                            case the subsequence is at the end.
2891
2892                         */
2893
2894                         /* dont use tail as the end marker for this traverse */
2895                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2896                             regnode * const noper = NEXTOPER( cur );
2897 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
2898                             regnode * const noper_next = regnext( noper );
2899 #endif
2900
2901                             DEBUG_OPTIMISE_r({
2902                                 regprop(RExC_rx, mysv, cur);
2903                                 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2904                                    (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2905
2906                                 regprop(RExC_rx, mysv, noper);
2907                                 PerlIO_printf( Perl_debug_log, " -> %s",
2908                                     SvPV_nolen_const(mysv));
2909
2910                                 if ( noper_next ) {
2911                                   regprop(RExC_rx, mysv, noper_next );
2912                                   PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2913                                     SvPV_nolen_const(mysv));
2914                                 }
2915                                 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2916                                    REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2917                             });
2918                             if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2919                                          : PL_regkind[ OP( noper ) ] == EXACT )
2920                                   || OP(noper) == NOTHING )
2921 #ifdef NOJUMPTRIE
2922                                   && noper_next == tail
2923 #endif
2924                                   && count < U16_MAX)
2925                             {
2926                                 count++;
2927                                 if ( !first || optype == NOTHING ) {
2928                                     if (!first) first = cur;
2929                                     optype = OP( noper );
2930                                 } else {
2931                                     last = cur;
2932                                 }
2933                             } else {
2934 /*
2935     Currently we do not believe that the trie logic can
2936     handle case insensitive matching properly when the
2937     pattern is not unicode (thus forcing unicode semantics).
2938
2939     If/when this is fixed the following define can be swapped
2940     in below to fully enable trie logic.
2941
2942 #define TRIE_TYPE_IS_SAFE 1
2943
2944 */
2945 #define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
2946
2947                                 if ( last && TRIE_TYPE_IS_SAFE ) {
2948                                     make_trie( pRExC_state,
2949                                             startbranch, first, cur, tail, count,
2950                                             optype, depth+1 );
2951                                 }
2952                                 if ( PL_regkind[ OP( noper ) ] == EXACT
2953 #ifdef NOJUMPTRIE
2954                                      && noper_next == tail
2955 #endif
2956                                 ){
2957                                     count = 1;
2958                                     first = cur;
2959                                     optype = OP( noper );
2960                                 } else {
2961                                     count = 0;
2962                                     first = NULL;
2963                                     optype = 0;
2964                                 }
2965                                 last = NULL;
2966                             }
2967                         }
2968                         DEBUG_OPTIMISE_r({
2969                             regprop(RExC_rx, mysv, cur);
2970                             PerlIO_printf( Perl_debug_log,
2971                               "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2972                               "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2973
2974                         });
2975
2976                         if ( last && TRIE_TYPE_IS_SAFE ) {
2977                             made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
2978 #ifdef TRIE_STUDY_OPT
2979                             if ( ((made == MADE_EXACT_TRIE &&
2980                                  startbranch == first)
2981                                  || ( first_non_open == first )) &&
2982                                  depth==0 ) {
2983                                 flags |= SCF_TRIE_RESTUDY;
2984                                 if ( startbranch == first
2985                                      && scan == tail )
2986                                 {
2987                                     RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2988                                 }
2989                             }
2990 #endif
2991                         }
2992                     }
2993
2994                 } /* do trie */
2995
2996             }
2997             else if ( code == BRANCHJ ) {  /* single branch is optimized. */
2998                 scan = NEXTOPER(NEXTOPER(scan));
2999             } else                      /* single branch is optimized. */
3000                 scan = NEXTOPER(scan);
3001             continue;
3002         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3003             scan_frame *newframe = NULL;
3004             I32 paren;
3005             regnode *start;
3006             regnode *end;
3007
3008             if (OP(scan) != SUSPEND) {
3009             /* set the pointer */
3010                 if (OP(scan) == GOSUB) {
3011                     paren = ARG(scan);
3012                     RExC_recurse[ARG2L(scan)] = scan;
3013                     start = RExC_open_parens[paren-1];
3014                     end   = RExC_close_parens[paren-1];
3015                 } else {
3016                     paren = 0;
3017                     start = RExC_rxi->program + 1;
3018                     end   = RExC_opend;
3019                 }
3020                 if (!recursed) {
3021                     Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3022                     SAVEFREEPV(recursed);
3023                 }
3024                 if (!PAREN_TEST(recursed,paren+1)) {
3025                     PAREN_SET(recursed,paren+1);
3026                     Newx(newframe,1,scan_frame);
3027                 } else {
3028                     if (flags & SCF_DO_SUBSTR) {
3029                         SCAN_COMMIT(pRExC_state,data,minlenp);
3030                         data->longest = &(data->longest_float);
3031                     }
3032                     is_inf = is_inf_internal = 1;
3033                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3034                         cl_anything(pRExC_state, data->start_class);
3035                     flags &= ~SCF_DO_STCLASS;
3036                 }
3037             } else {
3038                 Newx(newframe,1,scan_frame);
3039                 paren = stopparen;
3040                 start = scan+2;
3041                 end = regnext(scan);
3042             }
3043             if (newframe) {
3044                 assert(start);
3045                 assert(end);
3046                 SAVEFREEPV(newframe);
3047                 newframe->next = regnext(scan);
3048                 newframe->last = last;
3049                 newframe->stop = stopparen;
3050                 newframe->prev = frame;
3051
3052                 frame = newframe;
3053                 scan =  start;
3054                 stopparen = paren;
3055                 last = end;
3056
3057                 continue;
3058             }
3059         }
3060         else if (OP(scan) == EXACT) {
3061             I32 l = STR_LEN(scan);
3062             UV uc;
3063             if (UTF) {
3064                 const U8 * const s = (U8*)STRING(scan);
3065                 l = utf8_length(s, s + l);
3066                 uc = utf8_to_uvchr(s, NULL);
3067             } else {
3068                 uc = *((U8*)STRING(scan));
3069             }
3070             min += l;
3071             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3072                 /* The code below prefers earlier match for fixed
3073                    offset, later match for variable offset.  */
3074                 if (data->last_end == -1) { /* Update the start info. */
3075                     data->last_start_min = data->pos_min;
3076                     data->last_start_max = is_inf
3077                         ? I32_MAX : data->pos_min + data->pos_delta;
3078                 }
3079                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3080                 if (UTF)
3081                     SvUTF8_on(data->last_found);
3082                 {
3083                     SV * const sv = data->last_found;
3084                     MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3085                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
3086                     if (mg && mg->mg_len >= 0)
3087                         mg->mg_len += utf8_length((U8*)STRING(scan),
3088                                                   (U8*)STRING(scan)+STR_LEN(scan));
3089                 }
3090                 data->last_end = data->pos_min + l;
3091                 data->pos_min += l; /* As in the first entry. */
3092                 data->flags &= ~SF_BEFORE_EOL;
3093             }
3094             if (flags & SCF_DO_STCLASS_AND) {
3095                 /* Check whether it is compatible with what we know already! */
3096                 int compat = 1;
3097
3098
3099                 /* If compatible, we or it in below.  It is compatible if is
3100                  * in the bitmp and either 1) its bit or its fold is set, or 2)
3101                  * it's for a locale.  Even if there isn't unicode semantics
3102                  * here, at runtime there may be because of matching against a
3103                  * utf8 string, so accept a possible false positive for
3104                  * latin1-range folds */
3105                 if (uc >= 0x100 ||
3106                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3107                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
3108                     && (!(data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD)
3109                         || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3110                     )
3111                 {
3112                     compat = 0;
3113                 }
3114                 ANYOF_CLASS_ZERO(data->start_class);
3115                 ANYOF_BITMAP_ZERO(data->start_class);
3116                 if (compat)
3117                     ANYOF_BITMAP_SET(data->start_class, uc);
3118                 else if (uc >= 0x100) {
3119                     int i;
3120
3121                     /* Some Unicode code points fold to the Latin1 range; as
3122                      * XXX temporary code, instead of figuring out if this is
3123                      * one, just assume it is and set all the start class bits
3124                      * that could be some such above 255 code point's fold
3125                      * which will generate fals positives.  As the code
3126                      * elsewhere that does compute the fold settles down, it
3127                      * can be extracted out and re-used here */
3128                     for (i = 0; i < 256; i++){
3129                         if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3130                             ANYOF_BITMAP_SET(data->start_class, i);
3131                         }
3132                     }
3133                 }
3134                 data->start_class->flags &= ~ANYOF_EOS;
3135                 if (uc < 0x100)
3136                   data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3137             }
3138             else if (flags & SCF_DO_STCLASS_OR) {
3139                 /* false positive possible if the class is case-folded */
3140                 if (uc < 0x100)
3141                     ANYOF_BITMAP_SET(data->start_class, uc);
3142                 else
3143                     data->start_class->flags |= ANYOF_UNICODE_ALL;
3144                 data->start_class->flags &= ~ANYOF_EOS;
3145                 cl_and(data->start_class, and_withp);
3146             }
3147             flags &= ~SCF_DO_STCLASS;
3148         }
3149         else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3150             I32 l = STR_LEN(scan);
3151             UV uc = *((U8*)STRING(scan));
3152
3153             /* Search for fixed substrings supports EXACT only. */
3154             if (flags & SCF_DO_SUBSTR) {
3155                 assert(data);
3156                 SCAN_COMMIT(pRExC_state, data, minlenp);
3157             }
3158             if (UTF) {
3159                 const U8 * const s = (U8 *)STRING(scan);
3160                 l = utf8_length(s, s + l);
3161                 uc = utf8_to_uvchr(s, NULL);
3162             }
3163             min += l;
3164             if (flags & SCF_DO_SUBSTR)
3165                 data->pos_min += l;
3166             if (flags & SCF_DO_STCLASS_AND) {
3167                 /* Check whether it is compatible with what we know already! */
3168                 int compat = 1;
3169                 if (uc >= 0x100 ||
3170                  (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3171                   && !ANYOF_BITMAP_TEST(data->start_class, uc)
3172                   && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3173                 {
3174                     compat = 0;
3175                 }
3176                 ANYOF_CLASS_ZERO(data->start_class);
3177                 ANYOF_BITMAP_ZERO(data->start_class);
3178                 if (compat) {
3179                     ANYOF_BITMAP_SET(data->start_class, uc);
3180                     data->start_class->flags &= ~ANYOF_EOS;
3181                     data->start_class->flags |= ANYOF_LOC_NONBITMAP_FOLD;
3182                     if (OP(scan) == EXACTFL) {
3183                         data->start_class->flags |= ANYOF_LOCALE;
3184                     }
3185                     else {
3186
3187                         /* Also set the other member of the fold pair.  In case
3188                          * that unicode semantics is called for at runtime, use
3189                          * the full latin1 fold.  (Can't do this for locale,
3190                          * because not known until runtime */
3191                         ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3192                     }
3193                 }
3194                 else if (uc >= 0x100) {
3195                     int i;
3196                     for (i = 0; i < 256; i++){
3197                         if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3198                             ANYOF_BITMAP_SET(data->start_class, i);
3199                         }
3200                     }
3201                 }
3202             }
3203             else if (flags & SCF_DO_STCLASS_OR) {
3204                 if (data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) {
3205                     /* false positive possible if the class is case-folded.
3206                        Assume that the locale settings are the same... */
3207                     if (uc < 0x100) {
3208                         ANYOF_BITMAP_SET(data->start_class, uc);
3209                         if (OP(scan) != EXACTFL) {
3210
3211                             /* And set the other member of the fold pair, but
3212                              * can't do that in locale because not known until
3213                              * run-time */
3214                             ANYOF_BITMAP_SET(data->start_class,
3215                                              PL_fold_latin1[uc]);
3216                         }
3217                     }
3218                     data->start_class->flags &= ~ANYOF_EOS;
3219                 }
3220                 cl_and(data->start_class, and_withp);
3221             }
3222             flags &= ~SCF_DO_STCLASS;
3223         }
3224         else if (REGNODE_VARIES(OP(scan))) {
3225             I32 mincount, maxcount, minnext, deltanext, fl = 0;
3226             I32 f = flags, pos_before = 0;
3227             regnode * const oscan = scan;
3228             struct regnode_charclass_class this_class;
3229             struct regnode_charclass_class *oclass = NULL;
3230             I32 next_is_eval = 0;
3231
3232             switch (PL_regkind[OP(scan)]) {
3233             case WHILEM:                /* End of (?:...)* . */
3234                 scan = NEXTOPER(scan);
3235                 goto finish;
3236             case PLUS:
3237                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3238                     next = NEXTOPER(scan);
3239                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3240                         mincount = 1;
3241                         maxcount = REG_INFTY;
3242                         next = regnext(scan);
3243                         scan = NEXTOPER(scan);
3244                         goto do_curly;
3245                     }
3246                 }
3247                 if (flags & SCF_DO_SUBSTR)
3248                     data->pos_min++;
3249                 min++;
3250                 /* Fall through. */
3251             case STAR:
3252                 if (flags & SCF_DO_STCLASS) {
3253                     mincount = 0;
3254                     maxcount = REG_INFTY;
3255                     next = regnext(scan);
3256                     scan = NEXTOPER(scan);
3257                     goto do_curly;
3258                 }
3259                 is_inf = is_inf_internal = 1;
3260                 scan = regnext(scan);
3261                 if (flags & SCF_DO_SUBSTR) {
3262                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3263                     data->longest = &(data->longest_float);
3264                 }
3265                 goto optimize_curly_tail;
3266             case CURLY:
3267                 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3268                     && (scan->flags == stopparen))
3269                 {
3270                     mincount = 1;
3271                     maxcount = 1;
3272                 } else {
3273                     mincount = ARG1(scan);
3274                     maxcount = ARG2(scan);
3275                 }
3276                 next = regnext(scan);
3277                 if (OP(scan) == CURLYX) {
3278                     I32 lp = (data ? *(data->last_closep) : 0);
3279                     scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3280                 }
3281                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3282                 next_is_eval = (OP(scan) == EVAL);
3283               do_curly:
3284                 if (flags & SCF_DO_SUBSTR) {
3285                     if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3286                     pos_before = data->pos_min;
3287                 }
3288                 if (data) {
3289                     fl = data->flags;
3290                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3291                     if (is_inf)
3292                         data->flags |= SF_IS_INF;
3293                 }
3294                 if (flags & SCF_DO_STCLASS) {
3295                     cl_init(pRExC_state, &this_class);
3296                     oclass = data->start_class;
3297                     data->start_class = &this_class;
3298                     f |= SCF_DO_STCLASS_AND;
3299                     f &= ~SCF_DO_STCLASS_OR;
3300                 }
3301                 /* Exclude from super-linear cache processing any {n,m}
3302                    regops for which the combination of input pos and regex
3303                    pos is not enough information to determine if a match
3304                    will be possible.
3305
3306                    For example, in the regex /foo(bar\s*){4,8}baz/ with the
3307                    regex pos at the \s*, the prospects for a match depend not
3308                    only on the input position but also on how many (bar\s*)
3309                    repeats into the {4,8} we are. */
3310                if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3311                     f &= ~SCF_WHILEM_VISITED_POS;
3312
3313                 /* This will finish on WHILEM, setting scan, or on NULL: */
3314                 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3315                                       last, data, stopparen, recursed, NULL,
3316                                       (mincount == 0
3317                                         ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3318
3319                 if (flags & SCF_DO_STCLASS)
3320                     data->start_class = oclass;
3321                 if (mincount == 0 || minnext == 0) {
3322                     if (flags & SCF_DO_STCLASS_OR) {
3323                         cl_or(pRExC_state, data->start_class, &this_class);
3324                     }
3325                     else if (flags & SCF_DO_STCLASS_AND) {
3326                         /* Switch to OR mode: cache the old value of
3327                          * data->start_class */
3328                         INIT_AND_WITHP;
3329                         StructCopy(data->start_class, and_withp,
3330                                    struct regnode_charclass_class);
3331                         flags &= ~SCF_DO_STCLASS_AND;
3332                         StructCopy(&this_class, data->start_class,
3333                                    struct regnode_charclass_class);
3334                         flags |= SCF_DO_STCLASS_OR;
3335                         data->start_class->flags |= ANYOF_EOS;
3336                     }
3337                 } else {                /* Non-zero len */
3338                     if (flags & SCF_DO_STCLASS_OR) {
3339                         cl_or(pRExC_state, data->start_class, &this_class);
3340                         cl_and(data->start_class, and_withp);
3341                     }
3342                     else if (flags & SCF_DO_STCLASS_AND)
3343                         cl_and(data->start_class, &this_class);
3344                     flags &= ~SCF_DO_STCLASS;
3345                 }
3346                 if (!scan)              /* It was not CURLYX, but CURLY. */
3347                     scan = next;
3348                 if ( /* ? quantifier ok, except for (?{ ... }) */
3349                     (next_is_eval || !(mincount == 0 && maxcount == 1))
3350                     && (minnext == 0) && (deltanext == 0)
3351                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3352                     && maxcount <= REG_INFTY/3) /* Complement check for big count */
3353                 {
3354                     ckWARNreg(RExC_parse,
3355                               "Quantifier unexpected on zero-length expression");
3356                 }
3357
3358                 min += minnext * mincount;
3359                 is_inf_internal |= ((maxcount == REG_INFTY
3360                                      && (minnext + deltanext) > 0)
3361                                     || deltanext == I32_MAX);
3362                 is_inf |= is_inf_internal;
3363                 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3364
3365                 /* Try powerful optimization CURLYX => CURLYN. */
3366                 if (  OP(oscan) == CURLYX && data
3367                       && data->flags & SF_IN_PAR
3368                       && !(data->flags & SF_HAS_EVAL)
3369                       && !deltanext && minnext == 1 ) {
3370                     /* Try to optimize to CURLYN.  */
3371                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3372                     regnode * const nxt1 = nxt;
3373 #ifdef DEBUGGING
3374                     regnode *nxt2;
3375 #endif
3376
3377                     /* Skip open. */
3378                     nxt = regnext(nxt);
3379                     if (!REGNODE_SIMPLE(OP(nxt))
3380                         && !(PL_regkind[OP(nxt)] == EXACT
3381                              && STR_LEN(nxt) == 1))
3382                         goto nogo;
3383 #ifdef DEBUGGING
3384                     nxt2 = nxt;
3385 #endif
3386                     nxt = regnext(nxt);
3387                     if (OP(nxt) != CLOSE)
3388                         goto nogo;
3389                     if (RExC_open_parens) {
3390                         RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3391                         RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3392                     }
3393                     /* Now we know that nxt2 is the only contents: */
3394                     oscan->flags = (U8)ARG(nxt);
3395                     OP(oscan) = CURLYN;
3396                     OP(nxt1) = NOTHING; /* was OPEN. */
3397
3398 #ifdef DEBUGGING
3399                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3400                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3401                     NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3402                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
3403                     OP(nxt + 1) = OPTIMIZED; /* was count. */
3404                     NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3405 #endif
3406                 }
3407               nogo:
3408
3409                 /* Try optimization CURLYX => CURLYM. */
3410                 if (  OP(oscan) == CURLYX && data
3411                       && !(data->flags & SF_HAS_PAR)
3412                       && !(data->flags & SF_HAS_EVAL)
3413                       && !deltanext     /* atom is fixed width */
3414                       && minnext != 0   /* CURLYM can't handle zero width */
3415                 ) {
3416                     /* XXXX How to optimize if data == 0? */
3417                     /* Optimize to a simpler form.  */
3418                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3419                     regnode *nxt2;
3420
3421                     OP(oscan) = CURLYM;
3422                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3423                             && (OP(nxt2) != WHILEM))
3424                         nxt = nxt2;
3425                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
3426                     /* Need to optimize away parenths. */
3427                     if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3428                         /* Set the parenth number.  */
3429                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3430
3431                         oscan->flags = (U8)ARG(nxt);
3432                         if (RExC_open_parens) {
3433                             RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3434                             RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3435                         }
3436                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
3437                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
3438
3439 #ifdef DEBUGGING
3440                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3441                         OP(nxt + 1) = OPTIMIZED; /* was count. */
3442                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3443                         NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
3444 #endif
3445 #if 0
3446                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
3447                             regnode *nnxt = regnext(nxt1);
3448                             if (nnxt == nxt) {
3449                                 if (reg_off_by_arg[OP(nxt1)])
3450                                     ARG_SET(nxt1, nxt2 - nxt1);
3451                                 else if (nxt2 - nxt1 < U16_MAX)
3452                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
3453                                 else
3454                                     OP(nxt) = NOTHING;  /* Cannot beautify */
3455                             }
3456                             nxt1 = nnxt;
3457                         }
3458 #endif
3459                         /* Optimize again: */
3460                         study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3461                                     NULL, stopparen, recursed, NULL, 0,depth+1);
3462                     }
3463                     else
3464                         oscan->flags = 0;
3465                 }
3466                 else if ((OP(oscan) == CURLYX)
3467                          && (flags & SCF_WHILEM_VISITED_POS)
3468                          /* See the comment on a similar expression above.
3469                             However, this time it's not a subexpression
3470                             we care about, but the expression itself. */
3471                          && (maxcount == REG_INFTY)
3472                          && data && ++data->whilem_c < 16) {
3473                     /* This stays as CURLYX, we can put the count/of pair. */
3474                     /* Find WHILEM (as in regexec.c) */
3475                     regnode *nxt = oscan + NEXT_OFF(oscan);
3476
3477                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3478                         nxt += ARG(nxt);
3479                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
3480                         | (RExC_whilem_seen << 4)); /* On WHILEM */
3481                 }
3482                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3483                     pars++;
3484                 if (flags & SCF_DO_SUBSTR) {
3485                     SV *last_str = NULL;
3486                     int counted = mincount != 0;
3487
3488                     if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3489 #if defined(SPARC64_GCC_WORKAROUND)
3490                         I32 b = 0;
3491                         STRLEN l = 0;
3492                         const char *s = NULL;
3493                         I32 old = 0;
3494
3495                         if (pos_before >= data->last_start_min)
3496                             b = pos_before;
3497                         else
3498                             b = data->last_start_min;
3499
3500                         l = 0;
3501                         s = SvPV_const(data->last_found, l);
3502                         old = b - data->last_start_min;
3503
3504 #else
3505                         I32 b = pos_before >= data->last_start_min
3506                             ? pos_before : data->last_start_min;
3507                         STRLEN l;
3508                         const char * const s = SvPV_const(data->last_found, l);
3509                         I32 old = b - data->last_start_min;
3510 #endif
3511
3512                         if (UTF)
3513                             old = utf8_hop((U8*)s, old) - (U8*)s;
3514                         l -= old;
3515                         /* Get the added string: */
3516                         last_str = newSVpvn_utf8(s  + old, l, UTF);
3517                         if (deltanext == 0 && pos_before == b) {
3518                             /* What was added is a constant string */
3519                             if (mincount > 1) {
3520                                 SvGROW(last_str, (mincount * l) + 1);
3521                                 repeatcpy(SvPVX(last_str) + l,
3522                                           SvPVX_const(last_str), l, mincount - 1);
3523                                 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3524                                 /* Add additional parts. */
3525                                 SvCUR_set(data->last_found,
3526                                           SvCUR(data->last_found) - l);
3527                                 sv_catsv(data->last_found, last_str);
3528                                 {
3529                                     SV * sv = data->last_found;
3530                                     MAGIC *mg =
3531                                         SvUTF8(sv) && SvMAGICAL(sv) ?
3532                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
3533                                     if (mg && mg->mg_len >= 0)
3534                                         mg->mg_len += CHR_SVLEN(last_str) - l;
3535                                 }
3536                                 data->last_end += l * (mincount - 1);
3537                             }
3538                         } else {
3539                             /* start offset must point into the last copy */
3540                             data->last_start_min += minnext * (mincount - 1);
3541                             data->last_start_max += is_inf ? I32_MAX
3542                                 : (maxcount - 1) * (minnext + data->pos_delta);
3543                         }
3544                     }
3545                     /* It is counted once already... */
3546                     data->pos_min += minnext * (mincount - counted);
3547                     data->pos_delta += - counted * deltanext +
3548                         (minnext + deltanext) * maxcount - minnext * mincount;
3549                     if (mincount != maxcount) {
3550                          /* Cannot extend fixed substrings found inside
3551                             the group.  */
3552                         SCAN_COMMIT(pRExC_state,data,minlenp);
3553                         if (mincount && last_str) {
3554                             SV * const sv = data->last_found;
3555                             MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3556                                 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3557
3558                             if (mg)
3559                                 mg->mg_len = -1;
3560                             sv_setsv(sv, last_str);
3561                             data->last_end = data->pos_min;
3562                             data->last_start_min =
3563                                 data->pos_min - CHR_SVLEN(last_str);
3564                             data->last_start_max = is_inf
3565                                 ? I32_MAX
3566                                 : data->pos_min + data->pos_delta
3567                                 - CHR_SVLEN(last_str);
3568                         }
3569                         data->longest = &(data->longest_float);
3570                     }
3571                     SvREFCNT_dec(last_str);
3572                 }
3573                 if (data && (fl & SF_HAS_EVAL))
3574                     data->flags |= SF_HAS_EVAL;
3575               optimize_curly_tail:
3576                 if (OP(oscan) != CURLYX) {
3577                     while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3578                            && NEXT_OFF(next))
3579                         NEXT_OFF(oscan) += NEXT_OFF(next);
3580                 }
3581                 continue;
3582             default:                    /* REF, ANYOFV, and CLUMP only? */
3583                 if (flags & SCF_DO_SUBSTR) {
3584                     SCAN_COMMIT(pRExC_state,data,minlenp);      /* Cannot expect anything... */
3585                     data->longest = &(data->longest_float);
3586                 }
3587                 is_inf = is_inf_internal = 1;
3588                 if (flags & SCF_DO_STCLASS_OR)
3589                     cl_anything(pRExC_state, data->start_class);
3590                 flags &= ~SCF_DO_STCLASS;
3591                 break;
3592             }
3593         }
3594         else if (OP(scan) == LNBREAK) {
3595             if (flags & SCF_DO_STCLASS) {
3596                 int value = 0;
3597                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3598                 if (flags & SCF_DO_STCLASS_AND) {
3599                     for (value = 0; value < 256; value++)
3600                         if (!is_VERTWS_cp(value))
3601                             ANYOF_BITMAP_CLEAR(data->start_class, value);
3602                 }
3603                 else {
3604                     for (value = 0; value < 256; value++)
3605                         if (is_VERTWS_cp(value))
3606                             ANYOF_BITMAP_SET(data->start_class, value);
3607                 }
3608                 if (flags & SCF_DO_STCLASS_OR)
3609                     cl_and(data->start_class, and_withp);
3610                 flags &= ~SCF_DO_STCLASS;
3611             }
3612             min += 1;
3613             delta += 1;
3614             if (flags & SCF_DO_SUBSTR) {
3615                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
3616                 data->pos_min += 1;
3617                 data->pos_delta += 1;
3618                 data->longest = &(data->longest_float);
3619             }
3620         }
3621         else if (OP(scan) == FOLDCHAR) {
3622             int d = ARG(scan) == LATIN_SMALL_LETTER_SHARP_S ? 1 : 2;
3623             flags &= ~SCF_DO_STCLASS;
3624             min += 1;
3625             delta += d;
3626             if (flags & SCF_DO_SUBSTR) {
3627                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
3628                 data->pos_min += 1;
3629                 data->pos_delta += d;
3630                 data->longest = &(data->longest_float);
3631             }
3632         }
3633         else if (REGNODE_SIMPLE(OP(scan))) {
3634             int value = 0;
3635
3636             if (flags & SCF_DO_SUBSTR) {
3637                 SCAN_COMMIT(pRExC_state,data,minlenp);
3638                 data->pos_min++;
3639             }
3640             min++;
3641             if (flags & SCF_DO_STCLASS) {
3642                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3643
3644                 /* Some of the logic below assumes that switching
3645                    locale on will only add false positives. */
3646                 switch (PL_regkind[OP(scan)]) {
3647                 case SANY:
3648                 default:
3649                   do_default:
3650                     /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3651                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3652                         cl_anything(pRExC_state, data->start_class);
3653                     break;
3654                 case REG_ANY:
3655                     if (OP(scan) == SANY)
3656                         goto do_default;
3657                     if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3658                         value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3659                                  || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
3660                         cl_anything(pRExC_state, data->start_class);
3661                     }
3662                     if (flags & SCF_DO_STCLASS_AND || !value)
3663                         ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3664                     break;
3665                 case ANYOF:
3666                     if (flags & SCF_DO_STCLASS_AND)
3667                         cl_and(data->start_class,
3668                                (struct regnode_charclass_class*)scan);
3669                     else
3670                         cl_or(pRExC_state, data->start_class,
3671                               (struct regnode_charclass_class*)scan);
3672                     break;
3673                 case ALNUM:
3674                     if (flags & SCF_DO_STCLASS_AND) {
3675                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3676                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3677                             if (OP(scan) == ALNUMU) {
3678                                 for (value = 0; value < 256; value++) {
3679                                     if (!isWORDCHAR_L1(value)) {
3680                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3681                                     }
3682                                 }
3683                             } else {
3684                                 for (value = 0; value < 256; value++) {
3685                                     if (!isALNUM(value)) {
3686                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3687                                     }
3688                                 }
3689                             }
3690                         }
3691                     }
3692                     else {
3693                         if (data->start_class->flags & ANYOF_LOCALE)
3694                             ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3695                         else if (OP(scan) == ALNUMU) {
3696                             for (value = 0; value < 256; value++) {
3697                                 if (isWORDCHAR_L1(value)) {
3698                                     ANYOF_BITMAP_SET(data->start_class, value);
3699                                 }
3700                             }
3701                         } else {
3702                             for (value = 0; value < 256; value++) {
3703                                 if (isALNUM(value)) {
3704                                     ANYOF_BITMAP_SET(data->start_class, value);
3705                                 }
3706                             }
3707                         }
3708                     }
3709                     break;
3710                 case NALNUM:
3711                     if (flags & SCF_DO_STCLASS_AND) {
3712                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3713                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3714                             if (OP(scan) == NALNUMU) {
3715                                 for (value = 0; value < 256; value++) {
3716                                     if (isWORDCHAR_L1(value)) {
3717                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3718                                     }
3719                                 }
3720                             } else {
3721                                 for (value = 0; value < 256; value++) {
3722                                     if (isALNUM(value)) {
3723                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3724                                     }
3725                                 }
3726                             }
3727                         }
3728                     }
3729                     else {
3730                         if (data->start_class->flags & ANYOF_LOCALE)
3731                             ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3732                         else {
3733                             if (OP(scan) == NALNUMU) {
3734                                 for (value = 0; value < 256; value++) {
3735                                     if (! isWORDCHAR_L1(value)) {
3736                                         ANYOF_BITMAP_SET(data->start_class, value);
3737                                     }
3738                                 }
3739                             } else {
3740                                 for (value = 0; value < 256; value++) {
3741                                     if (! isALNUM(value)) {
3742                                         ANYOF_BITMAP_SET(data->start_class, value);
3743                                     }
3744                                 }
3745                             }
3746                         }
3747                     }
3748                     break;
3749                 case SPACE:
3750                     if (flags & SCF_DO_STCLASS_AND) {
3751                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3752                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3753                             if (OP(scan) == SPACEU) {
3754                                 for (value = 0; value < 256; value++) {
3755                                     if (!isSPACE_L1(value)) {
3756                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3757                                     }
3758                                 }
3759                             } else {
3760                                 for (value = 0; value < 256; value++) {
3761                                     if (!isSPACE(value)) {
3762                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3763                                     }
3764                                 }
3765                             }
3766                         }
3767                     }
3768                     else {
3769                         if (data->start_class->flags & ANYOF_LOCALE) {
3770                             ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3771                         }
3772                         else if (OP(scan) == SPACEU) {
3773                             for (value = 0; value < 256; value++) {
3774                                 if (isSPACE_L1(value)) {
3775                                     ANYOF_BITMAP_SET(data->start_class, value);
3776                                 }
3777                             }
3778                         } else {
3779                             for (value = 0; value < 256; value++) {
3780                                 if (isSPACE(value)) {
3781                                     ANYOF_BITMAP_SET(data->start_class, value);
3782                                 }
3783                             }
3784                         }
3785                     }
3786                     break;
3787                 case NSPACE:
3788                     if (flags & SCF_DO_STCLASS_AND) {
3789                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3790                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3791                             if (OP(scan) == NSPACEU) {
3792                                 for (value = 0; value < 256; value++) {
3793                                     if (isSPACE_L1(value)) {
3794                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3795                                     }
3796                                 }
3797                             } else {
3798                                 for (value = 0; value < 256; value++) {
3799                                     if (isSPACE(value)) {
3800                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3801                                     }
3802                                 }
3803                             }
3804                         }
3805                     }
3806                     else {
3807                         if (data->start_class->flags & ANYOF_LOCALE)
3808                             ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3809                         else if (OP(scan) == NSPACEU) {
3810                             for (value = 0; value < 256; value++) {
3811                                 if (!isSPACE_L1(value)) {
3812                                     ANYOF_BITMAP_SET(data->start_class, value);
3813                                 }
3814                             }
3815                         }
3816                         else {
3817                             for (value = 0; value < 256; value++) {
3818                                 if (!isSPACE(value)) {
3819                                     ANYOF_BITMAP_SET(data->start_class, value);
3820                                 }
3821                             }
3822                         }
3823                     }
3824                     break;
3825                 case DIGIT:
3826                     if (flags & SCF_DO_STCLASS_AND) {
3827                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3828                         for (value = 0; value < 256; value++)
3829                             if (!isDIGIT(value))
3830                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
3831                     }
3832                     else {
3833                         if (data->start_class->flags & ANYOF_LOCALE)
3834                             ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3835                         else {
3836                             for (value = 0; value < 256; value++)
3837                                 if (isDIGIT(value))
3838                                     ANYOF_BITMAP_SET(data->start_class, value);
3839                         }
3840                     }
3841                     break;
3842                 case NDIGIT:
3843                     if (flags & SCF_DO_STCLASS_AND) {
3844                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3845                         for (value = 0; value < 256; value++)
3846                             if (isDIGIT(value))
3847                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
3848                     }
3849                     else {
3850                         if (data->start_class->flags & ANYOF_LOCALE)
3851                             ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3852                         else {
3853                             for (value = 0; value < 256; value++)
3854                                 if (!isDIGIT(value))
3855                                     ANYOF_BITMAP_SET(data->start_class, value);
3856                         }
3857                     }
3858                     break;
3859                 CASE_SYNST_FNC(VERTWS);
3860                 CASE_SYNST_FNC(HORIZWS);
3861
3862                 }
3863                 if (flags & SCF_DO_STCLASS_OR)
3864                     cl_and(data->start_class, and_withp);
3865                 flags &= ~SCF_DO_STCLASS;
3866             }
3867         }
3868         else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3869             data->flags |= (OP(scan) == MEOL
3870                             ? SF_BEFORE_MEOL
3871                             : SF_BEFORE_SEOL);
3872         }
3873         else if (  PL_regkind[OP(scan)] == BRANCHJ
3874                  /* Lookbehind, or need to calculate parens/evals/stclass: */
3875                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
3876                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3877             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3878                 || OP(scan) == UNLESSM )
3879             {
3880                 /* Negative Lookahead/lookbehind
3881                    In this case we can't do fixed string optimisation.
3882                 */
3883
3884                 I32 deltanext, minnext, fake = 0;
3885                 regnode *nscan;
3886                 struct regnode_charclass_class intrnl;
3887                 int f = 0;
3888
3889                 data_fake.flags = 0;
3890                 if (data) {
3891                     data_fake.whilem_c = data->whilem_c;
3892                     data_fake.last_closep = data->last_closep;
3893                 }
3894                 else
3895                     data_fake.last_closep = &fake;
3896                 data_fake.pos_delta = delta;
3897                 if ( flags & SCF_DO_STCLASS && !scan->flags
3898                      && OP(scan) == IFMATCH ) { /* Lookahead */
3899                     cl_init(pRExC_state, &intrnl);
3900                     data_fake.start_class = &intrnl;
3901                     f |= SCF_DO_STCLASS_AND;
3902                 }
3903                 if (flags & SCF_WHILEM_VISITED_POS)
3904                     f |= SCF_WHILEM_VISITED_POS;
3905                 next = regnext(scan);
3906                 nscan = NEXTOPER(NEXTOPER(scan));
3907                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3908                     last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3909                 if (scan->flags) {
3910                     if (deltanext) {
3911                         FAIL("Variable length lookbehind not implemented");
3912                     }
3913                     else if (minnext > (I32)U8_MAX) {
3914                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3915                     }
3916                     scan->flags = (U8)minnext;
3917                 }
3918                 if (data) {
3919                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3920                         pars++;
3921                     if (data_fake.flags & SF_HAS_EVAL)
3922                         data->flags |= SF_HAS_EVAL;
3923                     data->whilem_c = data_fake.whilem_c;
3924                 }
3925                 if (f & SCF_DO_STCLASS_AND) {
3926                     if (flags & SCF_DO_STCLASS_OR) {
3927                         /* OR before, AND after: ideally we would recurse with
3928                          * data_fake to get the AND applied by study of the
3929                          * remainder of the pattern, and then derecurse;
3930                          * *** HACK *** for now just treat as "no information".
3931                          * See [perl #56690].
3932                          */
3933                         cl_init(pRExC_state, data->start_class);
3934                     }  else {
3935                         /* AND before and after: combine and continue */
3936                         const int was = (data->start_class->flags & ANYOF_EOS);
3937
3938                         cl_and(data->start_class, &intrnl);
3939                         if (was)
3940                             data->start_class->flags |= ANYOF_EOS;
3941                     }
3942                 }
3943             }
3944 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3945             else {
3946                 /* Positive Lookahead/lookbehind
3947                    In this case we can do fixed string optimisation,
3948                    but we must be careful about it. Note in the case of
3949                    lookbehind the positions will be offset by the minimum
3950                    length of the pattern, something we won't know about
3951                    until after the recurse.
3952                 */
3953                 I32 deltanext, fake = 0;
3954                 regnode *nscan;
3955                 struct regnode_charclass_class intrnl;
3956                 int f = 0;
3957                 /* We use SAVEFREEPV so that when the full compile
3958                     is finished perl will clean up the allocated
3959                     minlens when it's all done. This way we don't
3960                     have to worry about freeing them when we know
3961                     they wont be used, which would be a pain.
3962                  */
3963                 I32 *minnextp;
3964                 Newx( minnextp, 1, I32 );
3965                 SAVEFREEPV(minnextp);
3966
3967                 if (data) {
3968                     StructCopy(data, &data_fake, scan_data_t);
3969                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3970                         f |= SCF_DO_SUBSTR;
3971                         if (scan->flags)
3972                             SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
3973                         data_fake.last_found=newSVsv(data->last_found);
3974                     }
3975                 }
3976                 else
3977                     data_fake.last_closep = &fake;
3978                 data_fake.flags = 0;
3979                 data_fake.pos_delta = delta;
3980                 if (is_inf)
3981                     data_fake.flags |= SF_IS_INF;
3982                 if ( flags & SCF_DO_STCLASS && !scan->flags
3983                      && OP(scan) == IFMATCH ) { /* Lookahead */
3984                     cl_init(pRExC_state, &intrnl);
3985                     data_fake.start_class = &intrnl;
3986                     f |= SCF_DO_STCLASS_AND;
3987                 }
3988                 if (flags & SCF_WHILEM_VISITED_POS)
3989                     f |= SCF_WHILEM_VISITED_POS;
3990                 next = regnext(scan);
3991                 nscan = NEXTOPER(NEXTOPER(scan));
3992
3993                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3994                     last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3995                 if (scan->flags) {
3996                     if (deltanext) {
3997                         FAIL("Variable length lookbehind not implemented");
3998                     }
3999                     else if (*minnextp > (I32)U8_MAX) {
4000                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4001                     }
4002                     scan->flags = (U8)*minnextp;
4003                 }
4004
4005                 *minnextp += min;
4006
4007                 if (f & SCF_DO_STCLASS_AND) {
4008                     const int was = (data->start_class->flags & ANYOF_EOS);
4009
4010                     cl_and(data->start_class, &intrnl);
4011                     if (was)
4012                         data->start_class->flags |= ANYOF_EOS;
4013                 }
4014                 if (data) {
4015                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4016                         pars++;
4017                     if (data_fake.flags & SF_HAS_EVAL)
4018                         data->flags |= SF_HAS_EVAL;
4019                     data->whilem_c = data_fake.whilem_c;
4020                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4021                         if (RExC_rx->minlen<*minnextp)
4022                             RExC_rx->minlen=*minnextp;
4023                         SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4024                         SvREFCNT_dec(data_fake.last_found);
4025
4026                         if ( data_fake.minlen_fixed != minlenp )
4027                         {
4028                             data->offset_fixed= data_fake.offset_fixed;
4029                             data->minlen_fixed= data_fake.minlen_fixed;
4030                             data->lookbehind_fixed+= scan->flags;
4031                         }
4032                         if ( data_fake.minlen_float != minlenp )
4033                         {
4034                             data->minlen_float= data_fake.minlen_float;
4035                             data->offset_float_min=data_fake.offset_float_min;
4036                             data->offset_float_max=data_fake.offset_float_max;
4037                             data->lookbehind_float+= scan->flags;
4038                         }
4039                     }
4040                 }
4041
4042
4043             }
4044 #endif
4045         }
4046         else if (OP(scan) == OPEN) {
4047             if (stopparen != (I32)ARG(scan))
4048                 pars++;
4049         }
4050         else if (OP(scan) == CLOSE) {
4051             if (stopparen == (I32)ARG(scan)) {
4052                 break;
4053             }
4054             if ((I32)ARG(scan) == is_par) {
4055                 next = regnext(scan);
4056
4057                 if ( next && (OP(next) != WHILEM) && next < last)
4058                     is_par = 0;         /* Disable optimization */
4059             }
4060             if (data)
4061                 *(data->last_closep) = ARG(scan);
4062         }
4063         else if (OP(scan) == EVAL) {
4064                 if (data)
4065                     data->flags |= SF_HAS_EVAL;
4066         }
4067         else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4068             if (flags & SCF_DO_SUBSTR) {
4069                 SCAN_COMMIT(pRExC_state,data,minlenp);
4070                 flags &= ~SCF_DO_SUBSTR;
4071             }
4072             if (data && OP(scan)==ACCEPT) {
4073                 data->flags |= SCF_SEEN_ACCEPT;
4074                 if (stopmin > min)
4075                     stopmin = min;
4076             }
4077         }
4078         else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4079         {
4080                 if (flags & SCF_DO_SUBSTR) {
4081                     SCAN_COMMIT(pRExC_state,data,minlenp);
4082                     data->longest = &(data->longest_float);
4083                 }
4084                 is_inf = is_inf_internal = 1;
4085                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4086                     cl_anything(pRExC_state, data->start_class);
4087                 flags &= ~SCF_DO_STCLASS;
4088         }
4089         else if (OP(scan) == GPOS) {
4090             if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4091                 !(delta || is_inf || (data && data->pos_delta)))
4092             {
4093                 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4094                     RExC_rx->extflags |= RXf_ANCH_GPOS;
4095                 if (RExC_rx->gofs < (U32)min)
4096                     RExC_rx->gofs = min;
4097             } else {
4098                 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4099                 RExC_rx->gofs = 0;
4100             }
4101         }
4102 #ifdef TRIE_STUDY_OPT
4103 #ifdef FULL_TRIE_STUDY
4104         else if (PL_regkind[OP(scan)] == TRIE) {
4105             /* NOTE - There is similar code to this block above for handling
4106                BRANCH nodes on the initial study.  If you change stuff here
4107                check there too. */
4108             regnode *trie_node= scan;
4109             regnode *tail= regnext(scan);
4110             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4111             I32 max1 = 0, min1 = I32_MAX;
4112             struct regnode_charclass_class accum;
4113
4114             if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4115                 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4116             if (flags & SCF_DO_STCLASS)
4117                 cl_init_zero(pRExC_state, &accum);
4118
4119             if (!trie->jump) {
4120                 min1= trie->minlen;
4121                 max1= trie->maxlen;
4122             } else {
4123                 const regnode *nextbranch= NULL;
4124                 U32 word;
4125
4126                 for ( word=1 ; word <= trie->wordcount ; word++)
4127                 {
4128                     I32 deltanext=0, minnext=0, f = 0, fake;
4129                     struct regnode_charclass_class this_class;
4130
4131                     data_fake.flags = 0;
4132                     if (data) {
4133                         data_fake.whilem_c = data->whilem_c;
4134                         data_fake.last_closep = data->last_closep;
4135                     }
4136                     else
4137                         data_fake.last_closep = &fake;
4138                     data_fake.pos_delta = delta;
4139                     if (flags & SCF_DO_STCLASS) {
4140                         cl_init(pRExC_state, &this_class);
4141                         data_fake.start_class = &this_class;
4142                         f = SCF_DO_STCLASS_AND;
4143                     }
4144                     if (flags & SCF_WHILEM_VISITED_POS)
4145                         f |= SCF_WHILEM_VISITED_POS;
4146
4147                     if (trie->jump[word]) {
4148                         if (!nextbranch)
4149                             nextbranch = trie_node + trie->jump[0];
4150                         scan= trie_node + trie->jump[word];
4151                         /* We go from the jump point to the branch that follows
4152                            it. Note this means we need the vestigal unused branches
4153                            even though they arent otherwise used.
4154                          */
4155                         minnext = study_chunk(pRExC_state, &scan, minlenp,
4156                             &deltanext, (regnode *)nextbranch, &data_fake,
4157                             stopparen, recursed, NULL, f,depth+1);
4158                     }
4159                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4160                         nextbranch= regnext((regnode*)nextbranch);
4161
4162                     if (min1 > (I32)(minnext + trie->minlen))
4163                         min1 = minnext + trie->minlen;
4164                     if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4165                         max1 = minnext + deltanext + trie->maxlen;
4166                     if (deltanext == I32_MAX)
4167                         is_inf = is_inf_internal = 1;
4168
4169                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4170                         pars++;
4171                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
4172                         if ( stopmin > min + min1)
4173                             stopmin = min + min1;
4174                         flags &= ~SCF_DO_SUBSTR;
4175                         if (data)
4176                             data->flags |= SCF_SEEN_ACCEPT;
4177                     }
4178                     if (data) {
4179                         if (data_fake.flags & SF_HAS_EVAL)
4180                             data->flags |= SF_HAS_EVAL;
4181                         data->whilem_c = data_fake.whilem_c;
4182                     }
4183                     if (flags & SCF_DO_STCLASS)
4184                         cl_or(pRExC_state, &accum, &this_class);
4185                 }
4186             }
4187             if (flags & SCF_DO_SUBSTR) {
4188                 data->pos_min += min1;
4189                 data->pos_delta += max1 - min1;
4190                 if (max1 != min1 || is_inf)
4191                     data->longest = &(data->longest_float);
4192             }
4193             min += min1;
4194             delta += max1 - min1;
4195             if (flags & SCF_DO_STCLASS_OR) {
4196                 cl_or(pRExC_state, data->start_class, &accum);
4197                 if (min1) {
4198                     cl_and(data->start_class, and_withp);
4199                     flags &= ~SCF_DO_STCLASS;
4200                 }
4201             }
4202             else if (flags & SCF_DO_STCLASS_AND) {
4203                 if (min1) {
4204                     cl_and(data->start_class, &accum);
4205                     flags &= ~SCF_DO_STCLASS;
4206                 }
4207                 else {
4208                     /* Switch to OR mode: cache the old value of
4209                      * data->start_class */
4210                     INIT_AND_WITHP;
4211                     StructCopy(data->start_class, and_withp,
4212                                struct regnode_charclass_class);
4213                     flags &= ~SCF_DO_STCLASS_AND;
4214                     StructCopy(&accum, data->start_class,
4215                                struct regnode_charclass_class);
4216                     flags |= SCF_DO_STCLASS_OR;
4217                     data->start_class->flags |= ANYOF_EOS;
4218                 }
4219             }
4220             scan= tail;
4221             continue;
4222         }
4223 #else
4224         else if (PL_regkind[OP(scan)] == TRIE) {
4225             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4226             U8*bang=NULL;
4227
4228             min += trie->minlen;
4229             delta += (trie->maxlen - trie->minlen);
4230             flags &= ~SCF_DO_STCLASS; /* xxx */
4231             if (flags & SCF_DO_SUBSTR) {
4232                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
4233                 data->pos_min += trie->minlen;
4234                 data->pos_delta += (trie->maxlen - trie->minlen);
4235                 if (trie->maxlen != trie->minlen)
4236                     data->longest = &(data->longest_float);
4237             }
4238             if (trie->jump) /* no more substrings -- for now /grr*/
4239                 flags &= ~SCF_DO_SUBSTR;
4240         }
4241 #endif /* old or new */
4242 #endif /* TRIE_STUDY_OPT */
4243
4244         /* Else: zero-length, ignore. */
4245         scan = regnext(scan);
4246     }
4247     if (frame) {
4248         last = frame->last;
4249         scan = frame->next;
4250         stopparen = frame->stop;
4251         frame = frame->prev;
4252         goto fake_study_recurse;
4253     }
4254
4255   finish:
4256     assert(!frame);
4257     DEBUG_STUDYDATA("pre-fin:",data,depth);
4258
4259     *scanp = scan;
4260     *deltap = is_inf_internal ? I32_MAX : delta;
4261     if (flags & SCF_DO_SUBSTR && is_inf)
4262         data->pos_delta = I32_MAX - data->pos_min;
4263     if (is_par > (I32)U8_MAX)
4264         is_par = 0;
4265     if (is_par && pars==1 && data) {
4266         data->flags |= SF_IN_PAR;
4267         data->flags &= ~SF_HAS_PAR;
4268     }
4269     else if (pars && data) {
4270         data->flags |= SF_HAS_PAR;
4271         data->flags &= ~SF_IN_PAR;
4272     }
4273     if (flags & SCF_DO_STCLASS_OR)
4274         cl_and(data->start_class, and_withp);
4275     if (flags & SCF_TRIE_RESTUDY)
4276         data->flags |=  SCF_TRIE_RESTUDY;
4277
4278     DEBUG_STUDYDATA("post-fin:",data,depth);
4279
4280     return min < stopmin ? min : stopmin;
4281 }
4282
4283 STATIC U32
4284 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4285 {
4286     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4287
4288     PERL_ARGS_ASSERT_ADD_DATA;
4289
4290     Renewc(RExC_rxi->data,
4291            sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4292            char, struct reg_data);
4293     if(count)
4294         Renew(RExC_rxi->data->what, count + n, U8);
4295     else
4296         Newx(RExC_rxi->data->what, n, U8);
4297     RExC_rxi->data->count = count + n;
4298     Copy(s, RExC_rxi->data->what + count, n, U8);
4299     return count;
4300 }
4301
4302 /*XXX: todo make this not included in a non debugging perl */
4303 #ifndef PERL_IN_XSUB_RE
4304 void
4305 Perl_reginitcolors(pTHX)
4306 {
4307     dVAR;
4308     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4309     if (s) {
4310         char *t = savepv(s);
4311         int i = 0;
4312         PL_colors[0] = t;
4313         while (++i < 6) {
4314             t = strchr(t, '\t');
4315             if (t) {
4316                 *t = '\0';
4317                 PL_colors[i] = ++t;
4318             }
4319             else
4320                 PL_colors[i] = t = (char *)"";
4321         }
4322     } else {
4323         int i = 0;
4324         while (i < 6)
4325             PL_colors[i++] = (char *)"";
4326     }
4327     PL_colorset = 1;
4328 }
4329 #endif
4330
4331
4332 #ifdef TRIE_STUDY_OPT
4333 #define CHECK_RESTUDY_GOTO                                  \
4334         if (                                                \
4335               (data.flags & SCF_TRIE_RESTUDY)               \
4336               && ! restudied++                              \
4337         )     goto reStudy
4338 #else
4339 #define CHECK_RESTUDY_GOTO
4340 #endif
4341
4342 /*
4343  - pregcomp - compile a regular expression into internal code
4344  *
4345  * We can't allocate space until we know how big the compiled form will be,
4346  * but we can't compile it (and thus know how big it is) until we've got a
4347  * place to put the code.  So we cheat:  we compile it twice, once with code
4348  * generation turned off and size counting turned on, and once "for real".
4349  * This also means that we don't allocate space until we are sure that the
4350  * thing really will compile successfully, and we never have to move the
4351  * code and thus invalidate pointers into it.  (Note that it has to be in
4352  * one piece because free() must be able to free it all.) [NB: not true in perl]
4353  *
4354  * Beware that the optimization-preparation code in here knows about some
4355  * of the structure of the compiled regexp.  [I'll say.]
4356  */
4357
4358
4359
4360 #ifndef PERL_IN_XSUB_RE
4361 #define RE_ENGINE_PTR &PL_core_reg_engine
4362 #else
4363 extern const struct regexp_engine my_reg_engine;
4364 #define RE_ENGINE_PTR &my_reg_engine
4365 #endif
4366
4367 #ifndef PERL_IN_XSUB_RE
4368 REGEXP *
4369 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4370 {
4371     dVAR;
4372     HV * const table = GvHV(PL_hintgv);
4373
4374     PERL_ARGS_ASSERT_PREGCOMP;
4375
4376     /* Dispatch a request to compile a regexp to correct
4377        regexp engine. */
4378     if (table) {
4379         SV **ptr= hv_fetchs(table, "regcomp", FALSE);
4380         GET_RE_DEBUG_FLAGS_DECL;
4381         if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
4382             const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4383             DEBUG_COMPILE_r({
4384                 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4385                     SvIV(*ptr));
4386             });
4387             return CALLREGCOMP_ENG(eng, pattern, flags);
4388         }
4389     }
4390     return Perl_re_compile(aTHX_ pattern, flags);
4391 }
4392 #endif
4393
4394 REGEXP *
4395 Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
4396 {
4397     dVAR;
4398     REGEXP *rx;
4399     struct regexp *r;
4400     register regexp_internal *ri;
4401     STRLEN plen;
4402     char  *exp;
4403     char* xend;
4404     regnode *scan;
4405     I32 flags;
4406     I32 minlen = 0;
4407     U32 pm_flags;
4408
4409     /* these are all flags - maybe they should be turned
4410      * into a single int with different bit masks */
4411     I32 sawlookahead = 0;
4412     I32 sawplus = 0;
4413     I32 sawopen = 0;
4414     bool used_setjump = FALSE;
4415
4416     U8 jump_ret = 0;
4417     dJMPENV;
4418     scan_data_t data;
4419     RExC_state_t RExC_state;
4420     RExC_state_t * const pRExC_state = &RExC_state;
4421 #ifdef TRIE_STUDY_OPT
4422     int restudied;
4423     RExC_state_t copyRExC_state;
4424 #endif
4425     GET_RE_DEBUG_FLAGS_DECL;
4426
4427     PERL_ARGS_ASSERT_RE_COMPILE;
4428
4429     DEBUG_r(if (!PL_colorset) reginitcolors());
4430
4431     RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
4432     RExC_uni_semantics = 0;
4433
4434     /****************** LONG JUMP TARGET HERE***********************/
4435     /* Longjmp back to here if have to switch in midstream to utf8 */
4436     if (! RExC_orig_utf8) {
4437         JMPENV_PUSH(jump_ret);
4438         used_setjump = TRUE;
4439     }
4440
4441     if (jump_ret == 0) {    /* First time through */
4442         exp = SvPV(pattern, plen);
4443         xend = exp + plen;
4444         /* ignore the utf8ness if the pattern is 0 length */
4445         if (plen == 0) {
4446             RExC_utf8 = RExC_orig_utf8 = 0;
4447         }
4448
4449         DEBUG_COMPILE_r({
4450             SV *dsv= sv_newmortal();
4451             RE_PV_QUOTED_DECL(s, RExC_utf8,
4452                 dsv, exp, plen, 60);
4453             PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4454                            PL_colors[4],PL_colors[5],s);
4455         });
4456     }
4457     else {  /* longjumped back */
4458         STRLEN len = plen;
4459
4460         /* If the cause for the longjmp was other than changing to utf8, pop
4461          * our own setjmp, and longjmp to the correct handler */
4462         if (jump_ret != UTF8_LONGJMP) {
4463             JMPENV_POP;
4464             JMPENV_JUMP(jump_ret);
4465         }
4466
4467         GET_RE_DEBUG_FLAGS;
4468
4469         /* It's possible to write a regexp in ascii that represents Unicode
4470         codepoints outside of the byte range, such as via \x{100}. If we
4471         detect such a sequence we have to convert the entire pattern to utf8
4472         and then recompile, as our sizing calculation will have been based
4473         on 1 byte == 1 character, but we will need to use utf8 to encode
4474         at least some part of the pattern, and therefore must convert the whole
4475         thing.
4476         -- dmq */
4477         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4478             "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4479         exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)SvPV(pattern, plen), &len);
4480         xend = exp + len;
4481         RExC_orig_utf8 = RExC_utf8 = 1;
4482         SAVEFREEPV(exp);
4483     }
4484
4485 #ifdef TRIE_STUDY_OPT
4486     restudied = 0;
4487 #endif
4488
4489     /* Set to use unicode semantics if the pattern is in utf8 and has the
4490      * 'depends' charset specified, as it means unicode when utf8  */
4491     pm_flags = orig_pm_flags;
4492
4493     if (RExC_utf8 && get_regex_charset(pm_flags) == REGEX_DEPENDS_CHARSET) {
4494         set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
4495     }
4496
4497     RExC_precomp = exp;
4498     RExC_flags = pm_flags;
4499     RExC_sawback = 0;
4500
4501     RExC_seen = 0;
4502     RExC_in_lookbehind = 0;
4503     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4504     RExC_seen_evals = 0;
4505     RExC_extralen = 0;
4506
4507     /* First pass: determine size, legality. */
4508     RExC_parse = exp;
4509     RExC_start = exp;
4510     RExC_end = xend;
4511     RExC_naughty = 0;
4512     RExC_npar = 1;
4513     RExC_nestroot = 0;
4514     RExC_size = 0L;
4515     RExC_emit = &PL_regdummy;
4516     RExC_whilem_seen = 0;
4517     RExC_open_parens = NULL;
4518     RExC_close_parens = NULL;
4519     RExC_opend = NULL;
4520     RExC_paren_names = NULL;
4521 #ifdef DEBUGGING
4522     RExC_paren_name_list = NULL;
4523 #endif
4524     RExC_recurse = NULL;
4525     RExC_recurse_count = 0;
4526
4527 #if 0 /* REGC() is (currently) a NOP at the first pass.
4528        * Clever compilers notice this and complain. --jhi */
4529     REGC((U8)REG_MAGIC, (char*)RExC_emit);
4530 #endif
4531     DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4532     if (reg(pRExC_state, 0, &flags,1) == NULL) {
4533         RExC_precomp = NULL;
4534         return(NULL);
4535     }
4536
4537     /* Here, finished first pass.  Get rid of any added setjmp */
4538     if (used_setjump) {
4539         JMPENV_POP;
4540     }
4541
4542     DEBUG_PARSE_r({
4543         PerlIO_printf(Perl_debug_log,
4544             "Required size %"IVdf" nodes\n"
4545             "Starting second pass (creation)\n",
4546             (IV)RExC_size);
4547         RExC_lastnum=0;
4548         RExC_lastparse=NULL;
4549     });
4550
4551     /* The first pass could have found things that force Unicode semantics */
4552     if ((RExC_utf8 || RExC_uni_semantics)
4553          && get_regex_charset(pm_flags) == REGEX_DEPENDS_CHARSET)
4554     {
4555         set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
4556     }
4557
4558     /* Small enough for pointer-storage convention?
4559        If extralen==0, this means that we will not need long jumps. */
4560     if (RExC_size >= 0x10000L && RExC_extralen)
4561         RExC_size += RExC_extralen;
4562     else
4563         RExC_extralen = 0;
4564     if (RExC_whilem_seen > 15)
4565         RExC_whilem_seen = 15;
4566
4567     /* Allocate space and zero-initialize. Note, the two step process
4568        of zeroing when in debug mode, thus anything assigned has to
4569        happen after that */
4570     rx = (REGEXP*) newSV_type(SVt_REGEXP);
4571     r = (struct regexp*)SvANY(rx);
4572     Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4573          char, regexp_internal);
4574     if ( r == NULL || ri == NULL )
4575         FAIL("Regexp out of space");
4576 #ifdef DEBUGGING
4577     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
4578     Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
4579 #else
4580     /* bulk initialize base fields with 0. */
4581     Zero(ri, sizeof(regexp_internal), char);
4582 #endif
4583
4584     /* non-zero initialization begins here */
4585     RXi_SET( r, ri );
4586     r->engine= RE_ENGINE_PTR;
4587     r->extflags = pm_flags;
4588     {
4589         bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
4590         bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
4591
4592         /* The caret is output if there are any defaults: if not all the STD
4593          * flags are set, or if no character set specifier is needed */
4594         bool has_default =
4595                     (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
4596                     || ! has_charset);
4597         bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
4598         U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
4599                             >> RXf_PMf_STD_PMMOD_SHIFT);
4600         const char *fptr = STD_PAT_MODS;        /*"msix"*/
4601         char *p;
4602         /* Allocate for the worst case, which is all the std flags are turned
4603          * on.  If more precision is desired, we could do a population count of
4604          * the flags set.  This could be done with a small lookup table, or by
4605          * shifting, masking and adding, or even, when available, assembly
4606          * language for a machine-language population count.
4607          * We never output a minus, as all those are defaults, so are
4608          * covered by the caret */
4609         const STRLEN wraplen = plen + has_p + has_runon
4610             + has_default       /* If needs a caret */
4611
4612                 /* If needs a character set specifier */
4613             + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
4614             + (sizeof(STD_PAT_MODS) - 1)
4615             + (sizeof("(?:)") - 1);
4616
4617         p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
4618         SvPOK_on(rx);
4619         SvFLAGS(rx) |= SvUTF8(pattern);
4620         *p++='('; *p++='?';
4621
4622         /* If a default, cover it using the caret */
4623         if (has_default) {
4624             *p++= DEFAULT_PAT_MOD;
4625         }
4626         if (has_charset) {
4627             STRLEN len;
4628             const char* const name = get_regex_charset_name(r->extflags, &len);
4629             Copy(name, p, len, char);
4630             p += len;
4631         }
4632         if (has_p)
4633             *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
4634         {
4635             char ch;
4636             while((ch = *fptr++)) {
4637                 if(reganch & 1)
4638                     *p++ = ch;
4639                 reganch >>= 1;
4640             }
4641         }
4642
4643         *p++ = ':';
4644         Copy(RExC_precomp, p, plen, char);
4645         assert ((RX_WRAPPED(rx) - p) < 16);
4646         r->pre_prefix = p - RX_WRAPPED(rx);
4647         p += plen;
4648         if (has_runon)
4649             *p++ = '\n';
4650         *p++ = ')';
4651         *p = 0;
4652         SvCUR_set(rx, p - SvPVX_const(rx));
4653     }
4654
4655     r->intflags = 0;
4656     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4657
4658     if (RExC_seen & REG_SEEN_RECURSE) {
4659         Newxz(RExC_open_parens, RExC_npar,regnode *);
4660         SAVEFREEPV(RExC_open_parens);
4661         Newxz(RExC_close_parens,RExC_npar,regnode *);
4662         SAVEFREEPV(RExC_close_parens);
4663     }
4664
4665     /* Useful during FAIL. */
4666 #ifdef RE_TRACK_PATTERN_OFFSETS
4667     Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
4668     DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
4669                           "%s %"UVuf" bytes for offset annotations.\n",
4670                           ri->u.offsets ? "Got" : "Couldn't get",
4671                           (UV)((2*RExC_size+1) * sizeof(U32))));
4672 #endif
4673     SetProgLen(ri,RExC_size);
4674     RExC_rx_sv = rx;
4675     RExC_rx = r;
4676     RExC_rxi = ri;
4677
4678     /* Second pass: emit code. */
4679     RExC_flags = pm_flags;      /* don't let top level (?i) bleed */
4680     RExC_parse = exp;
4681     RExC_end = xend;
4682     RExC_naughty = 0;
4683     RExC_npar = 1;
4684     RExC_emit_start = ri->program;
4685     RExC_emit = ri->program;
4686     RExC_emit_bound = ri->program + RExC_size + 1;
4687
4688     /* Store the count of eval-groups for security checks: */
4689     RExC_rx->seen_evals = RExC_seen_evals;
4690     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
4691     if (reg(pRExC_state, 0, &flags,1) == NULL) {
4692         ReREFCNT_dec(rx);
4693         return(NULL);
4694     }
4695     /* XXXX To minimize changes to RE engine we always allocate
4696        3-units-long substrs field. */
4697     Newx(r->substrs, 1, struct reg_substr_data);
4698     if (RExC_recurse_count) {
4699         Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4700         SAVEFREEPV(RExC_recurse);
4701     }
4702
4703 reStudy:
4704     r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
4705     Zero(r->substrs, 1, struct reg_substr_data);
4706
4707 #ifdef TRIE_STUDY_OPT
4708     if (!restudied) {
4709         StructCopy(&zero_scan_data, &data, scan_data_t);
4710         copyRExC_state = RExC_state;
4711     } else {
4712         U32 seen=RExC_seen;
4713         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
4714
4715         RExC_state = copyRExC_state;
4716         if (seen & REG_TOP_LEVEL_BRANCHES)
4717             RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4718         else
4719             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
4720         if (data.last_found) {
4721             SvREFCNT_dec(data.longest_fixed);
4722             SvREFCNT_dec(data.longest_float);
4723             SvREFCNT_dec(data.last_found);
4724         }
4725         StructCopy(&zero_scan_data, &data, scan_data_t);
4726     }
4727 #else
4728     StructCopy(&zero_scan_data, &data, scan_data_t);
4729 #endif
4730
4731     /* Dig out information for optimizations. */
4732     r->extflags = RExC_flags; /* was pm_op */
4733     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
4734
4735     if (UTF)
4736         SvUTF8_on(rx);  /* Unicode in it? */
4737     ri->regstclass = NULL;
4738     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
4739         r->intflags |= PREGf_NAUGHTY;
4740     scan = ri->program + 1;             /* First BRANCH. */
4741
4742     /* testing for BRANCH here tells us whether there is "must appear"
4743        data in the pattern. If there is then we can use it for optimisations */
4744     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /*  Only one top-level choice. */
4745         I32 fake;
4746         STRLEN longest_float_length, longest_fixed_length;
4747         struct regnode_charclass_class ch_class; /* pointed to by data */
4748         int stclass_flag;
4749         I32 last_close = 0; /* pointed to by data */
4750         regnode *first= scan;
4751         regnode *first_next= regnext(first);
4752         /*
4753          * Skip introductions and multiplicators >= 1
4754          * so that we can extract the 'meat' of the pattern that must
4755          * match in the large if() sequence following.
4756          * NOTE that EXACT is NOT covered here, as it is normally
4757          * picked up by the optimiser separately.
4758          *
4759          * This is unfortunate as the optimiser isnt handling lookahead
4760          * properly currently.
4761          *
4762          */
4763         while ((OP(first) == OPEN && (sawopen = 1)) ||
4764                /* An OR of *one* alternative - should not happen now. */
4765             (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
4766             /* for now we can't handle lookbehind IFMATCH*/
4767             (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
4768             (OP(first) == PLUS) ||
4769             (OP(first) == MINMOD) ||
4770                /* An {n,m} with n>0 */
4771             (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
4772             (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
4773         {
4774                 /*
4775                  * the only op that could be a regnode is PLUS, all the rest
4776                  * will be regnode_1 or regnode_2.
4777                  *
4778                  */
4779                 if (OP(first) == PLUS)
4780                     sawplus = 1;
4781                 else
4782                     first += regarglen[OP(first)];
4783
4784                 first = NEXTOPER(first);
4785                 first_next= regnext(first);
4786         }
4787
4788         /* Starting-point info. */
4789       again:
4790         DEBUG_PEEP("first:",first,0);
4791         /* Ignore EXACT as we deal with it later. */
4792         if (PL_regkind[OP(first)] == EXACT) {
4793             if (OP(first) == EXACT)
4794                 NOOP;   /* Empty, get anchored substr later. */
4795             else
4796                 ri->regstclass = first;
4797         }
4798 #ifdef TRIE_STCLASS
4799         else if (PL_regkind[OP(first)] == TRIE &&
4800                 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
4801         {
4802             regnode *trie_op;
4803             /* this can happen only on restudy */
4804             if ( OP(first) == TRIE ) {
4805                 struct regnode_1 *trieop = (struct regnode_1 *)
4806                     PerlMemShared_calloc(1, sizeof(struct regnode_1));
4807                 StructCopy(first,trieop,struct regnode_1);
4808                 trie_op=(regnode *)trieop;
4809             } else {
4810                 struct regnode_charclass *trieop = (struct regnode_charclass *)
4811                     PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
4812                 StructCopy(first,trieop,struct regnode_charclass);
4813                 trie_op=(regnode *)trieop;
4814             }
4815             OP(trie_op)+=2;
4816             make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
4817             ri->regstclass = trie_op;
4818         }
4819 #endif
4820         else if (REGNODE_SIMPLE(OP(first)))
4821             ri->regstclass = first;
4822         else if (PL_regkind[OP(first)] == BOUND ||
4823                  PL_regkind[OP(first)] == NBOUND)
4824             ri->regstclass = first;
4825         else if (PL_regkind[OP(first)] == BOL) {
4826             r->extflags |= (OP(first) == MBOL
4827                            ? RXf_ANCH_MBOL
4828                            : (OP(first) == SBOL
4829                               ? RXf_ANCH_SBOL
4830                               : RXf_ANCH_BOL));
4831             first = NEXTOPER(first);
4832             goto again;
4833         }
4834         else if (OP(first) == GPOS) {
4835             r->extflags |= RXf_ANCH_GPOS;
4836             first = NEXTOPER(first);
4837             goto again;
4838         }
4839         else if ((!sawopen || !RExC_sawback) &&
4840             (OP(first) == STAR &&
4841             PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
4842             !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
4843         {
4844             /* turn .* into ^.* with an implied $*=1 */
4845             const int type =
4846                 (OP(NEXTOPER(first)) == REG_ANY)
4847                     ? RXf_ANCH_MBOL
4848                     : RXf_ANCH_SBOL;
4849             r->extflags |= type;
4850             r->intflags |= PREGf_IMPLICIT;
4851             first = NEXTOPER(first);
4852             goto again;
4853         }
4854         if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
4855             && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
4856             /* x+ must match at the 1st pos of run of x's */
4857             r->intflags |= PREGf_SKIP;
4858
4859         /* Scan is after the zeroth branch, first is atomic matcher. */
4860 #ifdef TRIE_STUDY_OPT
4861         DEBUG_PARSE_r(
4862             if (!restudied)
4863                 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4864                               (IV)(first - scan + 1))
4865         );
4866 #else
4867         DEBUG_PARSE_r(
4868             PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4869                 (IV)(first - scan + 1))
4870         );
4871 #endif
4872
4873
4874         /*
4875         * If there's something expensive in the r.e., find the
4876         * longest literal string that must appear and make it the
4877         * regmust.  Resolve ties in favor of later strings, since
4878         * the regstart check works with the beginning of the r.e.
4879         * and avoiding duplication strengthens checking.  Not a
4880         * strong reason, but sufficient in the absence of others.
4881         * [Now we resolve ties in favor of the earlier string if
4882         * it happens that c_offset_min has been invalidated, since the
4883         * earlier string may buy us something the later one won't.]
4884         */
4885
4886         data.longest_fixed = newSVpvs("");
4887         data.longest_float = newSVpvs("");
4888         data.last_found = newSVpvs("");
4889         data.longest = &(data.longest_fixed);
4890         first = scan;
4891         if (!ri->regstclass) {
4892             cl_init(pRExC_state, &ch_class);
4893             data.start_class = &ch_class;
4894             stclass_flag = SCF_DO_STCLASS_AND;
4895         } else                          /* XXXX Check for BOUND? */
4896             stclass_flag = 0;
4897         data.last_closep = &last_close;
4898
4899         minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
4900             &data, -1, NULL, NULL,
4901             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
4902
4903
4904         CHECK_RESTUDY_GOTO;
4905
4906
4907         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
4908              && data.last_start_min == 0 && data.last_end > 0
4909              && !RExC_seen_zerolen
4910              && !(RExC_seen & REG_SEEN_VERBARG)
4911              && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
4912             r->extflags |= RXf_CHECK_ALL;
4913         scan_commit(pRExC_state, &data,&minlen,0);
4914         SvREFCNT_dec(data.last_found);
4915
4916         /* Note that code very similar to this but for anchored string
4917            follows immediately below, changes may need to be made to both.
4918            Be careful.
4919          */
4920         longest_float_length = CHR_SVLEN(data.longest_float);
4921         if (longest_float_length
4922             || (data.flags & SF_FL_BEFORE_EOL
4923                 && (!(data.flags & SF_FL_BEFORE_MEOL)
4924                     || (RExC_flags & RXf_PMf_MULTILINE))))
4925         {
4926             I32 t,ml;
4927
4928             if (SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
4929                 && data.offset_fixed == data.offset_float_min
4930                 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4931                     goto remove_float;          /* As in (a)+. */
4932
4933             /* copy the information about the longest float from the reg_scan_data
4934                over to the program. */
4935             if (SvUTF8(data.longest_float)) {
4936                 r->float_utf8 = data.longest_float;
4937                 r->float_substr = NULL;
4938             } else {
4939                 r->float_substr = data.longest_float;
4940                 r->float_utf8 = NULL;
4941             }
4942             /* float_end_shift is how many chars that must be matched that
4943                follow this item. We calculate it ahead of time as once the
4944                lookbehind offset is added in we lose the ability to correctly
4945                calculate it.*/
4946             ml = data.minlen_float ? *(data.minlen_float)
4947                                    : (I32)longest_float_length;
4948             r->float_end_shift = ml - data.offset_float_min
4949                 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4950                 + data.lookbehind_float;
4951             r->float_min_offset = data.offset_float_min - data.lookbehind_float;
4952             r->float_max_offset = data.offset_float_max;
4953             if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
4954                 r->float_max_offset -= data.lookbehind_float;
4955
4956             t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4957                        && (!(data.flags & SF_FL_BEFORE_MEOL)
4958                            || (RExC_flags & RXf_PMf_MULTILINE)));
4959             fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
4960         }
4961         else {
4962           remove_float:
4963             r->float_substr = r->float_utf8 = NULL;
4964             SvREFCNT_dec(data.longest_float);
4965             longest_float_length = 0;
4966         }
4967
4968         /* Note that code very similar to this but for floating string
4969            is immediately above, changes may need to be made to both.
4970            Be careful.
4971          */
4972         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
4973         if (longest_fixed_length
4974             || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4975                 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4976                     || (RExC_flags & RXf_PMf_MULTILINE))))
4977         {
4978             I32 t,ml;
4979
4980             /* copy the information about the longest fixed
4981                from the reg_scan_data over to the program. */
4982             if (SvUTF8(data.longest_fixed)) {
4983                 r->anchored_utf8 = data.longest_fixed;
4984                 r->anchored_substr = NULL;
4985             } else {
4986                 r->anchored_substr = data.longest_fixed;
4987                 r->anchored_utf8 = NULL;
4988             }
4989             /* fixed_end_shift is how many chars that must be matched that
4990                follow this item. We calculate it ahead of time as once the
4991                lookbehind offset is added in we lose the ability to correctly
4992                calculate it.*/
4993             ml = data.minlen_fixed ? *(data.minlen_fixed)
4994                                    : (I32)longest_fixed_length;
4995             r->anchored_end_shift = ml - data.offset_fixed
4996                 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4997                 + data.lookbehind_fixed;
4998             r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4999
5000             t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
5001                  && (!(data.flags & SF_FIX_BEFORE_MEOL)
5002                      || (RExC_flags & RXf_PMf_MULTILINE)));
5003             fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
5004         }
5005         else {
5006             r->anchored_substr = r->anchored_utf8 = NULL;
5007             SvREFCNT_dec(data.longest_fixed);
5008             longest_fixed_length = 0;
5009         }
5010         if (ri->regstclass
5011             && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
5012             ri->regstclass = NULL;
5013
5014         /* If the synthetic start class were to ever be used when EOS is set,
5015          * that bit would have to be cleared, as it is shared with another */
5016         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
5017             && stclass_flag
5018             && !(data.start_class->flags & ANYOF_EOS)
5019             && !cl_is_anything(data.start_class))
5020         {
5021             const U32 n = add_data(pRExC_state, 1, "f");
5022
5023             Newx(RExC_rxi->data->data[n], 1,
5024                 struct regnode_charclass_class);
5025             StructCopy(data.start_class,
5026                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
5027                        struct regnode_charclass_class);
5028             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
5029             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
5030             DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
5031                       regprop(r, sv, (regnode*)data.start_class);
5032                       PerlIO_printf(Perl_debug_log,
5033                                     "synthetic stclass \"%s\".\n",
5034                                     SvPVX_const(sv));});
5035         }
5036
5037         /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
5038         if (longest_fixed_length > longest_float_length) {
5039             r->check_end_shift = r->anchored_end_shift;
5040             r->check_substr = r->anchored_substr;
5041             r->check_utf8 = r->anchored_utf8;
5042             r->check_offset_min = r->check_offset_max = r->anchored_offset;
5043             if (r->extflags & RXf_ANCH_SINGLE)
5044                 r->extflags |= RXf_NOSCAN;
5045         }
5046         else {
5047             r->check_end_shift = r->float_end_shift;
5048             r->check_substr = r->float_substr;
5049             r->check_utf8 = r->float_utf8;
5050             r->check_offset_min = r->float_min_offset;
5051             r->check_offset_max = r->float_max_offset;
5052         }
5053         /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
5054            This should be changed ASAP!  */
5055         if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
5056             r->extflags |= RXf_USE_INTUIT;
5057             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
5058                 r->extflags |= RXf_INTUIT_TAIL;
5059         }
5060         /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
5061         if ( (STRLEN)minlen < longest_float_length )
5062             minlen= longest_float_length;
5063         if ( (STRLEN)minlen < longest_fixed_length )
5064             minlen= longest_fixed_length;
5065         */
5066     }
5067     else {
5068         /* Several toplevels. Best we can is to set minlen. */
5069         I32 fake;
5070         struct regnode_charclass_class ch_class;
5071         I32 last_close = 0;
5072
5073         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
5074
5075         scan = ri->program + 1;
5076         cl_init(pRExC_state, &ch_class);
5077         data.start_class = &ch_class;
5078         data.last_closep = &last_close;
5079
5080
5081         minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
5082             &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
5083
5084         CHECK_RESTUDY_GOTO;
5085
5086         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
5087                 = r->float_substr = r->float_utf8 = NULL;
5088
5089         /* If the synthetic start class were to ever be used when EOS is set,
5090          * that bit would have to be cleared, as it is shared with another */
5091         if (!(data.start_class->flags & ANYOF_EOS)
5092             && !cl_is_anything(data.start_class))
5093         {
5094             const U32 n = add_data(pRExC_state, 1, "f");
5095
5096             Newx(RExC_rxi->data->data[n], 1,
5097                 struct regnode_charclass_class);
5098             StructCopy(data.start_class,
5099                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
5100                        struct regnode_charclass_class);
5101             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
5102             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
5103             DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
5104                       regprop(r, sv, (regnode*)data.start_class);
5105                       PerlIO_printf(Perl_debug_log,
5106                                     "synthetic stclass \"%s\".\n",
5107                                     SvPVX_const(sv));});
5108         }
5109     }
5110
5111     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
5112        the "real" pattern. */
5113     DEBUG_OPTIMISE_r({
5114         PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
5115                       (IV)minlen, (IV)r->minlen);
5116     });
5117     r->minlenret = minlen;
5118     if (r->minlen < minlen)
5119         r->minlen = minlen;
5120
5121     if (RExC_seen & REG_SEEN_GPOS)
5122         r->extflags |= RXf_GPOS_SEEN;
5123     if (RExC_seen & REG_SEEN_LOOKBEHIND)
5124         r->extflags |= RXf_LOOKBEHIND_SEEN;
5125     if (RExC_seen & REG_SEEN_EVAL)
5126         r->extflags |= RXf_EVAL_SEEN;
5127     if (RExC_seen & REG_SEEN_CANY)
5128         r->extflags |= RXf_CANY_SEEN;
5129     if (RExC_seen & REG_SEEN_VERBARG)
5130         r->intflags |= PREGf_VERBARG_SEEN;
5131     if (RExC_seen & REG_SEEN_CUTGROUP)
5132         r->intflags |= PREGf_CUTGROUP_SEEN;
5133     if (RExC_paren_names)
5134         RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
5135     else
5136         RXp_PAREN_NAMES(r) = NULL;
5137
5138 #ifdef STUPID_PATTERN_CHECKS
5139     if (RX_PRELEN(rx) == 0)
5140         r->extflags |= RXf_NULL;
5141     if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
5142         /* XXX: this should happen BEFORE we compile */
5143         r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5144     else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
5145         r->extflags |= RXf_WHITE;
5146     else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
5147         r->extflags |= RXf_START_ONLY;
5148 #else
5149     if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
5150             /* XXX: this should happen BEFORE we compile */
5151             r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5152     else {
5153         regnode *first = ri->program + 1;
5154         U8 fop = OP(first);
5155
5156         if (PL_regkind[fop] == NOTHING && OP(NEXTOPER(first)) == END)
5157             r->extflags |= RXf_NULL;
5158         else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END)
5159             r->extflags |= RXf_START_ONLY;
5160         else if (fop == PLUS && OP(NEXTOPER(first)) == SPACE
5161                              && OP(regnext(first)) == END)
5162             r->extflags |= RXf_WHITE;
5163     }
5164 #endif
5165 #ifdef DEBUGGING
5166     if (RExC_paren_names) {
5167         ri->name_list_idx = add_data( pRExC_state, 1, "a" );
5168         ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
5169     } else
5170 #endif
5171         ri->name_list_idx = 0;
5172
5173     if (RExC_recurse_count) {
5174         for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
5175             const regnode *scan = RExC_recurse[RExC_recurse_count-1];
5176             ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
5177         }
5178     }
5179     Newxz(r->offs, RExC_npar, regexp_paren_pair);
5180     /* assume we don't need to swap parens around before we match */
5181
5182     DEBUG_DUMP_r({
5183         PerlIO_printf(Perl_debug_log,"Final program:\n");
5184         regdump(r);
5185     });
5186 #ifdef RE_TRACK_PATTERN_OFFSETS
5187     DEBUG_OFFSETS_r(if (ri->u.offsets) {
5188         const U32 len = ri->u.offsets[0];
5189         U32 i;
5190         GET_RE_DEBUG_FLAGS_DECL;
5191         PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
5192         for (i = 1; i <= len; i++) {
5193             if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
5194                 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
5195                 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
5196             }
5197         PerlIO_printf(Perl_debug_log, "\n");
5198     });
5199 #endif
5200     return rx;
5201 }
5202
5203 #undef RE_ENGINE_PTR
5204
5205
5206 SV*
5207 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
5208                     const U32 flags)
5209 {
5210     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
5211
5212     PERL_UNUSED_ARG(value);
5213
5214     if (flags & RXapif_FETCH) {
5215         return reg_named_buff_fetch(rx, key, flags);
5216     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
5217         Perl_croak_no_modify(aTHX);
5218         return NULL;
5219     } else if (flags & RXapif_EXISTS) {
5220         return reg_named_buff_exists(rx, key, flags)
5221             ? &PL_sv_yes
5222             : &PL_sv_no;
5223     } else if (flags & RXapif_REGNAMES) {
5224         return reg_named_buff_all(rx, flags);
5225     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
5226         return reg_named_buff_scalar(rx, flags);
5227     } else {
5228         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
5229         return NULL;
5230     }
5231 }
5232
5233 SV*
5234 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
5235                          const U32 flags)
5236 {
5237     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
5238     PERL_UNUSED_ARG(lastkey);
5239
5240     if (flags & RXapif_FIRSTKEY)
5241         return reg_named_buff_firstkey(rx, flags);
5242     else if (flags & RXapif_NEXTKEY)
5243         return reg_named_buff_nextkey(rx, flags);
5244     else {
5245         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
5246         return NULL;
5247     }
5248 }
5249
5250 SV*
5251 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
5252                           const U32 flags)
5253 {
5254     AV *retarray = NULL;
5255     SV *ret;
5256     struct regexp *const rx = (struct regexp *)SvANY(r);
5257
5258     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
5259
5260     if (flags & RXapif_ALL)
5261         retarray=newAV();
5262
5263     if (rx && RXp_PAREN_NAMES(rx)) {
5264         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
5265         if (he_str) {
5266             IV i;
5267             SV* sv_dat=HeVAL(he_str);
5268             I32 *nums=(I32*)SvPVX(sv_dat);
5269             for ( i=0; i<SvIVX(sv_dat); i++ ) {
5270                 if ((I32)(rx->nparens) >= nums[i]
5271                     && rx->offs[nums[i]].start != -1
5272                     && rx->offs[nums[i]].end != -1)
5273                 {
5274                     ret = newSVpvs("");
5275                     CALLREG_NUMBUF_FETCH(r,nums[i],ret);
5276                     if (!retarray)
5277                         return ret;
5278                 } else {
5279                     ret = newSVsv(&PL_sv_undef);
5280                 }
5281                 if (retarray)
5282                     av_push(retarray, ret);
5283             }
5284             if (retarray)
5285                 return newRV_noinc(MUTABLE_SV(retarray));
5286         }
5287     }
5288     return NULL;
5289 }
5290
5291 bool
5292 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
5293                            const U32 flags)
5294 {
5295     struct regexp *const rx = (struct regexp *)SvANY(r);
5296
5297     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
5298
5299     if (rx && RXp_PAREN_NAMES(rx)) {
5300         if (flags & RXapif_ALL) {
5301             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
5302         } else {
5303             SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
5304             if (sv) {
5305                 SvREFCNT_dec(sv);
5306                 return TRUE;
5307             } else {
5308                 return FALSE;
5309             }
5310         }
5311     } else {
5312         return FALSE;
5313     }
5314 }
5315
5316 SV*
5317 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
5318 {
5319     struct regexp *const rx = (struct regexp *)SvANY(r);
5320
5321     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
5322
5323     if ( rx && RXp_PAREN_NAMES(rx) ) {
5324         (void)hv_iterinit(RXp_PAREN_NAMES(rx));
5325
5326         return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
5327     } else {
5328         return FALSE;
5329     }
5330 }
5331
5332 SV*
5333 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
5334 {
5335     struct regexp *const rx = (struct regexp *)SvANY(r);
5336     GET_RE_DEBUG_FLAGS_DECL;
5337
5338     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
5339
5340     if (rx && RXp_PAREN_NAMES(rx)) {
5341         HV *hv = RXp_PAREN_NAMES(rx);
5342         HE *temphe;
5343         while ( (temphe = hv_iternext_flags(hv,0)) ) {
5344             IV i;
5345             IV parno = 0;
5346             SV* sv_dat = HeVAL(temphe);
5347             I32 *nums = (I32*)SvPVX(sv_dat);
5348             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5349                 if ((I32)(rx->lastparen) >= nums[i] &&
5350                     rx->offs[nums[i]].start != -1 &&
5351                     rx->offs[nums[i]].end != -1)
5352                 {
5353                     parno = nums[i];
5354                     break;
5355                 }
5356             }
5357             if (parno || flags & RXapif_ALL) {
5358                 return newSVhek(HeKEY_hek(temphe));
5359             }
5360         }
5361     }
5362     return NULL;
5363 }
5364
5365 SV*
5366 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
5367 {
5368     SV *ret;
5369     AV *av;
5370     I32 length;
5371     struct regexp *const rx = (struct regexp *)SvANY(r);
5372
5373     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
5374
5375     if (rx && RXp_PAREN_NAMES(rx)) {
5376         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5377             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
5378         } else if (flags & RXapif_ONE) {
5379             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
5380             av = MUTABLE_AV(SvRV(ret));
5381             length = av_len(av);
5382             SvREFCNT_dec(ret);
5383             return newSViv(length + 1);
5384         } else {
5385             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5386             return NULL;
5387         }
5388     }
5389     return &PL_sv_undef;
5390 }
5391
5392 SV*
5393 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
5394 {
5395     struct regexp *const rx = (struct regexp *)SvANY(r);
5396     AV *av = newAV();
5397
5398     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
5399
5400     if (rx && RXp_PAREN_NAMES(rx)) {
5401         HV *hv= RXp_PAREN_NAMES(rx);
5402         HE *temphe;
5403         (void)hv_iterinit(hv);
5404         while ( (temphe = hv_iternext_flags(hv,0)) ) {
5405             IV i;
5406             IV parno = 0;
5407             SV* sv_dat = HeVAL(temphe);
5408             I32 *nums = (I32*)SvPVX(sv_dat);
5409             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5410                 if ((I32)(rx->lastparen) >= nums[i] &&
5411                     rx->offs[nums[i]].start != -1 &&
5412                     rx->offs[nums[i]].end != -1)
5413                 {
5414                     parno = nums[i];
5415                     break;
5416                 }
5417             }
5418             if (parno || flags & RXapif_ALL) {
5419                 av_push(av, newSVhek(HeKEY_hek(temphe)));
5420             }
5421         }
5422     }
5423
5424     return newRV_noinc(MUTABLE_SV(av));
5425 }
5426
5427 void
5428 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
5429                              SV * const sv)
5430 {
5431     struct regexp *const rx = (struct regexp *)SvANY(r);
5432     char *s = NULL;
5433     I32 i = 0;
5434     I32 s1, t1;
5435
5436     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
5437
5438     if (!rx->subbeg) {
5439         sv_setsv(sv,&PL_sv_undef);
5440         return;
5441     }
5442     else
5443     if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
5444         /* $` */
5445         i = rx->offs[0].start;
5446         s = rx->subbeg;
5447     }
5448     else
5449     if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
5450         /* $' */
5451         s = rx->subbeg + rx->offs[0].end;
5452         i = rx->sublen - rx->offs[0].end;
5453     }
5454     else
5455     if ( 0 <= paren && paren <= (I32)rx->nparens &&
5456         (s1 = rx->offs[paren].start) != -1 &&
5457         (t1 = rx->offs[paren].end) != -1)
5458     {
5459         /* $& $1 ... */
5460         i = t1 - s1;
5461         s = rx->subbeg + s1;
5462     } else {
5463         sv_setsv(sv,&PL_sv_undef);
5464         return;
5465     }
5466     assert(rx->sublen >= (s - rx->subbeg) + i );
5467     if (i >= 0) {
5468         const int oldtainted = PL_tainted;
5469         TAINT_NOT;
5470         sv_setpvn(sv, s, i);
5471         PL_tainted = oldtainted;
5472         if ( (rx->extflags & RXf_CANY_SEEN)
5473             ? (RXp_MATCH_UTF8(rx)
5474                         && (!i || is_utf8_string((U8*)s, i)))
5475             : (RXp_MATCH_UTF8(rx)) )
5476         {
5477             SvUTF8_on(sv);
5478         }
5479         else
5480             SvUTF8_off(sv);
5481         if (PL_tainting) {
5482             if (RXp_MATCH_TAINTED(rx)) {
5483                 if (SvTYPE(sv) >= SVt_PVMG) {
5484                     MAGIC* const mg = SvMAGIC(sv);
5485                     MAGIC* mgt;
5486                     PL_tainted = 1;
5487                     SvMAGIC_set(sv, mg->mg_moremagic);
5488                     SvTAINT(sv);
5489                     if ((mgt = SvMAGIC(sv))) {
5490                         mg->mg_moremagic = mgt;
5491                         SvMAGIC_set(sv, mg);
5492                     }
5493                 } else {
5494                     PL_tainted = 1;
5495                     SvTAINT(sv);
5496                 }
5497             } else
5498                 SvTAINTED_off(sv);
5499         }
5500     } else {
5501         sv_setsv(sv,&PL_sv_undef);
5502         return;
5503     }
5504 }
5505
5506 void
5507 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
5508                                                          SV const * const value)
5509 {
5510     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
5511
5512     PERL_UNUSED_ARG(rx);
5513     PERL_UNUSED_ARG(paren);
5514     PERL_UNUSED_ARG(value);
5515
5516     if (!PL_localizing)
5517         Perl_croak_no_modify(aTHX);
5518 }
5519
5520 I32
5521 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
5522                               const I32 paren)
5523 {
5524     struct regexp *const rx = (struct regexp *)SvANY(r);
5525     I32 i;
5526     I32 s1, t1;
5527
5528     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
5529
5530     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
5531         switch (paren) {
5532       /* $` / ${^PREMATCH} */
5533       case RX_BUFF_IDX_PREMATCH:
5534         if (rx->offs[0].start != -1) {
5535                         i = rx->offs[0].start;
5536                         if (i > 0) {
5537                                 s1 = 0;
5538                                 t1 = i;
5539                                 goto getlen;
5540                         }
5541             }
5542         return 0;
5543       /* $' / ${^POSTMATCH} */
5544       case RX_BUFF_IDX_POSTMATCH:
5545             if (rx->offs[0].end != -1) {
5546                         i = rx->sublen - rx->offs[0].end;
5547                         if (i > 0) {
5548                                 s1 = rx->offs[0].end;
5549                                 t1 = rx->sublen;
5550                                 goto getlen;
5551                         }
5552             }
5553         return 0;
5554       /* $& / ${^MATCH}, $1, $2, ... */
5555       default:
5556             if (paren <= (I32)rx->nparens &&
5557             (s1 = rx->offs[paren].start) != -1 &&
5558             (t1 = rx->offs[paren].end) != -1)
5559             {
5560             i = t1 - s1;
5561             goto getlen;
5562         } else {
5563             if (ckWARN(WARN_UNINITIALIZED))
5564                 report_uninit((const SV *)sv);
5565             return 0;
5566         }
5567     }
5568   getlen:
5569     if (i > 0 && RXp_MATCH_UTF8(rx)) {
5570         const char * const s = rx->subbeg + s1;
5571         const U8 *ep;
5572         STRLEN el;
5573
5574         i = t1 - s1;
5575         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
5576                         i = el;
5577     }
5578     return i;
5579 }
5580
5581 SV*
5582 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
5583 {
5584     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
5585         PERL_UNUSED_ARG(rx);
5586         if (0)
5587             return NULL;
5588         else
5589             return newSVpvs("Regexp");
5590 }
5591
5592 /* Scans the name of a named buffer from the pattern.
5593  * If flags is REG_RSN_RETURN_NULL returns null.
5594  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
5595  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
5596  * to the parsed name as looked up in the RExC_paren_names hash.
5597  * If there is an error throws a vFAIL().. type exception.
5598  */
5599
5600 #define REG_RSN_RETURN_NULL    0
5601 #define REG_RSN_RETURN_NAME    1
5602 #define REG_RSN_RETURN_DATA    2
5603
5604 STATIC SV*
5605 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
5606 {
5607     char *name_start = RExC_parse;
5608
5609     PERL_ARGS_ASSERT_REG_SCAN_NAME;
5610
5611     if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
5612          /* skip IDFIRST by using do...while */
5613         if (UTF)
5614             do {
5615                 RExC_parse += UTF8SKIP(RExC_parse);
5616             } while (isALNUM_utf8((U8*)RExC_parse));
5617         else
5618             do {
5619                 RExC_parse++;
5620             } while (isALNUM(*RExC_parse));
5621     }
5622
5623     if ( flags ) {
5624         SV* sv_name
5625             = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
5626                              SVs_TEMP | (UTF ? SVf_UTF8 : 0));
5627         if ( flags == REG_RSN_RETURN_NAME)
5628             return sv_name;
5629         else if (flags==REG_RSN_RETURN_DATA) {
5630             HE *he_str = NULL;
5631             SV *sv_dat = NULL;
5632             if ( ! sv_name )      /* should not happen*/
5633                 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
5634             if (RExC_paren_names)
5635                 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
5636             if ( he_str )
5637                 sv_dat = HeVAL(he_str);
5638             if ( ! sv_dat )
5639                 vFAIL("Reference to nonexistent named group");
5640             return sv_dat;
5641         }
5642         else {
5643             Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
5644         }
5645         /* NOT REACHED */
5646     }
5647     return NULL;
5648 }
5649
5650 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
5651     int rem=(int)(RExC_end - RExC_parse);                       \
5652     int cut;                                                    \
5653     int num;                                                    \
5654     int iscut=0;                                                \
5655     if (rem>10) {                                               \
5656         rem=10;                                                 \
5657         iscut=1;                                                \
5658     }                                                           \
5659     cut=10-rem;                                                 \
5660     if (RExC_lastparse!=RExC_parse)                             \
5661         PerlIO_printf(Perl_debug_log," >%.*s%-*s",              \
5662             rem, RExC_parse,                                    \
5663             cut + 4,                                            \
5664             iscut ? "..." : "<"                                 \
5665         );                                                      \
5666     else                                                        \
5667         PerlIO_printf(Perl_debug_log,"%16s","");                \
5668                                                                 \
5669     if (SIZE_ONLY)                                              \
5670        num = RExC_size + 1;                                     \
5671     else                                                        \
5672        num=REG_NODE_NUM(RExC_emit);                             \
5673     if (RExC_lastnum!=num)                                      \
5674        PerlIO_printf(Perl_debug_log,"|%4d",num);                \
5675     else                                                        \
5676        PerlIO_printf(Perl_debug_log,"|%4s","");                 \
5677     PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
5678         (int)((depth*2)), "",                                   \
5679         (funcname)                                              \
5680     );                                                          \
5681     RExC_lastnum=num;                                           \
5682     RExC_lastparse=RExC_parse;                                  \
5683 })
5684
5685
5686
5687 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
5688     DEBUG_PARSE_MSG((funcname));                            \
5689     PerlIO_printf(Perl_debug_log,"%4s","\n");               \
5690 })
5691 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
5692     DEBUG_PARSE_MSG((funcname));                            \
5693     PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
5694 })
5695
5696 /* This section of code defines the inversion list object and its methods.  The
5697  * interfaces are highly subject to change, so as much as possible is static to
5698  * this file.  An inversion list is here implemented as a malloc'd C array with
5699  * some added info.  More will be coming when functionality is added later.
5700  *
5701  * Some of the methods should always be private to the implementation, and some
5702  * should eventually be made public */
5703
5704 #define INVLIST_INITIAL_LEN 10
5705 #define INVLIST_ARRAY_KEY "array"
5706 #define INVLIST_MAX_KEY "max"
5707 #define INVLIST_LEN_KEY "len"
5708
5709 PERL_STATIC_INLINE UV*
5710 S_invlist_array(pTHX_ HV* const invlist)
5711 {
5712     /* Returns the pointer to the inversion list's array.  Every time the
5713      * length changes, this needs to be called in case malloc or realloc moved
5714      * it */
5715
5716     SV** list_ptr = hv_fetchs(invlist, INVLIST_ARRAY_KEY, FALSE);
5717
5718     PERL_ARGS_ASSERT_INVLIST_ARRAY;
5719
5720     if (list_ptr == NULL) {
5721         Perl_croak(aTHX_ "panic: inversion list without a '%s' element",
5722                                                             INVLIST_ARRAY_KEY);
5723     }
5724
5725     return INT2PTR(UV *, SvUV(*list_ptr));
5726 }
5727
5728 PERL_STATIC_INLINE void
5729 S_invlist_set_array(pTHX_ HV* const invlist, const UV* const array)
5730 {
5731     PERL_ARGS_ASSERT_INVLIST_SET_ARRAY;
5732
5733     /* Sets the array stored in the inversion list to the memory beginning with
5734      * the parameter */
5735
5736     if (hv_stores(invlist, INVLIST_ARRAY_KEY, newSVuv(PTR2UV(array))) == NULL) {
5737         Perl_croak(aTHX_ "panic: can't store '%s' entry in inversion list",
5738                                                             INVLIST_ARRAY_KEY);
5739     }
5740 }
5741
5742 PERL_STATIC_INLINE UV
5743 S_invlist_len(pTHX_ HV* const invlist)
5744 {
5745     /* Returns the current number of elements in the inversion list's array */
5746
5747     SV** len_ptr = hv_fetchs(invlist, INVLIST_LEN_KEY, FALSE);
5748
5749     PERL_ARGS_ASSERT_INVLIST_LEN;
5750
5751     if (len_ptr == NULL) {
5752         Perl_croak(aTHX_ "panic: inversion list without a '%s' element",
5753                                                             INVLIST_LEN_KEY);
5754     }
5755
5756     return SvUV(*len_ptr);
5757 }
5758
5759 PERL_STATIC_INLINE UV
5760 S_invlist_max(pTHX_ HV* const invlist)
5761 {
5762     /* Returns the maximum number of elements storable in the inversion list's
5763      * array, without having to realloc() */
5764
5765     SV** max_ptr = hv_fetchs(invlist, INVLIST_MAX_KEY, FALSE);
5766
5767     PERL_ARGS_ASSERT_INVLIST_MAX;
5768
5769     if (max_ptr == NULL) {
5770         Perl_croak(aTHX_ "panic: inversion list without a '%s' element",
5771                                                             INVLIST_MAX_KEY);
5772     }
5773
5774     return SvUV(*max_ptr);
5775 }
5776
5777 PERL_STATIC_INLINE void
5778 S_invlist_set_len(pTHX_ HV* const invlist, const UV len)
5779 {
5780     /* Sets the current number of elements stored in the inversion list */
5781
5782     PERL_ARGS_ASSERT_INVLIST_SET_LEN;
5783
5784     if (len != 0 && len > invlist_max(invlist)) {
5785         Perl_croak(aTHX_ "panic: Can't make '%s=%"UVuf"' more than %s=%"UVuf" in inversion list", INVLIST_LEN_KEY, len, INVLIST_MAX_KEY, invlist_max(invlist));
5786     }
5787
5788     if (hv_stores(invlist, INVLIST_LEN_KEY, newSVuv(len)) == NULL) {
5789         Perl_croak(aTHX_ "panic: can't store '%s' entry in inversion list",
5790                                                             INVLIST_LEN_KEY);
5791     }
5792 }
5793
5794 PERL_STATIC_INLINE void
5795 S_invlist_set_max(pTHX_ HV* const invlist, const UV max)
5796 {
5797
5798     /* Sets the maximum number of elements storable in the inversion list
5799      * without having to realloc() */
5800
5801     PERL_ARGS_ASSERT_INVLIST_SET_MAX;
5802
5803     if (max < invlist_len(invlist)) {
5804         Perl_croak(aTHX_ "panic: Can't make '%s=%"UVuf"' less than %s=%"UVuf" in inversion list", INVLIST_MAX_KEY, invlist_len(invlist), INVLIST_LEN_KEY, invlist_max(invlist));
5805     }
5806
5807     if (hv_stores(invlist, INVLIST_MAX_KEY, newSVuv(max)) == NULL) {
5808         Perl_croak(aTHX_ "panic: can't store '%s' entry in inversion list",
5809                                                             INVLIST_LEN_KEY);
5810     }
5811 }
5812
5813 #ifndef PERL_IN_XSUB_RE
5814 HV*
5815 Perl__new_invlist(pTHX_ IV initial_size)
5816 {
5817
5818     /* Return a pointer to a newly constructed inversion list, with enough
5819      * space to store 'initial_size' elements.  If that number is negative, a
5820      * system default is used instead */
5821
5822     HV* invlist = newHV();
5823     UV* list;
5824
5825     if (initial_size < 0) {
5826         initial_size = INVLIST_INITIAL_LEN;
5827     }
5828
5829     /* Allocate the initial space */
5830     Newx(list, initial_size, UV);
5831     invlist_set_array(invlist, list);
5832
5833     /* set_len has to come before set_max, as the latter inspects the len */
5834     invlist_set_len(invlist, 0);
5835     invlist_set_max(invlist, initial_size);
5836
5837     return invlist;
5838 }
5839 #endif
5840
5841 PERL_STATIC_INLINE void
5842 S_invlist_destroy(pTHX_ HV* const invlist)
5843 {
5844    /* Inversion list destructor */
5845
5846     SV** list_ptr = hv_fetchs(invlist, INVLIST_ARRAY_KEY, FALSE);
5847
5848     PERL_ARGS_ASSERT_INVLIST_DESTROY;
5849
5850     if (list_ptr != NULL) {
5851         UV *list = INT2PTR(UV *, SvUV(*list_ptr)); /* PERL_POISON needs lvalue */
5852         Safefree(list);
5853     }
5854 }
5855
5856 STATIC void
5857 S_invlist_extend(pTHX_ HV* const invlist, const UV new_max)
5858 {
5859     /* Change the maximum size of an inversion list (up or down) */
5860
5861     UV* orig_array;
5862     UV* array;
5863     const UV old_max = invlist_max(invlist);
5864
5865     PERL_ARGS_ASSERT_INVLIST_EXTEND;
5866
5867     if (old_max == new_max) {   /* If a no-op */
5868         return;
5869     }
5870
5871     array = orig_array = invlist_array(invlist);
5872     Renew(array, new_max, UV);
5873
5874     /* If the size change moved the list in memory, set the new one */
5875     if (array != orig_array) {
5876         invlist_set_array(invlist, array);
5877     }
5878
5879     invlist_set_max(invlist, new_max);
5880
5881 }
5882
5883 PERL_STATIC_INLINE void
5884 S_invlist_trim(pTHX_ HV* const invlist)
5885 {
5886     PERL_ARGS_ASSERT_INVLIST_TRIM;
5887
5888     /* Change the length of the inversion list to how many entries it currently
5889      * has */
5890
5891     invlist_extend(invlist, invlist_len(invlist));
5892 }
5893
5894 /* An element is in an inversion list iff its index is even numbered: 0, 2, 4,
5895  * etc */
5896
5897 #define ELEMENT_IN_INVLIST_SET(i) (! ((i) & 1))
5898
5899 #ifndef PERL_IN_XSUB_RE
5900 void
5901 Perl__append_range_to_invlist(pTHX_ HV* const invlist, const UV start, const UV end)
5902 {
5903    /* Subject to change or removal.  Append the range from 'start' to 'end' at
5904     * the end of the inversion list.  The range must be above any existing
5905     * ones. */
5906
5907     UV* array = invlist_array(invlist);
5908     UV max = invlist_max(invlist);
5909     UV len = invlist_len(invlist);
5910
5911     PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
5912
5913     if (len > 0) {
5914
5915         /* Here, the existing list is non-empty. The current max entry in the
5916          * list is generally the first value not in the set, except when the
5917          * set extends to the end of permissible values, in which case it is
5918          * the first entry in that final set, and so this call is an attempt to
5919          * append out-of-order */
5920
5921         UV final_element = len - 1;
5922         if (array[final_element] > start
5923             || ELEMENT_IN_INVLIST_SET(final_element))
5924         {
5925             Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list");
5926         }
5927
5928         /* Here, it is a legal append.  If the new range begins with the first
5929          * value not in the set, it is extending the set, so the new first
5930          * value not in the set is one greater than the newly extended range.
5931          * */
5932         if (array[final_element] == start) {
5933             if (end != UV_MAX) {
5934                 array[final_element] = end + 1;
5935             }
5936             else {
5937                 /* But if the end is the maximum representable on the machine,
5938                  * just let the range that this would extend have no end */
5939                 invlist_set_len(invlist, len - 1);
5940             }
5941             return;
5942         }
5943     }
5944
5945     /* Here the new range doesn't extend any existing set.  Add it */
5946
5947     len += 2;   /* Includes an element each for the start and end of range */
5948
5949     /* If overflows the existing space, extend, which may cause the array to be
5950      * moved */
5951     if (max < len) {
5952         invlist_extend(invlist, len);
5953         array = invlist_array(invlist);
5954     }
5955
5956     invlist_set_len(invlist, len);
5957
5958     /* The next item on the list starts the range, the one after that is
5959      * one past the new range.  */
5960     array[len - 2] = start;
5961     if (end != UV_MAX) {
5962         array[len - 1] = end + 1;
5963     }
5964     else {
5965         /* But if the end is the maximum representable on the machine, just let
5966          * the range have no end */
5967         invlist_set_len(invlist, len - 1);
5968     }
5969 }
5970 #endif
5971
5972 PERL_STATIC_INLINE HV*
5973 S_invlist_union(pTHX_ HV* const a, HV* const b)
5974 {
5975     /* Return a new inversion list which is the union of two inversion lists.
5976      * The basis for this comes from "Unicode Demystified" Chapter 13 by
5977      * Richard Gillam, published by Addison-Wesley, and explained at some
5978      * length there.  The preface says to incorporate its examples into your
5979      * code at your own risk.
5980      *
5981      * The algorithm is like a merge sort.
5982      *
5983      * XXX A potential performance improvement is to keep track as we go along
5984      * if only one of the inputs contributes to the result, meaning the other
5985      * is a subset of that one.  In that case, we can skip the final copy and
5986      * return the larger of the input lists */
5987
5988     UV* array_a = invlist_array(a);   /* a's array */
5989     UV* array_b = invlist_array(b);
5990     UV len_a = invlist_len(a);  /* length of a's array */
5991     UV len_b = invlist_len(b);
5992
5993     HV* u;                      /* the resulting union */
5994     UV* array_u;
5995     UV len_u;
5996
5997     UV i_a = 0;             /* current index into a's array */
5998     UV i_b = 0;
5999     UV i_u = 0;
6000
6001     /* running count, as explained in the algorithm source book; items are
6002      * stopped accumulating and are output when the count changes to/from 0.
6003      * The count is incremented when we start a range that's in the set, and
6004      * decremented when we start a range that's not in the set.  So its range
6005      * is 0 to 2.  Only when the count is zero is something not in the set.
6006      */
6007     UV count = 0;
6008
6009     PERL_ARGS_ASSERT_INVLIST_UNION;
6010
6011     /* Size the union for the worst case: that the sets are completely
6012      * disjoint */
6013     u = _new_invlist(len_a + len_b);
6014     array_u = invlist_array(u);
6015
6016     /* Go through each list item by item, stopping when exhausted one of
6017      * them */
6018     while (i_a < len_a && i_b < len_b) {
6019         UV cp;      /* The element to potentially add to the union's array */
6020         bool cp_in_set;   /* is it in the the input list's set or not */
6021
6022         /* We need to take one or the other of the two inputs for the union.
6023          * Since we are merging two sorted lists, we take the smaller of the
6024          * next items.  In case of a tie, we take the one that is in its set
6025          * first.  If we took one not in the set first, it would decrement the
6026          * count, possibly to 0 which would cause it to be output as ending the
6027          * range, and the next time through we would take the same number, and
6028          * output it again as beginning the next range.  By doing it the
6029          * opposite way, there is no possibility that the count will be
6030          * momentarily decremented to 0, and thus the two adjoining ranges will
6031          * be seamlessly merged.  (In a tie and both are in the set or both not
6032          * in the set, it doesn't matter which we take first.) */
6033         if (array_a[i_a] < array_b[i_b]
6034             || (array_a[i_a] == array_b[i_b] && ELEMENT_IN_INVLIST_SET(i_a)))
6035         {
6036             cp_in_set = ELEMENT_IN_INVLIST_SET(i_a);
6037             cp= array_a[i_a++];
6038         }
6039         else {
6040             cp_in_set = ELEMENT_IN_INVLIST_SET(i_b);
6041             cp= array_b[i_b++];
6042         }
6043
6044         /* Here, have chosen which of the two inputs to look at.  Only output
6045          * if the running count changes to/from 0, which marks the
6046          * beginning/end of a range in that's in the set */
6047         if (cp_in_set) {
6048             if (count == 0) {
6049                 array_u[i_u++] = cp;
6050             }
6051             count++;
6052         }
6053         else {
6054             count--;
6055             if (count == 0) {
6056                 array_u[i_u++] = cp;
6057             }
6058         }
6059     }
6060
6061     /* Here, we are finished going through at least one of the lists, which
6062      * means there is something remaining in at most one.  We check if the list
6063      * that hasn't been exhausted is positioned such that we are in the middle
6064      * of a range in its set or not.  (We are in the set if the next item in
6065      * the array marks the beginning of something not in the set)   If in the
6066      * set, we decrement 'count'; if 0, there is potentially more to output.
6067      * There are four cases:
6068      *  1) Both weren't in their sets, count is 0, and remains 0.  What's left
6069      *     in the union is entirely from the non-exhausted set.
6070      *  2) Both were in their sets, count is 2.  Nothing further should
6071      *     be output, as everything that remains will be in the exhausted
6072      *     list's set, hence in the union; decrementing to 1 but not 0 insures
6073      *     that
6074      *  3) the exhausted was in its set, non-exhausted isn't, count is 1.
6075      *     Nothing further should be output because the union includes
6076      *     everything from the exhausted set.  Not decrementing insures that.
6077      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1;
6078      *     decrementing to 0 insures that we look at the remainder of the
6079      *     non-exhausted set */
6080     if ((i_a != len_a && ! ELEMENT_IN_INVLIST_SET(i_a))
6081         || (i_b != len_b && ! ELEMENT_IN_INVLIST_SET(i_b)))
6082     {
6083         count--;
6084     }
6085
6086     /* The final length is what we've output so far, plus what else is about to
6087      * be output.  (If 'count' is non-zero, then the input list we exhausted
6088      * has everything remaining up to the machine's limit in its set, and hence
6089      * in the union, so there will be no further output. */
6090     len_u = i_u;
6091     if (count == 0) {
6092         /* At most one of the subexpressions will be non-zero */
6093         len_u += (len_a - i_a) + (len_b - i_b);
6094     }
6095
6096     /* Set result to final length, which can change the pointer to array_u, so
6097      * re-find it */
6098     if (len_u != invlist_len(u)) {
6099         invlist_set_len(u, len_u);
6100         invlist_trim(u);
6101         array_u = invlist_array(u);
6102     }
6103
6104     /* When 'count' is 0, the list that was exhausted (if one was shorter than
6105      * the other) ended with everything above it not in its set.  That means
6106      * that the remaining part of the union is precisely the same as the
6107      * non-exhausted list, so can just copy it unchanged.  (If both list were
6108      * exhausted at the same time, then the operations below will be both 0.)
6109      */
6110     if (count == 0) {
6111         IV copy_count; /* At most one will have a non-zero copy count */
6112         if ((copy_count = len_a - i_a) > 0) {
6113             Copy(array_a + i_a, array_u + i_u, copy_count, UV);
6114         }
6115         else if ((copy_count = len_b - i_b) > 0) {
6116             Copy(array_b + i_b, array_u + i_u, copy_count, UV);
6117         }
6118     }
6119
6120     return u;
6121 }
6122
6123 PERL_STATIC_INLINE HV*
6124 S_invlist_intersection(pTHX_ HV* const a, HV* const b)
6125 {
6126     /* Return the intersection of two inversion lists.  The basis for this
6127      * comes from "Unicode Demystified" Chapter 13 by Richard Gillam, published
6128      * by Addison-Wesley, and explained at some length there.  The preface says
6129      * to incorporate its examples into your code at your own risk.
6130      *
6131      * The algorithm is like a merge sort, and is essentially the same as the
6132      * union above
6133      */
6134
6135     UV* array_a = invlist_array(a);   /* a's array */
6136     UV* array_b = invlist_array(b);
6137     UV len_a = invlist_len(a);  /* length of a's array */
6138     UV len_b = invlist_len(b);
6139
6140     HV* r;                   /* the resulting intersection */
6141     UV* array_r;
6142     UV len_r;
6143
6144     UV i_a = 0;             /* current index into a's array */
6145     UV i_b = 0;
6146     UV i_r = 0;
6147
6148     /* running count, as explained in the algorithm source book; items are
6149      * stopped accumulating and are output when the count changes to/from 2.
6150      * The count is incremented when we start a range that's in the set, and
6151      * decremented when we start a range that's not in the set.  So its range
6152      * is 0 to 2.  Only when the count is 2 is something in the intersection.
6153      */
6154     UV count = 0;
6155
6156     PERL_ARGS_ASSERT_INVLIST_INTERSECTION;
6157
6158     /* Size the intersection for the worst case: that the intersection ends up
6159      * fragmenting everything to be completely disjoint */
6160     r= _new_invlist(len_a + len_b);
6161     array_r = invlist_array(r);
6162
6163     /* Go through each list item by item, stopping when exhausted one of
6164      * them */
6165     while (i_a < len_a && i_b < len_b) {
6166         UV cp;      /* The element to potentially add to the intersection's
6167                        array */
6168         bool cp_in_set; /* Is it in the input list's set or not */
6169
6170         /* We need to take one or the other of the two inputs for the union.
6171          * Since we are merging two sorted lists, we take the smaller of the
6172          * next items.  In case of a tie, we take the one that is not in its
6173          * set first (a difference from the union algorithm).  If we took one
6174          * in the set first, it would increment the count, possibly to 2 which
6175          * would cause it to be output as starting a range in the intersection,
6176          * and the next time through we would take that same number, and output
6177          * it again as ending the set.  By doing it the opposite of this, we
6178          * there is no possibility that the count will be momentarily
6179          * incremented to 2.  (In a tie and both are in the set or both not in
6180          * the set, it doesn't matter which we take first.) */
6181         if (array_a[i_a] < array_b[i_b]
6182             || (array_a[i_a] == array_b[i_b] && ! ELEMENT_IN_INVLIST_SET(i_a)))
6183         {
6184             cp_in_set = ELEMENT_IN_INVLIST_SET(i_a);
6185             cp= array_a[i_a++];
6186         }
6187         else {
6188             cp_in_set = ELEMENT_IN_INVLIST_SET(i_b);
6189             cp= array_b[i_b++];
6190         }
6191
6192         /* Here, have chosen which of the two inputs to look at.  Only output
6193          * if the running count changes to/from 2, which marks the
6194          * beginning/end of a range that's in the intersection */
6195         if (cp_in_set) {
6196             count++;
6197             if (count == 2) {
6198                 array_r[i_r++] = cp;
6199             }
6200         }
6201         else {
6202             if (count == 2) {
6203                 array_r[i_r++] = cp;
6204             }
6205             count--;
6206         }
6207     }
6208
6209     /* Here, we are finished going through at least one of the sets, which
6210      * means there is something remaining in at most one.  See the comments in
6211      * the union code */
6212     if ((i_a != len_a && ! ELEMENT_IN_INVLIST_SET(i_a))
6213         || (i_b != len_b && ! ELEMENT_IN_INVLIST_SET(i_b)))
6214     {
6215         count--;
6216     }
6217
6218     /* The final length is what we've output so far plus what else is in the
6219      * intersection.  Only one of the subexpressions below will be non-zero */
6220     len_r = i_r;
6221     if (count == 2) {
6222         len_r += (len_a - i_a) + (len_b - i_b);
6223     }
6224
6225     /* Set result to final length, which can change the pointer to array_r, so
6226      * re-find it */
6227     if (len_r != invlist_len(r)) {
6228         invlist_set_len(r, len_r);
6229         invlist_trim(r);
6230         array_r = invlist_array(r);
6231     }
6232
6233     /* Finish outputting any remaining */
6234     if (count == 2) { /* Only one of will have a non-zero copy count */
6235         IV copy_count;
6236         if ((copy_count = len_a - i_a) > 0) {
6237             Copy(array_a + i_a, array_r + i_r, copy_count, UV);
6238         }
6239         else if ((copy_count = len_b - i_b) > 0) {
6240             Copy(array_b + i_b, array_r + i_r, copy_count, UV);
6241         }
6242     }
6243
6244     return r;
6245 }
6246
6247 STATIC HV*
6248 S_add_range_to_invlist(pTHX_ HV* const invlist, const UV start, const UV end)
6249 {
6250     /* Add the range from 'start' to 'end' inclusive to the inversion list's
6251      * set.  A pointer to the inversion list is returned.  This may actually be
6252      * a new list, in which case the passed in one has been destroyed */
6253
6254     HV* range_invlist;
6255     HV* added_invlist;
6256
6257     UV len = invlist_len(invlist);
6258
6259     PERL_ARGS_ASSERT_ADD_RANGE_TO_INVLIST;
6260
6261     /* If comes after the final entry, can just append it to the end */
6262     if (len == 0
6263         || start >= invlist_array(invlist)
6264                                     [invlist_len(invlist) - 1])
6265     {
6266         _append_range_to_invlist(invlist, start, end);
6267         return invlist;
6268     }
6269
6270     /* Here, can't just append things, create and return a new inversion list
6271      * which is the union of this range and the existing inversion list */
6272     range_invlist = _new_invlist(2);
6273     _append_range_to_invlist(range_invlist, start, end);
6274
6275     added_invlist = invlist_union(invlist, range_invlist);
6276
6277     /* The passed in list can be freed, as well as our temporary */
6278     invlist_destroy(range_invlist);
6279     if (invlist != added_invlist) {
6280         invlist_destroy(invlist);
6281     }
6282
6283     return added_invlist;
6284 }
6285
6286 /* End of inversion list object */
6287
6288 /*
6289  - reg - regular expression, i.e. main body or parenthesized thing
6290  *
6291  * Caller must absorb opening parenthesis.
6292  *
6293  * Combining parenthesis handling with the base level of regular expression
6294  * is a trifle forced, but the need to tie the tails of the branches to what
6295  * follows makes it hard to avoid.
6296  */
6297 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
6298 #ifdef DEBUGGING
6299 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
6300 #else
6301 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
6302 #endif
6303
6304 STATIC regnode *
6305 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
6306     /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
6307 {
6308     dVAR;
6309     register regnode *ret;              /* Will be the head of the group. */
6310     register regnode *br;
6311     register regnode *lastbr;
6312     register regnode *ender = NULL;
6313     register I32 parno = 0;
6314     I32 flags;
6315     U32 oregflags = RExC_flags;
6316     bool have_branch = 0;
6317     bool is_open = 0;
6318     I32 freeze_paren = 0;
6319     I32 after_freeze = 0;
6320
6321     /* for (?g), (?gc), and (?o) warnings; warning
6322        about (?c) will warn about (?g) -- japhy    */
6323
6324 #define WASTED_O  0x01
6325 #define WASTED_G  0x02
6326 #define WASTED_C  0x04
6327 #define WASTED_GC (0x02|0x04)
6328     I32 wastedflags = 0x00;
6329
6330     char * parse_start = RExC_parse; /* MJD */
6331     char * const oregcomp_parse = RExC_parse;
6332
6333     GET_RE_DEBUG_FLAGS_DECL;
6334
6335     PERL_ARGS_ASSERT_REG;
6336     DEBUG_PARSE("reg ");
6337
6338     *flagp = 0;                         /* Tentatively. */
6339
6340
6341     /* Make an OPEN node, if parenthesized. */
6342     if (paren) {
6343         if ( *RExC_parse == '*') { /* (*VERB:ARG) */
6344             char *start_verb = RExC_parse;
6345             STRLEN verb_len = 0;
6346             char *start_arg = NULL;
6347             unsigned char op = 0;
6348             int argok = 1;
6349             int internal_argval = 0; /* internal_argval is only useful if !argok */
6350             while ( *RExC_parse && *RExC_parse != ')' ) {
6351                 if ( *RExC_parse == ':' ) {
6352                     start_arg = RExC_parse + 1;
6353                     break;
6354                 }
6355                 RExC_parse++;
6356             }
6357             ++start_verb;
6358             verb_len = RExC_parse - start_verb;
6359             if ( start_arg ) {
6360                 RExC_parse++;
6361                 while ( *RExC_parse && *RExC_parse != ')' )
6362                     RExC_parse++;
6363                 if ( *RExC_parse != ')' )
6364                     vFAIL("Unterminated verb pattern argument");
6365                 if ( RExC_parse == start_arg )
6366                     start_arg = NULL;
6367             } else {
6368                 if ( *RExC_parse != ')' )
6369                     vFAIL("Unterminated verb pattern");
6370             }
6371
6372             switch ( *start_verb ) {
6373             case 'A':  /* (*ACCEPT) */
6374                 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
6375                     op = ACCEPT;
6376                     internal_argval = RExC_nestroot;
6377                 }
6378                 break;
6379             case 'C':  /* (*COMMIT) */
6380                 if ( memEQs(start_verb,verb_len,"COMMIT") )
6381                     op = COMMIT;
6382                 break;
6383             case 'F':  /* (*FAIL) */
6384                 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
6385                     op = OPFAIL;
6386                     argok = 0;
6387                 }
6388                 break;
6389             case ':':  /* (*:NAME) */
6390             case 'M':  /* (*MARK:NAME) */
6391                 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
6392                     op = MARKPOINT;
6393                     argok = -1;
6394                 }
6395                 break;
6396             case 'P':  /* (*PRUNE) */
6397                 if ( memEQs(start_verb,verb_len,"PRUNE") )
6398                     op = PRUNE;
6399                 break;
6400             case 'S':   /* (*SKIP) */
6401                 if ( memEQs(start_verb,verb_len,"SKIP") )
6402                     op = SKIP;
6403                 break;
6404             case 'T':  /* (*THEN) */
6405                 /* [19:06] <TimToady> :: is then */
6406                 if ( memEQs(start_verb,verb_len,"THEN") ) {
6407                     op = CUTGROUP;
6408                     RExC_seen |= REG_SEEN_CUTGROUP;
6409                 }
6410                 break;
6411             }
6412             if ( ! op ) {
6413                 RExC_parse++;
6414                 vFAIL3("Unknown verb pattern '%.*s'",
6415                     verb_len, start_verb);
6416             }
6417             if ( argok ) {
6418                 if ( start_arg && internal_argval ) {
6419                     vFAIL3("Verb pattern '%.*s' may not have an argument",
6420                         verb_len, start_verb);
6421                 } else if ( argok < 0 && !start_arg ) {
6422                     vFAIL3("Verb pattern '%.*s' has a mandatory argument",
6423                         verb_len, start_verb);
6424                 } else {
6425                     ret = reganode(pRExC_state, op, internal_argval);
6426                     if ( ! internal_argval && ! SIZE_ONLY ) {
6427                         if (start_arg) {
6428                             SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
6429                             ARG(ret) = add_data( pRExC_state, 1, "S" );
6430                             RExC_rxi->data->data[ARG(ret)]=(void*)sv;
6431                             ret->flags = 0;
6432                         } else {
6433                             ret->flags = 1;
6434                         }
6435                     }
6436                 }
6437                 if (!internal_argval)
6438                     RExC_seen |= REG_SEEN_VERBARG;
6439             } else if ( start_arg ) {
6440                 vFAIL3("Verb pattern '%.*s' may not have an argument",
6441                         verb_len, start_verb);
6442             } else {
6443                 ret = reg_node(pRExC_state, op);
6444             }
6445             nextchar(pRExC_state);
6446             return ret;
6447         } else
6448         if (*RExC_parse == '?') { /* (?...) */
6449             bool is_logical = 0;
6450             const char * const seqstart = RExC_parse;
6451             bool has_use_defaults = FALSE;
6452
6453             RExC_parse++;
6454             paren = *RExC_parse++;
6455             ret = NULL;                 /* For look-ahead/behind. */
6456             switch (paren) {
6457
6458             case 'P':   /* (?P...) variants for those used to PCRE/Python */
6459                 paren = *RExC_parse++;
6460                 if ( paren == '<')         /* (?P<...>) named capture */
6461                     goto named_capture;
6462                 else if (paren == '>') {   /* (?P>name) named recursion */
6463                     goto named_recursion;
6464                 }
6465                 else if (paren == '=') {   /* (?P=...)  named backref */
6466                     /* this pretty much dupes the code for \k<NAME> in regatom(), if
6467                        you change this make sure you change that */
6468                     char* name_start = RExC_parse;
6469                     U32 num = 0;
6470                     SV *sv_dat = reg_scan_name(pRExC_state,
6471                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6472                     if (RExC_parse == name_start || *RExC_parse != ')')
6473                         vFAIL2("Sequence %.3s... not terminated",parse_start);
6474
6475                     if (!SIZE_ONLY) {
6476                         num = add_data( pRExC_state, 1, "S" );
6477                         RExC_rxi->data->data[num]=(void*)sv_dat;
6478                         SvREFCNT_inc_simple_void(sv_dat);
6479                     }
6480                     RExC_sawback = 1;
6481                     ret = reganode(pRExC_state,
6482                                    ((! FOLD)
6483                                      ? NREF
6484                                      : (MORE_ASCII_RESTRICTED)
6485                                        ? NREFFA
6486                                        : (AT_LEAST_UNI_SEMANTICS)
6487                                          ? NREFFU
6488                                          : (LOC)
6489                                            ? NREFFL
6490                                            : NREFF),
6491                                     num);
6492                     *flagp |= HASWIDTH;
6493
6494                     Set_Node_Offset(ret, parse_start+1);
6495                     Set_Node_Cur_Length(ret); /* MJD */
6496
6497                     nextchar(pRExC_state);
6498                     return ret;
6499                 }
6500                 RExC_parse++;
6501                 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6502                 /*NOTREACHED*/
6503             case '<':           /* (?<...) */
6504                 if (*RExC_parse == '!')
6505                     paren = ',';
6506                 else if (*RExC_parse != '=')
6507               named_capture:
6508                 {               /* (?<...>) */
6509                     char *name_start;
6510                     SV *svname;
6511                     paren= '>';
6512             case '\'':          /* (?'...') */
6513                     name_start= RExC_parse;
6514                     svname = reg_scan_name(pRExC_state,
6515                         SIZE_ONLY ?  /* reverse test from the others */
6516                         REG_RSN_RETURN_NAME :
6517                         REG_RSN_RETURN_NULL);
6518                     if (RExC_parse == name_start) {
6519                         RExC_parse++;
6520                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6521                         /*NOTREACHED*/
6522                     }
6523                     if (*RExC_parse != paren)
6524                         vFAIL2("Sequence (?%c... not terminated",
6525                             paren=='>' ? '<' : paren);
6526                     if (SIZE_ONLY) {
6527                         HE *he_str;
6528                         SV *sv_dat = NULL;
6529                         if (!svname) /* shouldn't happen */
6530                             Perl_croak(aTHX_
6531                                 "panic: reg_scan_name returned NULL");
6532                         if (!RExC_paren_names) {
6533                             RExC_paren_names= newHV();
6534                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
6535 #ifdef DEBUGGING
6536                             RExC_paren_name_list= newAV();
6537                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
6538 #endif
6539                         }
6540                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
6541                         if ( he_str )
6542                             sv_dat = HeVAL(he_str);
6543                         if ( ! sv_dat ) {
6544                             /* croak baby croak */
6545                             Perl_croak(aTHX_
6546                                 "panic: paren_name hash element allocation failed");
6547                         } else if ( SvPOK(sv_dat) ) {
6548                             /* (?|...) can mean we have dupes so scan to check
6549                                its already been stored. Maybe a flag indicating
6550                                we are inside such a construct would be useful,
6551                                but the arrays are likely to be quite small, so
6552                                for now we punt -- dmq */
6553                             IV count = SvIV(sv_dat);
6554                             I32 *pv = (I32*)SvPVX(sv_dat);
6555                             IV i;
6556                             for ( i = 0 ; i < count ; i++ ) {
6557                                 if ( pv[i] == RExC_npar ) {
6558                                     count = 0;
6559                                     break;
6560                                 }
6561                             }
6562                             if ( count ) {
6563                                 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
6564                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
6565                                 pv[count] = RExC_npar;
6566                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
6567                             }
6568                         } else {
6569                             (void)SvUPGRADE(sv_dat,SVt_PVNV);
6570                             sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
6571                             SvIOK_on(sv_dat);
6572                             SvIV_set(sv_dat, 1);
6573                         }
6574 #ifdef DEBUGGING
6575                         if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
6576                             SvREFCNT_dec(svname);
6577 #endif
6578
6579                         /*sv_dump(sv_dat);*/
6580                     }
6581                     nextchar(pRExC_state);
6582                     paren = 1;
6583                     goto capturing_parens;
6584                 }
6585                 RExC_seen |= REG_SEEN_LOOKBEHIND;
6586                 RExC_in_lookbehind++;
6587                 RExC_parse++;
6588             case '=':           /* (?=...) */
6589                 RExC_seen_zerolen++;
6590                 break;
6591             case '!':           /* (?!...) */
6592                 RExC_seen_zerolen++;
6593                 if (*RExC_parse == ')') {
6594                     ret=reg_node(pRExC_state, OPFAIL);
6595                     nextchar(pRExC_state);
6596                     return ret;
6597                 }
6598                 break;
6599             case '|':           /* (?|...) */
6600                 /* branch reset, behave like a (?:...) except that
6601                    buffers in alternations share the same numbers */
6602                 paren = ':';
6603                 after_freeze = freeze_paren = RExC_npar;
6604                 break;
6605             case ':':           /* (?:...) */
6606             case '>':           /* (?>...) */
6607                 break;
6608             case '$':           /* (?$...) */
6609             case '@':           /* (?@...) */
6610                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
6611                 break;
6612             case '#':           /* (?#...) */
6613                 while (*RExC_parse && *RExC_parse != ')')
6614                     RExC_parse++;
6615                 if (*RExC_parse != ')')
6616                     FAIL("Sequence (?#... not terminated");
6617                 nextchar(pRExC_state);
6618                 *flagp = TRYAGAIN;
6619                 return NULL;
6620             case '0' :           /* (?0) */
6621             case 'R' :           /* (?R) */
6622                 if (*RExC_parse != ')')
6623                     FAIL("Sequence (?R) not terminated");
6624                 ret = reg_node(pRExC_state, GOSTART);
6625                 *flagp |= POSTPONED;
6626                 nextchar(pRExC_state);
6627                 return ret;
6628                 /*notreached*/
6629             { /* named and numeric backreferences */
6630                 I32 num;
6631             case '&':            /* (?&NAME) */
6632                 parse_start = RExC_parse - 1;
6633               named_recursion:
6634                 {
6635                     SV *sv_dat = reg_scan_name(pRExC_state,
6636                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6637                      num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
6638                 }
6639                 goto gen_recurse_regop;
6640                 /* NOT REACHED */
6641             case '+':
6642                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
6643                     RExC_parse++;
6644                     vFAIL("Illegal pattern");
6645                 }
6646                 goto parse_recursion;
6647                 /* NOT REACHED*/
6648             case '-': /* (?-1) */
6649                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
6650                     RExC_parse--; /* rewind to let it be handled later */
6651                     goto parse_flags;
6652                 }
6653                 /*FALLTHROUGH */
6654             case '1': case '2': case '3': case '4': /* (?1) */
6655             case '5': case '6': case '7': case '8': case '9':
6656                 RExC_parse--;
6657               parse_recursion:
6658                 num = atoi(RExC_parse);
6659                 parse_start = RExC_parse - 1; /* MJD */
6660                 if (*RExC_parse == '-')
6661                     RExC_parse++;
6662                 while (isDIGIT(*RExC_parse))
6663                         RExC_parse++;
6664                 if (*RExC_parse!=')')
6665                     vFAIL("Expecting close bracket");
6666
6667               gen_recurse_regop:
6668                 if ( paren == '-' ) {
6669                     /*
6670                     Diagram of capture buffer numbering.
6671                     Top line is the normal capture buffer numbers
6672                     Bottom line is the negative indexing as from
6673                     the X (the (?-2))
6674
6675                     +   1 2    3 4 5 X          6 7
6676                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
6677                     -   5 4    3 2 1 X          x x
6678
6679                     */
6680                     num = RExC_npar + num;
6681                     if (num < 1)  {
6682                         RExC_parse++;
6683                         vFAIL("Reference to nonexistent group");
6684                     }
6685                 } else if ( paren == '+' ) {
6686                     num = RExC_npar + num - 1;
6687                 }
6688
6689                 ret = reganode(pRExC_state, GOSUB, num);
6690                 if (!SIZE_ONLY) {
6691                     if (num > (I32)RExC_rx->nparens) {
6692                         RExC_parse++;
6693                         vFAIL("Reference to nonexistent group");
6694                     }
6695                     ARG2L_SET( ret, RExC_recurse_count++);
6696                     RExC_emit++;
6697                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6698                         "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
6699                 } else {
6700                     RExC_size++;
6701                 }
6702                 RExC_seen |= REG_SEEN_RECURSE;
6703                 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
6704                 Set_Node_Offset(ret, parse_start); /* MJD */
6705
6706                 *flagp |= POSTPONED;
6707                 nextchar(pRExC_state);
6708                 return ret;
6709             } /* named and numeric backreferences */
6710             /* NOT REACHED */
6711
6712             case '?':           /* (??...) */
6713                 is_logical = 1;
6714                 if (*RExC_parse != '{') {
6715                     RExC_parse++;
6716                     vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6717                     /*NOTREACHED*/
6718                 }
6719                 *flagp |= POSTPONED;
6720                 paren = *RExC_parse++;
6721                 /* FALL THROUGH */
6722             case '{':           /* (?{...}) */
6723             {
6724                 I32 count = 1;
6725                 U32 n = 0;
6726                 char c;
6727                 char *s = RExC_parse;
6728
6729                 RExC_seen_zerolen++;
6730                 RExC_seen |= REG_SEEN_EVAL;
6731                 while (count && (c = *RExC_parse)) {
6732                     if (c == '\\') {
6733                         if (RExC_parse[1])
6734                             RExC_parse++;
6735                     }
6736                     else if (c == '{')
6737                         count++;
6738                     else if (c == '}')
6739                         count--;
6740                     RExC_parse++;
6741                 }
6742                 if (*RExC_parse != ')') {
6743                     RExC_parse = s;
6744                     vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
6745                 }
6746                 if (!SIZE_ONLY) {
6747                     PAD *pad;
6748                     OP_4tree *sop, *rop;
6749                     SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
6750
6751                     ENTER;
6752                     Perl_save_re_context(aTHX);
6753                     rop = Perl_sv_compile_2op_is_broken(aTHX_ sv, &sop, "re", &pad);
6754                     sop->op_private |= OPpREFCOUNTED;
6755                     /* re_dup will OpREFCNT_inc */
6756                     OpREFCNT_set(sop, 1);
6757                     LEAVE;
6758
6759                     n = add_data(pRExC_state, 3, "nop");
6760                     RExC_rxi->data->data[n] = (void*)rop;
6761                     RExC_rxi->data->data[n+1] = (void*)sop;
6762                     RExC_rxi->data->data[n+2] = (void*)pad;
6763                     SvREFCNT_dec(sv);
6764                 }
6765                 else {                                          /* First pass */
6766                     if (PL_reginterp_cnt < ++RExC_seen_evals
6767                         && IN_PERL_RUNTIME)
6768                         /* No compiled RE interpolated, has runtime
6769                            components ===> unsafe.  */
6770                         FAIL("Eval-group not allowed at runtime, use re 'eval'");
6771                     if (PL_tainting && PL_tainted)
6772                         FAIL("Eval-group in insecure regular expression");
6773 #if PERL_VERSION > 8
6774                     if (IN_PERL_COMPILETIME)
6775                         PL_cv_has_eval = 1;
6776 #endif
6777                 }
6778
6779                 nextchar(pRExC_state);
6780                 if (is_logical) {
6781                     ret = reg_node(pRExC_state, LOGICAL);
6782                     if (!SIZE_ONLY)
6783                         ret->flags = 2;
6784                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
6785                     /* deal with the length of this later - MJD */
6786                     return ret;
6787                 }
6788                 ret = reganode(pRExC_state, EVAL, n);
6789                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
6790                 Set_Node_Offset(ret, parse_start);
6791                 return ret;
6792             }
6793             case '(':           /* (?(?{...})...) and (?(?=...)...) */
6794             {
6795                 int is_define= 0;
6796                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
6797                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
6798                         || RExC_parse[1] == '<'
6799                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
6800                         I32 flag;
6801
6802                         ret = reg_node(pRExC_state, LOGICAL);
6803                         if (!SIZE_ONLY)
6804                             ret->flags = 1;
6805                         REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
6806                         goto insert_if;
6807                     }
6808                 }
6809                 else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
6810                          || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
6811                 {
6812                     char ch = RExC_parse[0] == '<' ? '>' : '\'';
6813                     char *name_start= RExC_parse++;
6814                     U32 num = 0;
6815                     SV *sv_dat=reg_scan_name(pRExC_state,
6816                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6817                     if (RExC_parse == name_start || *RExC_parse != ch)
6818                         vFAIL2("Sequence (?(%c... not terminated",
6819                             (ch == '>' ? '<' : ch));
6820                     RExC_parse++;
6821                     if (!SIZE_ONLY) {
6822                         num = add_data( pRExC_state, 1, "S" );
6823                         RExC_rxi->data->data[num]=(void*)sv_dat;
6824                         SvREFCNT_inc_simple_void(sv_dat);
6825                     }
6826                     ret = reganode(pRExC_state,NGROUPP,num);
6827                     goto insert_if_check_paren;
6828                 }
6829                 else if (RExC_parse[0] == 'D' &&
6830                          RExC_parse[1] == 'E' &&
6831                          RExC_parse[2] == 'F' &&
6832                          RExC_parse[3] == 'I' &&
6833                          RExC_parse[4] == 'N' &&
6834                          RExC_parse[5] == 'E')
6835                 {
6836                     ret = reganode(pRExC_state,DEFINEP,0);
6837                     RExC_parse +=6 ;
6838                     is_define = 1;
6839                     goto insert_if_check_paren;
6840                 }
6841                 else if (RExC_parse[0] == 'R') {
6842                     RExC_parse++;
6843                     parno = 0;
6844                     if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
6845                         parno = atoi(RExC_parse++);
6846                         while (isDIGIT(*RExC_parse))
6847                             RExC_parse++;
6848                     } else if (RExC_parse[0] == '&') {
6849                         SV *sv_dat;
6850                         RExC_parse++;
6851                         sv_dat = reg_scan_name(pRExC_state,
6852                             SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6853                         parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
6854                     }
6855                     ret = reganode(pRExC_state,INSUBP,parno);
6856                     goto insert_if_check_paren;
6857                 }
6858                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
6859                     /* (?(1)...) */
6860                     char c;
6861                     parno = atoi(RExC_parse++);
6862
6863                     while (isDIGIT(*RExC_parse))
6864                         RExC_parse++;
6865                     ret = reganode(pRExC_state, GROUPP, parno);
6866
6867                  insert_if_check_paren:
6868                     if ((c = *nextchar(pRExC_state)) != ')')
6869                         vFAIL("Switch condition not recognized");
6870                   insert_if:
6871                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
6872                     br = regbranch(pRExC_state, &flags, 1,depth+1);
6873                     if (br == NULL)
6874                         br = reganode(pRExC_state, LONGJMP, 0);
6875                     else
6876                         REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
6877                     c = *nextchar(pRExC_state);
6878                     if (flags&HASWIDTH)
6879                         *flagp |= HASWIDTH;
6880                     if (c == '|') {
6881                         if (is_define)
6882                             vFAIL("(?(DEFINE)....) does not allow branches");
6883                         lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
6884                         regbranch(pRExC_state, &flags, 1,depth+1);
6885                         REGTAIL(pRExC_state, ret, lastbr);
6886                         if (flags&HASWIDTH)
6887                             *flagp |= HASWIDTH;
6888                         c = *nextchar(pRExC_state);
6889                     }
6890                     else
6891                         lastbr = NULL;
6892                     if (c != ')')
6893                         vFAIL("Switch (?(condition)... contains too many branches");
6894                     ender = reg_node(pRExC_state, TAIL);
6895                     REGTAIL(pRExC_state, br, ender);
6896                     if (lastbr) {
6897                         REGTAIL(pRExC_state, lastbr, ender);
6898                         REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
6899                     }
6900                     else
6901                         REGTAIL(pRExC_state, ret, ender);
6902                     RExC_size++; /* XXX WHY do we need this?!!
6903                                     For large programs it seems to be required
6904                                     but I can't figure out why. -- dmq*/
6905                     return ret;
6906                 }
6907                 else {
6908                     vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
6909                 }
6910             }
6911             case 0:
6912                 RExC_parse--; /* for vFAIL to print correctly */
6913                 vFAIL("Sequence (? incomplete");
6914                 break;
6915             case DEFAULT_PAT_MOD:   /* Use default flags with the exceptions
6916                                        that follow */
6917                 has_use_defaults = TRUE;
6918                 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
6919                 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
6920                                                 ? REGEX_UNICODE_CHARSET
6921                                                 : REGEX_DEPENDS_CHARSET);
6922                 goto parse_flags;
6923             default:
6924                 --RExC_parse;
6925                 parse_flags:      /* (?i) */
6926             {
6927                 U32 posflags = 0, negflags = 0;
6928                 U32 *flagsp = &posflags;
6929                 bool has_charset_modifier = 0;
6930                 regex_charset cs = REGEX_DEPENDS_CHARSET;
6931
6932                 while (*RExC_parse) {
6933                     /* && strchr("iogcmsx", *RExC_parse) */
6934                     /* (?g), (?gc) and (?o) are useless here
6935                        and must be globally applied -- japhy */
6936                     switch (*RExC_parse) {
6937                     CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
6938                     case LOCALE_PAT_MOD:
6939                         if (has_charset_modifier || flagsp == &negflags) {
6940                             goto fail_modifiers;
6941                         }
6942                         cs = REGEX_LOCALE_CHARSET;
6943                         has_charset_modifier = 1;
6944                         break;
6945                     case UNICODE_PAT_MOD:
6946                         if (has_charset_modifier || flagsp == &negflags) {
6947                             goto fail_modifiers;
6948                         }
6949                         cs = REGEX_UNICODE_CHARSET;
6950                         has_charset_modifier = 1;
6951                         break;
6952                     case ASCII_RESTRICT_PAT_MOD:
6953                         if (has_charset_modifier || flagsp == &negflags) {
6954                             goto fail_modifiers;
6955                         }
6956                         if (*(RExC_parse + 1) == ASCII_RESTRICT_PAT_MOD) {
6957                             /* Doubled modifier implies more restricted */
6958                             cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
6959                             RExC_parse++;
6960                         }
6961                         else {
6962                             cs = REGEX_ASCII_RESTRICTED_CHARSET;
6963                         }
6964                         has_charset_modifier = 1;
6965                         break;
6966                     case DEPENDS_PAT_MOD:
6967                         if (has_use_defaults
6968                             || has_charset_modifier
6969                             || flagsp == &negflags)
6970                         {
6971                             goto fail_modifiers;
6972                         }
6973
6974                         /* The dual charset means unicode semantics if the
6975                          * pattern (or target, not known until runtime) are
6976                          * utf8, or something in the pattern indicates unicode
6977                          * semantics */
6978                         cs = (RExC_utf8 || RExC_uni_semantics)
6979                              ? REGEX_UNICODE_CHARSET
6980                              : REGEX_DEPENDS_CHARSET;
6981                         has_charset_modifier = 1;
6982                         break;
6983                     case ONCE_PAT_MOD: /* 'o' */
6984                     case GLOBAL_PAT_MOD: /* 'g' */
6985                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6986                             const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
6987                             if (! (wastedflags & wflagbit) ) {
6988                                 wastedflags |= wflagbit;
6989                                 vWARN5(
6990                                     RExC_parse + 1,
6991                                     "Useless (%s%c) - %suse /%c modifier",
6992                                     flagsp == &negflags ? "?-" : "?",
6993                                     *RExC_parse,
6994                                     flagsp == &negflags ? "don't " : "",
6995                                     *RExC_parse
6996                                 );
6997                             }
6998                         }
6999                         break;
7000
7001                     case CONTINUE_PAT_MOD: /* 'c' */
7002                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
7003                             if (! (wastedflags & WASTED_C) ) {
7004                                 wastedflags |= WASTED_GC;
7005                                 vWARN3(
7006                                     RExC_parse + 1,
7007                                     "Useless (%sc) - %suse /gc modifier",
7008                                     flagsp == &negflags ? "?-" : "?",
7009                                     flagsp == &negflags ? "don't " : ""
7010                                 );
7011                             }
7012                         }
7013                         break;
7014                     case KEEPCOPY_PAT_MOD: /* 'p' */
7015                         if (flagsp == &negflags) {
7016                             if (SIZE_ONLY)
7017                                 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
7018                         } else {
7019                             *flagsp |= RXf_PMf_KEEPCOPY;
7020                         }
7021                         break;
7022                     case '-':
7023                         /* A flag is a default iff it is following a minus, so
7024                          * if there is a minus, it means will be trying to
7025                          * re-specify a default which is an error */
7026                         if (has_use_defaults || flagsp == &negflags) {
7027             fail_modifiers:
7028                             RExC_parse++;
7029                             vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
7030                             /*NOTREACHED*/
7031                         }
7032                         flagsp = &negflags;
7033                         wastedflags = 0;  /* reset so (?g-c) warns twice */
7034                         break;
7035                     case ':':
7036                         paren = ':';
7037                         /*FALLTHROUGH*/
7038                     case ')':
7039                         RExC_flags |= posflags;
7040                         RExC_flags &= ~negflags;
7041                         set_regex_charset(&RExC_flags, cs);
7042                         if (paren != ':') {
7043                             oregflags |= posflags;
7044                             oregflags &= ~negflags;
7045                             set_regex_charset(&oregflags, cs);
7046                         }
7047                         nextchar(pRExC_state);
7048                         if (paren != ':') {
7049                             *flagp = TRYAGAIN;
7050                             return NULL;
7051                         } else {
7052                             ret = NULL;
7053                             goto parse_rest;
7054                         }
7055                         /*NOTREACHED*/
7056                     default:
7057                         RExC_parse++;
7058                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
7059                         /*NOTREACHED*/
7060                     }
7061                     ++RExC_parse;
7062                 }
7063             }} /* one for the default block, one for the switch */
7064         }
7065         else {                  /* (...) */
7066           capturing_parens:
7067             parno = RExC_npar;
7068             RExC_npar++;
7069
7070             ret = reganode(pRExC_state, OPEN, parno);
7071             if (!SIZE_ONLY ){
7072                 if (!RExC_nestroot)
7073                     RExC_nestroot = parno;
7074                 if (RExC_seen & REG_SEEN_RECURSE
7075                     && !RExC_open_parens[parno-1])
7076                 {
7077                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
7078                         "Setting open paren #%"IVdf" to %d\n",
7079                         (IV)parno, REG_NODE_NUM(ret)));
7080                     RExC_open_parens[parno-1]= ret;
7081                 }
7082             }
7083             Set_Node_Length(ret, 1); /* MJD */
7084             Set_Node_Offset(ret, RExC_parse); /* MJD */
7085             is_open = 1;
7086         }
7087     }
7088     else                        /* ! paren */
7089         ret = NULL;
7090
7091    parse_rest:
7092     /* Pick up the branches, linking them together. */
7093     parse_start = RExC_parse;   /* MJD */
7094     br = regbranch(pRExC_state, &flags, 1,depth+1);
7095
7096     if (freeze_paren) {
7097         if (RExC_npar > after_freeze)
7098             after_freeze = RExC_npar;
7099         RExC_npar = freeze_paren;
7100     }
7101
7102     /*     branch_len = (paren != 0); */
7103
7104     if (br == NULL)
7105         return(NULL);
7106     if (*RExC_parse == '|') {
7107         if (!SIZE_ONLY && RExC_extralen) {
7108             reginsert(pRExC_state, BRANCHJ, br, depth+1);
7109         }
7110         else {                  /* MJD */
7111             reginsert(pRExC_state, BRANCH, br, depth+1);
7112             Set_Node_Length(br, paren != 0);
7113             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
7114         }
7115         have_branch = 1;
7116         if (SIZE_ONLY)
7117             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
7118     }
7119     else if (paren == ':') {
7120         *flagp |= flags&SIMPLE;
7121     }
7122     if (is_open) {                              /* Starts with OPEN. */
7123         REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
7124     }
7125     else if (paren != '?')              /* Not Conditional */
7126         ret = br;
7127     *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
7128     lastbr = br;
7129     while (*RExC_parse == '|') {
7130         if (!SIZE_ONLY && RExC_extralen) {
7131             ender = reganode(pRExC_state, LONGJMP,0);
7132             REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
7133         }
7134         if (SIZE_ONLY)
7135             RExC_extralen += 2;         /* Account for LONGJMP. */
7136         nextchar(pRExC_state);
7137         if (freeze_paren) {
7138             if (RExC_npar > after_freeze)
7139                 after_freeze = RExC_npar;
7140             RExC_npar = freeze_paren;
7141         }
7142         br = regbranch(pRExC_state, &flags, 0, depth+1);
7143
7144         if (br == NULL)
7145             return(NULL);
7146         REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
7147         lastbr = br;
7148         *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
7149     }
7150
7151     if (have_branch || paren != ':') {
7152         /* Make a closing node, and hook it on the end. */
7153         switch (paren) {
7154         case ':':
7155             ender = reg_node(pRExC_state, TAIL);
7156             break;
7157         case 1:
7158             ender = reganode(pRExC_state, CLOSE, parno);
7159             if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
7160                 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
7161                         "Setting close paren #%"IVdf" to %d\n",
7162                         (IV)parno, REG_NODE_NUM(ender)));
7163                 RExC_close_parens[parno-1]= ender;
7164                 if (RExC_nestroot == parno)
7165                     RExC_nestroot = 0;
7166             }
7167             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
7168             Set_Node_Length(ender,1); /* MJD */
7169             break;
7170         case '<':
7171         case ',':
7172         case '=':
7173         case '!':
7174             *flagp &= ~HASWIDTH;
7175             /* FALL THROUGH */
7176         case '>':
7177             ender = reg_node(pRExC_state, SUCCEED);
7178             break;
7179         case 0:
7180             ender = reg_node(pRExC_state, END);
7181             if (!SIZE_ONLY) {
7182                 assert(!RExC_opend); /* there can only be one! */
7183                 RExC_opend = ender;
7184             }
7185             break;
7186         }
7187         REGTAIL(pRExC_state, lastbr, ender);
7188
7189         if (have_branch && !SIZE_ONLY) {
7190             if (depth==1)
7191                 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
7192
7193             /* Hook the tails of the branches to the closing node. */
7194             for (br = ret; br; br = regnext(br)) {
7195                 const U8 op = PL_regkind[OP(br)];
7196                 if (op == BRANCH) {
7197                     REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
7198                 }
7199                 else if (op == BRANCHJ) {
7200                     REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
7201                 }
7202             }
7203         }
7204     }
7205
7206     {
7207         const char *p;
7208         static const char parens[] = "=!<,>";
7209
7210         if (paren && (p = strchr(parens, paren))) {
7211             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
7212             int flag = (p - parens) > 1;
7213
7214             if (paren == '>')
7215                 node = SUSPEND, flag = 0;
7216             reginsert(pRExC_state, node,ret, depth+1);
7217             Set_Node_Cur_Length(ret);
7218             Set_Node_Offset(ret, parse_start + 1);
7219             ret->flags = flag;
7220             REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
7221         }
7222     }
7223
7224     /* Check for proper termination. */
7225     if (paren) {
7226         RExC_flags = oregflags;
7227         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
7228             RExC_parse = oregcomp_parse;
7229             vFAIL("Unmatched (");
7230         }
7231     }
7232     else if (!paren && RExC_parse < RExC_end) {
7233         if (*RExC_parse == ')') {
7234             RExC_parse++;
7235             vFAIL("Unmatched )");
7236         }
7237         else
7238             FAIL("Junk on end of regexp");      /* "Can't happen". */
7239         /* NOTREACHED */
7240     }
7241
7242     if (RExC_in_lookbehind) {
7243         RExC_in_lookbehind--;
7244     }
7245     if (after_freeze)
7246         RExC_npar = after_freeze;
7247     return(ret);
7248 }
7249
7250 /*
7251  - regbranch - one alternative of an | operator
7252  *
7253  * Implements the concatenation operator.
7254  */
7255 STATIC regnode *
7256 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
7257 {
7258     dVAR;
7259     register regnode *ret;
7260     register regnode *chain = NULL;
7261     register regnode *latest;
7262     I32 flags = 0, c = 0;
7263     GET_RE_DEBUG_FLAGS_DECL;
7264
7265     PERL_ARGS_ASSERT_REGBRANCH;
7266
7267     DEBUG_PARSE("brnc");
7268
7269     if (first)
7270         ret = NULL;
7271     else {
7272         if (!SIZE_ONLY && RExC_extralen)
7273             ret = reganode(pRExC_state, BRANCHJ,0);
7274         else {
7275             ret = reg_node(pRExC_state, BRANCH);
7276             Set_Node_Length(ret, 1);
7277         }
7278     }
7279
7280     if (!first && SIZE_ONLY)
7281         RExC_extralen += 1;                     /* BRANCHJ */
7282
7283     *flagp = WORST;                     /* Tentatively. */
7284
7285     RExC_parse--;
7286     nextchar(pRExC_state);
7287     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
7288         flags &= ~TRYAGAIN;
7289         latest = regpiece(pRExC_state, &flags,depth+1);
7290         if (latest == NULL) {
7291             if (flags & TRYAGAIN)
7292                 continue;
7293             return(NULL);
7294         }
7295         else if (ret == NULL)
7296             ret = latest;
7297         *flagp |= flags&(HASWIDTH|POSTPONED);
7298         if (chain == NULL)      /* First piece. */
7299             *flagp |= flags&SPSTART;
7300         else {
7301             RExC_naughty++;
7302             REGTAIL(pRExC_state, chain, latest);
7303         }
7304         chain = latest;
7305         c++;
7306     }
7307     if (chain == NULL) {        /* Loop ran zero times. */
7308         chain = reg_node(pRExC_state, NOTHING);
7309         if (ret == NULL)
7310             ret = chain;
7311     }
7312     if (c == 1) {
7313         *flagp |= flags&SIMPLE;
7314     }
7315
7316     return ret;
7317 }
7318
7319 /*
7320  - regpiece - something followed by possible [*+?]
7321  *
7322  * Note that the branching code sequences used for ? and the general cases
7323  * of * and + are somewhat optimized:  they use the same NOTHING node as
7324  * both the endmarker for their branch list and the body of the last branch.
7325  * It might seem that this node could be dispensed with entirely, but the
7326  * endmarker role is not redundant.
7327  */
7328 STATIC regnode *
7329 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
7330 {
7331     dVAR;
7332     register regnode *ret;
7333     register char op;
7334     register char *next;
7335     I32 flags;
7336     const char * const origparse = RExC_parse;
7337     I32 min;
7338     I32 max = REG_INFTY;
7339     char *parse_start;
7340     const char *maxpos = NULL;
7341     GET_RE_DEBUG_FLAGS_DECL;
7342
7343     PERL_ARGS_ASSERT_REGPIECE;
7344
7345     DEBUG_PARSE("piec");
7346
7347     ret = regatom(pRExC_state, &flags,depth+1);
7348     if (ret == NULL) {
7349         if (flags & TRYAGAIN)
7350             *flagp |= TRYAGAIN;
7351         return(NULL);
7352     }
7353
7354     op = *RExC_parse;
7355
7356     if (op == '{' && regcurly(RExC_parse)) {
7357         maxpos = NULL;
7358         parse_start = RExC_parse; /* MJD */
7359         next = RExC_parse + 1;
7360         while (isDIGIT(*next) || *next == ',') {
7361             if (*next == ',') {
7362                 if (maxpos)
7363                     break;
7364                 else
7365                     maxpos = next;
7366             }
7367             next++;
7368         }
7369         if (*next == '}') {             /* got one */
7370             if (!maxpos)
7371                 maxpos = next;
7372             RExC_parse++;
7373             min = atoi(RExC_parse);
7374             if (*maxpos == ',')
7375                 maxpos++;
7376             else
7377                 maxpos = RExC_parse;
7378             max = atoi(maxpos);
7379             if (!max && *maxpos != '0')
7380                 max = REG_INFTY;                /* meaning "infinity" */
7381             else if (max >= REG_INFTY)
7382                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
7383             RExC_parse = next;
7384             nextchar(pRExC_state);
7385
7386         do_curly:
7387             if ((flags&SIMPLE)) {
7388                 RExC_naughty += 2 + RExC_naughty / 2;
7389                 reginsert(pRExC_state, CURLY, ret, depth+1);
7390                 Set_Node_Offset(ret, parse_start+1); /* MJD */
7391                 Set_Node_Cur_Length(ret);
7392             }
7393             else {
7394                 regnode * const w = reg_node(pRExC_state, WHILEM);
7395
7396                 w->flags = 0;
7397                 REGTAIL(pRExC_state, ret, w);
7398                 if (!SIZE_ONLY && RExC_extralen) {
7399                     reginsert(pRExC_state, LONGJMP,ret, depth+1);
7400                     reginsert(pRExC_state, NOTHING,ret, depth+1);
7401                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
7402                 }
7403                 reginsert(pRExC_state, CURLYX,ret, depth+1);
7404                                 /* MJD hk */
7405                 Set_Node_Offset(ret, parse_start+1);
7406                 Set_Node_Length(ret,
7407                                 op == '{' ? (RExC_parse - parse_start) : 1);
7408
7409                 if (!SIZE_ONLY && RExC_extralen)
7410                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
7411                 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
7412                 if (SIZE_ONLY)
7413                     RExC_whilem_seen++, RExC_extralen += 3;
7414                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
7415             }
7416             ret->flags = 0;
7417
7418             if (min > 0)
7419                 *flagp = WORST;
7420             if (max > 0)
7421                 *flagp |= HASWIDTH;
7422             if (max < min)
7423                 vFAIL("Can't do {n,m} with n > m");
7424             if (!SIZE_ONLY) {
7425                 ARG1_SET(ret, (U16)min);
7426                 ARG2_SET(ret, (U16)max);
7427             }
7428
7429             goto nest_check;
7430         }
7431     }
7432
7433     if (!ISMULT1(op)) {
7434         *flagp = flags;
7435         return(ret);
7436     }
7437
7438 #if 0                           /* Now runtime fix should be reliable. */
7439
7440     /* if this is reinstated, don't forget to put this back into perldiag:
7441
7442             =item Regexp *+ operand could be empty at {#} in regex m/%s/
7443
7444            (F) The part of the regexp subject to either the * or + quantifier
7445            could match an empty string. The {#} shows in the regular
7446            expression about where the problem was discovered.
7447
7448     */
7449
7450     if (!(flags&HASWIDTH) && op != '?')
7451       vFAIL("Regexp *+ operand could be empty");
7452 #endif
7453
7454     parse_start = RExC_parse;
7455     nextchar(pRExC_state);
7456
7457     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
7458
7459     if (op == '*' && (flags&SIMPLE)) {
7460         reginsert(pRExC_state, STAR, ret, depth+1);
7461         ret->flags = 0;
7462         RExC_naughty += 4;
7463     }
7464     else if (op == '*') {
7465         min = 0;
7466         goto do_curly;
7467     }
7468     else if (op == '+' && (flags&SIMPLE)) {
7469         reginsert(pRExC_state, PLUS, ret, depth+1);
7470         ret->flags = 0;
7471         RExC_naughty += 3;
7472     }
7473     else if (op == '+') {
7474         min = 1;
7475         goto do_curly;
7476     }
7477     else if (op == '?') {
7478         min = 0; max = 1;
7479         goto do_curly;
7480     }
7481   nest_check:
7482     if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
7483         ckWARN3reg(RExC_parse,
7484                    "%.*s matches null string many times",
7485                    (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
7486                    origparse);
7487     }
7488
7489     if (RExC_parse < RExC_end && *RExC_parse == '?') {
7490         nextchar(pRExC_state);
7491         reginsert(pRExC_state, MINMOD, ret, depth+1);
7492         REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
7493     }
7494 #ifndef REG_ALLOW_MINMOD_SUSPEND
7495     else
7496 #endif
7497     if (RExC_parse < RExC_end && *RExC_parse == '+') {
7498         regnode *ender;
7499         nextchar(pRExC_state);
7500         ender = reg_node(pRExC_state, SUCCEED);
7501         REGTAIL(pRExC_state, ret, ender);
7502         reginsert(pRExC_state, SUSPEND, ret, depth+1);
7503         ret->flags = 0;
7504         ender = reg_node(pRExC_state, TAIL);
7505         REGTAIL(pRExC_state, ret, ender);
7506         /*ret= ender;*/
7507     }
7508
7509     if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
7510         RExC_parse++;
7511         vFAIL("Nested quantifiers");
7512     }
7513
7514     return(ret);
7515 }
7516
7517
7518 /* reg_namedseq(pRExC_state,UVp)
7519
7520    This is expected to be called by a parser routine that has
7521    recognized '\N' and needs to handle the rest. RExC_parse is
7522    expected to point at the first char following the N at the time
7523    of the call.
7524
7525    The \N may be inside (indicated by valuep not being NULL) or outside a
7526    character class.
7527
7528    \N may begin either a named sequence, or if outside a character class, mean
7529    to match a non-newline.  For non single-quoted regexes, the tokenizer has
7530    attempted to decide which, and in the case of a named sequence converted it
7531    into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
7532    where c1... are the characters in the sequence.  For single-quoted regexes,
7533    the tokenizer passes the \N sequence through unchanged; this code will not
7534    attempt to determine this nor expand those.  The net effect is that if the
7535    beginning of the passed-in pattern isn't '{U+' or there is no '}', it
7536    signals that this \N occurrence means to match a non-newline.
7537
7538    Only the \N{U+...} form should occur in a character class, for the same
7539    reason that '.' inside a character class means to just match a period: it
7540    just doesn't make sense.
7541
7542    If valuep is non-null then it is assumed that we are parsing inside
7543    of a charclass definition and the first codepoint in the resolved
7544    string is returned via *valuep and the routine will return NULL.
7545    In this mode if a multichar string is returned from the charnames
7546    handler, a warning will be issued, and only the first char in the
7547    sequence will be examined. If the string returned is zero length
7548    then the value of *valuep is undefined and NON-NULL will
7549    be returned to indicate failure. (This will NOT be a valid pointer
7550    to a regnode.)
7551
7552    If valuep is null then it is assumed that we are parsing normal text and a
7553    new EXACT node is inserted into the program containing the resolved string,
7554    and a pointer to the new node is returned.  But if the string is zero length
7555    a NOTHING node is emitted instead.
7556
7557    On success RExC_parse is set to the char following the endbrace.
7558    Parsing failures will generate a fatal error via vFAIL(...)
7559  */
7560 STATIC regnode *
7561 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
7562 {
7563     char * endbrace;    /* '}' following the name */
7564     regnode *ret = NULL;
7565 #ifdef DEBUGGING
7566     char* parse_start = RExC_parse - 2;     /* points to the '\N' */
7567 #endif
7568     char* p;
7569
7570     GET_RE_DEBUG_FLAGS_DECL;
7571
7572     PERL_ARGS_ASSERT_REG_NAMEDSEQ;
7573
7574     GET_RE_DEBUG_FLAGS;
7575
7576     /* The [^\n] meaning of \N ignores spaces and comments under the /x
7577      * modifier.  The other meaning does not */
7578     p = (RExC_flags & RXf_PMf_EXTENDED)
7579         ? regwhite( pRExC_state, RExC_parse )
7580         : RExC_parse;
7581
7582     /* Disambiguate between \N meaning a named character versus \N meaning
7583      * [^\n].  The former is assumed when it can't be the latter. */
7584     if (*p != '{' || regcurly(p)) {
7585         RExC_parse = p;
7586         if (valuep) {
7587             /* no bare \N in a charclass */
7588             vFAIL("\\N in a character class must be a named character: \\N{...}");
7589         }
7590         nextchar(pRExC_state);
7591         ret = reg_node(pRExC_state, REG_ANY);
7592         *flagp |= HASWIDTH|SIMPLE;
7593         RExC_naughty++;
7594         RExC_parse--;
7595         Set_Node_Length(ret, 1); /* MJD */
7596         return ret;
7597     }
7598
7599     /* Here, we have decided it should be a named sequence */
7600
7601     /* The test above made sure that the next real character is a '{', but
7602      * under the /x modifier, it could be separated by space (or a comment and
7603      * \n) and this is not allowed (for consistency with \x{...} and the
7604      * tokenizer handling of \N{NAME}). */
7605     if (*RExC_parse != '{') {
7606         vFAIL("Missing braces on \\N{}");
7607     }
7608
7609     RExC_parse++;       /* Skip past the '{' */
7610
7611     if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
7612         || ! (endbrace == RExC_parse            /* nothing between the {} */
7613               || (endbrace - RExC_parse >= 2    /* U+ (bad hex is checked below */
7614                   && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
7615     {
7616         if (endbrace) RExC_parse = endbrace;    /* position msg's '<--HERE' */
7617         vFAIL("\\N{NAME} must be resolved by the lexer");
7618     }
7619
7620     if (endbrace == RExC_parse) {   /* empty: \N{} */
7621         if (! valuep) {
7622             RExC_parse = endbrace + 1;
7623             return reg_node(pRExC_state,NOTHING);
7624         }
7625
7626         if (SIZE_ONLY) {
7627             ckWARNreg(RExC_parse,
7628                     "Ignoring zero length \\N{} in character class"
7629             );
7630             RExC_parse = endbrace + 1;
7631         }
7632         *valuep = 0;
7633         return (regnode *) &RExC_parse; /* Invalid regnode pointer */
7634     }
7635
7636     REQUIRE_UTF8;       /* named sequences imply Unicode semantics */
7637     RExC_parse += 2;    /* Skip past the 'U+' */
7638
7639     if (valuep) {   /* In a bracketed char class */
7640         /* We only pay attention to the first char of
7641         multichar strings being returned. I kinda wonder
7642         if this makes sense as it does change the behaviour
7643         from earlier versions, OTOH that behaviour was broken
7644         as well. XXX Solution is to recharacterize as
7645         [rest-of-class]|multi1|multi2... */
7646
7647         STRLEN length_of_hex;
7648         I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7649             | PERL_SCAN_DISALLOW_PREFIX
7650             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
7651
7652         char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
7653         if (endchar < endbrace) {
7654             ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
7655         }
7656
7657         length_of_hex = (STRLEN)(endchar - RExC_parse);
7658         *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
7659
7660         /* The tokenizer should have guaranteed validity, but it's possible to
7661          * bypass it by using single quoting, so check */
7662         if (length_of_hex == 0
7663             || length_of_hex != (STRLEN)(endchar - RExC_parse) )
7664         {
7665             RExC_parse += length_of_hex;        /* Includes all the valid */
7666             RExC_parse += (RExC_orig_utf8)      /* point to after 1st invalid */
7667                             ? UTF8SKIP(RExC_parse)
7668                             : 1;
7669             /* Guard against malformed utf8 */
7670             if (RExC_parse >= endchar) RExC_parse = endchar;
7671             vFAIL("Invalid hexadecimal number in \\N{U+...}");
7672         }
7673
7674         RExC_parse = endbrace + 1;
7675         if (endchar == endbrace) return NULL;
7676
7677         ret = (regnode *) &RExC_parse;  /* Invalid regnode pointer */
7678     }
7679     else {      /* Not a char class */
7680         char *s;            /* String to put in generated EXACT node */
7681         STRLEN len = 0;     /* Its current byte length */
7682         char *endchar;      /* Points to '.' or '}' ending cur char in the input
7683                                stream */
7684         ret = reg_node(pRExC_state,
7685                            (U8) ((! FOLD) ? EXACT
7686                                           : (LOC)
7687                                              ? EXACTFL
7688                                              : (MORE_ASCII_RESTRICTED)
7689                                                ? EXACTFA
7690                                                : (AT_LEAST_UNI_SEMANTICS)
7691                                                  ? EXACTFU
7692                                                  : EXACTF));
7693         s= STRING(ret);
7694
7695         /* Exact nodes can hold only a U8 length's of text = 255.  Loop through
7696          * the input which is of the form now 'c1.c2.c3...}' until find the
7697          * ending brace or exceed length 255.  The characters that exceed this
7698          * limit are dropped.  The limit could be relaxed should it become
7699          * desirable by reparsing this as (?:\N{NAME}), so could generate
7700          * multiple EXACT nodes, as is done for just regular input.  But this
7701          * is primarily a named character, and not intended to be a huge long
7702          * string, so 255 bytes should be good enough */
7703         while (1) {
7704             STRLEN length_of_hex;
7705             I32 grok_flags = PERL_SCAN_ALLOW_UNDERSCORES
7706                             | PERL_SCAN_DISALLOW_PREFIX
7707                             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
7708             UV cp;  /* Ord of current character */
7709             bool use_this_char_fold = FOLD;
7710
7711             /* Code points are separated by dots.  If none, there is only one
7712              * code point, and is terminated by the brace */
7713             endchar = RExC_parse + strcspn(RExC_parse, ".}");
7714
7715             /* The values are Unicode even on EBCDIC machines */
7716             length_of_hex = (STRLEN)(endchar - RExC_parse);
7717             cp = grok_hex(RExC_parse, &length_of_hex, &grok_flags, NULL);
7718             if ( length_of_hex == 0
7719                 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
7720             {
7721                 RExC_parse += length_of_hex;        /* Includes all the valid */
7722                 RExC_parse += (RExC_orig_utf8)  /* point to after 1st invalid */
7723                                 ? UTF8SKIP(RExC_parse)
7724                                 : 1;
7725                 /* Guard against malformed utf8 */
7726                 if (RExC_parse >= endchar) RExC_parse = endchar;
7727                 vFAIL("Invalid hexadecimal number in \\N{U+...}");
7728             }
7729
7730             /* XXX ? Change to ANYOF node
7731             if (FOLD
7732                 && (cp > 255 || (! MORE_ASCII_RESTRICTED && ! LOC))
7733                 && is_TRICKYFOLD_cp(cp))
7734             {
7735             }
7736             */
7737
7738             /* Under /aa, we can't mix ASCII with non- in a fold.  If we are
7739              * folding, and the source isn't ASCII, look through all the
7740              * characters it folds to.  If any one of them is ASCII, forbid
7741              * this fold.  (cp is uni, so the 127 below is correct even for
7742              * EBCDIC).  Similarly under locale rules, we don't mix under 256
7743              * with above 255.  XXX It really doesn't make sense to have \N{}
7744              * which means a Unicode rules under locale.  I (khw) think this
7745              * should be warned about, but the counter argument is that people
7746              * who have programmed around Perl's earlier lack of specifying the
7747              * rules and used \N{} to force Unicode things in a local
7748              * environment shouldn't get suddenly a warning */
7749             if (use_this_char_fold) {
7750                 if (LOC && cp < 256) {  /* Fold not known until run-time */
7751                     use_this_char_fold = FALSE;
7752                 }
7753                 else if ((cp > 127 && MORE_ASCII_RESTRICTED)
7754                          || (cp > 255 && LOC))
7755                 {
7756                 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
7757                 U8* s = tmpbuf;
7758                 U8* e;
7759                 STRLEN foldlen;
7760
7761                 (void) toFOLD_uni(cp, tmpbuf, &foldlen);
7762                 e = s + foldlen;
7763
7764                 while (s < e) {
7765                     if (isASCII(*s)
7766                         || (LOC && (UTF8_IS_INVARIANT(*s)
7767                                     || UTF8_IS_DOWNGRADEABLE_START(*s))))
7768                     {
7769                         use_this_char_fold = FALSE;
7770                         break;
7771                     }
7772                     s += UTF8SKIP(s);
7773                 }
7774                 }
7775             }
7776
7777             if (! use_this_char_fold) { /* Not folding, just append to the
7778                                            string */
7779                 STRLEN unilen;
7780
7781                 /* Quit before adding this character if would exceed limit */
7782                 if (len + UNISKIP(cp) > U8_MAX) break;
7783
7784                 unilen = reguni(pRExC_state, cp, s);
7785                 if (unilen > 0) {
7786                     s   += unilen;
7787                     len += unilen;
7788                 }
7789             } else {    /* Folding, output the folded equivalent */
7790                 STRLEN foldlen,numlen;
7791                 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
7792                 cp = toFOLD_uni(cp, tmpbuf, &foldlen);
7793
7794                 /* Quit before exceeding size limit */
7795                 if (len + foldlen > U8_MAX) break;
7796
7797                 for (foldbuf = tmpbuf;
7798                     foldlen;
7799                     foldlen -= numlen)
7800                 {
7801                     cp = utf8_to_uvchr(foldbuf, &numlen);
7802                     if (numlen > 0) {
7803                         const STRLEN unilen = reguni(pRExC_state, cp, s);
7804                         s       += unilen;
7805                         len     += unilen;
7806                         /* In EBCDIC the numlen and unilen can differ. */
7807                         foldbuf += numlen;
7808                         if (numlen >= foldlen)
7809                             break;
7810                     }
7811                     else
7812                         break; /* "Can't happen." */
7813                 }
7814             }
7815
7816             /* Point to the beginning of the next character in the sequence. */
7817             RExC_parse = endchar + 1;
7818
7819             /* Quit if no more characters */
7820             if (RExC_parse >= endbrace) break;
7821         }
7822
7823
7824         if (SIZE_ONLY) {
7825             if (RExC_parse < endbrace) {
7826                 ckWARNreg(RExC_parse - 1,
7827                           "Using just the first characters returned by \\N{}");
7828             }
7829
7830             RExC_size += STR_SZ(len);
7831         } else {
7832             STR_LEN(ret) = len;
7833             RExC_emit += STR_SZ(len);
7834         }
7835
7836         RExC_parse = endbrace + 1;
7837
7838         *flagp |= HASWIDTH; /* Not SIMPLE, as that causes the engine to fail
7839                                with malformed in t/re/pat_advanced.t */
7840         RExC_parse --;
7841         Set_Node_Cur_Length(ret); /* MJD */
7842         nextchar(pRExC_state);
7843     }
7844
7845     return ret;
7846 }
7847
7848
7849 /*
7850  * reg_recode
7851  *
7852  * It returns the code point in utf8 for the value in *encp.
7853  *    value: a code value in the source encoding
7854  *    encp:  a pointer to an Encode object
7855  *
7856  * If the result from Encode is not a single character,
7857  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
7858  */
7859 STATIC UV
7860 S_reg_recode(pTHX_ const char value, SV **encp)
7861 {
7862     STRLEN numlen = 1;
7863     SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
7864     const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
7865     const STRLEN newlen = SvCUR(sv);
7866     UV uv = UNICODE_REPLACEMENT;
7867
7868     PERL_ARGS_ASSERT_REG_RECODE;
7869
7870     if (newlen)
7871         uv = SvUTF8(sv)
7872              ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
7873              : *(U8*)s;
7874
7875     if (!newlen || numlen != newlen) {
7876         uv = UNICODE_REPLACEMENT;
7877         *encp = NULL;
7878     }
7879     return uv;
7880 }
7881
7882
7883 /*
7884  - regatom - the lowest level
7885
7886    Try to identify anything special at the start of the pattern. If there
7887    is, then handle it as required. This may involve generating a single regop,
7888    such as for an assertion; or it may involve recursing, such as to
7889    handle a () structure.
7890
7891    If the string doesn't start with something special then we gobble up
7892    as much literal text as we can.
7893
7894    Once we have been able to handle whatever type of thing started the
7895    sequence, we return.
7896
7897    Note: we have to be careful with escapes, as they can be both literal
7898    and special, and in the case of \10 and friends can either, depending
7899    on context. Specifically there are two separate switches for handling
7900    escape sequences, with the one for handling literal escapes requiring
7901    a dummy entry for all of the special escapes that are actually handled
7902    by the other.
7903 */
7904
7905 STATIC regnode *
7906 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
7907 {
7908     dVAR;
7909     register regnode *ret = NULL;
7910     I32 flags;
7911     char *parse_start = RExC_parse;
7912     U8 op;
7913     GET_RE_DEBUG_FLAGS_DECL;
7914     DEBUG_PARSE("atom");
7915     *flagp = WORST;             /* Tentatively. */
7916
7917     PERL_ARGS_ASSERT_REGATOM;
7918
7919 tryagain:
7920     switch ((U8)*RExC_parse) {
7921     case '^':
7922         RExC_seen_zerolen++;
7923         nextchar(pRExC_state);
7924         if (RExC_flags & RXf_PMf_MULTILINE)
7925             ret = reg_node(pRExC_state, MBOL);
7926         else if (RExC_flags & RXf_PMf_SINGLELINE)
7927             ret = reg_node(pRExC_state, SBOL);
7928         else
7929             ret = reg_node(pRExC_state, BOL);
7930         Set_Node_Length(ret, 1); /* MJD */
7931         break;
7932     case '$':
7933         nextchar(pRExC_state);
7934         if (*RExC_parse)
7935             RExC_seen_zerolen++;
7936         if (RExC_flags & RXf_PMf_MULTILINE)
7937             ret = reg_node(pRExC_state, MEOL);
7938         else if (RExC_flags & RXf_PMf_SINGLELINE)
7939             ret = reg_node(pRExC_state, SEOL);
7940         else
7941             ret = reg_node(pRExC_state, EOL);
7942         Set_Node_Length(ret, 1); /* MJD */
7943         break;
7944     case '.':
7945         nextchar(pRExC_state);
7946         if (RExC_flags & RXf_PMf_SINGLELINE)
7947             ret = reg_node(pRExC_state, SANY);
7948         else
7949             ret = reg_node(pRExC_state, REG_ANY);
7950         *flagp |= HASWIDTH|SIMPLE;
7951         RExC_naughty++;
7952         Set_Node_Length(ret, 1); /* MJD */
7953         break;
7954     case '[':
7955     {
7956         char * const oregcomp_parse = ++RExC_parse;
7957         ret = regclass(pRExC_state,depth+1);
7958         if (*RExC_parse != ']') {
7959             RExC_parse = oregcomp_parse;
7960             vFAIL("Unmatched [");
7961         }
7962         nextchar(pRExC_state);
7963         *flagp |= HASWIDTH|SIMPLE;
7964         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
7965         break;
7966     }
7967     case '(':
7968         nextchar(pRExC_state);
7969         ret = reg(pRExC_state, 1, &flags,depth+1);
7970         if (ret == NULL) {
7971                 if (flags & TRYAGAIN) {
7972                     if (RExC_parse == RExC_end) {
7973                          /* Make parent create an empty node if needed. */
7974                         *flagp |= TRYAGAIN;
7975                         return(NULL);
7976                     }
7977                     goto tryagain;
7978                 }
7979                 return(NULL);
7980         }
7981         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
7982         break;
7983     case '|':
7984     case ')':
7985         if (flags & TRYAGAIN) {
7986             *flagp |= TRYAGAIN;
7987             return NULL;
7988         }
7989         vFAIL("Internal urp");
7990                                 /* Supposed to be caught earlier. */
7991         break;
7992     case '{':
7993         if (!regcurly(RExC_parse)) {
7994             RExC_parse++;
7995             goto defchar;
7996         }
7997         /* FALL THROUGH */
7998     case '?':
7999     case '+':
8000     case '*':
8001         RExC_parse++;
8002         vFAIL("Quantifier follows nothing");
8003         break;
8004     case LATIN_SMALL_LETTER_SHARP_S:
8005     case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
8006     case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
8007 #if UTF8_TWO_BYTE_HI_nocast(UPSILON_D_T) != UTF8_TWO_BYTE_HI_nocast(IOTA_D_T)
8008 #error The beginning utf8 byte of IOTA_D_T and UPSILON_D_T unexpectedly differ.  Other instances in this code should have the case statement below.
8009     case UTF8_TWO_BYTE_HI_nocast(UPSILON_D_T):
8010 #endif
8011         do_foldchar:
8012         if (!LOC && FOLD) {
8013             U32 len,cp;
8014             len=0; /* silence a spurious compiler warning */
8015             if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) {
8016                 *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */
8017                 RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */
8018                 ret = reganode(pRExC_state, FOLDCHAR, cp);
8019                 Set_Node_Length(ret, 1); /* MJD */
8020                 nextchar(pRExC_state); /* kill whitespace under /x */
8021                 return ret;
8022             }
8023         }
8024         goto outer_default;
8025     case '\\':
8026         /* Special Escapes
8027
8028            This switch handles escape sequences that resolve to some kind
8029            of special regop and not to literal text. Escape sequnces that
8030            resolve to literal text are handled below in the switch marked
8031            "Literal Escapes".
8032
8033            Every entry in this switch *must* have a corresponding entry
8034            in the literal escape switch. However, the opposite is not
8035            required, as the default for this switch is to jump to the
8036            literal text handling code.
8037         */
8038         switch ((U8)*++RExC_parse) {
8039         case LATIN_SMALL_LETTER_SHARP_S:
8040         case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
8041         case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
8042                    goto do_foldchar;
8043         /* Special Escapes */
8044         case 'A':
8045             RExC_seen_zerolen++;
8046             ret = reg_node(pRExC_state, SBOL);
8047             *flagp |= SIMPLE;
8048             goto finish_meta_pat;
8049         case 'G':
8050             ret = reg_node(pRExC_state, GPOS);
8051             RExC_seen |= REG_SEEN_GPOS;
8052             *flagp |= SIMPLE;
8053             goto finish_meta_pat;
8054         case 'K':
8055             RExC_seen_zerolen++;
8056             ret = reg_node(pRExC_state, KEEPS);
8057             *flagp |= SIMPLE;
8058             /* XXX:dmq : disabling in-place substitution seems to
8059              * be necessary here to avoid cases of memory corruption, as
8060              * with: C<$_="x" x 80; s/x\K/y/> -- rgs
8061              */
8062             RExC_seen |= REG_SEEN_LOOKBEHIND;
8063             goto finish_meta_pat;
8064         case 'Z':
8065             ret = reg_node(pRExC_state, SEOL);
8066             *flagp |= SIMPLE;
8067             RExC_seen_zerolen++;                /* Do not optimize RE away */
8068             goto finish_meta_pat;
8069         case 'z':
8070             ret = reg_node(pRExC_state, EOS);
8071             *flagp |= SIMPLE;
8072             RExC_seen_zerolen++;                /* Do not optimize RE away */
8073             goto finish_meta_pat;
8074         case 'C':
8075             ret = reg_node(pRExC_state, CANY);
8076             RExC_seen |= REG_SEEN_CANY;
8077             *flagp |= HASWIDTH|SIMPLE;
8078             goto finish_meta_pat;
8079         case 'X':
8080             ret = reg_node(pRExC_state, CLUMP);
8081             *flagp |= HASWIDTH;
8082             goto finish_meta_pat;
8083         case 'w':
8084             switch (get_regex_charset(RExC_flags)) {
8085                 case REGEX_LOCALE_CHARSET:
8086                     op = ALNUML;
8087                     break;
8088                 case REGEX_UNICODE_CHARSET:
8089                     op = ALNUMU;
8090                     break;
8091                 case REGEX_ASCII_RESTRICTED_CHARSET:
8092                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8093                     op = ALNUMA;
8094                     break;
8095                 case REGEX_DEPENDS_CHARSET:
8096                     op = ALNUM;
8097                     break;
8098                 default:
8099                     goto bad_charset;
8100             }
8101             ret = reg_node(pRExC_state, op);
8102             *flagp |= HASWIDTH|SIMPLE;
8103             goto finish_meta_pat;
8104         case 'W':
8105             switch (get_regex_charset(RExC_flags)) {
8106                 case REGEX_LOCALE_CHARSET:
8107                     op = NALNUML;
8108                     break;
8109                 case REGEX_UNICODE_CHARSET:
8110                     op = NALNUMU;
8111                     break;
8112                 case REGEX_ASCII_RESTRICTED_CHARSET:
8113                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8114                     op = NALNUMA;
8115                     break;
8116                 case REGEX_DEPENDS_CHARSET:
8117                     op = NALNUM;
8118                     break;
8119                 default:
8120                     goto bad_charset;
8121             }
8122             ret = reg_node(pRExC_state, op);
8123             *flagp |= HASWIDTH|SIMPLE;
8124             goto finish_meta_pat;
8125         case 'b':
8126             RExC_seen_zerolen++;
8127             RExC_seen |= REG_SEEN_LOOKBEHIND;
8128             switch (get_regex_charset(RExC_flags)) {
8129                 case REGEX_LOCALE_CHARSET:
8130                     op = BOUNDL;
8131                     break;
8132                 case REGEX_UNICODE_CHARSET:
8133                     op = BOUNDU;
8134                     break;
8135                 case REGEX_ASCII_RESTRICTED_CHARSET:
8136                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8137                     op = BOUNDA;
8138                     break;
8139                 case REGEX_DEPENDS_CHARSET:
8140                     op = BOUND;
8141                     break;
8142                 default:
8143                     goto bad_charset;
8144             }
8145             ret = reg_node(pRExC_state, op);
8146             FLAGS(ret) = get_regex_charset(RExC_flags);
8147             *flagp |= SIMPLE;
8148             if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
8149                 ckWARNregdep(RExC_parse, "\"\\b{\" is deprecated; use \"\\b\\{\" instead");
8150             }
8151             goto finish_meta_pat;
8152         case 'B':
8153             RExC_seen_zerolen++;
8154             RExC_seen |= REG_SEEN_LOOKBEHIND;
8155             switch (get_regex_charset(RExC_flags)) {
8156                 case REGEX_LOCALE_CHARSET:
8157                     op = NBOUNDL;
8158                     break;
8159                 case REGEX_UNICODE_CHARSET:
8160                     op = NBOUNDU;
8161                     break;
8162                 case REGEX_ASCII_RESTRICTED_CHARSET:
8163                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8164                     op = NBOUNDA;
8165                     break;
8166                 case REGEX_DEPENDS_CHARSET:
8167                     op = NBOUND;
8168                     break;
8169                 default:
8170                     goto bad_charset;
8171             }
8172             ret = reg_node(pRExC_state, op);
8173             FLAGS(ret) = get_regex_charset(RExC_flags);
8174             *flagp |= SIMPLE;
8175             if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
8176                 ckWARNregdep(RExC_parse, "\"\\B{\" is deprecated; use \"\\B\\{\" instead");
8177             }
8178             goto finish_meta_pat;
8179         case 's':
8180             switch (get_regex_charset(RExC_flags)) {
8181                 case REGEX_LOCALE_CHARSET:
8182                     op = SPACEL;
8183                     break;
8184                 case REGEX_UNICODE_CHARSET:
8185                     op = SPACEU;
8186                     break;
8187                 case REGEX_ASCII_RESTRICTED_CHARSET:
8188                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8189                     op = SPACEA;
8190                     break;
8191                 case REGEX_DEPENDS_CHARSET:
8192                     op = SPACE;
8193                     break;
8194                 default:
8195                     goto bad_charset;
8196             }
8197             ret = reg_node(pRExC_state, op);
8198             *flagp |= HASWIDTH|SIMPLE;
8199             goto finish_meta_pat;
8200         case 'S':
8201             switch (get_regex_charset(RExC_flags)) {
8202                 case REGEX_LOCALE_CHARSET:
8203                     op = NSPACEL;
8204                     break;
8205                 case REGEX_UNICODE_CHARSET:
8206                     op = NSPACEU;
8207                     break;
8208                 case REGEX_ASCII_RESTRICTED_CHARSET:
8209                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8210                     op = NSPACEA;
8211                     break;
8212                 case REGEX_DEPENDS_CHARSET:
8213                     op = NSPACE;
8214                     break;
8215                 default:
8216                     goto bad_charset;
8217             }
8218             ret = reg_node(pRExC_state, op);
8219             *flagp |= HASWIDTH|SIMPLE;
8220             goto finish_meta_pat;
8221         case 'd':
8222             switch (get_regex_charset(RExC_flags)) {
8223                 case REGEX_LOCALE_CHARSET:
8224                     op = DIGITL;
8225                     break;
8226                 case REGEX_ASCII_RESTRICTED_CHARSET:
8227                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8228                     op = DIGITA;
8229                     break;
8230                 case REGEX_DEPENDS_CHARSET: /* No difference between these */
8231                 case REGEX_UNICODE_CHARSET:
8232                     op = DIGIT;
8233                     break;
8234                 default:
8235                     goto bad_charset;
8236             }
8237             ret = reg_node(pRExC_state, op);
8238             *flagp |= HASWIDTH|SIMPLE;
8239             goto finish_meta_pat;
8240         case 'D':
8241             switch (get_regex_charset(RExC_flags)) {
8242                 case REGEX_LOCALE_CHARSET:
8243                     op = NDIGITL;
8244                     break;
8245                 case REGEX_ASCII_RESTRICTED_CHARSET:
8246                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8247                     op = NDIGITA;
8248                     break;
8249                 case REGEX_DEPENDS_CHARSET: /* No difference between these */
8250                 case REGEX_UNICODE_CHARSET:
8251                     op = NDIGIT;
8252                     break;
8253                 default:
8254                     goto bad_charset;
8255             }
8256             ret = reg_node(pRExC_state, op);
8257             *flagp |= HASWIDTH|SIMPLE;
8258             goto finish_meta_pat;
8259         case 'R':
8260             ret = reg_node(pRExC_state, LNBREAK);
8261             *flagp |= HASWIDTH|SIMPLE;
8262             goto finish_meta_pat;
8263         case 'h':
8264             ret = reg_node(pRExC_state, HORIZWS);
8265             *flagp |= HASWIDTH|SIMPLE;
8266             goto finish_meta_pat;
8267         case 'H':
8268             ret = reg_node(pRExC_state, NHORIZWS);
8269             *flagp |= HASWIDTH|SIMPLE;
8270             goto finish_meta_pat;
8271         case 'v':
8272             ret = reg_node(pRExC_state, VERTWS);
8273             *flagp |= HASWIDTH|SIMPLE;
8274             goto finish_meta_pat;
8275         case 'V':
8276             ret = reg_node(pRExC_state, NVERTWS);
8277             *flagp |= HASWIDTH|SIMPLE;
8278          finish_meta_pat:
8279             nextchar(pRExC_state);
8280             Set_Node_Length(ret, 2); /* MJD */
8281             break;
8282         case 'p':
8283         case 'P':
8284             {
8285                 char* const oldregxend = RExC_end;
8286 #ifdef DEBUGGING
8287                 char* parse_start = RExC_parse - 2;
8288 #endif
8289
8290                 if (RExC_parse[1] == '{') {
8291                   /* a lovely hack--pretend we saw [\pX] instead */
8292                     RExC_end = strchr(RExC_parse, '}');
8293                     if (!RExC_end) {
8294                         const U8 c = (U8)*RExC_parse;
8295                         RExC_parse += 2;
8296                         RExC_end = oldregxend;
8297                         vFAIL2("Missing right brace on \\%c{}", c);
8298                     }
8299                     RExC_end++;
8300                 }
8301                 else {
8302                     RExC_end = RExC_parse + 2;
8303                     if (RExC_end > oldregxend)
8304                         RExC_end = oldregxend;
8305                 }
8306                 RExC_parse--;
8307
8308                 ret = regclass(pRExC_state,depth+1);
8309
8310                 RExC_end = oldregxend;
8311                 RExC_parse--;
8312
8313                 Set_Node_Offset(ret, parse_start + 2);
8314                 Set_Node_Cur_Length(ret);
8315                 nextchar(pRExC_state);
8316                 *flagp |= HASWIDTH|SIMPLE;
8317             }
8318             break;
8319         case 'N':
8320             /* Handle \N and \N{NAME} here and not below because it can be
8321             multicharacter. join_exact() will join them up later on.
8322             Also this makes sure that things like /\N{BLAH}+/ and
8323             \N{BLAH} being multi char Just Happen. dmq*/
8324             ++RExC_parse;
8325             ret= reg_namedseq(pRExC_state, NULL, flagp);
8326             break;
8327         case 'k':    /* Handle \k<NAME> and \k'NAME' */
8328         parse_named_seq:
8329         {
8330             char ch= RExC_parse[1];
8331             if (ch != '<' && ch != '\'' && ch != '{') {
8332                 RExC_parse++;
8333                 vFAIL2("Sequence %.2s... not terminated",parse_start);
8334             } else {
8335                 /* this pretty much dupes the code for (?P=...) in reg(), if
8336                    you change this make sure you change that */
8337                 char* name_start = (RExC_parse += 2);
8338                 U32 num = 0;
8339                 SV *sv_dat = reg_scan_name(pRExC_state,
8340                     SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8341                 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
8342                 if (RExC_parse == name_start || *RExC_parse != ch)
8343                     vFAIL2("Sequence %.3s... not terminated",parse_start);
8344
8345                 if (!SIZE_ONLY) {
8346                     num = add_data( pRExC_state, 1, "S" );
8347                     RExC_rxi->data->data[num]=(void*)sv_dat;
8348                     SvREFCNT_inc_simple_void(sv_dat);
8349                 }
8350
8351                 RExC_sawback = 1;
8352                 ret = reganode(pRExC_state,
8353                                ((! FOLD)
8354                                  ? NREF
8355                                  : (MORE_ASCII_RESTRICTED)
8356                                    ? NREFFA
8357                                    : (AT_LEAST_UNI_SEMANTICS)
8358                                      ? NREFFU
8359                                      : (LOC)
8360                                        ? NREFFL
8361                                        : NREFF),
8362                                 num);
8363                 *flagp |= HASWIDTH;
8364
8365                 /* override incorrect value set in reganode MJD */
8366                 Set_Node_Offset(ret, parse_start+1);
8367                 Set_Node_Cur_Length(ret); /* MJD */
8368                 nextchar(pRExC_state);
8369
8370             }
8371             break;
8372         }
8373         case 'g':
8374         case '1': case '2': case '3': case '4':
8375         case '5': case '6': case '7': case '8': case '9':
8376             {
8377                 I32 num;
8378                 bool isg = *RExC_parse == 'g';
8379                 bool isrel = 0;
8380                 bool hasbrace = 0;
8381                 if (isg) {
8382                     RExC_parse++;
8383                     if (*RExC_parse == '{') {
8384                         RExC_parse++;
8385                         hasbrace = 1;
8386                     }
8387                     if (*RExC_parse == '-') {
8388                         RExC_parse++;
8389                         isrel = 1;
8390                     }
8391                     if (hasbrace && !isDIGIT(*RExC_parse)) {
8392                         if (isrel) RExC_parse--;
8393                         RExC_parse -= 2;
8394                         goto parse_named_seq;
8395                 }   }
8396                 num = atoi(RExC_parse);
8397                 if (isg && num == 0)
8398                     vFAIL("Reference to invalid group 0");
8399                 if (isrel) {
8400                     num = RExC_npar - num;
8401                     if (num < 1)
8402                         vFAIL("Reference to nonexistent or unclosed group");
8403                 }
8404                 if (!isg && num > 9 && num >= RExC_npar)
8405                     goto defchar;
8406                 else {
8407                     char * const parse_start = RExC_parse - 1; /* MJD */
8408                     while (isDIGIT(*RExC_parse))
8409                         RExC_parse++;
8410                     if (parse_start == RExC_parse - 1)
8411                         vFAIL("Unterminated \\g... pattern");
8412                     if (hasbrace) {
8413                         if (*RExC_parse != '}')
8414                             vFAIL("Unterminated \\g{...} pattern");
8415                         RExC_parse++;
8416                     }
8417                     if (!SIZE_ONLY) {
8418                         if (num > (I32)RExC_rx->nparens)
8419                             vFAIL("Reference to nonexistent group");
8420                     }
8421                     RExC_sawback = 1;
8422                     ret = reganode(pRExC_state,
8423                                    ((! FOLD)
8424                                      ? REF
8425                                      : (MORE_ASCII_RESTRICTED)
8426                                        ? REFFA
8427                                        : (AT_LEAST_UNI_SEMANTICS)
8428                                          ? REFFU
8429                                          : (LOC)
8430                                            ? REFFL
8431                                            : REFF),
8432                                     num);
8433                     *flagp |= HASWIDTH;
8434
8435                     /* override incorrect value set in reganode MJD */
8436                     Set_Node_Offset(ret, parse_start+1);
8437                     Set_Node_Cur_Length(ret); /* MJD */
8438                     RExC_parse--;
8439                     nextchar(pRExC_state);
8440                 }
8441             }
8442             break;
8443         case '\0':
8444             if (RExC_parse >= RExC_end)
8445                 FAIL("Trailing \\");
8446             /* FALL THROUGH */
8447         default:
8448             /* Do not generate "unrecognized" warnings here, we fall
8449                back into the quick-grab loop below */
8450             parse_start--;
8451             goto defchar;
8452         }
8453         break;
8454
8455     case '#':
8456         if (RExC_flags & RXf_PMf_EXTENDED) {
8457             if ( reg_skipcomment( pRExC_state ) )
8458                 goto tryagain;
8459         }
8460         /* FALL THROUGH */
8461
8462     default:
8463         outer_default:{
8464             register STRLEN len;
8465             register UV ender;
8466             register char *p;
8467             char *s;
8468             STRLEN foldlen;
8469             U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
8470             regnode * orig_emit;
8471
8472             parse_start = RExC_parse - 1;
8473
8474             RExC_parse++;
8475
8476         defchar:
8477             ender = 0;
8478             orig_emit = RExC_emit; /* Save the original output node position in
8479                                       case we need to output a different node
8480                                       type */
8481             ret = reg_node(pRExC_state,
8482                            (U8) ((! FOLD) ? EXACT
8483                                           : (LOC)
8484                                              ? EXACTFL
8485                                              : (MORE_ASCII_RESTRICTED)
8486                                                ? EXACTFA
8487                                                : (AT_LEAST_UNI_SEMANTICS)
8488                                                  ? EXACTFU
8489                                                  : EXACTF)
8490                     );
8491             s = STRING(ret);
8492             for (len = 0, p = RExC_parse - 1;
8493               len < 127 && p < RExC_end;
8494               len++)
8495             {
8496                 char * const oldp = p;
8497
8498                 if (RExC_flags & RXf_PMf_EXTENDED)
8499                     p = regwhite( pRExC_state, p );
8500                 switch ((U8)*p) {
8501                 case LATIN_SMALL_LETTER_SHARP_S:
8502                 case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
8503                 case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
8504                            if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
8505                                 goto normal_default;
8506                 case '^':
8507                 case '$':
8508                 case '.':
8509                 case '[':
8510                 case '(':
8511                 case ')':
8512                 case '|':
8513                     goto loopdone;
8514                 case '\\':
8515                     /* Literal Escapes Switch
8516
8517                        This switch is meant to handle escape sequences that
8518                        resolve to a literal character.
8519
8520                        Every escape sequence that represents something
8521                        else, like an assertion or a char class, is handled
8522                        in the switch marked 'Special Escapes' above in this
8523                        routine, but also has an entry here as anything that
8524                        isn't explicitly mentioned here will be treated as
8525                        an unescaped equivalent literal.
8526                     */
8527
8528                     switch ((U8)*++p) {
8529                     /* These are all the special escapes. */
8530                     case LATIN_SMALL_LETTER_SHARP_S:
8531                     case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
8532                     case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
8533                            if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
8534                                 goto normal_default;
8535                     case 'A':             /* Start assertion */
8536                     case 'b': case 'B':   /* Word-boundary assertion*/
8537                     case 'C':             /* Single char !DANGEROUS! */
8538                     case 'd': case 'D':   /* digit class */
8539                     case 'g': case 'G':   /* generic-backref, pos assertion */
8540                     case 'h': case 'H':   /* HORIZWS */
8541                     case 'k': case 'K':   /* named backref, keep marker */
8542                     case 'N':             /* named char sequence */
8543                     case 'p': case 'P':   /* Unicode property */
8544                               case 'R':   /* LNBREAK */
8545                     case 's': case 'S':   /* space class */
8546                     case 'v': case 'V':   /* VERTWS */
8547                     case 'w': case 'W':   /* word class */
8548                     case 'X':             /* eXtended Unicode "combining character sequence" */
8549                     case 'z': case 'Z':   /* End of line/string assertion */
8550                         --p;
8551                         goto loopdone;
8552
8553                     /* Anything after here is an escape that resolves to a
8554                        literal. (Except digits, which may or may not)
8555                      */
8556                     case 'n':
8557                         ender = '\n';
8558                         p++;
8559                         break;
8560                     case 'r':
8561                         ender = '\r';
8562                         p++;
8563                         break;
8564                     case 't':
8565                         ender = '\t';
8566                         p++;
8567                         break;
8568                     case 'f':
8569                         ender = '\f';
8570                         p++;
8571                         break;
8572                     case 'e':
8573                           ender = ASCII_TO_NATIVE('\033');
8574                         p++;
8575                         break;
8576                     case 'a':
8577                           ender = ASCII_TO_NATIVE('\007');
8578                         p++;
8579                         break;
8580                     case 'o':
8581                         {
8582                             STRLEN brace_len = len;
8583                             UV result;
8584                             const char* error_msg;
8585
8586                             bool valid = grok_bslash_o(p,
8587                                                        &result,
8588                                                        &brace_len,
8589                                                        &error_msg,
8590                                                        1);
8591                             p += brace_len;
8592                             if (! valid) {
8593                                 RExC_parse = p; /* going to die anyway; point
8594                                                    to exact spot of failure */
8595                                 vFAIL(error_msg);
8596                             }
8597                             else
8598                             {
8599                                 ender = result;
8600                             }
8601                             if (PL_encoding && ender < 0x100) {
8602                                 goto recode_encoding;
8603                             }
8604                             if (ender > 0xff) {
8605                                 REQUIRE_UTF8;
8606                             }
8607                             break;
8608                         }
8609                     case 'x':
8610                         if (*++p == '{') {
8611                             char* const e = strchr(p, '}');
8612
8613                             if (!e) {
8614                                 RExC_parse = p + 1;
8615                                 vFAIL("Missing right brace on \\x{}");
8616                             }
8617                             else {
8618                                 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
8619                                     | PERL_SCAN_DISALLOW_PREFIX;
8620                                 STRLEN numlen = e - p - 1;
8621                                 ender = grok_hex(p + 1, &numlen, &flags, NULL);
8622                                 if (ender > 0xff)
8623                                     REQUIRE_UTF8;
8624                                 p = e + 1;
8625                             }
8626                         }
8627                         else {
8628                             I32 flags = PERL_SCAN_DISALLOW_PREFIX;
8629                             STRLEN numlen = 2;
8630                             ender = grok_hex(p, &numlen, &flags, NULL);
8631                             p += numlen;
8632                         }
8633                         if (PL_encoding && ender < 0x100)
8634                             goto recode_encoding;
8635                         break;
8636                     case 'c':
8637                         p++;
8638                         ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
8639                         break;
8640                     case '0': case '1': case '2': case '3':case '4':
8641                     case '5': case '6': case '7': case '8':case '9':
8642                         if (*p == '0' ||
8643                             (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
8644                         {
8645                             I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
8646                             STRLEN numlen = 3;
8647                             ender = grok_oct(p, &numlen, &flags, NULL);
8648                             if (ender > 0xff) {
8649                                 REQUIRE_UTF8;
8650                             }
8651                             p += numlen;
8652                         }
8653                         else {
8654                             --p;
8655                             goto loopdone;
8656                         }
8657                         if (PL_encoding && ender < 0x100)
8658                             goto recode_encoding;
8659                         break;
8660                     recode_encoding:
8661                         {
8662                             SV* enc = PL_encoding;
8663                             ender = reg_recode((const char)(U8)ender, &enc);
8664                             if (!enc && SIZE_ONLY)
8665                                 ckWARNreg(p, "Invalid escape in the specified encoding");
8666                             REQUIRE_UTF8;
8667                         }
8668                         break;
8669                     case '\0':
8670                         if (p >= RExC_end)
8671                             FAIL("Trailing \\");
8672                         /* FALL THROUGH */
8673                     default:
8674                         if (!SIZE_ONLY&& isALPHA(*p)) {
8675                             /* Include any { following the alpha to emphasize
8676                              * that it could be part of an escape at some point
8677                              * in the future */
8678                             int len = (*(p + 1) == '{') ? 2 : 1;
8679                             ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
8680                         }
8681                         goto normal_default;
8682                     }
8683                     break;
8684                 default:
8685                   normal_default:
8686                     if (UTF8_IS_START(*p) && UTF) {
8687                         STRLEN numlen;
8688                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
8689                                                &numlen, UTF8_ALLOW_DEFAULT);
8690                         p += numlen;
8691                     }
8692                     else
8693                         ender = (U8) *p++;
8694                     break;
8695                 } /* End of switch on the literal */
8696
8697                 /* Certain characters are problematic because their folded
8698                  * length is so different from their original length that it
8699                  * isn't handleable by the optimizer.  They are therefore not
8700                  * placed in an EXACTish node; and are here handled specially.
8701                  * (Even if the optimizer handled LATIN_SMALL_LETTER_SHARP_S,
8702                  * putting it in a special node keeps regexec from having to
8703                  * deal with a non-utf8 multi-char fold */
8704                 if (FOLD
8705                     && (ender > 255 || (! MORE_ASCII_RESTRICTED && ! LOC))
8706                     && is_TRICKYFOLD_cp(ender))
8707                 {
8708                     /* If is in middle of outputting characters into an
8709                      * EXACTish node, go output what we have so far, and
8710                      * position the parse so that this will be called again
8711                      * immediately */
8712                     if (len) {
8713                         p  = RExC_parse + len - 1;
8714                         goto loopdone;
8715                     }
8716                     else {
8717
8718                         /* Here we are ready to output our tricky fold
8719                          * character.  What's done is to pretend it's in a
8720                          * [bracketed] class, and let the code that deals with
8721                          * those handle it, as that code has all the
8722                          * intelligence necessary.  First save the current
8723                          * parse state, get rid of the already allocated EXACT
8724                          * node that the ANYOFV node will replace, and point
8725                          * the parse to a buffer which we fill with the
8726                          * character we want the regclass code to think is
8727                          * being parsed */
8728                         char* const oldregxend = RExC_end;
8729                         char tmpbuf[2];
8730                         RExC_emit = orig_emit;
8731                         RExC_parse = tmpbuf;
8732                         if (UTF) {
8733                             tmpbuf[0] = UTF8_TWO_BYTE_HI(ender);
8734                             tmpbuf[1] = UTF8_TWO_BYTE_LO(ender);
8735                             RExC_end = RExC_parse + 2;
8736                         }
8737                         else {
8738                             tmpbuf[0] = (char) ender;
8739                             RExC_end = RExC_parse + 1;
8740                         }
8741
8742                         ret = regclass(pRExC_state,depth+1);
8743
8744                         /* Here, have parsed the buffer.  Reset the parse to
8745                          * the actual input, and return */
8746                         RExC_end = oldregxend;
8747                         RExC_parse = p - 1;
8748
8749                         Set_Node_Offset(ret, RExC_parse);
8750                         Set_Node_Cur_Length(ret);
8751                         nextchar(pRExC_state);
8752                         *flagp |= HASWIDTH|SIMPLE;
8753                         return ret;
8754                     }
8755                 }
8756
8757                 if ( RExC_flags & RXf_PMf_EXTENDED)
8758                     p = regwhite( pRExC_state, p );
8759                 if (UTF && FOLD) {
8760                     /* Prime the casefolded buffer.  Locale rules, which apply
8761                      * only to code points < 256, aren't known until execution,
8762                      * so for them, just output the original character using
8763                      * utf8 */
8764                     if (LOC && ender < 256) {
8765                         if (UNI_IS_INVARIANT(ender)) {
8766                             *tmpbuf = (U8) ender;
8767                             foldlen = 1;
8768                         } else {
8769                             *tmpbuf = UTF8_TWO_BYTE_HI(ender);
8770                             *(tmpbuf + 1) = UTF8_TWO_BYTE_LO(ender);
8771                             foldlen = 2;
8772                         }
8773                     }
8774                     else if (isASCII(ender)) {  /* Note: Here can't also be LOC
8775                                                  */
8776                         ender = toLOWER(ender);
8777                         *tmpbuf = (U8) ender;
8778                         foldlen = 1;
8779                     }
8780                     else if (! MORE_ASCII_RESTRICTED && ! LOC) {
8781
8782                         /* Locale and /aa require more selectivity about the
8783                          * fold, so are handled below.  Otherwise, here, just
8784                          * use the fold */
8785                         ender = toFOLD_uni(ender, tmpbuf, &foldlen);
8786                     }
8787                     else {
8788                         /* Under locale rules or /aa we are not to mix,
8789                          * respectively, ords < 256 or ASCII with non-.  So
8790                          * reject folds that mix them, using only the
8791                          * non-folded code point.  So do the fold to a
8792                          * temporary, and inspect each character in it. */
8793                         U8 trialbuf[UTF8_MAXBYTES_CASE+1];
8794                         U8* s = trialbuf;
8795                         UV tmpender = toFOLD_uni(ender, trialbuf, &foldlen);
8796                         U8* e = s + foldlen;
8797                         bool fold_ok = TRUE;
8798
8799                         while (s < e) {
8800                             if (isASCII(*s)
8801                                 || (LOC && (UTF8_IS_INVARIANT(*s)
8802                                            || UTF8_IS_DOWNGRADEABLE_START(*s))))
8803                             {
8804                                 fold_ok = FALSE;
8805                                 break;
8806                             }
8807                             s += UTF8SKIP(s);
8808                         }
8809                         if (fold_ok) {
8810                             Copy(trialbuf, tmpbuf, foldlen, U8);
8811                             ender = tmpender;
8812                         }
8813                         else {
8814                             uvuni_to_utf8(tmpbuf, ender);
8815                             foldlen = UNISKIP(ender);
8816                         }
8817                     }
8818                 }
8819                 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
8820                     if (len)
8821                         p = oldp;
8822                     else if (UTF) {
8823                          if (FOLD) {
8824                               /* Emit all the Unicode characters. */
8825                               STRLEN numlen;
8826                               for (foldbuf = tmpbuf;
8827                                    foldlen;
8828                                    foldlen -= numlen) {
8829                                    ender = utf8_to_uvchr(foldbuf, &numlen);
8830                                    if (numlen > 0) {
8831                                         const STRLEN unilen = reguni(pRExC_state, ender, s);
8832                                         s       += unilen;
8833                                         len     += unilen;
8834                                         /* In EBCDIC the numlen
8835                                          * and unilen can differ. */
8836                                         foldbuf += numlen;
8837                                         if (numlen >= foldlen)
8838                                              break;
8839                                    }
8840                                    else
8841                                         break; /* "Can't happen." */
8842                               }
8843                          }
8844                          else {
8845                               const STRLEN unilen = reguni(pRExC_state, ender, s);
8846                               if (unilen > 0) {
8847                                    s   += unilen;
8848                                    len += unilen;
8849                               }
8850                          }
8851                     }
8852                     else {
8853                         len++;
8854                         REGC((char)ender, s++);
8855                     }
8856                     break;
8857                 }
8858                 if (UTF) {
8859                      if (FOLD) {
8860                           /* Emit all the Unicode characters. */
8861                           STRLEN numlen;
8862                           for (foldbuf = tmpbuf;
8863                                foldlen;
8864                                foldlen -= numlen) {
8865                                ender = utf8_to_uvchr(foldbuf, &numlen);
8866                                if (numlen > 0) {
8867                                     const STRLEN unilen = reguni(pRExC_state, ender, s);
8868                                     len     += unilen;
8869                                     s       += unilen;
8870                                     /* In EBCDIC the numlen
8871                                      * and unilen can differ. */
8872                                     foldbuf += numlen;
8873                                     if (numlen >= foldlen)
8874                                          break;
8875                                }
8876                                else
8877                                     break;
8878                           }
8879                      }
8880                      else {
8881                           const STRLEN unilen = reguni(pRExC_state, ender, s);
8882                           if (unilen > 0) {
8883                                s   += unilen;
8884                                len += unilen;
8885                           }
8886                      }
8887                      len--;
8888                 }
8889                 else
8890                     REGC((char)ender, s++);
8891             }
8892         loopdone:   /* Jumped to when encounters something that shouldn't be in
8893                        the node */
8894             RExC_parse = p - 1;
8895             Set_Node_Cur_Length(ret); /* MJD */
8896             nextchar(pRExC_state);
8897             {
8898                 /* len is STRLEN which is unsigned, need to copy to signed */
8899                 IV iv = len;
8900                 if (iv < 0)
8901                     vFAIL("Internal disaster");
8902             }
8903             if (len > 0)
8904                 *flagp |= HASWIDTH;
8905             if (len == 1 && UNI_IS_INVARIANT(ender))
8906                 *flagp |= SIMPLE;
8907
8908             if (SIZE_ONLY)
8909                 RExC_size += STR_SZ(len);
8910             else {
8911                 STR_LEN(ret) = len;
8912                 RExC_emit += STR_SZ(len);
8913             }
8914         }
8915         break;
8916     }
8917
8918     return(ret);
8919
8920 /* Jumped to when an unrecognized character set is encountered */
8921 bad_charset:
8922     Perl_croak(aTHX_ "panic: Unknown regex character set encoding: %u", get_regex_charset(RExC_flags));
8923     return(NULL);
8924 }
8925
8926 STATIC char *
8927 S_regwhite( RExC_state_t *pRExC_state, char *p )
8928 {
8929     const char *e = RExC_end;
8930
8931     PERL_ARGS_ASSERT_REGWHITE;
8932
8933     while (p < e) {
8934         if (isSPACE(*p))
8935             ++p;
8936         else if (*p == '#') {
8937             bool ended = 0;
8938             do {
8939                 if (*p++ == '\n') {
8940                     ended = 1;
8941                     break;
8942                 }
8943             } while (p < e);
8944             if (!ended)
8945                 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
8946         }
8947         else
8948             break;
8949     }
8950     return p;
8951 }
8952
8953 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
8954    Character classes ([:foo:]) can also be negated ([:^foo:]).
8955    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
8956    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
8957    but trigger failures because they are currently unimplemented. */
8958
8959 #define POSIXCC_DONE(c)   ((c) == ':')
8960 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
8961 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
8962
8963 STATIC I32
8964 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
8965 {
8966     dVAR;
8967     I32 namedclass = OOB_NAMEDCLASS;
8968
8969     PERL_ARGS_ASSERT_REGPPOSIXCC;
8970
8971     if (value == '[' && RExC_parse + 1 < RExC_end &&
8972         /* I smell either [: or [= or [. -- POSIX has been here, right? */
8973         POSIXCC(UCHARAT(RExC_parse))) {
8974         const char c = UCHARAT(RExC_parse);
8975         char* const s = RExC_parse++;
8976
8977         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
8978             RExC_parse++;
8979         if (RExC_parse == RExC_end)
8980             /* Grandfather lone [:, [=, [. */
8981             RExC_parse = s;
8982         else {
8983             const char* const t = RExC_parse++; /* skip over the c */
8984             assert(*t == c);
8985
8986             if (UCHARAT(RExC_parse) == ']') {
8987                 const char *posixcc = s + 1;
8988                 RExC_parse++; /* skip over the ending ] */
8989
8990                 if (*s == ':') {
8991                     const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
8992                     const I32 skip = t - posixcc;
8993
8994                     /* Initially switch on the length of the name.  */
8995                     switch (skip) {
8996                     case 4:
8997                         if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
8998                             namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
8999                         break;
9000                     case 5:
9001                         /* Names all of length 5.  */
9002                         /* alnum alpha ascii blank cntrl digit graph lower
9003                            print punct space upper  */
9004                         /* Offset 4 gives the best switch position.  */
9005                         switch (posixcc[4]) {
9006                         case 'a':
9007                             if (memEQ(posixcc, "alph", 4)) /* alpha */
9008                                 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
9009                             break;
9010                         case 'e':
9011                             if (memEQ(posixcc, "spac", 4)) /* space */
9012                                 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
9013                             break;
9014                         case 'h':
9015                             if (memEQ(posixcc, "grap", 4)) /* graph */
9016                                 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
9017                             break;
9018                         case 'i':
9019                             if (memEQ(posixcc, "asci", 4)) /* ascii */
9020                                 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
9021                             break;
9022                         case 'k':
9023                             if (memEQ(posixcc, "blan", 4)) /* blank */
9024                                 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
9025                             break;
9026                         case 'l':
9027                             if (memEQ(posixcc, "cntr", 4)) /* cntrl */
9028                                 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
9029                             break;
9030                         case 'm':
9031                             if (memEQ(posixcc, "alnu", 4)) /* alnum */
9032                                 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
9033                             break;
9034                         case 'r':
9035                             if (memEQ(posixcc, "lowe", 4)) /* lower */
9036                                 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
9037                             else if (memEQ(posixcc, "uppe", 4)) /* upper */
9038                                 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
9039                             break;
9040                         case 't':
9041                             if (memEQ(posixcc, "digi", 4)) /* digit */
9042                                 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
9043                             else if (memEQ(posixcc, "prin", 4)) /* print */
9044                                 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
9045                             else if (memEQ(posixcc, "punc", 4)) /* punct */
9046                                 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
9047                             break;
9048                         }
9049                         break;
9050                     case 6:
9051                         if (memEQ(posixcc, "xdigit", 6))
9052                             namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
9053                         break;
9054                     }
9055
9056                     if (namedclass == OOB_NAMEDCLASS)
9057                         Simple_vFAIL3("POSIX class [:%.*s:] unknown",
9058                                       t - s - 1, s + 1);
9059                     assert (posixcc[skip] == ':');
9060                     assert (posixcc[skip+1] == ']');
9061                 } else if (!SIZE_ONLY) {
9062                     /* [[=foo=]] and [[.foo.]] are still future. */
9063
9064                     /* adjust RExC_parse so the warning shows after
9065                        the class closes */
9066                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
9067                         RExC_parse++;
9068                     Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
9069                 }
9070             } else {
9071                 /* Maternal grandfather:
9072                  * "[:" ending in ":" but not in ":]" */
9073                 RExC_parse = s;
9074             }
9075         }
9076     }
9077
9078     return namedclass;
9079 }
9080
9081 STATIC void
9082 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
9083 {
9084     dVAR;
9085
9086     PERL_ARGS_ASSERT_CHECKPOSIXCC;
9087
9088     if (POSIXCC(UCHARAT(RExC_parse))) {
9089         const char *s = RExC_parse;
9090         const char  c = *s++;
9091
9092         while (isALNUM(*s))
9093             s++;
9094         if (*s && c == *s && s[1] == ']') {
9095             ckWARN3reg(s+2,
9096                        "POSIX syntax [%c %c] belongs inside character classes",
9097                        c, c);
9098
9099             /* [[=foo=]] and [[.foo.]] are still future. */
9100             if (POSIXCC_NOTYET(c)) {
9101                 /* adjust RExC_parse so the error shows after
9102                    the class closes */
9103                 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
9104                     NOOP;
9105                 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
9106             }
9107         }
9108     }
9109 }
9110
9111 /* No locale test, and always Unicode semantics */
9112 #define _C_C_T_NOLOC_(NAME,TEST,WORD)                                          \
9113 ANYOF_##NAME:                                                                  \
9114         for (value = 0; value < 256; value++)                                  \
9115             if (TEST)                                                          \
9116             stored += set_regclass_bit(pRExC_state, ret, (U8) value, &nonbitmap);  \
9117     yesno = '+';                                                               \
9118     what = WORD;                                                               \
9119     break;                                                                     \
9120 case ANYOF_N##NAME:                                                            \
9121         for (value = 0; value < 256; value++)                                  \
9122             if (!TEST)                                                         \
9123             stored += set_regclass_bit(pRExC_state, ret, (U8) value, &nonbitmap);  \
9124     yesno = '!';                                                               \
9125     what = WORD;                                                               \
9126     break
9127
9128 /* Like the above, but there are differences if we are in uni-8-bit or not, so
9129  * there are two tests passed in, to use depending on that. There aren't any
9130  * cases where the label is different from the name, so no need for that
9131  * parameter */
9132 #define _C_C_T_(NAME, TEST_8, TEST_7, WORD)                                    \
9133 ANYOF_##NAME:                                                                  \
9134     if (LOC) ANYOF_CLASS_SET(ret, ANYOF_##NAME);                               \
9135     else if (UNI_SEMANTICS) {                                                  \
9136         for (value = 0; value < 256; value++) {                                \
9137             if (TEST_8(value)) stored +=                                       \
9138                       set_regclass_bit(pRExC_state, ret, (U8) value, &nonbitmap);  \
9139         }                                                                      \
9140     }                                                                          \
9141     else {                                                                     \
9142         for (value = 0; value < 128; value++) {                                \
9143             if (TEST_7(UNI_TO_NATIVE(value))) stored +=                        \
9144                 set_regclass_bit(pRExC_state, ret,                     \
9145                                    (U8) UNI_TO_NATIVE(value), &nonbitmap);                 \
9146         }                                                                      \
9147     }                                                                          \
9148     yesno = '+';                                                               \
9149     what = WORD;                                                               \
9150     break;                                                                     \
9151 case ANYOF_N##NAME:                                                            \
9152     if (LOC) ANYOF_CLASS_SET(ret, ANYOF_N##NAME);                              \
9153     else if (UNI_SEMANTICS) {                                                  \
9154         for (value = 0; value < 256; value++) {                                \
9155             if (! TEST_8(value)) stored +=                                     \
9156                     set_regclass_bit(pRExC_state, ret, (U8) value, &nonbitmap);    \
9157         }                                                                      \
9158     }                                                                          \
9159     else {                                                                     \
9160         for (value = 0; value < 128; value++) {                                \
9161             if (! TEST_7(UNI_TO_NATIVE(value))) stored += set_regclass_bit(  \
9162                         pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &nonbitmap);    \
9163         }                                                                      \
9164         if (AT_LEAST_ASCII_RESTRICTED) {                                       \
9165             for (value = 128; value < 256; value++) {                          \
9166              stored += set_regclass_bit(                                     \
9167                            pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &nonbitmap); \
9168             }                                                                  \
9169             ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL|ANYOF_UTF8;                  \
9170         }                                                                      \
9171         else {                                                                 \
9172             /* For a non-ut8 target string with DEPENDS semantics, all above   \
9173              * ASCII Latin1 code points match the complement of any of the     \
9174              * classes.  But in utf8, they have their Unicode semantics, so    \
9175              * can't just set them in the bitmap, or else regexec.c will think \
9176              * they matched when they shouldn't. */                            \
9177             ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL|ANYOF_UTF8;          \
9178         }                                                                      \
9179     }                                                                          \
9180     yesno = '!';                                                               \
9181     what = WORD;                                                               \
9182     break
9183
9184 /*
9185    We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
9186    so that it is possible to override the option here without having to
9187    rebuild the entire core. as we are required to do if we change regcomp.h
9188    which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
9189 */
9190 #if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
9191 #define BROKEN_UNICODE_CHARCLASS_MAPPINGS
9192 #endif
9193
9194 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
9195 #define POSIX_CC_UNI_NAME(CCNAME) CCNAME
9196 #else
9197 #define POSIX_CC_UNI_NAME(CCNAME) "Posix" CCNAME
9198 #endif
9199
9200 STATIC U8
9201 S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, HV** nonbitmap_ptr)
9202 {
9203
9204     /* Handle the setting of folds in the bitmap for non-locale ANYOF nodes.
9205      * Locale folding is done at run-time, so this function should not be
9206      * called for nodes that are for locales.
9207      *
9208      * This function simply sets the bit corresponding to the fold of the input
9209      * 'value', if not already set.  The fold of 'f' is 'F', and the fold of
9210      * 'F' is 'f'.
9211      *
9212      * It also sets any necessary flags, and returns the number of bits that
9213      * actually changed from 0 to 1 */
9214
9215     U8 stored = 0;
9216     U8 fold;
9217
9218     PERL_ARGS_ASSERT_SET_REGCLASS_BIT_FOLD;
9219
9220     fold = (AT_LEAST_UNI_SEMANTICS) ? PL_fold_latin1[value]
9221                                     : PL_fold[value];
9222
9223     /* It assumes the bit for 'value' has already been set */
9224     if (fold != value && ! ANYOF_BITMAP_TEST(node, fold)) {
9225         ANYOF_BITMAP_SET(node, fold);
9226         stored++;
9227     }
9228     if ((_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value) && (! isASCII(value) || ! MORE_ASCII_RESTRICTED))
9229         || (! UNI_SEMANTICS
9230             && ! isASCII(value)
9231             && PL_fold_latin1[value] != value))
9232     {   /* A character that has a fold outside of Latin1 matches outside the
9233            bitmap, but only when the target string is utf8.  Similarly when we
9234            don't have unicode semantics for the above ASCII Latin-1 characters,
9235            and they have a fold, they should match if the target is utf8, and
9236            not otherwise */
9237         if (! *nonbitmap_ptr) {
9238             *nonbitmap_ptr = _new_invlist(2);
9239         }
9240         *nonbitmap_ptr = add_range_to_invlist(*nonbitmap_ptr, value, value);
9241         ANYOF_FLAGS(node) |= ANYOF_UTF8;
9242     }
9243
9244     return stored;
9245 }
9246
9247
9248 PERL_STATIC_INLINE U8
9249 S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, HV** nonbitmap_ptr)
9250 {
9251     /* This inline function sets a bit in the bitmap if not already set, and if
9252      * appropriate, its fold, returning the number of bits that actually
9253      * changed from 0 to 1 */
9254
9255     U8 stored;
9256
9257     PERL_ARGS_ASSERT_SET_REGCLASS_BIT;
9258
9259     if (ANYOF_BITMAP_TEST(node, value)) {   /* Already set */
9260         return 0;
9261     }
9262
9263     ANYOF_BITMAP_SET(node, value);
9264     stored = 1;
9265
9266     if (FOLD && ! LOC) {        /* Locale folds aren't known until runtime */
9267         stored += set_regclass_bit_fold(pRExC_state, node, value, nonbitmap_ptr);
9268     }
9269
9270     return stored;
9271 }
9272
9273 /*
9274    parse a class specification and produce either an ANYOF node that
9275    matches the pattern or perhaps will be optimized into an EXACTish node
9276    instead. The node contains a bit map for the first 256 characters, with the
9277    corresponding bit set if that character is in the list.  For characters
9278    above 255, a range list is used */
9279
9280 STATIC regnode *
9281 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
9282 {
9283     dVAR;
9284     register UV nextvalue;
9285     register IV prevvalue = OOB_UNICODE;
9286     register IV range = 0;
9287     UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
9288     register regnode *ret;
9289     STRLEN numlen;
9290     IV namedclass;
9291     char *rangebegin = NULL;
9292     bool need_class = 0;
9293     SV *listsv = NULL;
9294     UV n;
9295     HV* nonbitmap = NULL;
9296     AV* unicode_alternate  = NULL;
9297 #ifdef EBCDIC
9298     UV literal_endpoint = 0;
9299 #endif
9300     UV stored = 0;  /* how many chars stored in the bitmap */
9301
9302     regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
9303         case we need to change the emitted regop to an EXACT. */
9304     const char * orig_parse = RExC_parse;
9305     GET_RE_DEBUG_FLAGS_DECL;
9306
9307     PERL_ARGS_ASSERT_REGCLASS;
9308 #ifndef DEBUGGING
9309     PERL_UNUSED_ARG(depth);
9310 #endif
9311
9312     DEBUG_PARSE("clas");
9313
9314     /* Assume we are going to generate an ANYOF node. */
9315     ret = reganode(pRExC_state, ANYOF, 0);
9316
9317
9318     if (!SIZE_ONLY) {
9319         ANYOF_FLAGS(ret) = 0;
9320     }
9321
9322     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
9323         RExC_naughty++;
9324         RExC_parse++;
9325         if (!SIZE_ONLY)
9326             ANYOF_FLAGS(ret) |= ANYOF_INVERT;
9327     }
9328
9329     if (SIZE_ONLY) {
9330         RExC_size += ANYOF_SKIP;
9331 #ifdef ANYOF_ADD_LOC_SKIP
9332         if (LOC) {
9333             RExC_size += ANYOF_ADD_LOC_SKIP;
9334         }
9335 #endif
9336         listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
9337     }
9338     else {
9339         RExC_emit += ANYOF_SKIP;
9340         if (LOC) {
9341             ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
9342 #ifdef ANYOF_ADD_LOC_SKIP
9343             RExC_emit += ANYOF_ADD_LOC_SKIP;
9344 #endif
9345         }
9346         ANYOF_BITMAP_ZERO(ret);
9347         listsv = newSVpvs("# comment\n");
9348     }
9349
9350     nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
9351
9352     if (!SIZE_ONLY && POSIXCC(nextvalue))
9353         checkposixcc(pRExC_state);
9354
9355     /* allow 1st char to be ] (allowing it to be - is dealt with later) */
9356     if (UCHARAT(RExC_parse) == ']')
9357         goto charclassloop;
9358
9359 parseit:
9360     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
9361
9362     charclassloop:
9363
9364         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
9365
9366         if (!range)
9367             rangebegin = RExC_parse;
9368         if (UTF) {
9369             value = utf8n_to_uvchr((U8*)RExC_parse,
9370                                    RExC_end - RExC_parse,
9371                                    &numlen, UTF8_ALLOW_DEFAULT);
9372             RExC_parse += numlen;
9373         }
9374         else
9375             value = UCHARAT(RExC_parse++);
9376
9377         nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
9378         if (value == '[' && POSIXCC(nextvalue))
9379             namedclass = regpposixcc(pRExC_state, value);
9380         else if (value == '\\') {
9381             if (UTF) {
9382                 value = utf8n_to_uvchr((U8*)RExC_parse,
9383                                    RExC_end - RExC_parse,
9384                                    &numlen, UTF8_ALLOW_DEFAULT);
9385                 RExC_parse += numlen;
9386             }
9387             else
9388                 value = UCHARAT(RExC_parse++);
9389             /* Some compilers cannot handle switching on 64-bit integer
9390              * values, therefore value cannot be an UV.  Yes, this will
9391              * be a problem later if we want switch on Unicode.
9392              * A similar issue a little bit later when switching on
9393              * namedclass. --jhi */
9394             switch ((I32)value) {
9395             case 'w':   namedclass = ANYOF_ALNUM;       break;
9396             case 'W':   namedclass = ANYOF_NALNUM;      break;
9397             case 's':   namedclass = ANYOF_SPACE;       break;
9398             case 'S':   namedclass = ANYOF_NSPACE;      break;
9399             case 'd':   namedclass = ANYOF_DIGIT;       break;
9400             case 'D':   namedclass = ANYOF_NDIGIT;      break;
9401             case 'v':   namedclass = ANYOF_VERTWS;      break;
9402             case 'V':   namedclass = ANYOF_NVERTWS;     break;
9403             case 'h':   namedclass = ANYOF_HORIZWS;     break;
9404             case 'H':   namedclass = ANYOF_NHORIZWS;    break;
9405             case 'N':  /* Handle \N{NAME} in class */
9406                 {
9407                     /* We only pay attention to the first char of
9408                     multichar strings being returned. I kinda wonder
9409                     if this makes sense as it does change the behaviour
9410                     from earlier versions, OTOH that behaviour was broken
9411                     as well. */
9412                     UV v; /* value is register so we cant & it /grrr */
9413                     if (reg_namedseq(pRExC_state, &v, NULL)) {
9414                         goto parseit;
9415                     }
9416                     value= v;
9417                 }
9418                 break;
9419             case 'p':
9420             case 'P':
9421                 {
9422                 char *e;
9423                 if (RExC_parse >= RExC_end)
9424                     vFAIL2("Empty \\%c{}", (U8)value);
9425                 if (*RExC_parse == '{') {
9426                     const U8 c = (U8)value;
9427                     e = strchr(RExC_parse++, '}');
9428                     if (!e)
9429                         vFAIL2("Missing right brace on \\%c{}", c);
9430                     while (isSPACE(UCHARAT(RExC_parse)))
9431                         RExC_parse++;
9432                     if (e == RExC_parse)
9433                         vFAIL2("Empty \\%c{}", c);
9434                     n = e - RExC_parse;
9435                     while (isSPACE(UCHARAT(RExC_parse + n - 1)))
9436                         n--;
9437                 }
9438                 else {
9439                     e = RExC_parse;
9440                     n = 1;
9441                 }
9442                 if (!SIZE_ONLY) {
9443                     if (UCHARAT(RExC_parse) == '^') {
9444                          RExC_parse++;
9445                          n--;
9446                          value = value == 'p' ? 'P' : 'p'; /* toggle */
9447                          while (isSPACE(UCHARAT(RExC_parse))) {
9448                               RExC_parse++;
9449                               n--;
9450                          }
9451                     }
9452
9453                     /* Add the property name to the list.  If /i matching, give
9454                      * a different name which consists of the normal name
9455                      * sandwiched between two underscores and '_i'.  The design
9456                      * is discussed in the commit message for this. */
9457                     Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s%.*s%s\n",
9458                                         (value=='p' ? '+' : '!'),
9459                                         (FOLD) ? "__" : "",
9460                                         (int)n,
9461                                         RExC_parse,
9462                                         (FOLD) ? "_i" : ""
9463                                     );
9464                 }
9465                 RExC_parse = e + 1;
9466
9467                 /* The \p could match something in the Latin1 range, hence
9468                  * something that isn't utf8 */
9469                 ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP;
9470                 namedclass = ANYOF_MAX;  /* no official name, but it's named */
9471
9472                 /* \p means they want Unicode semantics */
9473                 RExC_uni_semantics = 1;
9474                 }
9475                 break;
9476             case 'n':   value = '\n';                   break;
9477             case 'r':   value = '\r';                   break;
9478             case 't':   value = '\t';                   break;
9479             case 'f':   value = '\f';                   break;
9480             case 'b':   value = '\b';                   break;
9481             case 'e':   value = ASCII_TO_NATIVE('\033');break;
9482             case 'a':   value = ASCII_TO_NATIVE('\007');break;
9483             case 'o':
9484                 RExC_parse--;   /* function expects to be pointed at the 'o' */
9485                 {
9486                     const char* error_msg;
9487                     bool valid = grok_bslash_o(RExC_parse,
9488                                                &value,
9489                                                &numlen,
9490                                                &error_msg,
9491                                                SIZE_ONLY);
9492                     RExC_parse += numlen;
9493                     if (! valid) {
9494                         vFAIL(error_msg);
9495                     }
9496                 }
9497                 if (PL_encoding && value < 0x100) {
9498                     goto recode_encoding;
9499                 }
9500                 break;
9501             case 'x':
9502                 if (*RExC_parse == '{') {
9503                     I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
9504                         | PERL_SCAN_DISALLOW_PREFIX;
9505                     char * const e = strchr(RExC_parse++, '}');
9506                     if (!e)
9507                         vFAIL("Missing right brace on \\x{}");
9508
9509                     numlen = e - RExC_parse;
9510                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
9511                     RExC_parse = e + 1;
9512                 }
9513                 else {
9514                     I32 flags = PERL_SCAN_DISALLOW_PREFIX;
9515                     numlen = 2;
9516                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
9517                     RExC_parse += numlen;
9518                 }
9519                 if (PL_encoding && value < 0x100)
9520                     goto recode_encoding;
9521                 break;
9522             case 'c':
9523                 value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
9524                 break;
9525             case '0': case '1': case '2': case '3': case '4':
9526             case '5': case '6': case '7':
9527                 {
9528                     /* Take 1-3 octal digits */
9529                     I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
9530                     numlen = 3;
9531                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
9532                     RExC_parse += numlen;
9533                     if (PL_encoding && value < 0x100)
9534                         goto recode_encoding;
9535                     break;
9536                 }
9537             recode_encoding:
9538                 {
9539                     SV* enc = PL_encoding;
9540                     value = reg_recode((const char)(U8)value, &enc);
9541                     if (!enc && SIZE_ONLY)
9542                         ckWARNreg(RExC_parse,
9543                                   "Invalid escape in the specified encoding");
9544                     break;
9545                 }
9546             default:
9547                 /* Allow \_ to not give an error */
9548                 if (!SIZE_ONLY && isALNUM(value) && value != '_') {
9549                     ckWARN2reg(RExC_parse,
9550                                "Unrecognized escape \\%c in character class passed through",
9551                                (int)value);
9552                 }
9553                 break;
9554             }
9555         } /* end of \blah */
9556 #ifdef EBCDIC
9557         else
9558             literal_endpoint++;
9559 #endif
9560
9561         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
9562
9563             /* What matches in a locale is not known until runtime, so need to
9564              * (one time per class) allocate extra space to pass to regexec.
9565              * The space will contain a bit for each named class that is to be
9566              * matched against.  This isn't needed for \p{} and pseudo-classes,
9567              * as they are not affected by locale, and hence are dealt with
9568              * separately */
9569             if (LOC && namedclass < ANYOF_MAX && ! need_class) {
9570                 need_class = 1;
9571                 if (SIZE_ONLY) {
9572 #ifdef ANYOF_CLASS_ADD_SKIP
9573                     RExC_size += ANYOF_CLASS_ADD_SKIP;
9574 #endif
9575                 }
9576                 else {
9577 #ifdef ANYOF_CLASS_ADD_SKIP
9578                     RExC_emit += ANYOF_CLASS_ADD_SKIP;
9579 #endif
9580                     ANYOF_CLASS_ZERO(ret);
9581                 }
9582                 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
9583             }
9584
9585             /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
9586              * literal */
9587             if (range) {
9588                 if (!SIZE_ONLY) {
9589                     const int w =
9590                         RExC_parse >= rangebegin ?
9591                         RExC_parse - rangebegin : 0;
9592                     ckWARN4reg(RExC_parse,
9593                                "False [] range \"%*.*s\"",
9594                                w, w, rangebegin);
9595
9596                     if (prevvalue < 256) {
9597                         stored +=
9598                          set_regclass_bit(pRExC_state, ret, (U8) prevvalue, &nonbitmap);
9599                         stored +=
9600                          set_regclass_bit(pRExC_state, ret, '-', &nonbitmap);
9601                     }
9602                     else {
9603                         ANYOF_FLAGS(ret) |= ANYOF_UTF8;
9604                         Perl_sv_catpvf(aTHX_ listsv,
9605                            "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
9606                     }
9607                 }
9608
9609                 range = 0; /* this was not a true range */
9610             }
9611
9612
9613
9614             if (!SIZE_ONLY) {
9615                 const char *what = NULL;
9616                 char yesno = 0;
9617
9618                 /* Possible truncation here but in some 64-bit environments
9619                  * the compiler gets heartburn about switch on 64-bit values.
9620                  * A similar issue a little earlier when switching on value.
9621                  * --jhi */
9622                 switch ((I32)namedclass) {
9623
9624                 case _C_C_T_(ALNUMC, isALNUMC_L1, isALNUMC, "XPosixAlnum");
9625                 case _C_C_T_(ALPHA, isALPHA_L1, isALPHA, "XPosixAlpha");
9626                 case _C_C_T_(BLANK, isBLANK_L1, isBLANK, "XPosixBlank");
9627                 case _C_C_T_(CNTRL, isCNTRL_L1, isCNTRL, "XPosixCntrl");
9628                 case _C_C_T_(GRAPH, isGRAPH_L1, isGRAPH, "XPosixGraph");
9629                 case _C_C_T_(LOWER, isLOWER_L1, isLOWER, "XPosixLower");
9630                 case _C_C_T_(PRINT, isPRINT_L1, isPRINT, "XPosixPrint");
9631                 case _C_C_T_(PSXSPC, isPSXSPC_L1, isPSXSPC, "XPosixSpace");
9632                 case _C_C_T_(PUNCT, isPUNCT_L1, isPUNCT, "XPosixPunct");
9633                 case _C_C_T_(UPPER, isUPPER_L1, isUPPER, "XPosixUpper");
9634 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
9635                 /* \s, \w match all unicode if utf8. */
9636                 case _C_C_T_(SPACE, isSPACE_L1, isSPACE, "SpacePerl");
9637                 case _C_C_T_(ALNUM, isWORDCHAR_L1, isALNUM, "Word");
9638 #else
9639                 /* \s, \w match ascii and locale only */
9640                 case _C_C_T_(SPACE, isSPACE_L1, isSPACE, "PerlSpace");
9641                 case _C_C_T_(ALNUM, isWORDCHAR_L1, isALNUM, "PerlWord");
9642 #endif
9643                 case _C_C_T_(XDIGIT, isXDIGIT_L1, isXDIGIT, "XPosixXDigit");
9644                 case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
9645                 case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
9646                 case ANYOF_ASCII:
9647                     if (LOC)
9648                         ANYOF_CLASS_SET(ret, ANYOF_ASCII);
9649                     else {
9650                         for (value = 0; value < 128; value++)
9651                             stored +=
9652                               set_regclass_bit(pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &nonbitmap);
9653                     }
9654                     yesno = '+';
9655                     what = NULL;        /* Doesn't match outside ascii, so
9656                                            don't want to add +utf8:: */
9657                     break;
9658                 case ANYOF_NASCII:
9659                     if (LOC)
9660                         ANYOF_CLASS_SET(ret, ANYOF_NASCII);
9661                     else {
9662                         for (value = 128; value < 256; value++)
9663                             stored +=
9664                               set_regclass_bit(pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &nonbitmap);
9665                     }
9666                     ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
9667                     yesno = '!';
9668                     what = "ASCII";
9669                     break;
9670                 case ANYOF_DIGIT:
9671                     if (LOC)
9672                         ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
9673                     else {
9674                         /* consecutive digits assumed */
9675                         for (value = '0'; value <= '9'; value++)
9676                             stored +=
9677                               set_regclass_bit(pRExC_state, ret, (U8) value, &nonbitmap);
9678                     }
9679                     yesno = '+';
9680                     what = POSIX_CC_UNI_NAME("Digit");
9681                     break;
9682                 case ANYOF_NDIGIT:
9683                     if (LOC)
9684                         ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
9685                     else {
9686                         /* consecutive digits assumed */
9687                         for (value = 0; value < '0'; value++)
9688                             stored +=
9689                               set_regclass_bit(pRExC_state, ret, (U8) value, &nonbitmap);
9690                         for (value = '9' + 1; value < 256; value++)
9691                             stored +=
9692                               set_regclass_bit(pRExC_state, ret, (U8) value, &nonbitmap);
9693                     }
9694                     yesno = '!';
9695                     what = POSIX_CC_UNI_NAME("Digit");
9696                     if (AT_LEAST_ASCII_RESTRICTED ) {
9697                         ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
9698                     }
9699                     break;
9700                 case ANYOF_MAX:
9701                     /* this is to handle \p and \P */
9702                     break;
9703                 default:
9704                     vFAIL("Invalid [::] class");
9705                     break;
9706                 }
9707                 if (what && ! (AT_LEAST_ASCII_RESTRICTED)) {
9708                     /* Strings such as "+utf8::isWord\n" */
9709                     Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
9710                     ANYOF_FLAGS(ret) |= ANYOF_UTF8;
9711                 }
9712
9713                 continue;
9714             }
9715         } /* end of namedclass \blah */
9716
9717         if (range) {
9718             if (prevvalue > (IV)value) /* b-a */ {
9719                 const int w = RExC_parse - rangebegin;
9720                 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
9721                 range = 0; /* not a valid range */
9722             }
9723         }
9724         else {
9725             prevvalue = value; /* save the beginning of the range */
9726             if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
9727                 RExC_parse[1] != ']') {
9728                 RExC_parse++;
9729
9730                 /* a bad range like \w-, [:word:]- ? */
9731                 if (namedclass > OOB_NAMEDCLASS) {
9732                     if (ckWARN(WARN_REGEXP)) {
9733                         const int w =
9734                             RExC_parse >= rangebegin ?
9735                             RExC_parse - rangebegin : 0;
9736                         vWARN4(RExC_parse,
9737                                "False [] range \"%*.*s\"",
9738                                w, w, rangebegin);
9739                     }
9740                     if (!SIZE_ONLY)
9741                         stored +=
9742                             set_regclass_bit(pRExC_state, ret, '-', &nonbitmap);
9743                 } else
9744                     range = 1;  /* yeah, it's a range! */
9745                 continue;       /* but do it the next time */
9746             }
9747         }
9748
9749         /* non-Latin1 code point implies unicode semantics.  Must be set in
9750          * pass1 so is there for the whole of pass 2 */
9751         if (value > 255) {
9752             RExC_uni_semantics = 1;
9753         }
9754
9755         /* now is the next time */
9756         if (!SIZE_ONLY) {
9757             if (prevvalue < 256) {
9758                 const IV ceilvalue = value < 256 ? value : 255;
9759                 IV i;
9760 #ifdef EBCDIC
9761                 /* In EBCDIC [\x89-\x91] should include
9762                  * the \x8e but [i-j] should not. */
9763                 if (literal_endpoint == 2 &&
9764                     ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
9765                      (isUPPER(prevvalue) && isUPPER(ceilvalue))))
9766                 {
9767                     if (isLOWER(prevvalue)) {
9768                         for (i = prevvalue; i <= ceilvalue; i++)
9769                             if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
9770                                 stored +=
9771                                   set_regclass_bit(pRExC_state, ret, (U8) i, &nonbitmap);
9772                             }
9773                     } else {
9774                         for (i = prevvalue; i <= ceilvalue; i++)
9775                             if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
9776                                 stored +=
9777                                   set_regclass_bit(pRExC_state, ret, (U8) i, &nonbitmap);
9778                             }
9779                     }
9780                 }
9781                 else
9782 #endif
9783                       for (i = prevvalue; i <= ceilvalue; i++) {
9784                         stored += set_regclass_bit(pRExC_state, ret, (U8) i, &nonbitmap);
9785                       }
9786           }
9787           if (value > 255) {
9788             const UV prevnatvalue  = NATIVE_TO_UNI(prevvalue);
9789             const UV natvalue      = NATIVE_TO_UNI(value);
9790             if (! nonbitmap) {
9791                 nonbitmap = _new_invlist(2);
9792             }
9793             nonbitmap = add_range_to_invlist(nonbitmap, prevnatvalue, natvalue);
9794             ANYOF_FLAGS(ret) |= ANYOF_UTF8;
9795         }
9796 #if 0
9797
9798                 /* If the code point requires utf8 to represent, and we are not
9799                  * folding, it can't match unless the target is in utf8.  Only
9800                  * a few code points above 255 fold to below it, so XXX an
9801                  * optimization would be to know which ones and set the flag
9802                  * appropriately. */
9803                 ANYOF_FLAGS(ret) |= (FOLD || value < 256)
9804                                     ? ANYOF_NONBITMAP
9805                                     : ANYOF_UTF8;
9806                 if (prevnatvalue < natvalue) { /* '>' case is fatal error above */
9807
9808                     /* The \t sets the whole range */
9809                     Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
9810                                    prevnatvalue, natvalue);
9811
9812                     /* Currently, we don't look at every value in the range.
9813                      * Therefore we have to assume the worst case: that if
9814                      * folding, it will match more than one character.  But in
9815                      * lookbehind patterns, can only be single character
9816                      * length, so disallow those folds */
9817                     if (FOLD && ! RExC_in_lookbehind) {
9818                       OP(ret) = ANYOFV;
9819                     }
9820                 }
9821                 else if (prevnatvalue == natvalue) {
9822                     Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
9823                     if (FOLD) {
9824                          U8 foldbuf[UTF8_MAXBYTES_CASE+1];
9825                          STRLEN foldlen;
9826                          const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
9827
9828 #ifdef EBCDIC /* RD t/uni/fold ff and 6b */
9829                          if (RExC_precomp[0] == ':' &&
9830                              RExC_precomp[1] == '[' &&
9831                              (f == 0xDF || f == 0x92)) {
9832                              f = NATIVE_TO_UNI(f);
9833                         }
9834 #endif
9835                          /* If folding and foldable and a single
9836                           * character, insert also the folded version
9837                           * to the charclass. */
9838                          if (f != value) {
9839 #ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
9840                              if ((RExC_precomp[0] == ':' &&
9841                                   RExC_precomp[1] == '[' &&
9842                                   (f == 0xA2 &&
9843                                    (value == 0xFB05 || value == 0xFB06))) ?
9844                                  foldlen == ((STRLEN)UNISKIP(f) - 1) :
9845                                  foldlen == (STRLEN)UNISKIP(f) )
9846 #else
9847                               if (foldlen == (STRLEN)UNISKIP(f))
9848 #endif
9849                                   Perl_sv_catpvf(aTHX_ listsv,
9850                                                  "%04"UVxf"\n", f);
9851                               else if (! RExC_in_lookbehind) {
9852                                   /* Any multicharacter foldings
9853                                    * (disallowed in lookbehind patterns)
9854                                    * require the following transform:
9855                                    * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
9856                                    * where E folds into "pq" and F folds
9857                                    * into "rst", all other characters
9858                                    * fold to single characters.  We save
9859                                    * away these multicharacter foldings,
9860                                    * to be later saved as part of the
9861                                    * additional "s" data. */
9862                                   SV *sv;
9863
9864                                   if (!unicode_alternate)
9865                                       unicode_alternate = newAV();
9866                                   sv = newSVpvn_utf8((char*)foldbuf, foldlen,
9867                                                      TRUE);
9868                                   av_push(unicode_alternate, sv);
9869                                   OP(ret) = ANYOFV;
9870                               }
9871                          }
9872
9873                          /* If folding and the value is one of the Greek
9874                           * sigmas insert a few more sigmas to make the
9875                           * folding rules of the sigmas to work right.
9876                           * Note that not all the possible combinations
9877                           * are handled here: some of them are handled
9878                           * by the standard folding rules, and some of
9879                           * them (literal or EXACTF cases) are handled
9880                           * during runtime in regexec.c:S_find_byclass(). */
9881                          if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
9882                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
9883                                              (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
9884                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
9885                                              (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
9886                          }
9887                          else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
9888                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
9889                                              (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
9890                     }
9891                 }
9892             }
9893 #endif
9894 #ifdef EBCDIC
9895             literal_endpoint = 0;
9896 #endif
9897         }
9898
9899         range = 0; /* this range (if it was one) is done now */
9900     }
9901
9902
9903
9904     if (SIZE_ONLY)
9905         return ret;
9906     /****** !SIZE_ONLY AFTER HERE *********/
9907
9908     /* Finish up the non-bitmap entries */
9909     if (nonbitmap) {
9910         UV* nonbitmap_array;
9911         UV i;
9912
9913         /* If folding, we add to the list all characters that could fold to or
9914          * from the ones already on the list */
9915         if (FOLD) {
9916             HV* fold_intersection;
9917             UV* fold_list;
9918
9919             /* This is a list of all the characters that participate in folds
9920              * (except marks, etc in multi-char folds */
9921             if (! PL_utf8_foldable) {
9922                 SV* swash = swash_init("utf8", "Cased", &PL_sv_undef, 1, 0);
9923                 PL_utf8_foldable = _swash_to_invlist(swash);
9924             }
9925
9926             /* This is a hash that for a particular fold gives all characters
9927              * that are involved in it */
9928             if (! PL_utf8_foldclosures) {
9929
9930                 /* If we were unable to find any folds, then we likely won't be
9931                  * able to find the closures.  So just create an empty list.
9932                  * Folding will effectively be restricted to the non-Unicode
9933                  * rules hard-coded into Perl.  (This case happens legitimately
9934                  * during compilation of Perl itself before the Unicode tables
9935                  * are generated) */
9936                 if (invlist_len(PL_utf8_foldable) == 0) {
9937                     PL_utf8_foldclosures = _new_invlist(0);
9938                 } else {
9939                     /* If the folds haven't been read in, call a fold function
9940                      * to force that */
9941                     if (! PL_utf8_tofold) {
9942                         U8 dummy[UTF8_MAXBYTES+1];
9943                         STRLEN dummy_len;
9944                         to_utf8_fold((U8*) "A", dummy, &dummy_len);
9945                     }
9946                     PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
9947                 }
9948             }
9949
9950             /* Only the characters in this class that participate in folds need
9951              * be checked.  Get the intersection of this class and all the
9952              * possible characters that are foldable.  This can quickly narrow
9953              * down a large class */
9954             fold_intersection = invlist_intersection(PL_utf8_foldable, nonbitmap);
9955
9956             /* Now look at the foldable characters in this class individually */
9957             fold_list = invlist_array(fold_intersection);
9958             for (i = 0; i < invlist_len(fold_intersection); i++) {
9959                 UV j;
9960
9961                 /* The next entry is the beginning of the range that is in the
9962                  * class */
9963                 UV start = fold_list[i++];
9964
9965
9966                 /* The next entry is the beginning of the next range, which
9967                  * isn't in the class, so the end of the current range is one
9968                  * less than that */
9969                 UV end = fold_list[i] - 1;
9970
9971                 /* Look at every character in the range */
9972                 for (j = start; j <= end; j++) {
9973
9974                     /* Get its fold */
9975                     U8 foldbuf[UTF8_MAXBYTES_CASE+1];
9976                     STRLEN foldlen;
9977                     const UV f = to_uni_fold(j, foldbuf, &foldlen);
9978
9979                     if (foldlen > (STRLEN)UNISKIP(f)) {
9980
9981                         /* Any multicharacter foldings (disallowed in
9982                          * lookbehind patterns) require the following
9983                          * transform: [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst) where
9984                          * E folds into "pq" and F folds into "rst", all other
9985                          * characters fold to single characters.  We save away
9986                          * these multicharacter foldings, to be later saved as
9987                          * part of the additional "s" data. */
9988                         if (! RExC_in_lookbehind) {
9989                             SV *sv;
9990                             U8* loc = foldbuf;
9991                             U8* e = foldbuf + foldlen;
9992
9993                             /* If any of the folded characters of this are in
9994                              * the Latin1 range, tell the regex engine that
9995                              * this can match a non-utf8 target string.  The
9996                              * only multi-byte fold whose source is in the
9997                              * Latin1 range (U+00DF) applies only when the
9998                              * target string is utf8, or under unicode rules */
9999                             if (j > 255 || AT_LEAST_UNI_SEMANTICS) {
10000                                 while (loc < e) {
10001
10002                                     /* Can't mix ascii with non- under /aa */
10003                                     if (MORE_ASCII_RESTRICTED
10004                                         && (isASCII(*loc) != isASCII(j)))
10005                                     {
10006                                         goto end_multi_fold;
10007                                     }
10008                                     if (UTF8_IS_INVARIANT(*loc)
10009                                         || UTF8_IS_DOWNGRADEABLE_START(*loc))
10010                                     {
10011                                         /* Can't mix above and below 256 under
10012                                          * LOC */
10013                                         if (LOC) {
10014                                             goto end_multi_fold;
10015                                         }
10016                                         ANYOF_FLAGS(ret)
10017                                                 |= ANYOF_NONBITMAP_NON_UTF8;
10018                                         break;
10019                                     }
10020                                     loc += UTF8SKIP(loc);
10021                                 }
10022                             }
10023                             ANYOF_FLAGS(ret) |= ANYOF_UTF8;
10024
10025                             if (!unicode_alternate) {
10026                                 unicode_alternate = newAV();
10027                             }
10028                             sv = newSVpvn_utf8((char*)foldbuf, foldlen, TRUE);
10029                             av_push(unicode_alternate, sv);
10030
10031                             /* This node is variable length */
10032                             OP(ret) = ANYOFV;
10033                         end_multi_fold: ;
10034                         }
10035                     }
10036                     else {
10037                         /* Single character fold.  Add everything in its fold
10038                          * closure to the list that this node should match */
10039                         SV** listp;
10040
10041                         /* The fold closures data structure is a hash with the
10042                          * keys being every character that is folded to, like
10043                          * 'k', and the values each an array of everything that
10044                          * folds to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ] */
10045                         if ((listp = hv_fetch(PL_utf8_foldclosures,
10046                                       (char *) foldbuf, foldlen, FALSE)))
10047                         {
10048                             AV* list = (AV*) *listp;
10049                             IV k;
10050                             for (k = 0; k <= av_len(list); k++) {
10051                                 SV** c_p = av_fetch(list, k, FALSE);
10052                                 UV c;
10053                                 if (c_p == NULL) {
10054                                     Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
10055                                 }
10056                                 c = SvUV(*c_p);
10057
10058                                 /* /aa doesn't allow folds between ASCII and
10059                                  * non-; /l doesn't allow them between above
10060                                  * and below 256 */
10061                                 if ((MORE_ASCII_RESTRICTED && (isASCII(c) != isASCII(j)))
10062                                      || (LOC && ((c < 256) != (j < 256))))
10063                                 {
10064                                     continue;
10065                                 }
10066
10067                                 if (c < 256 && AT_LEAST_UNI_SEMANTICS) {
10068                                     stored += set_regclass_bit(pRExC_state, ret, (U8) c, &nonbitmap);
10069                                 }
10070                                     /* It may be that the code point is already
10071                                      * in this range or already in the bitmap,
10072                                      * in which case we need do nothing */
10073                                 else if ((c < start || c > end)
10074                                          && (c > 255
10075                                              || ! ANYOF_BITMAP_TEST(ret, c)))
10076                                 {
10077                                     nonbitmap = add_range_to_invlist(nonbitmap, c, c);
10078                                 }
10079                             }
10080                         }
10081                     }
10082                 }
10083             }
10084             invlist_destroy(fold_intersection);
10085         } /* End of processing all the folds */
10086
10087         /*  Here have the full list of items to match that aren't in the
10088          *  bitmap.  Convert to the structure that the rest of the code is
10089          *  expecting.   XXX That rest of the code should convert to this
10090          *  structure */
10091         nonbitmap_array = invlist_array(nonbitmap);
10092         for (i = 0; i < invlist_len(nonbitmap); i++) {
10093
10094             /* The next entry is the beginning of the range that is in the
10095              * class */
10096             UV start = nonbitmap_array[i++];
10097
10098             /* The next entry is the beginning of the next range, which isn't
10099              * in the class, so the end of the current range is one less than
10100              * that */
10101             UV end = nonbitmap_array[i] - 1;
10102
10103             if (start == end) {
10104                 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", start);
10105             }
10106             else {
10107                 /* The \t sets the whole range */
10108                 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
10109                         /* XXX EBCDIC */
10110                                    start, end);
10111             }
10112         }
10113         invlist_destroy(nonbitmap);
10114     }
10115
10116     /* Here, we have calculated what code points should be in the character
10117      * class.   Now we can see about various optimizations.  Fold calculation
10118      * needs to take place before inversion.  Otherwise /[^k]/i would invert to
10119      * include K, which under /i would match k. */
10120
10121     /* Optimize inverted simple patterns (e.g. [^a-z]).  Note that we haven't
10122      * set the FOLD flag yet, so this this does optimize those.  It doesn't
10123      * optimize locale.  Doing so perhaps could be done as long as there is
10124      * nothing like \w in it; some thought also would have to be given to the
10125      * interaction with above 0x100 chars */
10126     if (! LOC && (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
10127         for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
10128             ANYOF_BITMAP(ret)[value] ^= 0xFF;
10129         stored = 256 - stored;
10130
10131         /* The inversion means that everything above 255 is matched; and at the
10132          * same time we clear the invert flag */
10133         ANYOF_FLAGS(ret) = ANYOF_UTF8|ANYOF_UNICODE_ALL;
10134     }
10135
10136     /* Folding in the bitmap is taken care of above, but not for locale (for
10137      * which we have to wait to see what folding is in effect at runtime), and
10138      * for things not in the bitmap.  Set run-time fold flag for these */
10139     if (FOLD && (LOC || (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP))) {
10140         ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD;
10141     }
10142
10143     /* A single character class can be "optimized" into an EXACTish node.
10144      * Note that since we don't currently count how many characters there are
10145      * outside the bitmap, we are XXX missing optimization possibilities for
10146      * them.  This optimization can't happen unless this is a truly single
10147      * character class, which means that it can't be an inversion into a
10148      * many-character class, and there must be no possibility of there being
10149      * things outside the bitmap.  'stored' (only) for locales doesn't include
10150      * \w, etc, so have to make a special test that they aren't present
10151      *
10152      * Similarly A 2-character class of the very special form like [bB] can be
10153      * optimized into an EXACTFish node, but only for non-locales, and for
10154      * characters which only have the two folds; so things like 'fF' and 'Ii'
10155      * wouldn't work because they are part of the fold of 'LATIN SMALL LIGATURE
10156      * FI'. */
10157     if (! (ANYOF_FLAGS(ret) & (ANYOF_NONBITMAP|ANYOF_INVERT|ANYOF_UNICODE_ALL))
10158         && (((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
10159                               || (! ANYOF_CLASS_TEST_ANY_SET(ret)))))
10160             || (stored == 2 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
10161                                  && (! _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value))
10162                                  /* If the latest code point has a fold whose
10163                                   * bit is set, it must be the only other one */
10164                                 && ((prevvalue = PL_fold_latin1[value]) != (IV)value)
10165                                  && ANYOF_BITMAP_TEST(ret, prevvalue)))))
10166     {
10167         /* Note that the information needed to decide to do this optimization
10168          * is not currently available until the 2nd pass, and that the actually
10169          * used EXACTish node takes less space than the calculated ANYOF node,
10170          * and hence the amount of space calculated in the first pass is larger
10171          * than actually used, so this optimization doesn't gain us any space.
10172          * But an EXACT node is faster than an ANYOF node, and can be combined
10173          * with any adjacent EXACT nodes later by the optimizer for further
10174          * gains.  The speed of executing an EXACTF is similar to an ANYOF
10175          * node, so the optimization advantage comes from the ability to join
10176          * it to adjacent EXACT nodes */
10177
10178         const char * cur_parse= RExC_parse;
10179         U8 op;
10180         RExC_emit = (regnode *)orig_emit;
10181         RExC_parse = (char *)orig_parse;
10182
10183         if (stored == 1) {
10184
10185             /* A locale node with one point can be folded; all the other cases
10186              * with folding will have two points, since we calculate them above
10187              */
10188             if (ANYOF_FLAGS(ret) & ANYOF_LOC_NONBITMAP_FOLD) {
10189                  op = EXACTFL;
10190             }
10191             else {
10192                 op = EXACT;
10193             }
10194         }   /* else 2 chars in the bit map: the folds of each other */
10195         else if (AT_LEAST_UNI_SEMANTICS || !isASCII(value)) {
10196
10197             /* To join adjacent nodes, they must be the exact EXACTish type.
10198              * Try to use the most likely type, by using EXACTFU if the regex
10199              * calls for them, or is required because the character is
10200              * non-ASCII */
10201             op = EXACTFU;
10202         }
10203         else {    /* Otherwise, more likely to be EXACTF type */
10204             op = EXACTF;
10205         }
10206
10207         ret = reg_node(pRExC_state, op);
10208         RExC_parse = (char *)cur_parse;
10209         if (UTF && ! NATIVE_IS_INVARIANT(value)) {
10210             *STRING(ret)= UTF8_EIGHT_BIT_HI((U8) value);
10211             *(STRING(ret) + 1)= UTF8_EIGHT_BIT_LO((U8) value);
10212             STR_LEN(ret)= 2;
10213             RExC_emit += STR_SZ(2);
10214         }
10215         else {
10216             *STRING(ret)= (char)value;
10217             STR_LEN(ret)= 1;
10218             RExC_emit += STR_SZ(1);
10219         }
10220         SvREFCNT_dec(listsv);
10221         return ret;
10222     }
10223
10224     {
10225         AV * const av = newAV();
10226         SV *rv;
10227         /* The 0th element stores the character class description
10228          * in its textual form: used later (regexec.c:Perl_regclass_swash())
10229          * to initialize the appropriate swash (which gets stored in
10230          * the 1st element), and also useful for dumping the regnode.
10231          * The 2nd element stores the multicharacter foldings,
10232          * used later (regexec.c:S_reginclass()). */
10233         av_store(av, 0, listsv);
10234         av_store(av, 1, NULL);
10235         av_store(av, 2, MUTABLE_SV(unicode_alternate));
10236         rv = newRV_noinc(MUTABLE_SV(av));
10237         n = add_data(pRExC_state, 1, "s");
10238         RExC_rxi->data->data[n] = (void*)rv;
10239         ARG_SET(ret, n);
10240     }
10241     return ret;
10242 }
10243 #undef _C_C_T_
10244
10245
10246 /* reg_skipcomment()
10247
10248    Absorbs an /x style # comments from the input stream.
10249    Returns true if there is more text remaining in the stream.
10250    Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
10251    terminates the pattern without including a newline.
10252
10253    Note its the callers responsibility to ensure that we are
10254    actually in /x mode
10255
10256 */
10257
10258 STATIC bool
10259 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
10260 {
10261     bool ended = 0;
10262
10263     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
10264
10265     while (RExC_parse < RExC_end)
10266         if (*RExC_parse++ == '\n') {
10267             ended = 1;
10268             break;
10269         }
10270     if (!ended) {
10271         /* we ran off the end of the pattern without ending
10272            the comment, so we have to add an \n when wrapping */
10273         RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
10274         return 0;
10275     } else
10276         return 1;
10277 }
10278
10279 /* nextchar()
10280
10281    Advances the parse position, and optionally absorbs
10282    "whitespace" from the inputstream.
10283
10284    Without /x "whitespace" means (?#...) style comments only,
10285    with /x this means (?#...) and # comments and whitespace proper.
10286
10287    Returns the RExC_parse point from BEFORE the scan occurs.
10288
10289    This is the /x friendly way of saying RExC_parse++.
10290 */
10291
10292 STATIC char*
10293 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
10294 {
10295     char* const retval = RExC_parse++;
10296
10297     PERL_ARGS_ASSERT_NEXTCHAR;
10298
10299     for (;;) {
10300         if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
10301                 RExC_parse[2] == '#') {
10302             while (*RExC_parse != ')') {
10303                 if (RExC_parse == RExC_end)
10304                     FAIL("Sequence (?#... not terminated");
10305                 RExC_parse++;
10306             }
10307             RExC_parse++;
10308             continue;
10309         }
10310         if (RExC_flags & RXf_PMf_EXTENDED) {
10311             if (isSPACE(*RExC_parse)) {
10312                 RExC_parse++;
10313                 continue;
10314             }
10315             else if (*RExC_parse == '#') {
10316                 if ( reg_skipcomment( pRExC_state ) )
10317                     continue;
10318             }
10319         }
10320         return retval;
10321     }
10322 }
10323
10324 /*
10325 - reg_node - emit a node
10326 */
10327 STATIC regnode *                        /* Location. */
10328 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
10329 {
10330     dVAR;
10331     register regnode *ptr;
10332     regnode * const ret = RExC_emit;
10333     GET_RE_DEBUG_FLAGS_DECL;
10334
10335     PERL_ARGS_ASSERT_REG_NODE;
10336
10337     if (SIZE_ONLY) {
10338         SIZE_ALIGN(RExC_size);
10339         RExC_size += 1;
10340         return(ret);
10341     }
10342     if (RExC_emit >= RExC_emit_bound)
10343         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
10344
10345     NODE_ALIGN_FILL(ret);
10346     ptr = ret;
10347     FILL_ADVANCE_NODE(ptr, op);
10348     REH_CALL_REGCOMP_HOOK(pRExC_state->rx, (ptr) - 1);
10349 #ifdef RE_TRACK_PATTERN_OFFSETS
10350     if (RExC_offsets) {         /* MJD */
10351         MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
10352               "reg_node", __LINE__,
10353               PL_reg_name[op],
10354               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
10355                 ? "Overwriting end of array!\n" : "OK",
10356               (UV)(RExC_emit - RExC_emit_start),
10357               (UV)(RExC_parse - RExC_start),
10358               (UV)RExC_offsets[0]));
10359         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
10360     }
10361 #endif
10362     RExC_emit = ptr;
10363     return(ret);
10364 }
10365
10366 /*
10367 - reganode - emit a node with an argument
10368 */
10369 STATIC regnode *                        /* Location. */
10370 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
10371 {
10372     dVAR;
10373     register regnode *ptr;
10374     regnode * const ret = RExC_emit;
10375     GET_RE_DEBUG_FLAGS_DECL;
10376
10377     PERL_ARGS_ASSERT_REGANODE;
10378
10379     if (SIZE_ONLY) {
10380         SIZE_ALIGN(RExC_size);
10381         RExC_size += 2;
10382         /*
10383            We can't do this:
10384
10385            assert(2==regarglen[op]+1);
10386
10387            Anything larger than this has to allocate the extra amount.
10388            If we changed this to be:
10389
10390            RExC_size += (1 + regarglen[op]);
10391
10392            then it wouldn't matter. Its not clear what side effect
10393            might come from that so its not done so far.
10394            -- dmq
10395         */
10396         return(ret);
10397     }
10398     if (RExC_emit >= RExC_emit_bound)
10399         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
10400
10401     NODE_ALIGN_FILL(ret);
10402     ptr = ret;
10403     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
10404     REH_CALL_REGCOMP_HOOK(pRExC_state->rx, (ptr) - 2);
10405 #ifdef RE_TRACK_PATTERN_OFFSETS
10406     if (RExC_offsets) {         /* MJD */
10407         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
10408               "reganode",
10409               __LINE__,
10410               PL_reg_name[op],
10411               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
10412               "Overwriting end of array!\n" : "OK",
10413               (UV)(RExC_emit - RExC_emit_start),
10414               (UV)(RExC_parse - RExC_start),
10415               (UV)RExC_offsets[0]));
10416         Set_Cur_Node_Offset;
10417     }
10418 #endif
10419     RExC_emit = ptr;
10420     return(ret);
10421 }
10422
10423 /*
10424 - reguni - emit (if appropriate) a Unicode character
10425 */
10426 STATIC STRLEN
10427 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
10428 {
10429     dVAR;
10430
10431     PERL_ARGS_ASSERT_REGUNI;
10432
10433     return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
10434 }
10435
10436 /*
10437 - reginsert - insert an operator in front of already-emitted operand
10438 *
10439 * Means relocating the operand.
10440 */
10441 STATIC void
10442 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
10443 {
10444     dVAR;
10445     register regnode *src;
10446     register regnode *dst;
10447     register regnode *place;
10448     const int offset = regarglen[(U8)op];
10449     const int size = NODE_STEP_REGNODE + offset;
10450     GET_RE_DEBUG_FLAGS_DECL;
10451
10452     PERL_ARGS_ASSERT_REGINSERT;
10453     PERL_UNUSED_ARG(depth);
10454 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
10455     DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
10456     if (SIZE_ONLY) {
10457         RExC_size += size;
10458         return;
10459     }
10460
10461     src = RExC_emit;
10462     RExC_emit += size;
10463     dst = RExC_emit;
10464     if (RExC_open_parens) {
10465         int paren;
10466         /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
10467         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
10468             if ( RExC_open_parens[paren] >= opnd ) {
10469                 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
10470                 RExC_open_parens[paren] += size;
10471             } else {
10472                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
10473             }
10474             if ( RExC_close_parens[paren] >= opnd ) {
10475                 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
10476                 RExC_close_parens[paren] += size;
10477             } else {
10478                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
10479             }
10480         }
10481     }
10482
10483     while (src > opnd) {
10484         StructCopy(--src, --dst, regnode);
10485 #ifdef RE_TRACK_PATTERN_OFFSETS
10486         if (RExC_offsets) {     /* MJD 20010112 */
10487             MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
10488                   "reg_insert",
10489                   __LINE__,
10490                   PL_reg_name[op],
10491                   (UV)(dst - RExC_emit_start) > RExC_offsets[0]
10492                     ? "Overwriting end of array!\n" : "OK",
10493                   (UV)(src - RExC_emit_start),
10494                   (UV)(dst - RExC_emit_start),
10495                   (UV)RExC_offsets[0]));
10496             Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
10497             Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
10498         }
10499 #endif
10500     }
10501
10502
10503     place = opnd;               /* Op node, where operand used to be. */
10504 #ifdef RE_TRACK_PATTERN_OFFSETS
10505     if (RExC_offsets) {         /* MJD */
10506         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
10507               "reginsert",
10508               __LINE__,
10509               PL_reg_name[op],
10510               (UV)(place - RExC_emit_start) > RExC_offsets[0]
10511               ? "Overwriting end of array!\n" : "OK",
10512               (UV)(place - RExC_emit_start),
10513               (UV)(RExC_parse - RExC_start),
10514               (UV)RExC_offsets[0]));
10515         Set_Node_Offset(place, RExC_parse);
10516         Set_Node_Length(place, 1);
10517     }
10518 #endif
10519     src = NEXTOPER(place);
10520     FILL_ADVANCE_NODE(place, op);
10521     REH_CALL_REGCOMP_HOOK(pRExC_state->rx, (place) - 1);
10522     Zero(src, offset, regnode);
10523 }
10524
10525 /*
10526 - regtail - set the next-pointer at the end of a node chain of p to val.
10527 - SEE ALSO: regtail_study
10528 */
10529 /* TODO: All three parms should be const */
10530 STATIC void
10531 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
10532 {
10533     dVAR;
10534     register regnode *scan;
10535     GET_RE_DEBUG_FLAGS_DECL;
10536
10537     PERL_ARGS_ASSERT_REGTAIL;
10538 #ifndef DEBUGGING
10539     PERL_UNUSED_ARG(depth);
10540 #endif
10541
10542     if (SIZE_ONLY)
10543         return;
10544
10545     /* Find last node. */
10546     scan = p;
10547     for (;;) {
10548         regnode * const temp = regnext(scan);
10549         DEBUG_PARSE_r({
10550             SV * const mysv=sv_newmortal();
10551             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
10552             regprop(RExC_rx, mysv, scan);
10553             PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
10554                 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
10555                     (temp == NULL ? "->" : ""),
10556                     (temp == NULL ? PL_reg_name[OP(val)] : "")
10557             );
10558         });
10559         if (temp == NULL)
10560             break;
10561         scan = temp;
10562     }
10563
10564     if (reg_off_by_arg[OP(scan)]) {
10565         ARG_SET(scan, val - scan);
10566     }
10567     else {
10568         NEXT_OFF(scan) = val - scan;
10569     }
10570 }
10571
10572 #ifdef DEBUGGING
10573 /*
10574 - regtail_study - set the next-pointer at the end of a node chain of p to val.
10575 - Look for optimizable sequences at the same time.
10576 - currently only looks for EXACT chains.
10577
10578 This is experimental code. The idea is to use this routine to perform
10579 in place optimizations on branches and groups as they are constructed,
10580 with the long term intention of removing optimization from study_chunk so
10581 that it is purely analytical.
10582
10583 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
10584 to control which is which.
10585
10586 */
10587 /* TODO: All four parms should be const */
10588
10589 STATIC U8
10590 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
10591 {
10592     dVAR;
10593     register regnode *scan;
10594     U8 exact = PSEUDO;
10595 #ifdef EXPERIMENTAL_INPLACESCAN
10596     I32 min = 0;
10597 #endif
10598     GET_RE_DEBUG_FLAGS_DECL;
10599
10600     PERL_ARGS_ASSERT_REGTAIL_STUDY;
10601
10602
10603     if (SIZE_ONLY)
10604         return exact;
10605
10606     /* Find last node. */
10607
10608     scan = p;
10609     for (;;) {
10610         regnode * const temp = regnext(scan);
10611 #ifdef EXPERIMENTAL_INPLACESCAN
10612         if (PL_regkind[OP(scan)] == EXACT)
10613             if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
10614                 return EXACT;
10615 #endif
10616         if ( exact ) {
10617             switch (OP(scan)) {
10618                 case EXACT:
10619                 case EXACTF:
10620                 case EXACTFA:
10621                 case EXACTFU:
10622                 case EXACTFL:
10623                         if( exact == PSEUDO )
10624                             exact= OP(scan);
10625                         else if ( exact != OP(scan) )
10626                             exact= 0;
10627                 case NOTHING:
10628                     break;
10629                 default:
10630                     exact= 0;
10631             }
10632         }
10633         DEBUG_PARSE_r({
10634             SV * const mysv=sv_newmortal();
10635             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
10636             regprop(RExC_rx, mysv, scan);
10637             PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
10638                 SvPV_nolen_const(mysv),
10639                 REG_NODE_NUM(scan),
10640                 PL_reg_name[exact]);
10641         });
10642         if (temp == NULL)
10643             break;
10644         scan = temp;
10645     }
10646     DEBUG_PARSE_r({
10647         SV * const mysv_val=sv_newmortal();
10648         DEBUG_PARSE_MSG("");
10649         regprop(RExC_rx, mysv_val, val);
10650         PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
10651                       SvPV_nolen_const(mysv_val),
10652                       (IV)REG_NODE_NUM(val),
10653                       (IV)(val - scan)
10654         );
10655     });
10656     if (reg_off_by_arg[OP(scan)]) {
10657         ARG_SET(scan, val - scan);
10658     }
10659     else {
10660         NEXT_OFF(scan) = val - scan;
10661     }
10662
10663     return exact;
10664 }
10665 #endif
10666
10667 /*
10668  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
10669  */
10670 #ifdef DEBUGGING
10671 static void
10672 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
10673 {
10674     int bit;
10675     int set=0;
10676     regex_charset cs;
10677
10678     for (bit=0; bit<32; bit++) {
10679         if (flags & (1<<bit)) {
10680             if ((1<<bit) & RXf_PMf_CHARSET) {   /* Output separately, below */
10681                 continue;
10682             }
10683             if (!set++ && lead)
10684                 PerlIO_printf(Perl_debug_log, "%s",lead);
10685             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
10686         }
10687     }
10688     if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
10689             if (!set++ && lead) {
10690                 PerlIO_printf(Perl_debug_log, "%s",lead);
10691             }
10692             switch (cs) {
10693                 case REGEX_UNICODE_CHARSET:
10694                     PerlIO_printf(Perl_debug_log, "UNICODE");
10695                     break;
10696                 case REGEX_LOCALE_CHARSET:
10697                     PerlIO_printf(Perl_debug_log, "LOCALE");
10698                     break;
10699                 case REGEX_ASCII_RESTRICTED_CHARSET:
10700                     PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
10701                     break;
10702                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
10703                     PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
10704                     break;
10705                 default:
10706                     PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
10707                     break;
10708             }
10709     }
10710     if (lead)  {
10711         if (set)
10712             PerlIO_printf(Perl_debug_log, "\n");
10713         else
10714             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
10715     }
10716 }
10717 #endif
10718
10719 void
10720 Perl_regdump(pTHX_ const regexp *r)
10721 {
10722 #ifdef DEBUGGING
10723     dVAR;
10724     SV * const sv = sv_newmortal();
10725     SV *dsv= sv_newmortal();
10726     RXi_GET_DECL(r,ri);
10727     GET_RE_DEBUG_FLAGS_DECL;
10728
10729     PERL_ARGS_ASSERT_REGDUMP;
10730
10731     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
10732
10733     /* Header fields of interest. */
10734     if (r->anchored_substr) {
10735         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
10736             RE_SV_DUMPLEN(r->anchored_substr), 30);
10737         PerlIO_printf(Perl_debug_log,
10738                       "anchored %s%s at %"IVdf" ",
10739                       s, RE_SV_TAIL(r->anchored_substr),
10740                       (IV)r->anchored_offset);
10741     } else if (r->anchored_utf8) {
10742         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
10743             RE_SV_DUMPLEN(r->anchored_utf8), 30);
10744         PerlIO_printf(Perl_debug_log,
10745                       "anchored utf8 %s%s at %"IVdf" ",
10746                       s, RE_SV_TAIL(r->anchored_utf8),
10747                       (IV)r->anchored_offset);
10748     }
10749     if (r->float_substr) {
10750         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
10751             RE_SV_DUMPLEN(r->float_substr), 30);
10752         PerlIO_printf(Perl_debug_log,
10753                       "floating %s%s at %"IVdf"..%"UVuf" ",
10754                       s, RE_SV_TAIL(r->float_substr),
10755                       (IV)r->float_min_offset, (UV)r->float_max_offset);
10756     } else if (r->float_utf8) {
10757         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
10758             RE_SV_DUMPLEN(r->float_utf8), 30);
10759         PerlIO_printf(Perl_debug_log,
10760                       "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
10761                       s, RE_SV_TAIL(r->float_utf8),
10762                       (IV)r->float_min_offset, (UV)r->float_max_offset);
10763     }
10764     if (r->check_substr || r->check_utf8)
10765         PerlIO_printf(Perl_debug_log,
10766                       (const char *)
10767                       (r->check_substr == r->float_substr
10768                        && r->check_utf8 == r->float_utf8
10769                        ? "(checking floating" : "(checking anchored"));
10770     if (r->extflags & RXf_NOSCAN)
10771         PerlIO_printf(Perl_debug_log, " noscan");
10772     if (r->extflags & RXf_CHECK_ALL)
10773         PerlIO_printf(Perl_debug_log, " isall");
10774     if (r->check_substr || r->check_utf8)
10775         PerlIO_printf(Perl_debug_log, ") ");
10776
10777     if (ri->regstclass) {
10778         regprop(r, sv, ri->regstclass);
10779         PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
10780     }
10781     if (r->extflags & RXf_ANCH) {
10782         PerlIO_printf(Perl_debug_log, "anchored");
10783         if (r->extflags & RXf_ANCH_BOL)
10784             PerlIO_printf(Perl_debug_log, "(BOL)");
10785         if (r->extflags & RXf_ANCH_MBOL)
10786             PerlIO_printf(Perl_debug_log, "(MBOL)");
10787         if (r->extflags & RXf_ANCH_SBOL)
10788             PerlIO_printf(Perl_debug_log, "(SBOL)");
10789         if (r->extflags & RXf_ANCH_GPOS)
10790             PerlIO_printf(Perl_debug_log, "(GPOS)");
10791         PerlIO_putc(Perl_debug_log, ' ');
10792     }
10793     if (r->extflags & RXf_GPOS_SEEN)
10794         PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
10795     if (r->intflags & PREGf_SKIP)
10796         PerlIO_printf(Perl_debug_log, "plus ");
10797     if (r->intflags & PREGf_IMPLICIT)
10798         PerlIO_printf(Perl_debug_log, "implicit ");
10799     PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
10800     if (r->extflags & RXf_EVAL_SEEN)
10801         PerlIO_printf(Perl_debug_log, "with eval ");
10802     PerlIO_printf(Perl_debug_log, "\n");
10803     DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
10804 #else
10805     PERL_ARGS_ASSERT_REGDUMP;
10806     PERL_UNUSED_CONTEXT;
10807     PERL_UNUSED_ARG(r);
10808 #endif  /* DEBUGGING */
10809 }
10810
10811 /*
10812 - regprop - printable representation of opcode
10813 */
10814 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
10815 STMT_START { \
10816         if (do_sep) {                           \
10817             Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
10818             if (flags & ANYOF_INVERT)           \
10819                 /*make sure the invert info is in each */ \
10820                 sv_catpvs(sv, "^");             \
10821             do_sep = 0;                         \
10822         }                                       \
10823 } STMT_END
10824
10825 void
10826 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
10827 {
10828 #ifdef DEBUGGING
10829     dVAR;
10830     register int k;
10831     RXi_GET_DECL(prog,progi);
10832     GET_RE_DEBUG_FLAGS_DECL;
10833
10834     PERL_ARGS_ASSERT_REGPROP;
10835
10836     sv_setpvs(sv, "");
10837
10838     if (OP(o) > REGNODE_MAX)            /* regnode.type is unsigned */
10839         /* It would be nice to FAIL() here, but this may be called from
10840            regexec.c, and it would be hard to supply pRExC_state. */
10841         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
10842     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
10843
10844     k = PL_regkind[OP(o)];
10845
10846     if (k == EXACT) {
10847         sv_catpvs(sv, " ");
10848         /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
10849          * is a crude hack but it may be the best for now since
10850          * we have no flag "this EXACTish node was UTF-8"
10851          * --jhi */
10852         pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
10853                   PERL_PV_ESCAPE_UNI_DETECT |
10854                   PERL_PV_ESCAPE_NONASCII   |
10855                   PERL_PV_PRETTY_ELLIPSES   |
10856                   PERL_PV_PRETTY_LTGT       |
10857                   PERL_PV_PRETTY_NOCLEAR
10858                   );
10859     } else if (k == TRIE) {
10860         /* print the details of the trie in dumpuntil instead, as
10861          * progi->data isn't available here */
10862         const char op = OP(o);
10863         const U32 n = ARG(o);
10864         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
10865                (reg_ac_data *)progi->data->data[n] :
10866                NULL;
10867         const reg_trie_data * const trie
10868             = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
10869
10870         Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
10871         DEBUG_TRIE_COMPILE_r(
10872             Perl_sv_catpvf(aTHX_ sv,
10873                 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
10874                 (UV)trie->startstate,
10875                 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
10876                 (UV)trie->wordcount,
10877                 (UV)trie->minlen,
10878                 (UV)trie->maxlen,
10879                 (UV)TRIE_CHARCOUNT(trie),
10880                 (UV)trie->uniquecharcount
10881             )
10882         );
10883         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
10884             int i;
10885             int rangestart = -1;
10886             U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
10887             sv_catpvs(sv, "[");
10888             for (i = 0; i <= 256; i++) {
10889                 if (i < 256 && BITMAP_TEST(bitmap,i)) {
10890                     if (rangestart == -1)
10891                         rangestart = i;
10892                 } else if (rangestart != -1) {
10893                     if (i <= rangestart + 3)
10894                         for (; rangestart < i; rangestart++)
10895                             put_byte(sv, rangestart);
10896                     else {
10897                         put_byte(sv, rangestart);
10898                         sv_catpvs(sv, "-");
10899                         put_byte(sv, i - 1);
10900                     }
10901                     rangestart = -1;
10902                 }
10903             }
10904             sv_catpvs(sv, "]");
10905         }
10906
10907     } else if (k == CURLY) {
10908         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
10909             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
10910         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
10911     }
10912     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
10913         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
10914     else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
10915         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
10916         if ( RXp_PAREN_NAMES(prog) ) {
10917             if ( k != REF || (OP(o) < NREF)) {
10918                 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
10919                 SV **name= av_fetch(list, ARG(o), 0 );
10920                 if (name)
10921                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
10922             }
10923             else {
10924                 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
10925                 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
10926                 I32 *nums=(I32*)SvPVX(sv_dat);
10927                 SV **name= av_fetch(list, nums[0], 0 );
10928                 I32 n;
10929                 if (name) {
10930                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
10931                         Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
10932                                     (n ? "," : ""), (IV)nums[n]);
10933                     }
10934                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
10935                 }
10936             }
10937         }
10938     } else if (k == GOSUB)
10939         Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
10940     else if (k == VERB) {
10941         if (!o->flags)
10942             Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
10943                            SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
10944     } else if (k == LOGICAL)
10945         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);     /* 2: embedded, otherwise 1 */
10946     else if (k == FOLDCHAR)
10947         Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) );
10948     else if (k == ANYOF) {
10949         int i, rangestart = -1;
10950         const U8 flags = ANYOF_FLAGS(o);
10951         int do_sep = 0;
10952
10953         /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
10954         static const char * const anyofs[] = {
10955             "\\w",
10956             "\\W",
10957             "\\s",
10958             "\\S",
10959             "\\d",
10960             "\\D",
10961             "[:alnum:]",
10962             "[:^alnum:]",
10963             "[:alpha:]",
10964             "[:^alpha:]",
10965             "[:ascii:]",
10966             "[:^ascii:]",
10967             "[:cntrl:]",
10968             "[:^cntrl:]",
10969             "[:graph:]",
10970             "[:^graph:]",
10971             "[:lower:]",
10972             "[:^lower:]",
10973             "[:print:]",
10974             "[:^print:]",
10975             "[:punct:]",
10976             "[:^punct:]",
10977             "[:upper:]",
10978             "[:^upper:]",
10979             "[:xdigit:]",
10980             "[:^xdigit:]",
10981             "[:space:]",
10982             "[:^space:]",
10983             "[:blank:]",
10984             "[:^blank:]"
10985         };
10986
10987         if (flags & ANYOF_LOCALE)
10988             sv_catpvs(sv, "{loc}");
10989         if (flags & ANYOF_LOC_NONBITMAP_FOLD)
10990             sv_catpvs(sv, "{i}");
10991         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
10992         if (flags & ANYOF_INVERT)
10993             sv_catpvs(sv, "^");
10994
10995         /* output what the standard cp 0-255 bitmap matches */
10996         for (i = 0; i <= 256; i++) {
10997             if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
10998                 if (rangestart == -1)
10999                     rangestart = i;
11000             } else if (rangestart != -1) {
11001                 if (i <= rangestart + 3)
11002                     for (; rangestart < i; rangestart++)
11003                         put_byte(sv, rangestart);
11004                 else {
11005                     put_byte(sv, rangestart);
11006                     sv_catpvs(sv, "-");
11007                     put_byte(sv, i - 1);
11008                 }
11009                 do_sep = 1;
11010                 rangestart = -1;
11011             }
11012         }
11013
11014         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
11015         /* output any special charclass tests (used entirely under use locale) */
11016         if (ANYOF_CLASS_TEST_ANY_SET(o))
11017             for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
11018                 if (ANYOF_CLASS_TEST(o,i)) {
11019                     sv_catpv(sv, anyofs[i]);
11020                     do_sep = 1;
11021                 }
11022
11023         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
11024
11025         if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
11026             sv_catpvs(sv, "{non-utf8-latin1-all}");
11027         }
11028
11029         /* output information about the unicode matching */
11030         if (flags & ANYOF_UNICODE_ALL)
11031             sv_catpvs(sv, "{unicode_all}");
11032         else if (flags & ANYOF_UTF8)
11033             sv_catpvs(sv, "{unicode}");
11034         if (flags & ANYOF_NONBITMAP_NON_UTF8)
11035             sv_catpvs(sv, "{outside bitmap}");
11036
11037         {
11038             SV *lv;
11039             SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
11040
11041             if (lv) {
11042                 if (sw) {
11043                     U8 s[UTF8_MAXBYTES_CASE+1];
11044
11045                     for (i = 0; i <= 256; i++) { /* just the first 256 */
11046                         uvchr_to_utf8(s, i);
11047
11048                         if (i < 256 && swash_fetch(sw, s, TRUE)) {
11049                             if (rangestart == -1)
11050                                 rangestart = i;
11051                         } else if (rangestart != -1) {
11052                             if (i <= rangestart + 3)
11053                                 for (; rangestart < i; rangestart++) {
11054                                     const U8 * const e = uvchr_to_utf8(s,rangestart);
11055                                     U8 *p;
11056                                     for(p = s; p < e; p++)
11057                                         put_byte(sv, *p);
11058                                 }
11059                             else {
11060                                 const U8 *e = uvchr_to_utf8(s,rangestart);
11061                                 U8 *p;
11062                                 for (p = s; p < e; p++)
11063                                     put_byte(sv, *p);
11064                                 sv_catpvs(sv, "-");
11065                                 e = uvchr_to_utf8(s, i-1);
11066                                 for (p = s; p < e; p++)
11067                                     put_byte(sv, *p);
11068                                 }
11069                                 rangestart = -1;
11070                             }
11071                         }
11072
11073                     sv_catpvs(sv, "..."); /* et cetera */
11074                 }
11075
11076                 {
11077                     char *s = savesvpv(lv);
11078                     char * const origs = s;
11079
11080                     while (*s && *s != '\n')
11081                         s++;
11082
11083                     if (*s == '\n') {
11084                         const char * const t = ++s;
11085
11086                         while (*s) {
11087                             if (*s == '\n')
11088                                 *s = ' ';
11089                             s++;
11090                         }
11091                         if (s[-1] == ' ')
11092                             s[-1] = 0;
11093
11094                         sv_catpv(sv, t);
11095                     }
11096
11097                     Safefree(origs);
11098                 }
11099             }
11100         }
11101
11102         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
11103     }
11104     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
11105         Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
11106 #else
11107     PERL_UNUSED_CONTEXT;
11108     PERL_UNUSED_ARG(sv);
11109     PERL_UNUSED_ARG(o);
11110     PERL_UNUSED_ARG(prog);
11111 #endif  /* DEBUGGING */
11112 }
11113
11114 SV *
11115 Perl_re_intuit_string(pTHX_ REGEXP * const r)
11116 {                               /* Assume that RE_INTUIT is set */
11117     dVAR;
11118     struct regexp *const prog = (struct regexp *)SvANY(r);
11119     GET_RE_DEBUG_FLAGS_DECL;
11120
11121     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
11122     PERL_UNUSED_CONTEXT;
11123
11124     DEBUG_COMPILE_r(
11125         {
11126             const char * const s = SvPV_nolen_const(prog->check_substr
11127                       ? prog->check_substr : prog->check_utf8);
11128
11129             if (!PL_colorset) reginitcolors();
11130             PerlIO_printf(Perl_debug_log,
11131                       "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
11132                       PL_colors[4],
11133                       prog->check_substr ? "" : "utf8 ",
11134                       PL_colors[5],PL_colors[0],
11135                       s,
11136                       PL_colors[1],
11137                       (strlen(s) > 60 ? "..." : ""));
11138         } );
11139
11140     return prog->check_substr ? prog->check_substr : prog->check_utf8;
11141 }
11142
11143 /*
11144    pregfree()
11145
11146    handles refcounting and freeing the perl core regexp structure. When
11147    it is necessary to actually free the structure the first thing it
11148    does is call the 'free' method of the regexp_engine associated to
11149    the regexp, allowing the handling of the void *pprivate; member
11150    first. (This routine is not overridable by extensions, which is why
11151    the extensions free is called first.)
11152
11153    See regdupe and regdupe_internal if you change anything here.
11154 */
11155 #ifndef PERL_IN_XSUB_RE
11156 void
11157 Perl_pregfree(pTHX_ REGEXP *r)
11158 {
11159     SvREFCNT_dec(r);
11160 }
11161
11162 void
11163 Perl_pregfree2(pTHX_ REGEXP *rx)
11164 {
11165     dVAR;
11166     struct regexp *const r = (struct regexp *)SvANY(rx);
11167     GET_RE_DEBUG_FLAGS_DECL;
11168
11169     PERL_ARGS_ASSERT_PREGFREE2;
11170
11171     if (r->mother_re) {
11172         ReREFCNT_dec(r->mother_re);
11173     } else {
11174         CALLREGFREE_PVT(rx); /* free the private data */
11175         SvREFCNT_dec(RXp_PAREN_NAMES(r));
11176     }
11177     if (r->substrs) {
11178         SvREFCNT_dec(r->anchored_substr);
11179         SvREFCNT_dec(r->anchored_utf8);
11180         SvREFCNT_dec(r->float_substr);
11181         SvREFCNT_dec(r->float_utf8);
11182         Safefree(r->substrs);
11183     }
11184     RX_MATCH_COPY_FREE(rx);
11185 #ifdef PERL_OLD_COPY_ON_WRITE
11186     SvREFCNT_dec(r->saved_copy);
11187 #endif
11188     Safefree(r->offs);
11189 }
11190
11191 /*  reg_temp_copy()
11192
11193     This is a hacky workaround to the structural issue of match results
11194     being stored in the regexp structure which is in turn stored in
11195     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
11196     could be PL_curpm in multiple contexts, and could require multiple
11197     result sets being associated with the pattern simultaneously, such
11198     as when doing a recursive match with (??{$qr})
11199
11200     The solution is to make a lightweight copy of the regexp structure
11201     when a qr// is returned from the code executed by (??{$qr}) this
11202     lightweight copy doesn't actually own any of its data except for
11203     the starp/end and the actual regexp structure itself.
11204
11205 */
11206
11207
11208 REGEXP *
11209 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
11210 {
11211     struct regexp *ret;
11212     struct regexp *const r = (struct regexp *)SvANY(rx);
11213     register const I32 npar = r->nparens+1;
11214
11215     PERL_ARGS_ASSERT_REG_TEMP_COPY;
11216
11217     if (!ret_x)
11218         ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
11219     ret = (struct regexp *)SvANY(ret_x);
11220
11221     (void)ReREFCNT_inc(rx);
11222     /* We can take advantage of the existing "copied buffer" mechanism in SVs
11223        by pointing directly at the buffer, but flagging that the allocated
11224        space in the copy is zero. As we've just done a struct copy, it's now
11225        a case of zero-ing that, rather than copying the current length.  */
11226     SvPV_set(ret_x, RX_WRAPPED(rx));
11227     SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
11228     memcpy(&(ret->xpv_cur), &(r->xpv_cur),
11229            sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
11230     SvLEN_set(ret_x, 0);
11231     SvSTASH_set(ret_x, NULL);
11232     SvMAGIC_set(ret_x, NULL);
11233     Newx(ret->offs, npar, regexp_paren_pair);
11234     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
11235     if (r->substrs) {
11236         Newx(ret->substrs, 1, struct reg_substr_data);
11237         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
11238
11239         SvREFCNT_inc_void(ret->anchored_substr);
11240         SvREFCNT_inc_void(ret->anchored_utf8);
11241         SvREFCNT_inc_void(ret->float_substr);
11242         SvREFCNT_inc_void(ret->float_utf8);
11243
11244         /* check_substr and check_utf8, if non-NULL, point to either their
11245            anchored or float namesakes, and don't hold a second reference.  */
11246     }
11247     RX_MATCH_COPIED_off(ret_x);
11248 #ifdef PERL_OLD_COPY_ON_WRITE
11249     ret->saved_copy = NULL;
11250 #endif
11251     ret->mother_re = rx;
11252
11253     return ret_x;
11254 }
11255 #endif
11256
11257 /* regfree_internal()
11258
11259    Free the private data in a regexp. This is overloadable by
11260    extensions. Perl takes care of the regexp structure in pregfree(),
11261    this covers the *pprivate pointer which technically perl doesn't
11262    know about, however of course we have to handle the
11263    regexp_internal structure when no extension is in use.
11264
11265    Note this is called before freeing anything in the regexp
11266    structure.
11267  */
11268
11269 void
11270 Perl_regfree_internal(pTHX_ REGEXP * const rx)
11271 {
11272     dVAR;
11273     struct regexp *const r = (struct regexp *)SvANY(rx);
11274     RXi_GET_DECL(r,ri);
11275     GET_RE_DEBUG_FLAGS_DECL;
11276
11277     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
11278
11279     DEBUG_COMPILE_r({
11280         if (!PL_colorset)
11281             reginitcolors();
11282         {
11283             SV *dsv= sv_newmortal();
11284             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
11285                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
11286             PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
11287                 PL_colors[4],PL_colors[5],s);
11288         }
11289     });
11290 #ifdef RE_TRACK_PATTERN_OFFSETS
11291     if (ri->u.offsets)
11292         Safefree(ri->u.offsets);             /* 20010421 MJD */
11293 #endif
11294     if (ri->data) {
11295         int n = ri->data->count;
11296         PAD* new_comppad = NULL;
11297         PAD* old_comppad;
11298         PADOFFSET refcnt;
11299
11300         while (--n >= 0) {
11301           /* If you add a ->what type here, update the comment in regcomp.h */
11302             switch (ri->data->what[n]) {
11303             case 'a':
11304             case 's':
11305             case 'S':
11306             case 'u':
11307                 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
11308                 break;
11309             case 'f':
11310                 Safefree(ri->data->data[n]);
11311                 break;
11312             case 'p':
11313                 new_comppad = MUTABLE_AV(ri->data->data[n]);
11314                 break;
11315             case 'o':
11316                 if (new_comppad == NULL)
11317                     Perl_croak(aTHX_ "panic: pregfree comppad");
11318                 PAD_SAVE_LOCAL(old_comppad,
11319                     /* Watch out for global destruction's random ordering. */
11320                     (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
11321                 );
11322                 OP_REFCNT_LOCK;
11323                 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
11324                 OP_REFCNT_UNLOCK;
11325                 if (!refcnt)
11326                     op_free((OP_4tree*)ri->data->data[n]);
11327
11328                 PAD_RESTORE_LOCAL(old_comppad);
11329                 SvREFCNT_dec(MUTABLE_SV(new_comppad));
11330                 new_comppad = NULL;
11331                 break;
11332             case 'n':
11333                 break;
11334             case 'T':
11335                 { /* Aho Corasick add-on structure for a trie node.
11336                      Used in stclass optimization only */
11337                     U32 refcount;
11338                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
11339                     OP_REFCNT_LOCK;
11340                     refcount = --aho->refcount;
11341                     OP_REFCNT_UNLOCK;
11342                     if ( !refcount ) {
11343                         PerlMemShared_free(aho->states);
11344                         PerlMemShared_free(aho->fail);
11345                          /* do this last!!!! */
11346                         PerlMemShared_free(ri->data->data[n]);
11347                         PerlMemShared_free(ri->regstclass);
11348                     }
11349                 }
11350                 break;
11351             case 't':
11352                 {
11353                     /* trie structure. */
11354                     U32 refcount;
11355                     reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
11356                     OP_REFCNT_LOCK;
11357                     refcount = --trie->refcount;
11358                     OP_REFCNT_UNLOCK;
11359                     if ( !refcount ) {
11360                         PerlMemShared_free(trie->charmap);
11361                         PerlMemShared_free(trie->states);
11362                         PerlMemShared_free(trie->trans);
11363                         if (trie->bitmap)
11364                             PerlMemShared_free(trie->bitmap);
11365                         if (trie->jump)
11366                             PerlMemShared_free(trie->jump);
11367                         PerlMemShared_free(trie->wordinfo);
11368                         /* do this last!!!! */
11369                         PerlMemShared_free(ri->data->data[n]);
11370                     }
11371                 }
11372                 break;
11373             default:
11374                 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
11375             }
11376         }
11377         Safefree(ri->data->what);
11378         Safefree(ri->data);
11379     }
11380
11381     Safefree(ri);
11382 }
11383
11384 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
11385 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
11386 #define SAVEPVN(p,n)    ((p) ? savepvn(p,n) : NULL)
11387
11388 /*
11389    re_dup - duplicate a regexp.
11390
11391    This routine is expected to clone a given regexp structure. It is only
11392    compiled under USE_ITHREADS.
11393
11394    After all of the core data stored in struct regexp is duplicated
11395    the regexp_engine.dupe method is used to copy any private data
11396    stored in the *pprivate pointer. This allows extensions to handle
11397    any duplication it needs to do.
11398
11399    See pregfree() and regfree_internal() if you change anything here.
11400 */
11401 #if defined(USE_ITHREADS)
11402 #ifndef PERL_IN_XSUB_RE
11403 void
11404 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
11405 {
11406     dVAR;
11407     I32 npar;
11408     const struct regexp *r = (const struct regexp *)SvANY(sstr);
11409     struct regexp *ret = (struct regexp *)SvANY(dstr);
11410
11411     PERL_ARGS_ASSERT_RE_DUP_GUTS;
11412
11413     npar = r->nparens+1;
11414     Newx(ret->offs, npar, regexp_paren_pair);
11415     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
11416     if(ret->swap) {
11417         /* no need to copy these */
11418         Newx(ret->swap, npar, regexp_paren_pair);
11419     }
11420
11421     if (ret->substrs) {
11422         /* Do it this way to avoid reading from *r after the StructCopy().
11423            That way, if any of the sv_dup_inc()s dislodge *r from the L1
11424            cache, it doesn't matter.  */
11425         const bool anchored = r->check_substr
11426             ? r->check_substr == r->anchored_substr
11427             : r->check_utf8 == r->anchored_utf8;
11428         Newx(ret->substrs, 1, struct reg_substr_data);
11429         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
11430
11431         ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
11432         ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
11433         ret->float_substr = sv_dup_inc(ret->float_substr, param);
11434         ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
11435
11436         /* check_substr and check_utf8, if non-NULL, point to either their
11437            anchored or float namesakes, and don't hold a second reference.  */
11438
11439         if (ret->check_substr) {
11440             if (anchored) {
11441                 assert(r->check_utf8 == r->anchored_utf8);
11442                 ret->check_substr = ret->anchored_substr;
11443                 ret->check_utf8 = ret->anchored_utf8;
11444             } else {
11445                 assert(r->check_substr == r->float_substr);
11446                 assert(r->check_utf8 == r->float_utf8);
11447                 ret->check_substr = ret->float_substr;
11448                 ret->check_utf8 = ret->float_utf8;
11449             }
11450         } else if (ret->check_utf8) {
11451             if (anchored) {
11452                 ret->check_utf8 = ret->anchored_utf8;
11453             } else {
11454                 ret->check_utf8 = ret->float_utf8;
11455             }
11456         }
11457     }
11458
11459     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
11460
11461     if (ret->pprivate)
11462         RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
11463
11464     if (RX_MATCH_COPIED(dstr))
11465         ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
11466     else
11467         ret->subbeg = NULL;
11468 #ifdef PERL_OLD_COPY_ON_WRITE
11469     ret->saved_copy = NULL;
11470 #endif
11471
11472     if (ret->mother_re) {
11473         if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
11474             /* Our storage points directly to our mother regexp, but that's
11475                1: a buffer in a different thread
11476                2: something we no longer hold a reference on
11477                so we need to copy it locally.  */
11478             /* Note we need to sue SvCUR() on our mother_re, because it, in
11479                turn, may well be pointing to its own mother_re.  */
11480             SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
11481                                    SvCUR(ret->mother_re)+1));
11482             SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
11483         }
11484         ret->mother_re      = NULL;
11485     }
11486     ret->gofs = 0;
11487 }
11488 #endif /* PERL_IN_XSUB_RE */
11489
11490 /*
11491    regdupe_internal()
11492
11493    This is the internal complement to regdupe() which is used to copy
11494    the structure pointed to by the *pprivate pointer in the regexp.
11495    This is the core version of the extension overridable cloning hook.
11496    The regexp structure being duplicated will be copied by perl prior
11497    to this and will be provided as the regexp *r argument, however
11498    with the /old/ structures pprivate pointer value. Thus this routine
11499    may override any copying normally done by perl.
11500
11501    It returns a pointer to the new regexp_internal structure.
11502 */
11503
11504 void *
11505 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
11506 {
11507     dVAR;
11508     struct regexp *const r = (struct regexp *)SvANY(rx);
11509     regexp_internal *reti;
11510     int len, npar;
11511     RXi_GET_DECL(r,ri);
11512
11513     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
11514
11515     npar = r->nparens+1;
11516     len = ProgLen(ri);
11517
11518     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
11519     Copy(ri->program, reti->program, len+1, regnode);
11520
11521
11522     reti->regstclass = NULL;
11523
11524     if (ri->data) {
11525         struct reg_data *d;
11526         const int count = ri->data->count;
11527         int i;
11528
11529         Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
11530                 char, struct reg_data);
11531         Newx(d->what, count, U8);
11532
11533         d->count = count;
11534         for (i = 0; i < count; i++) {
11535             d->what[i] = ri->data->what[i];
11536             switch (d->what[i]) {
11537                 /* legal options are one of: sSfpontTua
11538                    see also regcomp.h and pregfree() */
11539             case 'a': /* actually an AV, but the dup function is identical.  */
11540             case 's':
11541             case 'S':
11542             case 'p': /* actually an AV, but the dup function is identical.  */
11543             case 'u': /* actually an HV, but the dup function is identical.  */
11544                 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
11545                 break;
11546             case 'f':
11547                 /* This is cheating. */
11548                 Newx(d->data[i], 1, struct regnode_charclass_class);
11549                 StructCopy(ri->data->data[i], d->data[i],
11550                             struct regnode_charclass_class);
11551                 reti->regstclass = (regnode*)d->data[i];
11552                 break;
11553             case 'o':
11554                 /* Compiled op trees are readonly and in shared memory,
11555                    and can thus be shared without duplication. */
11556                 OP_REFCNT_LOCK;
11557                 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
11558                 OP_REFCNT_UNLOCK;
11559                 break;
11560             case 'T':
11561                 /* Trie stclasses are readonly and can thus be shared
11562                  * without duplication. We free the stclass in pregfree
11563                  * when the corresponding reg_ac_data struct is freed.
11564                  */
11565                 reti->regstclass= ri->regstclass;
11566                 /* Fall through */
11567             case 't':
11568                 OP_REFCNT_LOCK;
11569                 ((reg_trie_data*)ri->data->data[i])->refcount++;
11570                 OP_REFCNT_UNLOCK;
11571                 /* Fall through */
11572             case 'n':
11573                 d->data[i] = ri->data->data[i];
11574                 break;
11575             default:
11576                 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
11577             }
11578         }
11579
11580         reti->data = d;
11581     }
11582     else
11583         reti->data = NULL;
11584
11585     reti->name_list_idx = ri->name_list_idx;
11586
11587 #ifdef RE_TRACK_PATTERN_OFFSETS
11588     if (ri->u.offsets) {
11589         Newx(reti->u.offsets, 2*len+1, U32);
11590         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
11591     }
11592 #else
11593     SetProgLen(reti,len);
11594 #endif
11595
11596     return (void*)reti;
11597 }
11598
11599 #endif    /* USE_ITHREADS */
11600
11601 #ifndef PERL_IN_XSUB_RE
11602
11603 /*
11604  - regnext - dig the "next" pointer out of a node
11605  */
11606 regnode *
11607 Perl_regnext(pTHX_ register regnode *p)
11608 {
11609     dVAR;
11610     register I32 offset;
11611
11612     if (!p)
11613         return(NULL);
11614
11615     if (OP(p) > REGNODE_MAX) {          /* regnode.type is unsigned */
11616         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
11617     }
11618
11619     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
11620     if (offset == 0)
11621         return(NULL);
11622
11623     return(p+offset);
11624 }
11625 #endif
11626
11627 STATIC void
11628 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
11629 {
11630     va_list args;
11631     STRLEN l1 = strlen(pat1);
11632     STRLEN l2 = strlen(pat2);
11633     char buf[512];
11634     SV *msv;
11635     const char *message;
11636
11637     PERL_ARGS_ASSERT_RE_CROAK2;
11638
11639     if (l1 > 510)
11640         l1 = 510;
11641     if (l1 + l2 > 510)
11642         l2 = 510 - l1;
11643     Copy(pat1, buf, l1 , char);
11644     Copy(pat2, buf + l1, l2 , char);
11645     buf[l1 + l2] = '\n';
11646     buf[l1 + l2 + 1] = '\0';
11647 #ifdef I_STDARG
11648     /* ANSI variant takes additional second argument */
11649     va_start(args, pat2);
11650 #else
11651     va_start(args);
11652 #endif
11653     msv = vmess(buf, &args);
11654     va_end(args);
11655     message = SvPV_const(msv,l1);
11656     if (l1 > 512)
11657         l1 = 512;
11658     Copy(message, buf, l1 , char);
11659     buf[l1-1] = '\0';                   /* Overwrite \n */
11660     Perl_croak(aTHX_ "%s", buf);
11661 }
11662
11663 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
11664
11665 #ifndef PERL_IN_XSUB_RE
11666 void
11667 Perl_save_re_context(pTHX)
11668 {
11669     dVAR;
11670
11671     struct re_save_state *state;
11672
11673     SAVEVPTR(PL_curcop);
11674     SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
11675
11676     state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
11677     PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
11678     SSPUSHUV(SAVEt_RE_STATE);
11679
11680     Copy(&PL_reg_state, state, 1, struct re_save_state);
11681
11682     PL_reg_start_tmp = 0;
11683     PL_reg_start_tmpl = 0;
11684     PL_reg_oldsaved = NULL;
11685     PL_reg_oldsavedlen = 0;
11686     PL_reg_maxiter = 0;
11687     PL_reg_leftiter = 0;
11688     PL_reg_poscache = NULL;
11689     PL_reg_poscache_size = 0;
11690 #ifdef PERL_OLD_COPY_ON_WRITE
11691     PL_nrs = NULL;
11692 #endif
11693
11694     /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
11695     if (PL_curpm) {
11696         const REGEXP * const rx = PM_GETRE(PL_curpm);
11697         if (rx) {
11698             U32 i;
11699             for (i = 1; i <= RX_NPARENS(rx); i++) {
11700                 char digits[TYPE_CHARS(long)];
11701                 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
11702                 GV *const *const gvp
11703                     = (GV**)hv_fetch(PL_defstash, digits, len, 0);
11704
11705                 if (gvp) {
11706                     GV * const gv = *gvp;
11707                     if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
11708                         save_scalar(gv);
11709                 }
11710             }
11711         }
11712     }
11713 }
11714 #endif
11715
11716 static void
11717 clear_re(pTHX_ void *r)
11718 {
11719     dVAR;
11720     ReREFCNT_dec((REGEXP *)r);
11721 }
11722
11723 #ifdef DEBUGGING
11724
11725 STATIC void
11726 S_put_byte(pTHX_ SV *sv, int c)
11727 {
11728     PERL_ARGS_ASSERT_PUT_BYTE;
11729
11730     /* Our definition of isPRINT() ignores locales, so only bytes that are
11731        not part of UTF-8 are considered printable. I assume that the same
11732        holds for UTF-EBCDIC.
11733        Also, code point 255 is not printable in either (it's E0 in EBCDIC,
11734        which Wikipedia says:
11735
11736        EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
11737        ones (binary 1111 1111, hexadecimal FF). It is similar, but not
11738        identical, to the ASCII delete (DEL) or rubout control character.
11739        ) So the old condition can be simplified to !isPRINT(c)  */
11740     if (!isPRINT(c)) {
11741         if (c < 256) {
11742             Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
11743         }
11744         else {
11745             Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
11746         }
11747     }
11748     else {
11749         const char string = c;
11750         if (c == '-' || c == ']' || c == '\\' || c == '^')
11751             sv_catpvs(sv, "\\");
11752         sv_catpvn(sv, &string, 1);
11753     }
11754 }
11755
11756
11757 #define CLEAR_OPTSTART \
11758     if (optstart) STMT_START { \
11759             DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
11760             optstart=NULL; \
11761     } STMT_END
11762
11763 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
11764
11765 STATIC const regnode *
11766 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
11767             const regnode *last, const regnode *plast,
11768             SV* sv, I32 indent, U32 depth)
11769 {
11770     dVAR;
11771     register U8 op = PSEUDO;    /* Arbitrary non-END op. */
11772     register const regnode *next;
11773     const regnode *optstart= NULL;
11774
11775     RXi_GET_DECL(r,ri);
11776     GET_RE_DEBUG_FLAGS_DECL;
11777
11778     PERL_ARGS_ASSERT_DUMPUNTIL;
11779
11780 #ifdef DEBUG_DUMPUNTIL
11781     PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
11782         last ? last-start : 0,plast ? plast-start : 0);
11783 #endif
11784
11785     if (plast && plast < last)
11786         last= plast;
11787
11788     while (PL_regkind[op] != END && (!last || node < last)) {
11789         /* While that wasn't END last time... */
11790         NODE_ALIGN(node);
11791         op = OP(node);
11792         if (op == CLOSE || op == WHILEM)
11793             indent--;
11794         next = regnext((regnode *)node);
11795
11796         /* Where, what. */
11797         if (OP(node) == OPTIMIZED) {
11798             if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
11799                 optstart = node;
11800             else
11801                 goto after_print;
11802         } else
11803             CLEAR_OPTSTART;
11804
11805         regprop(r, sv, node);
11806         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
11807                       (int)(2*indent + 1), "", SvPVX_const(sv));
11808
11809         if (OP(node) != OPTIMIZED) {
11810             if (next == NULL)           /* Next ptr. */
11811                 PerlIO_printf(Perl_debug_log, " (0)");
11812             else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
11813                 PerlIO_printf(Perl_debug_log, " (FAIL)");
11814             else
11815                 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
11816             (void)PerlIO_putc(Perl_debug_log, '\n');
11817         }
11818
11819       after_print:
11820         if (PL_regkind[(U8)op] == BRANCHJ) {
11821             assert(next);
11822             {
11823                 register const regnode *nnode = (OP(next) == LONGJMP
11824                                              ? regnext((regnode *)next)
11825                                              : next);
11826                 if (last && nnode > last)
11827                     nnode = last;
11828                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
11829             }
11830         }
11831         else if (PL_regkind[(U8)op] == BRANCH) {
11832             assert(next);
11833             DUMPUNTIL(NEXTOPER(node), next);
11834         }
11835         else if ( PL_regkind[(U8)op]  == TRIE ) {
11836             const regnode *this_trie = node;
11837             const char op = OP(node);
11838             const U32 n = ARG(node);
11839             const reg_ac_data * const ac = op>=AHOCORASICK ?
11840                (reg_ac_data *)ri->data->data[n] :
11841                NULL;
11842             const reg_trie_data * const trie =
11843                 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
11844 #ifdef DEBUGGING
11845             AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
11846 #endif
11847             const regnode *nextbranch= NULL;
11848             I32 word_idx;
11849             sv_setpvs(sv, "");
11850             for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
11851                 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
11852
11853                 PerlIO_printf(Perl_debug_log, "%*s%s ",
11854                    (int)(2*(indent+3)), "",
11855                     elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
11856                             PL_colors[0], PL_colors[1],
11857                             (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
11858                             PERL_PV_PRETTY_ELLIPSES    |
11859                             PERL_PV_PRETTY_LTGT
11860                             )
11861                             : "???"
11862                 );
11863                 if (trie->jump) {
11864                     U16 dist= trie->jump[word_idx+1];
11865                     PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
11866                                   (UV)((dist ? this_trie + dist : next) - start));
11867                     if (dist) {
11868                         if (!nextbranch)
11869                             nextbranch= this_trie + trie->jump[0];
11870                         DUMPUNTIL(this_trie + dist, nextbranch);
11871                     }
11872                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
11873                         nextbranch= regnext((regnode *)nextbranch);
11874                 } else {
11875                     PerlIO_printf(Perl_debug_log, "\n");
11876                 }
11877             }
11878             if (last && next > last)
11879                 node= last;
11880             else
11881                 node= next;
11882         }
11883         else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
11884             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
11885                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
11886         }
11887         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
11888             assert(next);
11889             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
11890         }
11891         else if ( op == PLUS || op == STAR) {
11892             DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
11893         }
11894         else if (PL_regkind[(U8)op] == ANYOF) {
11895             /* arglen 1 + class block */
11896             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
11897                     ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
11898             node = NEXTOPER(node);
11899         }
11900         else if (PL_regkind[(U8)op] == EXACT) {
11901             /* Literal string, where present. */
11902             node += NODE_SZ_STR(node) - 1;
11903             node = NEXTOPER(node);
11904         }
11905         else {
11906             node = NEXTOPER(node);
11907             node += regarglen[(U8)op];
11908         }
11909         if (op == CURLYX || op == OPEN)
11910             indent++;
11911     }
11912     CLEAR_OPTSTART;
11913 #ifdef DEBUG_DUMPUNTIL
11914     PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
11915 #endif
11916     return node;
11917 }
11918
11919 #endif  /* DEBUGGING */
11920
11921 /*
11922  * Local variables:
11923  * c-indentation-style: bsd
11924  * c-basic-offset: 4
11925  * indent-tabs-mode: t
11926  * End:
11927  *
11928  * ex: set ts=8 sts=4 sw=4 noet:
11929  */