src/5013009/regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  *      Copyright (c) 1986 by University of Toronto.
  42  *      Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  *      Permission is granted to anyone to use this software for any
  45  *      purpose on any computer system, and to redistribute it freely,
  46  *      subject to the following restrictions:
  47  *
  48  *      1. The author is not responsible for the consequences of use of
  49  *              this software, no matter how awful, even if they arise
  50  *              from defects in it.
  51  *
  52  *      2. The origin of this software must not be misrepresented, either
  53  *              by explicit claim or by omission.
  54  *
  55  *      3. Altered versions must be plainly marked as such, and must not
  56  *              be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #include "perl.h"
  76
  77 #ifndef PERL_IN_XSUB_RE
  78 #include "re_defs.h"
  79 #endif
  80
  81 #define REG_COMP_C
  82 #ifdef PERL_IN_XSUB_RE
  83 #  include "re_comp.h"
  84 #else
  85 #  include "regcomp.h"
  86 #endif
  87
  88 #include "dquote_static.c"
  89
  90 #ifdef op
  91 #undef op
  92 #endif /* op */
  93
  94 #ifdef MSDOS
  95 #  if defined(BUGGY_MSC6)
  96  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
  97 #    pragma optimize("a",off)
  98  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
  99 #    pragma optimize("w",on )
 100 #  endif /* BUGGY_MSC6 */
 101 #endif /* MSDOS */
 102
 103 #ifndef STATIC
 104 #define STATIC  static
 105 #endif
 106
 107 typedef struct RExC_state_t {
 108     U32         flags;                  /* are we folding, multilining? */
 109     char        *precomp;               /* uncompiled string. */
 110     REGEXP      *rx_sv;                 /* The SV that is the regexp. */
 111     regexp      *rx;                    /* perl core regexp structure */
 112     regexp_internal     *rxi;           /* internal data for regexp object pprivate field */
 113     char        *start;                 /* Start of input for compile */
 114     char        *end;                   /* End of input for compile */
 115     char        *parse;                 /* Input-scan pointer. */
 116     I32         whilem_seen;            /* number of WHILEM in this expr */
 117     regnode     *emit_start;            /* Start of emitted-code area */
 118     regnode     *emit_bound;            /* First regnode outside of the allocated space */
 119     regnode     *emit;                  /* Code-emit pointer; &regdummy = don't = compiling */
 120     I32         naughty;                /* How bad is this pattern? */
 121     I32         sawback;                /* Did we see \1, ...? */
 122     U32         seen;
 123     I32         size;                   /* Code size. */
 124     I32         npar;                   /* Capture buffer count, (OPEN). */
 125     I32         cpar;                   /* Capture buffer count, (CLOSE). */
 126     I32         nestroot;               /* root parens we are in - used by accept */
 127     I32         extralen;
 128     I32         seen_zerolen;
 129     I32         seen_evals;
 130     regnode     **open_parens;          /* pointers to open parens */
 131     regnode     **close_parens;         /* pointers to close parens */
 132     regnode     *opend;                 /* END node in program */
 133     I32         utf8;           /* whether the pattern is utf8 or not */
 134     I32         orig_utf8;      /* whether the pattern was originally in utf8 */
 135                                 /* XXX use this for future optimisation of case
 136                                  * where pattern must be upgraded to utf8. */
 137     HV          *paren_names;           /* Paren names */
 138
 139     regnode     **recurse;              /* Recurse regops */
 140     I32         recurse_count;          /* Number of recurse regops */
 141     I32         in_lookbehind;
 142 #if ADD_TO_REGEXEC
 143     char        *starttry;              /* -Dr: where regtry was called. */
 144 #define RExC_starttry   (pRExC_state->starttry)
 145 #endif
 146 #ifdef DEBUGGING
 147     const char  *lastparse;
 148     I32         lastnum;
 149     AV          *paren_name_list;       /* idx -> name */
 150 #define RExC_lastparse  (pRExC_state->lastparse)
 151 #define RExC_lastnum    (pRExC_state->lastnum)
 152 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 153 #endif
 154 } RExC_state_t;
 155
 156 #define RExC_flags      (pRExC_state->flags)
 157 #define RExC_precomp    (pRExC_state->precomp)
 158 #define RExC_rx_sv      (pRExC_state->rx_sv)
 159 #define RExC_rx         (pRExC_state->rx)
 160 #define RExC_rxi        (pRExC_state->rxi)
 161 #define RExC_start      (pRExC_state->start)
 162 #define RExC_end        (pRExC_state->end)
 163 #define RExC_parse      (pRExC_state->parse)
 164 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
 165 #ifdef RE_TRACK_PATTERN_OFFSETS
 166 #define RExC_offsets    (pRExC_state->rxi->u.offsets) /* I am not like the others */
 167 #endif
 168 #define RExC_emit       (pRExC_state->emit)
 169 #define RExC_emit_start (pRExC_state->emit_start)
 170 #define RExC_emit_bound (pRExC_state->emit_bound)
 171 #define RExC_naughty    (pRExC_state->naughty)
 172 #define RExC_sawback    (pRExC_state->sawback)
 173 #define RExC_seen       (pRExC_state->seen)
 174 #define RExC_size       (pRExC_state->size)
 175 #define RExC_npar       (pRExC_state->npar)
 176 #define RExC_nestroot   (pRExC_state->nestroot)
 177 #define RExC_extralen   (pRExC_state->extralen)
 178 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
 179 #define RExC_seen_evals (pRExC_state->seen_evals)
 180 #define RExC_utf8       (pRExC_state->utf8)
 181 #define RExC_orig_utf8  (pRExC_state->orig_utf8)
 182 #define RExC_open_parens        (pRExC_state->open_parens)
 183 #define RExC_close_parens       (pRExC_state->close_parens)
 184 #define RExC_opend      (pRExC_state->opend)
 185 #define RExC_paren_names        (pRExC_state->paren_names)
 186 #define RExC_recurse    (pRExC_state->recurse)
 187 #define RExC_recurse_count      (pRExC_state->recurse_count)
 188 #define RExC_in_lookbehind      (pRExC_state->in_lookbehind)
 189
 190
 191 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
 192 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 193         ((*s) == '{' && regcurly(s)))
 194
 195 #ifdef SPSTART
 196 #undef SPSTART          /* dratted cpp namespace... */
 197 #endif
 198 /*
 199  * Flags to be passed up and down.
 200  */
 201 #define WORST           0       /* Worst case. */
 202 #define HASWIDTH        0x01    /* Known to match non-null strings. */
 203
 204 /* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
 205  * character, and if utf8, must be invariant.  Note that this is not the same thing as REGNODE_SIMPLE */
 206 #define SIMPLE          0x02
 207 #define SPSTART         0x04    /* Starts with * or +. */
 208 #define TRYAGAIN        0x08    /* Weeded out a declaration. */
 209 #define POSTPONED       0x10    /* (?1),(?&name), (??{...}) or similar */
 210
 211 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 212
 213 /* whether trie related optimizations are enabled */
 214 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 215 #define TRIE_STUDY_OPT
 216 #define FULL_TRIE_STUDY
 217 #define TRIE_STCLASS
 218 #endif
 219
 220
 221
 222 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 223 #define PBITVAL(paren) (1 << ((paren) & 7))
 224 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 225 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 226 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 227
 228 /* If not already in utf8, do a longjmp back to the beginning */
 229 #define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
 230 #define REQUIRE_UTF8    STMT_START {                                       \
 231                                      if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
 232                         } STMT_END
 233
 234 /* About scan_data_t.
 235
 236   During optimisation we recurse through the regexp program performing
 237   various inplace (keyhole style) optimisations. In addition study_chunk
 238   and scan_commit populate this data structure with information about
 239   what strings MUST appear in the pattern. We look for the longest
 240   string that must appear at a fixed location, and we look for the
 241   longest string that may appear at a floating location. So for instance
 242   in the pattern:
 243
 244     /FOO[xX]A.*B[xX]BAR/
 245
 246   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 247   strings (because they follow a .* construct). study_chunk will identify
 248   both FOO and BAR as being the longest fixed and floating strings respectively.
 249
 250   The strings can be composites, for instance
 251
 252      /(f)(o)(o)/
 253
 254   will result in a composite fixed substring 'foo'.
 255
 256   For each string some basic information is maintained:
 257
 258   - offset or min_offset
 259     This is the position the string must appear at, or not before.
 260     It also implicitly (when combined with minlenp) tells us how many
 261     characters must match before the string we are searching for.
 262     Likewise when combined with minlenp and the length of the string it
 263     tells us how many characters must appear after the string we have
 264     found.
 265
 266   - max_offset
 267     Only used for floating strings. This is the rightmost point that
 268     the string can appear at. If set to I32 max it indicates that the
 269     string can occur infinitely far to the right.
 270
 271   - minlenp
 272     A pointer to the minimum length of the pattern that the string
 273     was found inside. This is important as in the case of positive
 274     lookahead or positive lookbehind we can have multiple patterns
 275     involved. Consider
 276
 277     /(?=FOO).*F/
 278
 279     The minimum length of the pattern overall is 3, the minimum length
 280     of the lookahead part is 3, but the minimum length of the part that
 281     will actually match is 1. So 'FOO's minimum length is 3, but the
 282     minimum length for the F is 1. This is important as the minimum length
 283     is used to determine offsets in front of and behind the string being
 284     looked for.  Since strings can be composites this is the length of the
 285     pattern at the time it was committed with a scan_commit. Note that
 286     the length is calculated by study_chunk, so that the minimum lengths
 287     are not known until the full pattern has been compiled, thus the
 288     pointer to the value.
 289
 290   - lookbehind
 291
 292     In the case of lookbehind the string being searched for can be
 293     offset past the start point of the final matching string.
 294     If this value was just blithely removed from the min_offset it would
 295     invalidate some of the calculations for how many chars must match
 296     before or after (as they are derived from min_offset and minlen and
 297     the length of the string being searched for).
 298     When the final pattern is compiled and the data is moved from the
 299     scan_data_t structure into the regexp structure the information
 300     about lookbehind is factored in, with the information that would
 301     have been lost precalculated in the end_shift field for the
 302     associated string.
 303
 304   The fields pos_min and pos_delta are used to store the minimum offset
 305   and the delta to the maximum offset at the current point in the pattern.
 306
 307 */
 308
 309 typedef struct scan_data_t {
 310     /*I32 len_min;      unused */
 311     /*I32 len_delta;    unused */
 312     I32 pos_min;
 313     I32 pos_delta;
 314     SV *last_found;
 315     I32 last_end;           /* min value, <0 unless valid. */
 316     I32 last_start_min;
 317     I32 last_start_max;
 318     SV **longest;           /* Either &l_fixed, or &l_float. */
 319     SV *longest_fixed;      /* longest fixed string found in pattern */
 320     I32 offset_fixed;       /* offset where it starts */
 321     I32 *minlen_fixed;      /* pointer to the minlen relevant to the string */
 322     I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 323     SV *longest_float;      /* longest floating string found in pattern */
 324     I32 offset_float_min;   /* earliest point in string it can appear */
 325     I32 offset_float_max;   /* latest point in string it can appear */
 326     I32 *minlen_float;      /* pointer to the minlen relevant to the string */
 327     I32 lookbehind_float;   /* is the position of the string modified by LB */
 328     I32 flags;
 329     I32 whilem_c;
 330     I32 *last_closep;
 331     struct regnode_charclass_class *start_class;
 332 } scan_data_t;
 333
 334 /*
 335  * Forward declarations for pregcomp()'s friends.
 336  */
 337
 338 static const scan_data_t zero_scan_data =
 339   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 340
 341 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 342 #define SF_BEFORE_SEOL          0x0001
 343 #define SF_BEFORE_MEOL          0x0002
 344 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 345 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 346
 347 #ifdef NO_UNARY_PLUS
 348 #  define SF_FIX_SHIFT_EOL      (0+2)
 349 #  define SF_FL_SHIFT_EOL               (0+4)
 350 #else
 351 #  define SF_FIX_SHIFT_EOL      (+2)
 352 #  define SF_FL_SHIFT_EOL               (+4)
 353 #endif
 354
 355 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 356 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 357
 358 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 359 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 360 #define SF_IS_INF               0x0040
 361 #define SF_HAS_PAR              0x0080
 362 #define SF_IN_PAR               0x0100
 363 #define SF_HAS_EVAL             0x0200
 364 #define SCF_DO_SUBSTR           0x0400
 365 #define SCF_DO_STCLASS_AND      0x0800
 366 #define SCF_DO_STCLASS_OR       0x1000
 367 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 368 #define SCF_WHILEM_VISITED_POS  0x2000
 369
 370 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 371 #define SCF_SEEN_ACCEPT         0x8000
 372
 373 #define UTF cBOOL(RExC_utf8)
 374 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 375 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
 376 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
 377 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
 378 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
 379
 380 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 381
 382 #define OOB_UNICODE             12345678
 383 #define OOB_NAMEDCLASS          -1
 384
 385 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 386 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 387
 388
 389 /* length of regex to show in messages that don't mark a position within */
 390 #define RegexLengthToShowInErrorMessages 127
 391
 392 /*
 393  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 394  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 395  * op/pragma/warn/regcomp.
 396  */
 397 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 398 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 399
 400 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
 401
 402 /*
 403  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 404  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 405  * "...".
 406  */
 407 #define _FAIL(code) STMT_START {                                        \
 408     const char *ellipses = "";                                          \
 409     IV len = RExC_end - RExC_precomp;                                   \
 410                                                                         \
 411     if (!SIZE_ONLY)                                                     \
 412         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);                   \
 413     if (len > RegexLengthToShowInErrorMessages) {                       \
 414         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
 415         len = RegexLengthToShowInErrorMessages - 10;                    \
 416         ellipses = "...";                                               \
 417     }                                                                   \
 418     code;                                                               \
 419 } STMT_END
 420
 421 #define FAIL(msg) _FAIL(                            \
 422     Perl_croak(aTHX_ "%s in regex m/%.*s%s/",       \
 423             msg, (int)len, RExC_precomp, ellipses))
 424
 425 #define FAIL2(msg,arg) _FAIL(                       \
 426     Perl_croak(aTHX_ msg " in regex m/%.*s%s/",     \
 427             arg, (int)len, RExC_precomp, ellipses))
 428
 429 /*
 430  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 431  */
 432 #define Simple_vFAIL(m) STMT_START {                                    \
 433     const IV offset = RExC_parse - RExC_precomp;                        \
 434     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
 435             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
 436 } STMT_END
 437
 438 /*
 439  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 440  */
 441 #define vFAIL(m) STMT_START {                           \
 442     if (!SIZE_ONLY)                                     \
 443         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 444     Simple_vFAIL(m);                                    \
 445 } STMT_END
 446
 447 /*
 448  * Like Simple_vFAIL(), but accepts two arguments.
 449  */
 450 #define Simple_vFAIL2(m,a1) STMT_START {                        \
 451     const IV offset = RExC_parse - RExC_precomp;                        \
 452     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,                   \
 453             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 454 } STMT_END
 455
 456 /*
 457  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 458  */
 459 #define vFAIL2(m,a1) STMT_START {                       \
 460     if (!SIZE_ONLY)                                     \
 461         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 462     Simple_vFAIL2(m, a1);                               \
 463 } STMT_END
 464
 465
 466 /*
 467  * Like Simple_vFAIL(), but accepts three arguments.
 468  */
 469 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
 470     const IV offset = RExC_parse - RExC_precomp;                \
 471     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,               \
 472             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 473 } STMT_END
 474
 475 /*
 476  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 477  */
 478 #define vFAIL3(m,a1,a2) STMT_START {                    \
 479     if (!SIZE_ONLY)                                     \
 480         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 481     Simple_vFAIL3(m, a1, a2);                           \
 482 } STMT_END
 483
 484 /*
 485  * Like Simple_vFAIL(), but accepts four arguments.
 486  */
 487 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
 488     const IV offset = RExC_parse - RExC_precomp;                \
 489     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,           \
 490             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 491 } STMT_END
 492
 493 #define ckWARNreg(loc,m) STMT_START {                                   \
 494     const IV offset = loc - RExC_precomp;                               \
 495     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 496             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 497 } STMT_END
 498
 499 #define ckWARNregdep(loc,m) STMT_START {                                \
 500     const IV offset = loc - RExC_precomp;                               \
 501     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 502             m REPORT_LOCATION,                                          \
 503             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 504 } STMT_END
 505
 506 #define ckWARN2reg(loc, m, a1) STMT_START {                             \
 507     const IV offset = loc - RExC_precomp;                               \
 508     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 509             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
 510 } STMT_END
 511
 512 #define vWARN3(loc, m, a1, a2) STMT_START {                             \
 513     const IV offset = loc - RExC_precomp;                               \
 514     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 515             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 516 } STMT_END
 517
 518 #define ckWARN3reg(loc, m, a1, a2) STMT_START {                         \
 519     const IV offset = loc - RExC_precomp;                               \
 520     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 521             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 522 } STMT_END
 523
 524 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
 525     const IV offset = loc - RExC_precomp;                               \
 526     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 527             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 528 } STMT_END
 529
 530 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START {                     \
 531     const IV offset = loc - RExC_precomp;                               \
 532     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 533             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 534 } STMT_END
 535
 536 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
 537     const IV offset = loc - RExC_precomp;                               \
 538     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 539             a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
 540 } STMT_END
 541
 542
 543 /* Allow for side effects in s */
 544 #define REGC(c,s) STMT_START {                  \
 545     if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
 546 } STMT_END
 547
 548 /* Macros for recording node offsets.   20001227 mjd@plover.com
 549  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 550  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 551  * Element 0 holds the number n.
 552  * Position is 1 indexed.
 553  */
 554 #ifndef RE_TRACK_PATTERN_OFFSETS
 555 #define Set_Node_Offset_To_R(node,byte)
 556 #define Set_Node_Offset(node,byte)
 557 #define Set_Cur_Node_Offset
 558 #define Set_Node_Length_To_R(node,len)
 559 #define Set_Node_Length(node,len)
 560 #define Set_Node_Cur_Length(node)
 561 #define Node_Offset(n)
 562 #define Node_Length(n)
 563 #define Set_Node_Offset_Length(node,offset,len)
 564 #define ProgLen(ri) ri->u.proglen
 565 #define SetProgLen(ri,x) ri->u.proglen = x
 566 #else
 567 #define ProgLen(ri) ri->u.offsets[0]
 568 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 569 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
 570     if (! SIZE_ONLY) {                                                  \
 571         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
 572                     __LINE__, (int)(node), (int)(byte)));               \
 573         if((node) < 0) {                                                \
 574             Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
 575         } else {                                                        \
 576             RExC_offsets[2*(node)-1] = (byte);                          \
 577         }                                                               \
 578     }                                                                   \
 579 } STMT_END
 580
 581 #define Set_Node_Offset(node,byte) \
 582     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 583 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 584
 585 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
 586     if (! SIZE_ONLY) {                                                  \
 587         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
 588                 __LINE__, (int)(node), (int)(len)));                    \
 589         if((node) < 0) {                                                \
 590             Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
 591         } else {                                                        \
 592             RExC_offsets[2*(node)] = (len);                             \
 593         }                                                               \
 594     }                                                                   \
 595 } STMT_END
 596
 597 #define Set_Node_Length(node,len) \
 598     Set_Node_Length_To_R((node)-RExC_emit_start, len)
 599 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
 600 #define Set_Node_Cur_Length(node) \
 601     Set_Node_Length(node, RExC_parse - parse_start)
 602
 603 /* Get offsets and lengths */
 604 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 605 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 606
 607 #define Set_Node_Offset_Length(node,offset,len) STMT_START {    \
 608     Set_Node_Offset_To_R((node)-RExC_emit_start, (offset));     \
 609     Set_Node_Length_To_R((node)-RExC_emit_start, (len));        \
 610 } STMT_END
 611 #endif
 612
 613 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 614 #define EXPERIMENTAL_INPLACESCAN
 615 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
 616
 617 #define DEBUG_STUDYDATA(str,data,depth)                              \
 618 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
 619     PerlIO_printf(Perl_debug_log,                                    \
 620         "%*s" str "Pos:%"IVdf"/%"IVdf                                \
 621         " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s",       \
 622         (int)(depth)*2, "",                                          \
 623         (IV)((data)->pos_min),                                       \
 624         (IV)((data)->pos_delta),                                     \
 625         (UV)((data)->flags),                                         \
 626         (IV)((data)->whilem_c),                                      \
 627         (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
 628         is_inf ? "INF " : ""                                         \
 629     );                                                               \
 630     if ((data)->last_found)                                          \
 631         PerlIO_printf(Perl_debug_log,                                \
 632             "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 633             " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 634             SvPVX_const((data)->last_found),                         \
 635             (IV)((data)->last_end),                                  \
 636             (IV)((data)->last_start_min),                            \
 637             (IV)((data)->last_start_max),                            \
 638             ((data)->longest &&                                      \
 639              (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 640             SvPVX_const((data)->longest_fixed),                      \
 641             (IV)((data)->offset_fixed),                              \
 642             ((data)->longest &&                                      \
 643              (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 644             SvPVX_const((data)->longest_float),                      \
 645             (IV)((data)->offset_float_min),                          \
 646             (IV)((data)->offset_float_max)                           \
 647         );                                                           \
 648     PerlIO_printf(Perl_debug_log,"\n");                              \
 649 });
 650
 651 static void clear_re(pTHX_ void *r);
 652
 653 /* Mark that we cannot extend a found fixed substring at this point.
 654    Update the longest found anchored substring and the longest found
 655    floating substrings if needed. */
 656
 657 STATIC void
 658 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
 659 {
 660     const STRLEN l = CHR_SVLEN(data->last_found);
 661     const STRLEN old_l = CHR_SVLEN(*data->longest);
 662     GET_RE_DEBUG_FLAGS_DECL;
 663
 664     PERL_ARGS_ASSERT_SCAN_COMMIT;
 665
 666     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 667         SvSetMagicSV(*data->longest, data->last_found);
 668         if (*data->longest == data->longest_fixed) {
 669             data->offset_fixed = l ? data->last_start_min : data->pos_min;
 670             if (data->flags & SF_BEFORE_EOL)
 671                 data->flags
 672                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
 673             else
 674                 data->flags &= ~SF_FIX_BEFORE_EOL;
 675             data->minlen_fixed=minlenp;
 676             data->lookbehind_fixed=0;
 677         }
 678         else { /* *data->longest == data->longest_float */
 679             data->offset_float_min = l ? data->last_start_min : data->pos_min;
 680             data->offset_float_max = (l
 681                                       ? data->last_start_max
 682                                       : data->pos_min + data->pos_delta);
 683             if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
 684                 data->offset_float_max = I32_MAX;
 685             if (data->flags & SF_BEFORE_EOL)
 686                 data->flags
 687                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
 688             else
 689                 data->flags &= ~SF_FL_BEFORE_EOL;
 690             data->minlen_float=minlenp;
 691             data->lookbehind_float=0;
 692         }
 693     }
 694     SvCUR_set(data->last_found, 0);
 695     {
 696         SV * const sv = data->last_found;
 697         if (SvUTF8(sv) && SvMAGICAL(sv)) {
 698             MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 699             if (mg)
 700                 mg->mg_len = 0;
 701         }
 702     }
 703     data->last_end = -1;
 704     data->flags &= ~SF_BEFORE_EOL;
 705     DEBUG_STUDYDATA("commit: ",data,0);
 706 }
 707
 708 /* Can match anything (initialization) */
 709 STATIC void
 710 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 711 {
 712     PERL_ARGS_ASSERT_CL_ANYTHING;
 713
 714     ANYOF_CLASS_ZERO(cl);
 715     ANYOF_BITMAP_SETALL(cl);
 716     cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL|ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL;
 717     if (LOC)
 718         cl->flags |= ANYOF_LOCALE;
 719 }
 720
 721 /* Can match anything (initialization) */
 722 STATIC int
 723 S_cl_is_anything(const struct regnode_charclass_class *cl)
 724 {
 725     int value;
 726
 727     PERL_ARGS_ASSERT_CL_IS_ANYTHING;
 728
 729     for (value = 0; value <= ANYOF_MAX; value += 2)
 730         if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
 731             return 1;
 732     if (!(cl->flags & ANYOF_UNICODE_ALL))
 733         return 0;
 734     if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
 735         return 0;
 736     return 1;
 737 }
 738
 739 /* Can match anything (initialization) */
 740 STATIC void
 741 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 742 {
 743     PERL_ARGS_ASSERT_CL_INIT;
 744
 745     Zero(cl, 1, struct regnode_charclass_class);
 746     cl->type = ANYOF;
 747     cl_anything(pRExC_state, cl);
 748 }
 749
 750 STATIC void
 751 S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 752 {
 753     PERL_ARGS_ASSERT_CL_INIT_ZERO;
 754
 755     Zero(cl, 1, struct regnode_charclass_class);
 756     cl->type = ANYOF;
 757     cl_anything(pRExC_state, cl);
 758     if (LOC)
 759         cl->flags |= ANYOF_LOCALE;
 760 }
 761
 762 /* 'And' a given class with another one.  Can create false positives */
 763 /* We assume that cl is not inverted */
 764 STATIC void
 765 S_cl_and(struct regnode_charclass_class *cl,
 766         const struct regnode_charclass_class *and_with)
 767 {
 768     PERL_ARGS_ASSERT_CL_AND;
 769
 770     assert(and_with->type == ANYOF);
 771
 772     if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
 773         && !(ANYOF_CLASS_TEST_ANY_SET(cl))
 774         && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 775         && !(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
 776         && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) {
 777         int i;
 778
 779         if (and_with->flags & ANYOF_INVERT)
 780             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 781                 cl->bitmap[i] &= ~and_with->bitmap[i];
 782         else
 783             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 784                 cl->bitmap[i] &= and_with->bitmap[i];
 785     } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
 786     if (!(and_with->flags & ANYOF_EOS))
 787         cl->flags &= ~ANYOF_EOS;
 788
 789     if (!(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD))
 790         cl->flags &= ~ANYOF_LOC_NONBITMAP_FOLD;
 791     if (!(and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL))
 792         cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
 793
 794     if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_NONBITMAP &&
 795         !(and_with->flags & ANYOF_INVERT)) {
 796         cl->flags &= ~ANYOF_UNICODE_ALL;
 797         cl->flags |= and_with->flags & ANYOF_NONBITMAP; /* field is 2 bits; use
 798                                                            only the one(s)
 799                                                            actually set */
 800         ARG_SET(cl, ARG(and_with));
 801     }
 802     if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
 803         !(and_with->flags & ANYOF_INVERT))
 804         cl->flags &= ~ANYOF_UNICODE_ALL;
 805     if (!(and_with->flags & (ANYOF_NONBITMAP|ANYOF_UNICODE_ALL)) &&
 806         !(and_with->flags & ANYOF_INVERT))
 807         cl->flags &= ~ANYOF_NONBITMAP;
 808 }
 809
 810 /* 'OR' a given class with another one.  Can create false positives */
 811 /* We assume that cl is not inverted */
 812 STATIC void
 813 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
 814 {
 815     PERL_ARGS_ASSERT_CL_OR;
 816
 817     if (or_with->flags & ANYOF_INVERT) {
 818         /* We do not use
 819          * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
 820          *   <= (B1 | !B2) | (CL1 | !CL2)
 821          * which is wasteful if CL2 is small, but we ignore CL2:
 822          *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
 823          * XXXX Can we handle case-fold?  Unclear:
 824          *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
 825          *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
 826          */
 827         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 828              && !(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
 829              && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD) ) {
 830             int i;
 831
 832             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 833                 cl->bitmap[i] |= ~or_with->bitmap[i];
 834         } /* XXXX: logic is complicated otherwise */
 835         else {
 836             cl_anything(pRExC_state, cl);
 837         }
 838     } else {
 839         /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
 840         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 841              && (!(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
 842                  || (cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) ) {
 843             int i;
 844
 845             /* OR char bitmap and class bitmap separately */
 846             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 847                 cl->bitmap[i] |= or_with->bitmap[i];
 848             if (ANYOF_CLASS_TEST_ANY_SET(or_with)) {
 849                 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
 850                     cl->classflags[i] |= or_with->classflags[i];
 851                 cl->flags |= ANYOF_CLASS;
 852             }
 853         }
 854         else { /* XXXX: logic is complicated, leave it along for a moment. */
 855             cl_anything(pRExC_state, cl);
 856         }
 857     }
 858     if (or_with->flags & ANYOF_EOS)
 859         cl->flags |= ANYOF_EOS;
 860     if (!(or_with->flags & ANYOF_NON_UTF8_LATIN1_ALL))
 861         cl->flags |= ANYOF_NON_UTF8_LATIN1_ALL;
 862
 863     if (or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
 864         cl->flags |= ANYOF_LOC_NONBITMAP_FOLD;
 865
 866     /* If both nodes match something outside the bitmap, but what they match
 867      * outside is not the same pointer, and hence not easily compared, give up
 868      * and allow the start class to match everything outside the bitmap */
 869     if (cl->flags & ANYOF_NONBITMAP && or_with->flags & ANYOF_NONBITMAP &&
 870         ARG(cl) != ARG(or_with)) {
 871         cl->flags |= ANYOF_UNICODE_ALL;
 872     }
 873
 874     if (or_with->flags & ANYOF_UNICODE_ALL) {
 875         cl->flags |= ANYOF_UNICODE_ALL;
 876     }
 877 }
 878
 879 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
 880 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
 881 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
 882 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
 883
 884
 885 #ifdef DEBUGGING
 886 /*
 887    dump_trie(trie,widecharmap,revcharmap)
 888    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
 889    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
 890
 891    These routines dump out a trie in a somewhat readable format.
 892    The _interim_ variants are used for debugging the interim
 893    tables that are used to generate the final compressed
 894    representation which is what dump_trie expects.
 895
 896    Part of the reason for their existence is to provide a form
 897    of documentation as to how the different representations function.
 898
 899 */
 900
 901 /*
 902   Dumps the final compressed table form of the trie to Perl_debug_log.
 903   Used for debugging make_trie().
 904 */
 905
 906 STATIC void
 907 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
 908             AV *revcharmap, U32 depth)
 909 {
 910     U32 state;
 911     SV *sv=sv_newmortal();
 912     int colwidth= widecharmap ? 6 : 4;
 913     U16 word;
 914     GET_RE_DEBUG_FLAGS_DECL;
 915
 916     PERL_ARGS_ASSERT_DUMP_TRIE;
 917
 918     PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
 919         (int)depth * 2 + 2,"",
 920         "Match","Base","Ofs" );
 921
 922     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
 923         SV ** const tmp = av_fetch( revcharmap, state, 0);
 924         if ( tmp ) {
 925             PerlIO_printf( Perl_debug_log, "%*s",
 926                 colwidth,
 927                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
 928                             PL_colors[0], PL_colors[1],
 929                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
 930                             PERL_PV_ESCAPE_FIRSTCHAR
 931                 )
 932             );
 933         }
 934     }
 935     PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
 936         (int)depth * 2 + 2,"");
 937
 938     for( state = 0 ; state < trie->uniquecharcount ; state++ )
 939         PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
 940     PerlIO_printf( Perl_debug_log, "\n");
 941
 942     for( state = 1 ; state < trie->statecount ; state++ ) {
 943         const U32 base = trie->states[ state ].trans.base;
 944
 945         PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
 946
 947         if ( trie->states[ state ].wordnum ) {
 948             PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
 949         } else {
 950             PerlIO_printf( Perl_debug_log, "%6s", "" );
 951         }
 952
 953         PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
 954
 955         if ( base ) {
 956             U32 ofs = 0;
 957
 958             while( ( base + ofs  < trie->uniquecharcount ) ||
 959                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
 960                      && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
 961                     ofs++;
 962
 963             PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
 964
 965             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
 966                 if ( ( base + ofs >= trie->uniquecharcount ) &&
 967                      ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
 968                      trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
 969                 {
 970                    PerlIO_printf( Perl_debug_log, "%*"UVXf,
 971                     colwidth,
 972                     (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
 973                 } else {
 974                     PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
 975                 }
 976             }
 977
 978             PerlIO_printf( Perl_debug_log, "]");
 979
 980         }
 981         PerlIO_printf( Perl_debug_log, "\n" );
 982     }
 983     PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
 984     for (word=1; word <= trie->wordcount; word++) {
 985         PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
 986             (int)word, (int)(trie->wordinfo[word].prev),
 987             (int)(trie->wordinfo[word].len));
 988     }
 989     PerlIO_printf(Perl_debug_log, "\n" );
 990 }
 991 /*
 992   Dumps a fully constructed but uncompressed trie in list form.
 993   List tries normally only are used for construction when the number of
 994   possible chars (trie->uniquecharcount) is very high.
 995   Used for debugging make_trie().
 996 */
 997 STATIC void
 998 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
 999                          HV *widecharmap, AV *revcharmap, U32 next_alloc,
1000                          U32 depth)
1001 {
1002     U32 state;
1003     SV *sv=sv_newmortal();
1004     int colwidth= widecharmap ? 6 : 4;
1005     GET_RE_DEBUG_FLAGS_DECL;
1006
1007     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1008
1009     /* print out the table precompression.  */
1010     PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1011         (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1012         "------:-----+-----------------\n" );
1013
1014     for( state=1 ; state < next_alloc ; state ++ ) {
1015         U16 charid;
1016
1017         PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1018             (int)depth * 2 + 2,"", (UV)state  );
1019         if ( ! trie->states[ state ].wordnum ) {
1020             PerlIO_printf( Perl_debug_log, "%5s| ","");
1021         } else {
1022             PerlIO_printf( Perl_debug_log, "W%4x| ",
1023                 trie->states[ state ].wordnum
1024             );
1025         }
1026         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1027             SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1028             if ( tmp ) {
1029                 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1030                     colwidth,
1031                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1032                             PL_colors[0], PL_colors[1],
1033                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1034                             PERL_PV_ESCAPE_FIRSTCHAR
1035                     ) ,
1036                     TRIE_LIST_ITEM(state,charid).forid,
1037                     (UV)TRIE_LIST_ITEM(state,charid).newstate
1038                 );
1039                 if (!(charid % 10))
1040                     PerlIO_printf(Perl_debug_log, "\n%*s| ",
1041                         (int)((depth * 2) + 14), "");
1042             }
1043         }
1044         PerlIO_printf( Perl_debug_log, "\n");
1045     }
1046 }
1047
1048 /*
1049   Dumps a fully constructed but uncompressed trie in table form.
1050   This is the normal DFA style state transition table, with a few
1051   twists to facilitate compression later.
1052   Used for debugging make_trie().
1053 */
1054 STATIC void
1055 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1056                           HV *widecharmap, AV *revcharmap, U32 next_alloc,
1057                           U32 depth)
1058 {
1059     U32 state;
1060     U16 charid;
1061     SV *sv=sv_newmortal();
1062     int colwidth= widecharmap ? 6 : 4;
1063     GET_RE_DEBUG_FLAGS_DECL;
1064
1065     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1066
1067     /*
1068        print out the table precompression so that we can do a visual check
1069        that they are identical.
1070      */
1071
1072     PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1073
1074     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1075         SV ** const tmp = av_fetch( revcharmap, charid, 0);
1076         if ( tmp ) {
1077             PerlIO_printf( Perl_debug_log, "%*s",
1078                 colwidth,
1079                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1080                             PL_colors[0], PL_colors[1],
1081                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1082                             PERL_PV_ESCAPE_FIRSTCHAR
1083                 )
1084             );
1085         }
1086     }
1087
1088     PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1089
1090     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1091         PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1092     }
1093
1094     PerlIO_printf( Perl_debug_log, "\n" );
1095
1096     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1097
1098         PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1099             (int)depth * 2 + 2,"",
1100             (UV)TRIE_NODENUM( state ) );
1101
1102         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1103             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1104             if (v)
1105                 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1106             else
1107                 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1108         }
1109         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1110             PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1111         } else {
1112             PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1113             trie->states[ TRIE_NODENUM( state ) ].wordnum );
1114         }
1115     }
1116 }
1117
1118 #endif
1119
1120
1121 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1122   startbranch: the first branch in the whole branch sequence
1123   first      : start branch of sequence of branch-exact nodes.
1124                May be the same as startbranch
1125   last       : Thing following the last branch.
1126                May be the same as tail.
1127   tail       : item following the branch sequence
1128   count      : words in the sequence
1129   flags      : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1130   depth      : indent depth
1131
1132 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1133
1134 A trie is an N'ary tree where the branches are determined by digital
1135 decomposition of the key. IE, at the root node you look up the 1st character and
1136 follow that branch repeat until you find the end of the branches. Nodes can be
1137 marked as "accepting" meaning they represent a complete word. Eg:
1138
1139   /he|she|his|hers/
1140
1141 would convert into the following structure. Numbers represent states, letters
1142 following numbers represent valid transitions on the letter from that state, if
1143 the number is in square brackets it represents an accepting state, otherwise it
1144 will be in parenthesis.
1145
1146       +-h->+-e->[3]-+-r->(8)-+-s->[9]
1147       |    |
1148       |   (2)
1149       |    |
1150      (1)   +-i->(6)-+-s->[7]
1151       |
1152       +-s->(3)-+-h->(4)-+-e->[5]
1153
1154       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1155
1156 This shows that when matching against the string 'hers' we will begin at state 1
1157 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1158 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1159 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1160 single traverse. We store a mapping from accepting to state to which word was
1161 matched, and then when we have multiple possibilities we try to complete the
1162 rest of the regex in the order in which they occured in the alternation.
1163
1164 The only prior NFA like behaviour that would be changed by the TRIE support is
1165 the silent ignoring of duplicate alternations which are of the form:
1166
1167  / (DUPE|DUPE) X? (?{ ... }) Y /x
1168
1169 Thus EVAL blocks following a trie may be called a different number of times with
1170 and without the optimisation. With the optimisations dupes will be silently
1171 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1172 the following demonstrates:
1173
1174  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1175
1176 which prints out 'word' three times, but
1177
1178  'words'=~/(word|word|word)(?{ print $1 })S/
1179
1180 which doesnt print it out at all. This is due to other optimisations kicking in.
1181
1182 Example of what happens on a structural level:
1183
1184 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1185
1186    1: CURLYM[1] {1,32767}(18)
1187    5:   BRANCH(8)
1188    6:     EXACT <ac>(16)
1189    8:   BRANCH(11)
1190    9:     EXACT <ad>(16)
1191   11:   BRANCH(14)
1192   12:     EXACT <ab>(16)
1193   16:   SUCCEED(0)
1194   17:   NOTHING(18)
1195   18: END(0)
1196
1197 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1198 and should turn into:
1199
1200    1: CURLYM[1] {1,32767}(18)
1201    5:   TRIE(16)
1202         [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1203           <ac>
1204           <ad>
1205           <ab>
1206   16:   SUCCEED(0)
1207   17:   NOTHING(18)
1208   18: END(0)
1209
1210 Cases where tail != last would be like /(?foo|bar)baz/:
1211
1212    1: BRANCH(4)
1213    2:   EXACT <foo>(8)
1214    4: BRANCH(7)
1215    5:   EXACT <bar>(8)
1216    7: TAIL(8)
1217    8: EXACT <baz>(10)
1218   10: END(0)
1219
1220 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1221 and would end up looking like:
1222
1223     1: TRIE(8)
1224       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1225         <foo>
1226         <bar>
1227    7: TAIL(8)
1228    8: EXACT <baz>(10)
1229   10: END(0)
1230
1231     d = uvuni_to_utf8_flags(d, uv, 0);
1232
1233 is the recommended Unicode-aware way of saying
1234
1235     *(d++) = uv;
1236 */
1237
1238 #define TRIE_STORE_REVCHAR                                                 \
1239     STMT_START {                                                           \
1240         if (UTF) {                                                         \
1241             SV *zlopp = newSV(2);                                          \
1242             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
1243             unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
1244             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
1245             SvPOK_on(zlopp);                                               \
1246             SvUTF8_on(zlopp);                                              \
1247             av_push(revcharmap, zlopp);                                    \
1248         } else {                                                           \
1249             char ooooff = (char)uvc;                                               \
1250             av_push(revcharmap, newSVpvn(&ooooff, 1));                     \
1251         }                                                                  \
1252         } STMT_END
1253
1254 #define TRIE_READ_CHAR STMT_START {                                           \
1255     wordlen++;                                                                \
1256     if ( UTF ) {                                                              \
1257         if ( folder ) {                                                       \
1258             if ( foldlen > 0 ) {                                              \
1259                uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags );     \
1260                foldlen -= len;                                                \
1261                scan += len;                                                   \
1262                len = 0;                                                       \
1263             } else {                                                          \
1264                 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1265                 uvc = to_uni_fold( uvc, foldbuf, &foldlen );                  \
1266                 foldlen -= UNISKIP( uvc );                                    \
1267                 scan = foldbuf + UNISKIP( uvc );                              \
1268             }                                                                 \
1269         } else {                                                              \
1270             uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1271         }                                                                     \
1272     } else {                                                                  \
1273         uvc = (U32)*uc;                                                       \
1274         len = 1;                                                              \
1275     }                                                                         \
1276 } STMT_END
1277
1278
1279
1280 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
1281     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
1282         U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
1283         Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1284     }                                                           \
1285     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
1286     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
1287     TRIE_LIST_CUR( state )++;                                   \
1288 } STMT_END
1289
1290 #define TRIE_LIST_NEW(state) STMT_START {                       \
1291     Newxz( trie->states[ state ].trans.list,               \
1292         4, reg_trie_trans_le );                                 \
1293      TRIE_LIST_CUR( state ) = 1;                                \
1294      TRIE_LIST_LEN( state ) = 4;                                \
1295 } STMT_END
1296
1297 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
1298     U16 dupe= trie->states[ state ].wordnum;                    \
1299     regnode * const noper_next = regnext( noper );              \
1300                                                                 \
1301     DEBUG_r({                                                   \
1302         /* store the word for dumping */                        \
1303         SV* tmp;                                                \
1304         if (OP(noper) != NOTHING)                               \
1305             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);    \
1306         else                                                    \
1307             tmp = newSVpvn_utf8( "", 0, UTF );                  \
1308         av_push( trie_words, tmp );                             \
1309     });                                                         \
1310                                                                 \
1311     curword++;                                                  \
1312     trie->wordinfo[curword].prev   = 0;                         \
1313     trie->wordinfo[curword].len    = wordlen;                   \
1314     trie->wordinfo[curword].accept = state;                     \
1315                                                                 \
1316     if ( noper_next < tail ) {                                  \
1317         if (!trie->jump)                                        \
1318             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1319         trie->jump[curword] = (U16)(noper_next - convert);      \
1320         if (!jumper)                                            \
1321             jumper = noper_next;                                \
1322         if (!nextbranch)                                        \
1323             nextbranch= regnext(cur);                           \
1324     }                                                           \
1325                                                                 \
1326     if ( dupe ) {                                               \
1327         /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
1328         /* chain, so that when the bits of chain are later    */\
1329         /* linked together, the dups appear in the chain      */\
1330         trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1331         trie->wordinfo[dupe].prev = curword;                    \
1332     } else {                                                    \
1333         /* we haven't inserted this word yet.                */ \
1334         trie->states[ state ].wordnum = curword;                \
1335     }                                                           \
1336 } STMT_END
1337
1338
1339 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)          \
1340      ( ( base + charid >=  ucharcount                                   \
1341          && base + charid < ubound                                      \
1342          && state == trie->trans[ base - ucharcount + charid ].check    \
1343          && trie->trans[ base - ucharcount + charid ].next )            \
1344            ? trie->trans[ base - ucharcount + charid ].next             \
1345            : ( state==1 ? special : 0 )                                 \
1346       )
1347
1348 #define MADE_TRIE       1
1349 #define MADE_JUMP_TRIE  2
1350 #define MADE_EXACT_TRIE 4
1351
1352 STATIC I32
1353 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1354 {
1355     dVAR;
1356     /* first pass, loop through and scan words */
1357     reg_trie_data *trie;
1358     HV *widecharmap = NULL;
1359     AV *revcharmap = newAV();
1360     regnode *cur;
1361     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1362     STRLEN len = 0;
1363     UV uvc = 0;
1364     U16 curword = 0;
1365     U32 next_alloc = 0;
1366     regnode *jumper = NULL;
1367     regnode *nextbranch = NULL;
1368     regnode *convert = NULL;
1369     U32 *prev_states; /* temp array mapping each state to previous one */
1370     /* we just use folder as a flag in utf8 */
1371     const U8 * folder = NULL;
1372
1373 #ifdef DEBUGGING
1374     const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1375     AV *trie_words = NULL;
1376     /* along with revcharmap, this only used during construction but both are
1377      * useful during debugging so we store them in the struct when debugging.
1378      */
1379 #else
1380     const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1381     STRLEN trie_charcount=0;
1382 #endif
1383     SV *re_trie_maxbuff;
1384     GET_RE_DEBUG_FLAGS_DECL;
1385
1386     PERL_ARGS_ASSERT_MAKE_TRIE;
1387 #ifndef DEBUGGING
1388     PERL_UNUSED_ARG(depth);
1389 #endif
1390
1391     switch (flags) {
1392         case EXACTFU: folder = PL_fold_latin1; break;
1393         case EXACTF:  folder = PL_fold; break;
1394         case EXACTFL: folder = PL_fold_locale; break;
1395     }
1396
1397     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1398     trie->refcount = 1;
1399     trie->startstate = 1;
1400     trie->wordcount = word_count;
1401     RExC_rxi->data->data[ data_slot ] = (void*)trie;
1402     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1403     if (!(UTF && folder))
1404         trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1405     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1406                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
1407
1408     DEBUG_r({
1409         trie_words = newAV();
1410     });
1411
1412     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1413     if (!SvIOK(re_trie_maxbuff)) {
1414         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1415     }
1416     DEBUG_OPTIMISE_r({
1417                 PerlIO_printf( Perl_debug_log,
1418                   "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1419                   (int)depth * 2 + 2, "",
1420                   REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1421                   REG_NODE_NUM(last), REG_NODE_NUM(tail),
1422                   (int)depth);
1423     });
1424
1425    /* Find the node we are going to overwrite */
1426     if ( first == startbranch && OP( last ) != BRANCH ) {
1427         /* whole branch chain */
1428         convert = first;
1429     } else {
1430         /* branch sub-chain */
1431         convert = NEXTOPER( first );
1432     }
1433
1434     /*  -- First loop and Setup --
1435
1436        We first traverse the branches and scan each word to determine if it
1437        contains widechars, and how many unique chars there are, this is
1438        important as we have to build a table with at least as many columns as we
1439        have unique chars.
1440
1441        We use an array of integers to represent the character codes 0..255
1442        (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1443        native representation of the character value as the key and IV's for the
1444        coded index.
1445
1446        *TODO* If we keep track of how many times each character is used we can
1447        remap the columns so that the table compression later on is more
1448        efficient in terms of memory by ensuring the most common value is in the
1449        middle and the least common are on the outside.  IMO this would be better
1450        than a most to least common mapping as theres a decent chance the most
1451        common letter will share a node with the least common, meaning the node
1452        will not be compressible. With a middle is most common approach the worst
1453        case is when we have the least common nodes twice.
1454
1455      */
1456
1457     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1458         regnode * const noper = NEXTOPER( cur );
1459         const U8 *uc = (U8*)STRING( noper );
1460         const U8 * const e  = uc + STR_LEN( noper );
1461         STRLEN foldlen = 0;
1462         U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1463         const U8 *scan = (U8*)NULL;
1464         U32 wordlen      = 0;         /* required init */
1465         STRLEN chars = 0;
1466         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1467
1468         if (OP(noper) == NOTHING) {
1469             trie->minlen= 0;
1470             continue;
1471         }
1472         if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1473             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1474                                           regardless of encoding */
1475
1476         for ( ; uc < e ; uc += len ) {
1477             TRIE_CHARCOUNT(trie)++;
1478             TRIE_READ_CHAR;
1479             chars++;
1480             if ( uvc < 256 ) {
1481                 if ( !trie->charmap[ uvc ] ) {
1482                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1483                     if ( folder )
1484                         trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1485                     TRIE_STORE_REVCHAR;
1486                 }
1487                 if ( set_bit ) {
1488                     /* store the codepoint in the bitmap, and its folded
1489                      * equivalent. */
1490                     TRIE_BITMAP_SET(trie,uvc);
1491
1492                     /* store the folded codepoint */
1493                     if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1494
1495                     if ( !UTF ) {
1496                         /* store first byte of utf8 representation of
1497                            variant codepoints */
1498                         if (! UNI_IS_INVARIANT(uvc)) {
1499                             TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
1500                         }
1501                     }
1502                     set_bit = 0; /* We've done our bit :-) */
1503                 }
1504             } else {
1505                 SV** svpp;
1506                 if ( !widecharmap )
1507                     widecharmap = newHV();
1508
1509                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1510
1511                 if ( !svpp )
1512                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1513
1514                 if ( !SvTRUE( *svpp ) ) {
1515                     sv_setiv( *svpp, ++trie->uniquecharcount );
1516                     TRIE_STORE_REVCHAR;
1517                 }
1518             }
1519         }
1520         if( cur == first ) {
1521             trie->minlen=chars;
1522             trie->maxlen=chars;
1523         } else if (chars < trie->minlen) {
1524             trie->minlen=chars;
1525         } else if (chars > trie->maxlen) {
1526             trie->maxlen=chars;
1527         }
1528
1529     } /* end first pass */
1530     DEBUG_TRIE_COMPILE_r(
1531         PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1532                 (int)depth * 2 + 2,"",
1533                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1534                 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1535                 (int)trie->minlen, (int)trie->maxlen )
1536     );
1537
1538     /*
1539         We now know what we are dealing with in terms of unique chars and
1540         string sizes so we can calculate how much memory a naive
1541         representation using a flat table  will take. If it's over a reasonable
1542         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1543         conservative but potentially much slower representation using an array
1544         of lists.
1545
1546         At the end we convert both representations into the same compressed
1547         form that will be used in regexec.c for matching with. The latter
1548         is a form that cannot be used to construct with but has memory
1549         properties similar to the list form and access properties similar
1550         to the table form making it both suitable for fast searches and
1551         small enough that its feasable to store for the duration of a program.
1552
1553         See the comment in the code where the compressed table is produced
1554         inplace from the flat tabe representation for an explanation of how
1555         the compression works.
1556
1557     */
1558
1559
1560     Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1561     prev_states[1] = 0;
1562
1563     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1564         /*
1565             Second Pass -- Array Of Lists Representation
1566
1567             Each state will be represented by a list of charid:state records
1568             (reg_trie_trans_le) the first such element holds the CUR and LEN
1569             points of the allocated array. (See defines above).
1570
1571             We build the initial structure using the lists, and then convert
1572             it into the compressed table form which allows faster lookups
1573             (but cant be modified once converted).
1574         */
1575
1576         STRLEN transcount = 1;
1577
1578         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1579             "%*sCompiling trie using list compiler\n",
1580             (int)depth * 2 + 2, ""));
1581
1582         trie->states = (reg_trie_state *)
1583             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1584                                   sizeof(reg_trie_state) );
1585         TRIE_LIST_NEW(1);
1586         next_alloc = 2;
1587
1588         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1589
1590             regnode * const noper = NEXTOPER( cur );
1591             U8 *uc           = (U8*)STRING( noper );
1592             const U8 * const e = uc + STR_LEN( noper );
1593             U32 state        = 1;         /* required init */
1594             U16 charid       = 0;         /* sanity init */
1595             U8 *scan         = (U8*)NULL; /* sanity init */
1596             STRLEN foldlen   = 0;         /* required init */
1597             U32 wordlen      = 0;         /* required init */
1598             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1599
1600             if (OP(noper) != NOTHING) {
1601                 for ( ; uc < e ; uc += len ) {
1602
1603                     TRIE_READ_CHAR;
1604
1605                     if ( uvc < 256 ) {
1606                         charid = trie->charmap[ uvc ];
1607                     } else {
1608                         SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1609                         if ( !svpp ) {
1610                             charid = 0;
1611                         } else {
1612                             charid=(U16)SvIV( *svpp );
1613                         }
1614                     }
1615                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
1616                     if ( charid ) {
1617
1618                         U16 check;
1619                         U32 newstate = 0;
1620
1621                         charid--;
1622                         if ( !trie->states[ state ].trans.list ) {
1623                             TRIE_LIST_NEW( state );
1624                         }
1625                         for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1626                             if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1627                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1628                                 break;
1629                             }
1630                         }
1631                         if ( ! newstate ) {
1632                             newstate = next_alloc++;
1633                             prev_states[newstate] = state;
1634                             TRIE_LIST_PUSH( state, charid, newstate );
1635                             transcount++;
1636                         }
1637                         state = newstate;
1638                     } else {
1639                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1640                     }
1641                 }
1642             }
1643             TRIE_HANDLE_WORD(state);
1644
1645         } /* end second pass */
1646
1647         /* next alloc is the NEXT state to be allocated */
1648         trie->statecount = next_alloc;
1649         trie->states = (reg_trie_state *)
1650             PerlMemShared_realloc( trie->states,
1651                                    next_alloc
1652                                    * sizeof(reg_trie_state) );
1653
1654         /* and now dump it out before we compress it */
1655         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1656                                                          revcharmap, next_alloc,
1657                                                          depth+1)
1658         );
1659
1660         trie->trans = (reg_trie_trans *)
1661             PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1662         {
1663             U32 state;
1664             U32 tp = 0;
1665             U32 zp = 0;
1666
1667
1668             for( state=1 ; state < next_alloc ; state ++ ) {
1669                 U32 base=0;
1670
1671                 /*
1672                 DEBUG_TRIE_COMPILE_MORE_r(
1673                     PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1674                 );
1675                 */
1676
1677                 if (trie->states[state].trans.list) {
1678                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1679                     U16 maxid=minid;
1680                     U16 idx;
1681
1682                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1683                         const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1684                         if ( forid < minid ) {
1685                             minid=forid;
1686                         } else if ( forid > maxid ) {
1687                             maxid=forid;
1688                         }
1689                     }
1690                     if ( transcount < tp + maxid - minid + 1) {
1691                         transcount *= 2;
1692                         trie->trans = (reg_trie_trans *)
1693                             PerlMemShared_realloc( trie->trans,
1694                                                      transcount
1695                                                      * sizeof(reg_trie_trans) );
1696                         Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1697                     }
1698                     base = trie->uniquecharcount + tp - minid;
1699                     if ( maxid == minid ) {
1700                         U32 set = 0;
1701                         for ( ; zp < tp ; zp++ ) {
1702                             if ( ! trie->trans[ zp ].next ) {
1703                                 base = trie->uniquecharcount + zp - minid;
1704                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1705                                 trie->trans[ zp ].check = state;
1706                                 set = 1;
1707                                 break;
1708                             }
1709                         }
1710                         if ( !set ) {
1711                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1712                             trie->trans[ tp ].check = state;
1713                             tp++;
1714                             zp = tp;
1715                         }
1716                     } else {
1717                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1718                             const U32 tid = base -  trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1719                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1720                             trie->trans[ tid ].check = state;
1721                         }
1722                         tp += ( maxid - minid + 1 );
1723                     }
1724                     Safefree(trie->states[ state ].trans.list);
1725                 }
1726                 /*
1727                 DEBUG_TRIE_COMPILE_MORE_r(
1728                     PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1729                 );
1730                 */
1731                 trie->states[ state ].trans.base=base;
1732             }
1733             trie->lasttrans = tp + 1;
1734         }
1735     } else {
1736         /*
1737            Second Pass -- Flat Table Representation.
1738
1739            we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1740            We know that we will need Charcount+1 trans at most to store the data
1741            (one row per char at worst case) So we preallocate both structures
1742            assuming worst case.
1743
1744            We then construct the trie using only the .next slots of the entry
1745            structs.
1746
1747            We use the .check field of the first entry of the node temporarily to
1748            make compression both faster and easier by keeping track of how many non
1749            zero fields are in the node.
1750
1751            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1752            transition.
1753
1754            There are two terms at use here: state as a TRIE_NODEIDX() which is a
1755            number representing the first entry of the node, and state as a
1756            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1757            TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1758            are 2 entrys per node. eg:
1759
1760              A B       A B
1761           1. 2 4    1. 3 7
1762           2. 0 3    3. 0 5
1763           3. 0 0    5. 0 0
1764           4. 0 0    7. 0 0
1765
1766            The table is internally in the right hand, idx form. However as we also
1767            have to deal with the states array which is indexed by nodenum we have to
1768            use TRIE_NODENUM() to convert.
1769
1770         */
1771         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1772             "%*sCompiling trie using table compiler\n",
1773             (int)depth * 2 + 2, ""));
1774
1775         trie->trans = (reg_trie_trans *)
1776             PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1777                                   * trie->uniquecharcount + 1,
1778                                   sizeof(reg_trie_trans) );
1779         trie->states = (reg_trie_state *)
1780             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1781                                   sizeof(reg_trie_state) );
1782         next_alloc = trie->uniquecharcount + 1;
1783
1784
1785         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1786
1787             regnode * const noper   = NEXTOPER( cur );
1788             const U8 *uc     = (U8*)STRING( noper );
1789             const U8 * const e = uc + STR_LEN( noper );
1790
1791             U32 state        = 1;         /* required init */
1792
1793             U16 charid       = 0;         /* sanity init */
1794             U32 accept_state = 0;         /* sanity init */
1795             U8 *scan         = (U8*)NULL; /* sanity init */
1796
1797             STRLEN foldlen   = 0;         /* required init */
1798             U32 wordlen      = 0;         /* required init */
1799             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1800
1801             if ( OP(noper) != NOTHING ) {
1802                 for ( ; uc < e ; uc += len ) {
1803
1804                     TRIE_READ_CHAR;
1805
1806                     if ( uvc < 256 ) {
1807                         charid = trie->charmap[ uvc ];
1808                     } else {
1809                         SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1810                         charid = svpp ? (U16)SvIV(*svpp) : 0;
1811                     }
1812                     if ( charid ) {
1813                         charid--;
1814                         if ( !trie->trans[ state + charid ].next ) {
1815                             trie->trans[ state + charid ].next = next_alloc;
1816                             trie->trans[ state ].check++;
1817                             prev_states[TRIE_NODENUM(next_alloc)]
1818                                     = TRIE_NODENUM(state);
1819                             next_alloc += trie->uniquecharcount;
1820                         }
1821                         state = trie->trans[ state + charid ].next;
1822                     } else {
1823                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1824                     }
1825                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
1826                 }
1827             }
1828             accept_state = TRIE_NODENUM( state );
1829             TRIE_HANDLE_WORD(accept_state);
1830
1831         } /* end second pass */
1832
1833         /* and now dump it out before we compress it */
1834         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1835                                                           revcharmap,
1836                                                           next_alloc, depth+1));
1837
1838         {
1839         /*
1840            * Inplace compress the table.*
1841
1842            For sparse data sets the table constructed by the trie algorithm will
1843            be mostly 0/FAIL transitions or to put it another way mostly empty.
1844            (Note that leaf nodes will not contain any transitions.)
1845
1846            This algorithm compresses the tables by eliminating most such
1847            transitions, at the cost of a modest bit of extra work during lookup:
1848
1849            - Each states[] entry contains a .base field which indicates the
1850            index in the state[] array wheres its transition data is stored.
1851
1852            - If .base is 0 there are no valid transitions from that node.
1853
1854            - If .base is nonzero then charid is added to it to find an entry in
1855            the trans array.
1856
1857            -If trans[states[state].base+charid].check!=state then the
1858            transition is taken to be a 0/Fail transition. Thus if there are fail
1859            transitions at the front of the node then the .base offset will point
1860            somewhere inside the previous nodes data (or maybe even into a node
1861            even earlier), but the .check field determines if the transition is
1862            valid.
1863
1864            XXX - wrong maybe?
1865            The following process inplace converts the table to the compressed
1866            table: We first do not compress the root node 1,and mark all its
1867            .check pointers as 1 and set its .base pointer as 1 as well. This
1868            allows us to do a DFA construction from the compressed table later,
1869            and ensures that any .base pointers we calculate later are greater
1870            than 0.
1871
1872            - We set 'pos' to indicate the first entry of the second node.
1873
1874            - We then iterate over the columns of the node, finding the first and
1875            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1876            and set the .check pointers accordingly, and advance pos
1877            appropriately and repreat for the next node. Note that when we copy
1878            the next pointers we have to convert them from the original
1879            NODEIDX form to NODENUM form as the former is not valid post
1880            compression.
1881
1882            - If a node has no transitions used we mark its base as 0 and do not
1883            advance the pos pointer.
1884
1885            - If a node only has one transition we use a second pointer into the
1886            structure to fill in allocated fail transitions from other states.
1887            This pointer is independent of the main pointer and scans forward
1888            looking for null transitions that are allocated to a state. When it
1889            finds one it writes the single transition into the "hole".  If the
1890            pointer doesnt find one the single transition is appended as normal.
1891
1892            - Once compressed we can Renew/realloc the structures to release the
1893            excess space.
1894
1895            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1896            specifically Fig 3.47 and the associated pseudocode.
1897
1898            demq
1899         */
1900         const U32 laststate = TRIE_NODENUM( next_alloc );
1901         U32 state, charid;
1902         U32 pos = 0, zp=0;
1903         trie->statecount = laststate;
1904
1905         for ( state = 1 ; state < laststate ; state++ ) {
1906             U8 flag = 0;
1907             const U32 stateidx = TRIE_NODEIDX( state );
1908             const U32 o_used = trie->trans[ stateidx ].check;
1909             U32 used = trie->trans[ stateidx ].check;
1910             trie->trans[ stateidx ].check = 0;
1911
1912             for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1913                 if ( flag || trie->trans[ stateidx + charid ].next ) {
1914                     if ( trie->trans[ stateidx + charid ].next ) {
1915                         if (o_used == 1) {
1916                             for ( ; zp < pos ; zp++ ) {
1917                                 if ( ! trie->trans[ zp ].next ) {
1918                                     break;
1919                                 }
1920                             }
1921                             trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1922                             trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1923                             trie->trans[ zp ].check = state;
1924                             if ( ++zp > pos ) pos = zp;
1925                             break;
1926                         }
1927                         used--;
1928                     }
1929                     if ( !flag ) {
1930                         flag = 1;
1931                         trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1932                     }
1933                     trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1934                     trie->trans[ pos ].check = state;
1935                     pos++;
1936                 }
1937             }
1938         }
1939         trie->lasttrans = pos + 1;
1940         trie->states = (reg_trie_state *)
1941             PerlMemShared_realloc( trie->states, laststate
1942                                    * sizeof(reg_trie_state) );
1943         DEBUG_TRIE_COMPILE_MORE_r(
1944                 PerlIO_printf( Perl_debug_log,
1945                     "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1946                     (int)depth * 2 + 2,"",
1947                     (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
1948                     (IV)next_alloc,
1949                     (IV)pos,
1950                     ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1951             );
1952
1953         } /* end table compress */
1954     }
1955     DEBUG_TRIE_COMPILE_MORE_r(
1956             PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1957                 (int)depth * 2 + 2, "",
1958                 (UV)trie->statecount,
1959                 (UV)trie->lasttrans)
1960     );
1961     /* resize the trans array to remove unused space */
1962     trie->trans = (reg_trie_trans *)
1963         PerlMemShared_realloc( trie->trans, trie->lasttrans
1964                                * sizeof(reg_trie_trans) );
1965
1966     {   /* Modify the program and insert the new TRIE node */
1967         U8 nodetype =(U8)(flags & 0xFF);
1968         char *str=NULL;
1969
1970 #ifdef DEBUGGING
1971         regnode *optimize = NULL;
1972 #ifdef RE_TRACK_PATTERN_OFFSETS
1973
1974         U32 mjd_offset = 0;
1975         U32 mjd_nodelen = 0;
1976 #endif /* RE_TRACK_PATTERN_OFFSETS */
1977 #endif /* DEBUGGING */
1978         /*
1979            This means we convert either the first branch or the first Exact,
1980            depending on whether the thing following (in 'last') is a branch
1981            or not and whther first is the startbranch (ie is it a sub part of
1982            the alternation or is it the whole thing.)
1983            Assuming its a sub part we convert the EXACT otherwise we convert
1984            the whole branch sequence, including the first.
1985          */
1986         /* Find the node we are going to overwrite */
1987         if ( first != startbranch || OP( last ) == BRANCH ) {
1988             /* branch sub-chain */
1989             NEXT_OFF( first ) = (U16)(last - first);
1990 #ifdef RE_TRACK_PATTERN_OFFSETS
1991             DEBUG_r({
1992                 mjd_offset= Node_Offset((convert));
1993                 mjd_nodelen= Node_Length((convert));
1994             });
1995 #endif
1996             /* whole branch chain */
1997         }
1998 #ifdef RE_TRACK_PATTERN_OFFSETS
1999         else {
2000             DEBUG_r({
2001                 const  regnode *nop = NEXTOPER( convert );
2002                 mjd_offset= Node_Offset((nop));
2003                 mjd_nodelen= Node_Length((nop));
2004             });
2005         }
2006         DEBUG_OPTIMISE_r(
2007             PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2008                 (int)depth * 2 + 2, "",
2009                 (UV)mjd_offset, (UV)mjd_nodelen)
2010         );
2011 #endif
2012         /* But first we check to see if there is a common prefix we can
2013            split out as an EXACT and put in front of the TRIE node.  */
2014         trie->startstate= 1;
2015         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
2016             U32 state;
2017             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2018                 U32 ofs = 0;
2019                 I32 idx = -1;
2020                 U32 count = 0;
2021                 const U32 base = trie->states[ state ].trans.base;
2022
2023                 if ( trie->states[state].wordnum )
2024                         count = 1;
2025
2026                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2027                     if ( ( base + ofs >= trie->uniquecharcount ) &&
2028                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2029                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2030                     {
2031                         if ( ++count > 1 ) {
2032                             SV **tmp = av_fetch( revcharmap, ofs, 0);
2033                             const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2034                             if ( state == 1 ) break;
2035                             if ( count == 2 ) {
2036                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2037                                 DEBUG_OPTIMISE_r(
2038                                     PerlIO_printf(Perl_debug_log,
2039                                         "%*sNew Start State=%"UVuf" Class: [",
2040                                         (int)depth * 2 + 2, "",
2041                                         (UV)state));
2042                                 if (idx >= 0) {
2043                                     SV ** const tmp = av_fetch( revcharmap, idx, 0);
2044                                     const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2045
2046                                     TRIE_BITMAP_SET(trie,*ch);
2047                                     if ( folder )
2048                                         TRIE_BITMAP_SET(trie, folder[ *ch ]);
2049                                     DEBUG_OPTIMISE_r(
2050                                         PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2051                                     );
2052                                 }
2053                             }
2054                             TRIE_BITMAP_SET(trie,*ch);
2055                             if ( folder )
2056                                 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2057                             DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2058                         }
2059                         idx = ofs;
2060                     }
2061                 }
2062                 if ( count == 1 ) {
2063                     SV **tmp = av_fetch( revcharmap, idx, 0);
2064                     STRLEN len;
2065                     char *ch = SvPV( *tmp, len );
2066                     DEBUG_OPTIMISE_r({
2067                         SV *sv=sv_newmortal();
2068                         PerlIO_printf( Perl_debug_log,
2069                             "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2070                             (int)depth * 2 + 2, "",
2071                             (UV)state, (UV)idx,
2072                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2073                                 PL_colors[0], PL_colors[1],
2074                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2075                                 PERL_PV_ESCAPE_FIRSTCHAR
2076                             )
2077                         );
2078                     });
2079                     if ( state==1 ) {
2080                         OP( convert ) = nodetype;
2081                         str=STRING(convert);
2082                         STR_LEN(convert)=0;
2083                     }
2084                     STR_LEN(convert) += len;
2085                     while (len--)
2086                         *str++ = *ch++;
2087                 } else {
2088 #ifdef DEBUGGING
2089                     if (state>1)
2090                         DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2091 #endif
2092                     break;
2093                 }
2094             }
2095             trie->prefixlen = (state-1);
2096             if (str) {
2097                 regnode *n = convert+NODE_SZ_STR(convert);
2098                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2099                 trie->startstate = state;
2100                 trie->minlen -= (state - 1);
2101                 trie->maxlen -= (state - 1);
2102 #ifdef DEBUGGING
2103                /* At least the UNICOS C compiler choked on this
2104                 * being argument to DEBUG_r(), so let's just have
2105                 * it right here. */
2106                if (
2107 #ifdef PERL_EXT_RE_BUILD
2108                    1
2109 #else
2110                    DEBUG_r_TEST
2111 #endif
2112                    ) {
2113                    regnode *fix = convert;
2114                    U32 word = trie->wordcount;
2115                    mjd_nodelen++;
2116                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2117                    while( ++fix < n ) {
2118                        Set_Node_Offset_Length(fix, 0, 0);
2119                    }
2120                    while (word--) {
2121                        SV ** const tmp = av_fetch( trie_words, word, 0 );
2122                        if (tmp) {
2123                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
2124                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2125                            else
2126                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2127                        }
2128                    }
2129                }
2130 #endif
2131                 if (trie->maxlen) {
2132                     convert = n;
2133                 } else {
2134                     NEXT_OFF(convert) = (U16)(tail - convert);
2135                     DEBUG_r(optimize= n);
2136                 }
2137             }
2138         }
2139         if (!jumper)
2140             jumper = last;
2141         if ( trie->maxlen ) {
2142             NEXT_OFF( convert ) = (U16)(tail - convert);
2143             ARG_SET( convert, data_slot );
2144             /* Store the offset to the first unabsorbed branch in
2145                jump[0], which is otherwise unused by the jump logic.
2146                We use this when dumping a trie and during optimisation. */
2147             if (trie->jump)
2148                 trie->jump[0] = (U16)(nextbranch - convert);
2149
2150             /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2151              *   and there is a bitmap
2152              *   and the first "jump target" node we found leaves enough room
2153              * then convert the TRIE node into a TRIEC node, with the bitmap
2154              * embedded inline in the opcode - this is hypothetically faster.
2155              */
2156             if ( !trie->states[trie->startstate].wordnum
2157                  && trie->bitmap
2158                  && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2159             {
2160                 OP( convert ) = TRIEC;
2161                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2162                 PerlMemShared_free(trie->bitmap);
2163                 trie->bitmap= NULL;
2164             } else
2165                 OP( convert ) = TRIE;
2166
2167             /* store the type in the flags */
2168             convert->flags = nodetype;
2169             DEBUG_r({
2170             optimize = convert
2171                       + NODE_STEP_REGNODE
2172                       + regarglen[ OP( convert ) ];
2173             });
2174             /* XXX We really should free up the resource in trie now,
2175                    as we won't use them - (which resources?) dmq */
2176         }
2177         /* needed for dumping*/
2178         DEBUG_r(if (optimize) {
2179             regnode *opt = convert;
2180
2181             while ( ++opt < optimize) {
2182                 Set_Node_Offset_Length(opt,0,0);
2183             }
2184             /*
2185                 Try to clean up some of the debris left after the
2186                 optimisation.
2187              */
2188             while( optimize < jumper ) {
2189                 mjd_nodelen += Node_Length((optimize));
2190                 OP( optimize ) = OPTIMIZED;
2191                 Set_Node_Offset_Length(optimize,0,0);
2192                 optimize++;
2193             }
2194             Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2195         });
2196     } /* end node insert */
2197     REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, convert);
2198
2199     /*  Finish populating the prev field of the wordinfo array.  Walk back
2200      *  from each accept state until we find another accept state, and if
2201      *  so, point the first word's .prev field at the second word. If the
2202      *  second already has a .prev field set, stop now. This will be the
2203      *  case either if we've already processed that word's accept state,
2204      *  or that state had multiple words, and the overspill words were
2205      *  already linked up earlier.
2206      */
2207     {
2208         U16 word;
2209         U32 state;
2210         U16 prev;
2211
2212         for (word=1; word <= trie->wordcount; word++) {
2213             prev = 0;
2214             if (trie->wordinfo[word].prev)
2215                 continue;
2216             state = trie->wordinfo[word].accept;
2217             while (state) {
2218                 state = prev_states[state];
2219                 if (!state)
2220                     break;
2221                 prev = trie->states[state].wordnum;
2222                 if (prev)
2223                     break;
2224             }
2225             trie->wordinfo[word].prev = prev;
2226         }
2227         Safefree(prev_states);
2228     }
2229
2230
2231     /* and now dump out the compressed format */
2232     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2233
2234     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2235 #ifdef DEBUGGING
2236     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2237     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2238 #else
2239     SvREFCNT_dec(revcharmap);
2240 #endif
2241     return trie->jump
2242            ? MADE_JUMP_TRIE
2243            : trie->startstate>1
2244              ? MADE_EXACT_TRIE
2245              : MADE_TRIE;
2246 }
2247
2248 STATIC void
2249 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode *stclass, U32 depth)
2250 {
2251 /* The Trie is constructed and compressed now so we can build a fail array if it's needed
2252
2253    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2254    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2255    ISBN 0-201-10088-6
2256
2257    We find the fail state for each state in the trie, this state is the longest proper
2258    suffix of the current state's 'word' that is also a proper prefix of another word in our
2259    trie. State 1 represents the word '' and is thus the default fail state. This allows
2260    the DFA not to have to restart after its tried and failed a word at a given point, it
2261    simply continues as though it had been matching the other word in the first place.
2262    Consider
2263       'abcdgu'=~/abcdefg|cdgu/
2264    When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2265    fail, which would bring us to the state representing 'd' in the second word where we would
2266    try 'g' and succeed, proceeding to match 'cdgu'.
2267  */
2268  /* add a fail transition */
2269     const U32 trie_offset = ARG(source);
2270     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2271     U32 *q;
2272     const U32 ucharcount = trie->uniquecharcount;
2273     const U32 numstates = trie->statecount;
2274     const U32 ubound = trie->lasttrans + ucharcount;
2275     U32 q_read = 0;
2276     U32 q_write = 0;
2277     U32 charid;
2278     U32 base = trie->states[ 1 ].trans.base;
2279     U32 *fail;
2280     reg_ac_data *aho;
2281     const U32 data_slot = add_data( pRExC_state, 1, "T" );
2282     GET_RE_DEBUG_FLAGS_DECL;
2283
2284     PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2285 #ifndef DEBUGGING
2286     PERL_UNUSED_ARG(depth);
2287 #endif
2288
2289
2290     ARG_SET( stclass, data_slot );
2291     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2292     RExC_rxi->data->data[ data_slot ] = (void*)aho;
2293     aho->trie=trie_offset;
2294     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2295     Copy( trie->states, aho->states, numstates, reg_trie_state );
2296     Newxz( q, numstates, U32);
2297     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2298     aho->refcount = 1;
2299     fail = aho->fail;
2300     /* initialize fail[0..1] to be 1 so that we always have
2301        a valid final fail state */
2302     fail[ 0 ] = fail[ 1 ] = 1;
2303
2304     for ( charid = 0; charid < ucharcount ; charid++ ) {
2305         const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2306         if ( newstate ) {
2307             q[ q_write ] = newstate;
2308             /* set to point at the root */
2309             fail[ q[ q_write++ ] ]=1;
2310         }
2311     }
2312     while ( q_read < q_write) {
2313         const U32 cur = q[ q_read++ % numstates ];
2314         base = trie->states[ cur ].trans.base;
2315
2316         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2317             const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2318             if (ch_state) {
2319                 U32 fail_state = cur;
2320                 U32 fail_base;
2321                 do {
2322                     fail_state = fail[ fail_state ];
2323                     fail_base = aho->states[ fail_state ].trans.base;
2324                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2325
2326                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2327                 fail[ ch_state ] = fail_state;
2328                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2329                 {
2330                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
2331                 }
2332                 q[ q_write++ % numstates] = ch_state;
2333             }
2334         }
2335     }
2336     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2337        when we fail in state 1, this allows us to use the
2338        charclass scan to find a valid start char. This is based on the principle
2339        that theres a good chance the string being searched contains lots of stuff
2340        that cant be a start char.
2341      */
2342     fail[ 0 ] = fail[ 1 ] = 0;
2343     DEBUG_TRIE_COMPILE_r({
2344         PerlIO_printf(Perl_debug_log,
2345                       "%*sStclass Failtable (%"UVuf" states): 0",
2346                       (int)(depth * 2), "", (UV)numstates
2347         );
2348         for( q_read=1; q_read<numstates; q_read++ ) {
2349             PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2350         }
2351         PerlIO_printf(Perl_debug_log, "\n");
2352     });
2353     Safefree(q);
2354     /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2355 }
2356
2357
2358 /*
2359  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2360  * These need to be revisited when a newer toolchain becomes available.
2361  */
2362 #if defined(__sparc64__) && defined(__GNUC__)
2363 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2364 #       undef  SPARC64_GCC_WORKAROUND
2365 #       define SPARC64_GCC_WORKAROUND 1
2366 #   endif
2367 #endif
2368
2369 #define DEBUG_PEEP(str,scan,depth) \
2370     DEBUG_OPTIMISE_r({if (scan){ \
2371        SV * const mysv=sv_newmortal(); \
2372        regnode *Next = regnext(scan); \
2373        regprop(RExC_rx, mysv, scan); \
2374        PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2375        (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2376        Next ? (REG_NODE_NUM(Next)) : 0 ); \
2377    }});
2378
2379
2380
2381
2382
2383 #define JOIN_EXACT(scan,min,flags) \
2384     if (PL_regkind[OP(scan)] == EXACT) \
2385         join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2386
2387 STATIC U32
2388 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2389     /* Merge several consecutive EXACTish nodes into one. */
2390     regnode *n = regnext(scan);
2391     U32 stringok = 1;
2392     regnode *next = scan + NODE_SZ_STR(scan);
2393     U32 merged = 0;
2394     U32 stopnow = 0;
2395 #ifdef DEBUGGING
2396     regnode *stop = scan;
2397     GET_RE_DEBUG_FLAGS_DECL;
2398 #else
2399     PERL_UNUSED_ARG(depth);
2400 #endif
2401
2402     PERL_ARGS_ASSERT_JOIN_EXACT;
2403 #ifndef EXPERIMENTAL_INPLACESCAN
2404     PERL_UNUSED_ARG(flags);
2405     PERL_UNUSED_ARG(val);
2406 #endif
2407     DEBUG_PEEP("join",scan,depth);
2408
2409     /* Skip NOTHING, merge EXACT*. */
2410     while (n &&
2411            ( PL_regkind[OP(n)] == NOTHING ||
2412              (stringok && (OP(n) == OP(scan))))
2413            && NEXT_OFF(n)
2414            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2415
2416         if (OP(n) == TAIL || n > next)
2417             stringok = 0;
2418         if (PL_regkind[OP(n)] == NOTHING) {
2419             DEBUG_PEEP("skip:",n,depth);
2420             NEXT_OFF(scan) += NEXT_OFF(n);
2421             next = n + NODE_STEP_REGNODE;
2422 #ifdef DEBUGGING
2423             if (stringok)
2424                 stop = n;
2425 #endif
2426             n = regnext(n);
2427         }
2428         else if (stringok) {
2429             const unsigned int oldl = STR_LEN(scan);
2430             regnode * const nnext = regnext(n);
2431
2432             DEBUG_PEEP("merg",n,depth);
2433
2434             merged++;
2435             if (oldl + STR_LEN(n) > U8_MAX)
2436                 break;
2437             NEXT_OFF(scan) += NEXT_OFF(n);
2438             STR_LEN(scan) += STR_LEN(n);
2439             next = n + NODE_SZ_STR(n);
2440             /* Now we can overwrite *n : */
2441             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2442 #ifdef DEBUGGING
2443             stop = next - 1;
2444 #endif
2445             n = nnext;
2446             if (stopnow) break;
2447         }
2448
2449 #ifdef EXPERIMENTAL_INPLACESCAN
2450         if (flags && !NEXT_OFF(n)) {
2451             DEBUG_PEEP("atch", val, depth);
2452             if (reg_off_by_arg[OP(n)]) {
2453                 ARG_SET(n, val - n);
2454             }
2455             else {
2456                 NEXT_OFF(n) = val - n;
2457             }
2458             stopnow = 1;
2459         }
2460 #endif
2461     }
2462 #define GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS   0x0390
2463 #define IOTA_D_T        GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS
2464 #define GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS     0x03B0
2465 #define UPSILON_D_T     GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS
2466
2467     if (UTF
2468         && ( OP(scan) == EXACTF || OP(scan) == EXACTFU)
2469         && ( STR_LEN(scan) >= 6 ) )
2470     {
2471     /*
2472     Two problematic code points in Unicode casefolding of EXACT nodes:
2473
2474     U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2475     U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2476
2477     which casefold to
2478
2479     Unicode                      UTF-8
2480
2481     U+03B9 U+0308 U+0301         0xCE 0xB9 0xCC 0x88 0xCC 0x81
2482     U+03C5 U+0308 U+0301         0xCF 0x85 0xCC 0x88 0xCC 0x81
2483
2484     This means that in case-insensitive matching (or "loose matching",
2485     as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2486     length of the above casefolded versions) can match a target string
2487     of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2488     This would rather mess up the minimum length computation.
2489
2490     What we'll do is to look for the tail four bytes, and then peek
2491     at the preceding two bytes to see whether we need to decrease
2492     the minimum length by four (six minus two).
2493
2494     Thanks to the design of UTF-8, there cannot be false matches:
2495     A sequence of valid UTF-8 bytes cannot be a subsequence of
2496     another valid sequence of UTF-8 bytes.
2497
2498     */
2499          char * const s0 = STRING(scan), *s, *t;
2500          char * const s1 = s0 + STR_LEN(scan) - 1;
2501          char * const s2 = s1 - 4;
2502 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2503          const char t0[] = "\xaf\x49\xaf\x42";
2504 #else
2505          const char t0[] = "\xcc\x88\xcc\x81";
2506 #endif
2507          const char * const t1 = t0 + 3;
2508
2509          for (s = s0 + 2;
2510               s < s2 && (t = ninstr(s, s1, t0, t1));
2511               s = t + 4) {
2512 #ifdef EBCDIC
2513               if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2514                   ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2515 #else
2516               if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2517                   ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
2518 #endif
2519                    *min -= 4;
2520          }
2521     }
2522
2523 #ifdef DEBUGGING
2524     /* Allow dumping */
2525     n = scan + NODE_SZ_STR(scan);
2526     while (n <= stop) {
2527         if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2528             OP(n) = OPTIMIZED;
2529             NEXT_OFF(n) = 0;
2530         }
2531         n++;
2532     }
2533 #endif
2534     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2535     return stopnow;
2536 }
2537
2538 /* REx optimizer.  Converts nodes into quicker variants "in place".
2539    Finds fixed substrings.  */
2540
2541 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2542    to the position after last scanned or to NULL. */
2543
2544 #define INIT_AND_WITHP \
2545     assert(!and_withp); \
2546     Newx(and_withp,1,struct regnode_charclass_class); \
2547     SAVEFREEPV(and_withp)
2548
2549 /* this is a chain of data about sub patterns we are processing that
2550    need to be handled separately/specially in study_chunk. Its so
2551    we can simulate recursion without losing state.  */
2552 struct scan_frame;
2553 typedef struct scan_frame {
2554     regnode *last;  /* last node to process in this frame */
2555     regnode *next;  /* next node to process when last is reached */
2556     struct scan_frame *prev; /*previous frame*/
2557     I32 stop; /* what stopparen do we use */
2558 } scan_frame;
2559
2560
2561 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2562
2563 #define CASE_SYNST_FNC(nAmE)                                       \
2564 case nAmE:                                                         \
2565     if (flags & SCF_DO_STCLASS_AND) {                              \
2566             for (value = 0; value < 256; value++)                  \
2567                 if (!is_ ## nAmE ## _cp(value))                       \
2568                     ANYOF_BITMAP_CLEAR(data->start_class, value);  \
2569     }                                                              \
2570     else {                                                         \
2571             for (value = 0; value < 256; value++)                  \
2572                 if (is_ ## nAmE ## _cp(value))                        \
2573                     ANYOF_BITMAP_SET(data->start_class, value);    \
2574     }                                                              \
2575     break;                                                         \
2576 case N ## nAmE:                                                    \
2577     if (flags & SCF_DO_STCLASS_AND) {                              \
2578             for (value = 0; value < 256; value++)                   \
2579                 if (is_ ## nAmE ## _cp(value))                         \
2580                     ANYOF_BITMAP_CLEAR(data->start_class, value);   \
2581     }                                                               \
2582     else {                                                          \
2583             for (value = 0; value < 256; value++)                   \
2584                 if (!is_ ## nAmE ## _cp(value))                        \
2585                     ANYOF_BITMAP_SET(data->start_class, value);     \
2586     }                                                               \
2587     break
2588
2589
2590
2591 STATIC I32
2592 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2593                         I32 *minlenp, I32 *deltap,
2594                         regnode *last,
2595                         scan_data_t *data,
2596                         I32 stopparen,
2597                         U8* recursed,
2598                         struct regnode_charclass_class *and_withp,
2599                         U32 flags, U32 depth)
2600                         /* scanp: Start here (read-write). */
2601                         /* deltap: Write maxlen-minlen here. */
2602                         /* last: Stop before this one. */
2603                         /* data: string data about the pattern */
2604                         /* stopparen: treat close N as END */
2605                         /* recursed: which subroutines have we recursed into */
2606                         /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2607 {
2608     dVAR;
2609     I32 min = 0, pars = 0, code;
2610     regnode *scan = *scanp, *next;
2611     I32 delta = 0;
2612     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2613     int is_inf_internal = 0;            /* The studied chunk is infinite */
2614     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2615     scan_data_t data_fake;
2616     SV *re_trie_maxbuff = NULL;
2617     regnode *first_non_open = scan;
2618     I32 stopmin = I32_MAX;
2619     scan_frame *frame = NULL;
2620     GET_RE_DEBUG_FLAGS_DECL;
2621
2622     PERL_ARGS_ASSERT_STUDY_CHUNK;
2623
2624 #ifdef DEBUGGING
2625     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2626 #endif
2627
2628     if ( depth == 0 ) {
2629         while (first_non_open && OP(first_non_open) == OPEN)
2630             first_non_open=regnext(first_non_open);
2631     }
2632
2633
2634   fake_study_recurse:
2635     while ( scan && OP(scan) != END && scan < last ){
2636         /* Peephole optimizer: */
2637         DEBUG_STUDYDATA("Peep:", data,depth);
2638         DEBUG_PEEP("Peep",scan,depth);
2639         JOIN_EXACT(scan,&min,0);
2640
2641         /* Follow the next-chain of the current node and optimize
2642            away all the NOTHINGs from it.  */
2643         if (OP(scan) != CURLYX) {
2644             const int max = (reg_off_by_arg[OP(scan)]
2645                        ? I32_MAX
2646                        /* I32 may be smaller than U16 on CRAYs! */
2647                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2648             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2649             int noff;
2650             regnode *n = scan;
2651
2652             /* Skip NOTHING and LONGJMP. */
2653             while ((n = regnext(n))
2654                    && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2655                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2656                    && off + noff < max)
2657                 off += noff;
2658             if (reg_off_by_arg[OP(scan)])
2659                 ARG(scan) = off;
2660             else
2661                 NEXT_OFF(scan) = off;
2662         }
2663
2664
2665
2666         /* The principal pseudo-switch.  Cannot be a switch, since we
2667            look into several different things.  */
2668         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2669                    || OP(scan) == IFTHEN) {
2670             next = regnext(scan);
2671             code = OP(scan);
2672             /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2673
2674             if (OP(next) == code || code == IFTHEN) {
2675                 /* NOTE - There is similar code to this block below for handling
2676                    TRIE nodes on a re-study.  If you change stuff here check there
2677                    too. */
2678                 I32 max1 = 0, min1 = I32_MAX, num = 0;
2679                 struct regnode_charclass_class accum;
2680                 regnode * const startbranch=scan;
2681
2682                 if (flags & SCF_DO_SUBSTR)
2683                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2684                 if (flags & SCF_DO_STCLASS)
2685                     cl_init_zero(pRExC_state, &accum);
2686
2687                 while (OP(scan) == code) {
2688                     I32 deltanext, minnext, f = 0, fake;
2689                     struct regnode_charclass_class this_class;
2690
2691                     num++;
2692                     data_fake.flags = 0;
2693                     if (data) {
2694                         data_fake.whilem_c = data->whilem_c;
2695                         data_fake.last_closep = data->last_closep;
2696                     }
2697                     else
2698                         data_fake.last_closep = &fake;
2699
2700                     data_fake.pos_delta = delta;
2701                     next = regnext(scan);
2702                     scan = NEXTOPER(scan);
2703                     if (code != BRANCH)
2704                         scan = NEXTOPER(scan);
2705                     if (flags & SCF_DO_STCLASS) {
2706                         cl_init(pRExC_state, &this_class);
2707                         data_fake.start_class = &this_class;
2708                         f = SCF_DO_STCLASS_AND;
2709                     }
2710                     if (flags & SCF_WHILEM_VISITED_POS)
2711                         f |= SCF_WHILEM_VISITED_POS;
2712
2713                     /* we suppose the run is continuous, last=next...*/
2714                     minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2715                                           next, &data_fake,
2716                                           stopparen, recursed, NULL, f,depth+1);
2717                     if (min1 > minnext)
2718                         min1 = minnext;
2719                     if (max1 < minnext + deltanext)
2720                         max1 = minnext + deltanext;
2721                     if (deltanext == I32_MAX)
2722                         is_inf = is_inf_internal = 1;
2723                     scan = next;
2724                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2725                         pars++;
2726                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
2727                         if ( stopmin > minnext)
2728                             stopmin = min + min1;
2729                         flags &= ~SCF_DO_SUBSTR;
2730                         if (data)
2731                             data->flags |= SCF_SEEN_ACCEPT;
2732                     }
2733                     if (data) {
2734                         if (data_fake.flags & SF_HAS_EVAL)
2735                             data->flags |= SF_HAS_EVAL;
2736                         data->whilem_c = data_fake.whilem_c;
2737                     }
2738                     if (flags & SCF_DO_STCLASS)
2739                         cl_or(pRExC_state, &accum, &this_class);
2740                 }
2741                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2742                     min1 = 0;
2743                 if (flags & SCF_DO_SUBSTR) {
2744                     data->pos_min += min1;
2745                     data->pos_delta += max1 - min1;
2746                     if (max1 != min1 || is_inf)
2747                         data->longest = &(data->longest_float);
2748                 }
2749                 min += min1;
2750                 delta += max1 - min1;
2751                 if (flags & SCF_DO_STCLASS_OR) {
2752                     cl_or(pRExC_state, data->start_class, &accum);
2753                     if (min1) {
2754                         cl_and(data->start_class, and_withp);
2755                         flags &= ~SCF_DO_STCLASS;
2756                     }
2757                 }
2758                 else if (flags & SCF_DO_STCLASS_AND) {
2759                     if (min1) {
2760                         cl_and(data->start_class, &accum);
2761                         flags &= ~SCF_DO_STCLASS;
2762                     }
2763                     else {
2764                         /* Switch to OR mode: cache the old value of
2765                          * data->start_class */
2766                         INIT_AND_WITHP;
2767                         StructCopy(data->start_class, and_withp,
2768                                    struct regnode_charclass_class);
2769                         flags &= ~SCF_DO_STCLASS_AND;
2770                         StructCopy(&accum, data->start_class,
2771                                    struct regnode_charclass_class);
2772                         flags |= SCF_DO_STCLASS_OR;
2773                         data->start_class->flags |= ANYOF_EOS;
2774                     }
2775                 }
2776
2777                 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2778                 /* demq.
2779
2780                    Assuming this was/is a branch we are dealing with: 'scan' now
2781                    points at the item that follows the branch sequence, whatever
2782                    it is. We now start at the beginning of the sequence and look
2783                    for subsequences of
2784
2785                    BRANCH->EXACT=>x1
2786                    BRANCH->EXACT=>x2
2787                    tail
2788
2789                    which would be constructed from a pattern like /A|LIST|OF|WORDS/
2790
2791                    If we can find such a subsequence we need to turn the first
2792                    element into a trie and then add the subsequent branch exact
2793                    strings to the trie.
2794
2795                    We have two cases
2796
2797                      1. patterns where the whole set of branches can be converted.
2798
2799                      2. patterns where only a subset can be converted.
2800
2801                    In case 1 we can replace the whole set with a single regop
2802                    for the trie. In case 2 we need to keep the start and end
2803                    branches so
2804
2805                      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2806                      becomes BRANCH TRIE; BRANCH X;
2807
2808                   There is an additional case, that being where there is a
2809                   common prefix, which gets split out into an EXACT like node
2810                   preceding the TRIE node.
2811
2812                   If x(1..n)==tail then we can do a simple trie, if not we make
2813                   a "jump" trie, such that when we match the appropriate word
2814                   we "jump" to the appropriate tail node. Essentially we turn
2815                   a nested if into a case structure of sorts.
2816
2817                 */
2818
2819                     int made=0;
2820                     if (!re_trie_maxbuff) {
2821                         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2822                         if (!SvIOK(re_trie_maxbuff))
2823                             sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2824                     }
2825                     if ( SvIV(re_trie_maxbuff)>=0  ) {
2826                         regnode *cur;
2827                         regnode *first = (regnode *)NULL;
2828                         regnode *last = (regnode *)NULL;
2829                         regnode *tail = scan;
2830                         U8 optype = 0;
2831                         U32 count=0;
2832
2833 #ifdef DEBUGGING
2834                         SV * const mysv = sv_newmortal();       /* for dumping */
2835 #endif
2836                         /* var tail is used because there may be a TAIL
2837                            regop in the way. Ie, the exacts will point to the
2838                            thing following the TAIL, but the last branch will
2839                            point at the TAIL. So we advance tail. If we
2840                            have nested (?:) we may have to move through several
2841                            tails.
2842                          */
2843
2844                         while ( OP( tail ) == TAIL ) {
2845                             /* this is the TAIL generated by (?:) */
2846                             tail = regnext( tail );
2847                         }
2848
2849
2850                         DEBUG_OPTIMISE_r({
2851                             regprop(RExC_rx, mysv, tail );
2852                             PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2853                                 (int)depth * 2 + 2, "",
2854                                 "Looking for TRIE'able sequences. Tail node is: ",
2855                                 SvPV_nolen_const( mysv )
2856                             );
2857                         });
2858
2859                         /*
2860
2861                            step through the branches, cur represents each
2862                            branch, noper is the first thing to be matched
2863                            as part of that branch and noper_next is the
2864                            regnext() of that node. if noper is an EXACT
2865                            and noper_next is the same as scan (our current
2866                            position in the regex) then the EXACT branch is
2867                            a possible optimization target. Once we have
2868                            two or more consecutive such branches we can
2869                            create a trie of the EXACT's contents and stich
2870                            it in place. If the sequence represents all of
2871                            the branches we eliminate the whole thing and
2872                            replace it with a single TRIE. If it is a
2873                            subsequence then we need to stitch it in. This
2874                            means the first branch has to remain, and needs
2875                            to be repointed at the item on the branch chain
2876                            following the last branch optimized. This could
2877                            be either a BRANCH, in which case the
2878                            subsequence is internal, or it could be the
2879                            item following the branch sequence in which
2880                            case the subsequence is at the end.
2881
2882                         */
2883
2884                         /* dont use tail as the end marker for this traverse */
2885                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2886                             regnode * const noper = NEXTOPER( cur );
2887 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
2888                             regnode * const noper_next = regnext( noper );
2889 #endif
2890
2891                             DEBUG_OPTIMISE_r({
2892                                 regprop(RExC_rx, mysv, cur);
2893                                 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2894                                    (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2895
2896                                 regprop(RExC_rx, mysv, noper);
2897                                 PerlIO_printf( Perl_debug_log, " -> %s",
2898                                     SvPV_nolen_const(mysv));
2899
2900                                 if ( noper_next ) {
2901                                   regprop(RExC_rx, mysv, noper_next );
2902                                   PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2903                                     SvPV_nolen_const(mysv));
2904                                 }
2905                                 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2906                                    REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2907                             });
2908                             if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2909                                          : PL_regkind[ OP( noper ) ] == EXACT )
2910                                   || OP(noper) == NOTHING )
2911 #ifdef NOJUMPTRIE
2912                                   && noper_next == tail
2913 #endif
2914                                   && count < U16_MAX)
2915                             {
2916                                 count++;
2917                                 if ( !first || optype == NOTHING ) {
2918                                     if (!first) first = cur;
2919                                     optype = OP( noper );
2920                                 } else {
2921                                     last = cur;
2922                                 }
2923                             } else {
2924 /*
2925     Currently we do not believe that the trie logic can
2926     handle case insensitive matching properly when the
2927     pattern is not unicode (thus forcing unicode semantics).
2928
2929     If/when this is fixed the following define can be swapped
2930     in below to fully enable trie logic.
2931
2932 #define TRIE_TYPE_IS_SAFE 1
2933
2934 */
2935 #define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
2936
2937                                 if ( last && TRIE_TYPE_IS_SAFE ) {
2938                                     make_trie( pRExC_state,
2939                                             startbranch, first, cur, tail, count,
2940                                             optype, depth+1 );
2941                                 }
2942                                 if ( PL_regkind[ OP( noper ) ] == EXACT
2943 #ifdef NOJUMPTRIE
2944                                      && noper_next == tail
2945 #endif
2946                                 ){
2947                                     count = 1;
2948                                     first = cur;
2949                                     optype = OP( noper );
2950                                 } else {
2951                                     count = 0;
2952                                     first = NULL;
2953                                     optype = 0;
2954                                 }
2955                                 last = NULL;
2956                             }
2957                         }
2958                         DEBUG_OPTIMISE_r({
2959                             regprop(RExC_rx, mysv, cur);
2960                             PerlIO_printf( Perl_debug_log,
2961                               "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2962                               "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2963
2964                         });
2965
2966                         if ( last && TRIE_TYPE_IS_SAFE ) {
2967                             made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
2968 #ifdef TRIE_STUDY_OPT
2969                             if ( ((made == MADE_EXACT_TRIE &&
2970                                  startbranch == first)
2971                                  || ( first_non_open == first )) &&
2972                                  depth==0 ) {
2973                                 flags |= SCF_TRIE_RESTUDY;
2974                                 if ( startbranch == first
2975                                      && scan == tail )
2976                                 {
2977                                     RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2978                                 }
2979                             }
2980 #endif
2981                         }
2982                     }
2983
2984                 } /* do trie */
2985
2986             }
2987             else if ( code == BRANCHJ ) {  /* single branch is optimized. */
2988                 scan = NEXTOPER(NEXTOPER(scan));
2989             } else                      /* single branch is optimized. */
2990                 scan = NEXTOPER(scan);
2991             continue;
2992         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2993             scan_frame *newframe = NULL;
2994             I32 paren;
2995             regnode *start;
2996             regnode *end;
2997
2998             if (OP(scan) != SUSPEND) {
2999             /* set the pointer */
3000                 if (OP(scan) == GOSUB) {
3001                     paren = ARG(scan);
3002                     RExC_recurse[ARG2L(scan)] = scan;
3003                     start = RExC_open_parens[paren-1];
3004                     end   = RExC_close_parens[paren-1];
3005                 } else {
3006                     paren = 0;
3007                     start = RExC_rxi->program + 1;
3008                     end   = RExC_opend;
3009                 }
3010                 if (!recursed) {
3011                     Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3012                     SAVEFREEPV(recursed);
3013                 }
3014                 if (!PAREN_TEST(recursed,paren+1)) {
3015                     PAREN_SET(recursed,paren+1);
3016                     Newx(newframe,1,scan_frame);
3017                 } else {
3018                     if (flags & SCF_DO_SUBSTR) {
3019                         SCAN_COMMIT(pRExC_state,data,minlenp);
3020                         data->longest = &(data->longest_float);
3021                     }
3022                     is_inf = is_inf_internal = 1;
3023                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3024                         cl_anything(pRExC_state, data->start_class);
3025                     flags &= ~SCF_DO_STCLASS;
3026                 }
3027             } else {
3028                 Newx(newframe,1,scan_frame);
3029                 paren = stopparen;
3030                 start = scan+2;
3031                 end = regnext(scan);
3032             }
3033             if (newframe) {
3034                 assert(start);
3035                 assert(end);
3036                 SAVEFREEPV(newframe);
3037                 newframe->next = regnext(scan);
3038                 newframe->last = last;
3039                 newframe->stop = stopparen;
3040                 newframe->prev = frame;
3041
3042                 frame = newframe;
3043                 scan =  start;
3044                 stopparen = paren;
3045                 last = end;
3046
3047                 continue;
3048             }
3049         }
3050         else if (OP(scan) == EXACT) {
3051             I32 l = STR_LEN(scan);
3052             UV uc;
3053             if (UTF) {
3054                 const U8 * const s = (U8*)STRING(scan);
3055                 l = utf8_length(s, s + l);
3056                 uc = utf8_to_uvchr(s, NULL);
3057             } else {
3058                 uc = *((U8*)STRING(scan));
3059             }
3060             min += l;
3061             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3062                 /* The code below prefers earlier match for fixed
3063                    offset, later match for variable offset.  */
3064                 if (data->last_end == -1) { /* Update the start info. */
3065                     data->last_start_min = data->pos_min;
3066                     data->last_start_max = is_inf
3067                         ? I32_MAX : data->pos_min + data->pos_delta;
3068                 }
3069                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3070                 if (UTF)
3071                     SvUTF8_on(data->last_found);
3072                 {
3073                     SV * const sv = data->last_found;
3074                     MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3075                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
3076                     if (mg && mg->mg_len >= 0)
3077                         mg->mg_len += utf8_length((U8*)STRING(scan),
3078                                                   (U8*)STRING(scan)+STR_LEN(scan));
3079                 }
3080                 data->last_end = data->pos_min + l;
3081                 data->pos_min += l; /* As in the first entry. */
3082                 data->flags &= ~SF_BEFORE_EOL;
3083             }
3084             if (flags & SCF_DO_STCLASS_AND) {
3085                 /* Check whether it is compatible with what we know already! */
3086                 int compat = 1;
3087
3088
3089                 /* If compatible, we or it in below.  It is compatible if is
3090                  * in the bitmp and either 1) its bit or its fold is set, or 2)
3091                  * it's for a locale.  Even if there isn't unicode semantics
3092                  * here, at runtime there may be because of matching against a
3093                  * utf8 string, so accept a possible false positive for
3094                  * latin1-range folds */
3095                 if (uc >= 0x100 ||
3096                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3097                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
3098                     && (!(data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD)
3099                         || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3100                     )
3101                     compat = 0;
3102                 ANYOF_CLASS_ZERO(data->start_class);
3103                 ANYOF_BITMAP_ZERO(data->start_class);
3104                 if (compat)
3105                     ANYOF_BITMAP_SET(data->start_class, uc);
3106                 data->start_class->flags &= ~ANYOF_EOS;
3107                 if (uc < 0x100)
3108                   data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3109             }
3110             else if (flags & SCF_DO_STCLASS_OR) {
3111                 /* false positive possible if the class is case-folded */
3112                 if (uc < 0x100)
3113                     ANYOF_BITMAP_SET(data->start_class, uc);
3114                 else
3115                     data->start_class->flags |= ANYOF_UNICODE_ALL;
3116                 data->start_class->flags &= ~ANYOF_EOS;
3117                 cl_and(data->start_class, and_withp);
3118             }
3119             flags &= ~SCF_DO_STCLASS;
3120         }
3121         else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3122             I32 l = STR_LEN(scan);
3123             UV uc = *((U8*)STRING(scan));
3124
3125             /* Search for fixed substrings supports EXACT only. */
3126             if (flags & SCF_DO_SUBSTR) {
3127                 assert(data);
3128                 SCAN_COMMIT(pRExC_state, data, minlenp);
3129             }
3130             if (UTF) {
3131                 const U8 * const s = (U8 *)STRING(scan);
3132                 l = utf8_length(s, s + l);
3133                 uc = utf8_to_uvchr(s, NULL);
3134             }
3135             min += l;
3136             if (flags & SCF_DO_SUBSTR)
3137                 data->pos_min += l;
3138             if (flags & SCF_DO_STCLASS_AND) {
3139                 /* Check whether it is compatible with what we know already! */
3140                 int compat = 1;
3141                 if (uc >= 0x100 ||
3142                  (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3143                   && !ANYOF_BITMAP_TEST(data->start_class, uc)
3144                   && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3145                 {
3146                     compat = 0;
3147                 }
3148                 ANYOF_CLASS_ZERO(data->start_class);
3149                 ANYOF_BITMAP_ZERO(data->start_class);
3150                 if (compat) {
3151                     ANYOF_BITMAP_SET(data->start_class, uc);
3152                     data->start_class->flags &= ~ANYOF_EOS;
3153                     data->start_class->flags |= ANYOF_LOC_NONBITMAP_FOLD;
3154                     if (OP(scan) == EXACTFL) {
3155                         data->start_class->flags |= ANYOF_LOCALE;
3156                     }
3157                     else {
3158
3159                         /* Also set the other member of the fold pair.  In case
3160                          * that unicode semantics is called for at runtime, use
3161                          * the full latin1 fold.  (Can't do this for locale,
3162                          * because not known until runtime */
3163                         ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3164                     }
3165                 }
3166             }
3167             else if (flags & SCF_DO_STCLASS_OR) {
3168                 if (data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) {
3169                     /* false positive possible if the class is case-folded.
3170                        Assume that the locale settings are the same... */
3171                     if (uc < 0x100) {
3172                         ANYOF_BITMAP_SET(data->start_class, uc);
3173                         if (OP(scan) != EXACTFL) {
3174
3175                             /* And set the other member of the fold pair, but
3176                              * can't do that in locale because not known until
3177                              * run-time */
3178                             ANYOF_BITMAP_SET(data->start_class,
3179                                              PL_fold_latin1[uc]);
3180                         }
3181                     }
3182                     data->start_class->flags &= ~ANYOF_EOS;
3183                 }
3184                 cl_and(data->start_class, and_withp);
3185             }
3186             flags &= ~SCF_DO_STCLASS;
3187         }
3188         else if (REGNODE_VARIES(OP(scan))) {
3189             I32 mincount, maxcount, minnext, deltanext, fl = 0;
3190             I32 f = flags, pos_before = 0;
3191             regnode * const oscan = scan;
3192             struct regnode_charclass_class this_class;
3193             struct regnode_charclass_class *oclass = NULL;
3194             I32 next_is_eval = 0;
3195
3196             switch (PL_regkind[OP(scan)]) {
3197             case WHILEM:                /* End of (?:...)* . */
3198                 scan = NEXTOPER(scan);
3199                 goto finish;
3200             case PLUS:
3201                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3202                     next = NEXTOPER(scan);
3203                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3204                         mincount = 1;
3205                         maxcount = REG_INFTY;
3206                         next = regnext(scan);
3207                         scan = NEXTOPER(scan);
3208                         goto do_curly;
3209                     }
3210                 }
3211                 if (flags & SCF_DO_SUBSTR)
3212                     data->pos_min++;
3213                 min++;
3214                 /* Fall through. */
3215             case STAR:
3216                 if (flags & SCF_DO_STCLASS) {
3217                     mincount = 0;
3218                     maxcount = REG_INFTY;
3219                     next = regnext(scan);
3220                     scan = NEXTOPER(scan);
3221                     goto do_curly;
3222                 }
3223                 is_inf = is_inf_internal = 1;
3224                 scan = regnext(scan);
3225                 if (flags & SCF_DO_SUBSTR) {
3226                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3227                     data->longest = &(data->longest_float);
3228                 }
3229                 goto optimize_curly_tail;
3230             case CURLY:
3231                 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3232                     && (scan->flags == stopparen))
3233                 {
3234                     mincount = 1;
3235                     maxcount = 1;
3236                 } else {
3237                     mincount = ARG1(scan);
3238                     maxcount = ARG2(scan);
3239                 }
3240                 next = regnext(scan);
3241                 if (OP(scan) == CURLYX) {
3242                     I32 lp = (data ? *(data->last_closep) : 0);
3243                     scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3244                 }
3245                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3246                 next_is_eval = (OP(scan) == EVAL);
3247               do_curly:
3248                 if (flags & SCF_DO_SUBSTR) {
3249                     if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3250                     pos_before = data->pos_min;
3251                 }
3252                 if (data) {
3253                     fl = data->flags;
3254                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3255                     if (is_inf)
3256                         data->flags |= SF_IS_INF;
3257                 }
3258                 if (flags & SCF_DO_STCLASS) {
3259                     cl_init(pRExC_state, &this_class);
3260                     oclass = data->start_class;
3261                     data->start_class = &this_class;
3262                     f |= SCF_DO_STCLASS_AND;
3263                     f &= ~SCF_DO_STCLASS_OR;
3264                 }
3265                 /* Exclude from super-linear cache processing any {n,m}
3266                    regops for which the combination of input pos and regex
3267                    pos is not enough information to determine if a match
3268                    will be possible.
3269
3270                    For example, in the regex /foo(bar\s*){4,8}baz/ with the
3271                    regex pos at the \s*, the prospects for a match depend not
3272                    only on the input position but also on how many (bar\s*)
3273                    repeats into the {4,8} we are. */
3274                if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3275                     f &= ~SCF_WHILEM_VISITED_POS;
3276
3277                 /* This will finish on WHILEM, setting scan, or on NULL: */
3278                 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3279                                       last, data, stopparen, recursed, NULL,
3280                                       (mincount == 0
3281                                         ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3282
3283                 if (flags & SCF_DO_STCLASS)
3284                     data->start_class = oclass;
3285                 if (mincount == 0 || minnext == 0) {
3286                     if (flags & SCF_DO_STCLASS_OR) {
3287                         cl_or(pRExC_state, data->start_class, &this_class);
3288                     }
3289                     else if (flags & SCF_DO_STCLASS_AND) {
3290                         /* Switch to OR mode: cache the old value of
3291                          * data->start_class */
3292                         INIT_AND_WITHP;
3293                         StructCopy(data->start_class, and_withp,
3294                                    struct regnode_charclass_class);
3295                         flags &= ~SCF_DO_STCLASS_AND;
3296                         StructCopy(&this_class, data->start_class,
3297                                    struct regnode_charclass_class);
3298                         flags |= SCF_DO_STCLASS_OR;
3299                         data->start_class->flags |= ANYOF_EOS;
3300                     }
3301                 } else {                /* Non-zero len */
3302                     if (flags & SCF_DO_STCLASS_OR) {
3303                         cl_or(pRExC_state, data->start_class, &this_class);
3304                         cl_and(data->start_class, and_withp);
3305                     }
3306                     else if (flags & SCF_DO_STCLASS_AND)
3307                         cl_and(data->start_class, &this_class);
3308                     flags &= ~SCF_DO_STCLASS;
3309                 }
3310                 if (!scan)              /* It was not CURLYX, but CURLY. */
3311                     scan = next;
3312                 if ( /* ? quantifier ok, except for (?{ ... }) */
3313                     (next_is_eval || !(mincount == 0 && maxcount == 1))
3314                     && (minnext == 0) && (deltanext == 0)
3315                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3316                     && maxcount <= REG_INFTY/3) /* Complement check for big count */
3317                 {
3318                     ckWARNreg(RExC_parse,
3319                               "Quantifier unexpected on zero-length expression");
3320                 }
3321
3322                 min += minnext * mincount;
3323                 is_inf_internal |= ((maxcount == REG_INFTY
3324                                      && (minnext + deltanext) > 0)
3325                                     || deltanext == I32_MAX);
3326                 is_inf |= is_inf_internal;
3327                 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3328
3329                 /* Try powerful optimization CURLYX => CURLYN. */
3330                 if (  OP(oscan) == CURLYX && data
3331                       && data->flags & SF_IN_PAR
3332                       && !(data->flags & SF_HAS_EVAL)
3333                       && !deltanext && minnext == 1 ) {
3334                     /* Try to optimize to CURLYN.  */
3335                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3336                     regnode * const nxt1 = nxt;
3337 #ifdef DEBUGGING
3338                     regnode *nxt2;
3339 #endif
3340
3341                     /* Skip open. */
3342                     nxt = regnext(nxt);
3343                     if (!REGNODE_SIMPLE(OP(nxt))
3344                         && !(PL_regkind[OP(nxt)] == EXACT
3345                              && STR_LEN(nxt) == 1))
3346                         goto nogo;
3347 #ifdef DEBUGGING
3348                     nxt2 = nxt;
3349 #endif
3350                     nxt = regnext(nxt);
3351                     if (OP(nxt) != CLOSE)
3352                         goto nogo;
3353                     if (RExC_open_parens) {
3354                         RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3355                         RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3356                     }
3357                     /* Now we know that nxt2 is the only contents: */
3358                     oscan->flags = (U8)ARG(nxt);
3359                     OP(oscan) = CURLYN;
3360                     OP(nxt1) = NOTHING; /* was OPEN. */
3361
3362 #ifdef DEBUGGING
3363                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3364                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3365                     NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3366                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
3367                     OP(nxt + 1) = OPTIMIZED; /* was count. */
3368                     NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3369 #endif
3370                 }
3371               nogo:
3372
3373                 /* Try optimization CURLYX => CURLYM. */
3374                 if (  OP(oscan) == CURLYX && data
3375                       && !(data->flags & SF_HAS_PAR)
3376                       && !(data->flags & SF_HAS_EVAL)
3377                       && !deltanext     /* atom is fixed width */
3378                       && minnext != 0   /* CURLYM can't handle zero width */
3379                 ) {
3380                     /* XXXX How to optimize if data == 0? */
3381                     /* Optimize to a simpler form.  */
3382                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3383                     regnode *nxt2;
3384
3385                     OP(oscan) = CURLYM;
3386                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3387                             && (OP(nxt2) != WHILEM))
3388                         nxt = nxt2;
3389                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
3390                     /* Need to optimize away parenths. */
3391                     if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3392                         /* Set the parenth number.  */
3393                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3394
3395                         oscan->flags = (U8)ARG(nxt);
3396                         if (RExC_open_parens) {
3397                             RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3398                             RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3399                         }
3400                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
3401                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
3402
3403 #ifdef DEBUGGING
3404                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3405                         OP(nxt + 1) = OPTIMIZED; /* was count. */
3406                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3407                         NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
3408 #endif
3409 #if 0
3410                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
3411                             regnode *nnxt = regnext(nxt1);
3412                             if (nnxt == nxt) {
3413                                 if (reg_off_by_arg[OP(nxt1)])
3414                                     ARG_SET(nxt1, nxt2 - nxt1);
3415                                 else if (nxt2 - nxt1 < U16_MAX)
3416                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
3417                                 else
3418                                     OP(nxt) = NOTHING;  /* Cannot beautify */
3419                             }
3420                             nxt1 = nnxt;
3421                         }
3422 #endif
3423                         /* Optimize again: */
3424                         study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3425                                     NULL, stopparen, recursed, NULL, 0,depth+1);
3426                     }
3427                     else
3428                         oscan->flags = 0;
3429                 }
3430                 else if ((OP(oscan) == CURLYX)
3431                          && (flags & SCF_WHILEM_VISITED_POS)
3432                          /* See the comment on a similar expression above.
3433                             However, this time it's not a subexpression
3434                             we care about, but the expression itself. */
3435                          && (maxcount == REG_INFTY)
3436                          && data && ++data->whilem_c < 16) {
3437                     /* This stays as CURLYX, we can put the count/of pair. */
3438                     /* Find WHILEM (as in regexec.c) */
3439                     regnode *nxt = oscan + NEXT_OFF(oscan);
3440
3441                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3442                         nxt += ARG(nxt);
3443                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
3444                         | (RExC_whilem_seen << 4)); /* On WHILEM */
3445                 }
3446                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3447                     pars++;
3448                 if (flags & SCF_DO_SUBSTR) {
3449                     SV *last_str = NULL;
3450                     int counted = mincount != 0;
3451
3452                     if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3453 #if defined(SPARC64_GCC_WORKAROUND)
3454                         I32 b = 0;
3455                         STRLEN l = 0;
3456                         const char *s = NULL;
3457                         I32 old = 0;
3458
3459                         if (pos_before >= data->last_start_min)
3460                             b = pos_before;
3461                         else
3462                             b = data->last_start_min;
3463
3464                         l = 0;
3465                         s = SvPV_const(data->last_found, l);
3466                         old = b - data->last_start_min;
3467
3468 #else
3469                         I32 b = pos_before >= data->last_start_min
3470                             ? pos_before : data->last_start_min;
3471                         STRLEN l;
3472                         const char * const s = SvPV_const(data->last_found, l);
3473                         I32 old = b - data->last_start_min;
3474 #endif
3475
3476                         if (UTF)
3477                             old = utf8_hop((U8*)s, old) - (U8*)s;
3478                         l -= old;
3479                         /* Get the added string: */
3480                         last_str = newSVpvn_utf8(s  + old, l, UTF);
3481                         if (deltanext == 0 && pos_before == b) {
3482                             /* What was added is a constant string */
3483                             if (mincount > 1) {
3484                                 SvGROW(last_str, (mincount * l) + 1);
3485                                 repeatcpy(SvPVX(last_str) + l,
3486                                           SvPVX_const(last_str), l, mincount - 1);
3487                                 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3488                                 /* Add additional parts. */
3489                                 SvCUR_set(data->last_found,
3490                                           SvCUR(data->last_found) - l);
3491                                 sv_catsv(data->last_found, last_str);
3492                                 {
3493                                     SV * sv = data->last_found;
3494                                     MAGIC *mg =
3495                                         SvUTF8(sv) && SvMAGICAL(sv) ?
3496                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
3497                                     if (mg && mg->mg_len >= 0)
3498                                         mg->mg_len += CHR_SVLEN(last_str) - l;
3499                                 }
3500                                 data->last_end += l * (mincount - 1);
3501                             }
3502                         } else {
3503                             /* start offset must point into the last copy */
3504                             data->last_start_min += minnext * (mincount - 1);
3505                             data->last_start_max += is_inf ? I32_MAX
3506                                 : (maxcount - 1) * (minnext + data->pos_delta);
3507                         }
3508                     }
3509                     /* It is counted once already... */
3510                     data->pos_min += minnext * (mincount - counted);
3511                     data->pos_delta += - counted * deltanext +
3512                         (minnext + deltanext) * maxcount - minnext * mincount;
3513                     if (mincount != maxcount) {
3514                          /* Cannot extend fixed substrings found inside
3515                             the group.  */
3516                         SCAN_COMMIT(pRExC_state,data,minlenp);
3517                         if (mincount && last_str) {
3518                             SV * const sv = data->last_found;
3519                             MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3520                                 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3521
3522                             if (mg)
3523                                 mg->mg_len = -1;
3524                             sv_setsv(sv, last_str);
3525                             data->last_end = data->pos_min;
3526                             data->last_start_min =
3527                                 data->pos_min - CHR_SVLEN(last_str);
3528                             data->last_start_max = is_inf
3529                                 ? I32_MAX
3530                                 : data->pos_min + data->pos_delta
3531                                 - CHR_SVLEN(last_str);
3532                         }
3533                         data->longest = &(data->longest_float);
3534                     }
3535                     SvREFCNT_dec(last_str);
3536                 }
3537                 if (data && (fl & SF_HAS_EVAL))
3538                     data->flags |= SF_HAS_EVAL;
3539               optimize_curly_tail:
3540                 if (OP(oscan) != CURLYX) {
3541                     while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3542                            && NEXT_OFF(next))
3543                         NEXT_OFF(oscan) += NEXT_OFF(next);
3544                 }
3545                 continue;
3546             default:                    /* REF, ANYOFV, and CLUMP only? */
3547                 if (flags & SCF_DO_SUBSTR) {
3548                     SCAN_COMMIT(pRExC_state,data,minlenp);      /* Cannot expect anything... */
3549                     data->longest = &(data->longest_float);
3550                 }
3551                 is_inf = is_inf_internal = 1;
3552                 if (flags & SCF_DO_STCLASS_OR)
3553                     cl_anything(pRExC_state, data->start_class);
3554                 flags &= ~SCF_DO_STCLASS;
3555                 break;
3556             }
3557         }
3558         else if (OP(scan) == LNBREAK) {
3559             if (flags & SCF_DO_STCLASS) {
3560                 int value = 0;
3561                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3562                 if (flags & SCF_DO_STCLASS_AND) {
3563                     for (value = 0; value < 256; value++)
3564                         if (!is_VERTWS_cp(value))
3565                             ANYOF_BITMAP_CLEAR(data->start_class, value);
3566                 }
3567                 else {
3568                     for (value = 0; value < 256; value++)
3569                         if (is_VERTWS_cp(value))
3570                             ANYOF_BITMAP_SET(data->start_class, value);
3571                 }
3572                 if (flags & SCF_DO_STCLASS_OR)
3573                     cl_and(data->start_class, and_withp);
3574                 flags &= ~SCF_DO_STCLASS;
3575             }
3576             min += 1;
3577             delta += 1;
3578             if (flags & SCF_DO_SUBSTR) {
3579                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
3580                 data->pos_min += 1;
3581                 data->pos_delta += 1;
3582                 data->longest = &(data->longest_float);
3583             }
3584         }
3585         else if (OP(scan) == FOLDCHAR) {
3586             int d = ARG(scan) == LATIN_SMALL_LETTER_SHARP_S ? 1 : 2;
3587             flags &= ~SCF_DO_STCLASS;
3588             min += 1;
3589             delta += d;
3590             if (flags & SCF_DO_SUBSTR) {
3591                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
3592                 data->pos_min += 1;
3593                 data->pos_delta += d;
3594                 data->longest = &(data->longest_float);
3595             }
3596         }
3597         else if (REGNODE_SIMPLE(OP(scan))) {
3598             int value = 0;
3599
3600             if (flags & SCF_DO_SUBSTR) {
3601                 SCAN_COMMIT(pRExC_state,data,minlenp);
3602                 data->pos_min++;
3603             }
3604             min++;
3605             if (flags & SCF_DO_STCLASS) {
3606                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3607
3608                 /* Some of the logic below assumes that switching
3609                    locale on will only add false positives. */
3610                 switch (PL_regkind[OP(scan)]) {
3611                 case SANY:
3612                 default:
3613                   do_default:
3614                     /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3615                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3616                         cl_anything(pRExC_state, data->start_class);
3617                     break;
3618                 case REG_ANY:
3619                     if (OP(scan) == SANY)
3620                         goto do_default;
3621                     if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3622                         value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3623                                  || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
3624                         cl_anything(pRExC_state, data->start_class);
3625                     }
3626                     if (flags & SCF_DO_STCLASS_AND || !value)
3627                         ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3628                     break;
3629                 case ANYOF:
3630                     if (flags & SCF_DO_STCLASS_AND)
3631                         cl_and(data->start_class,
3632                                (struct regnode_charclass_class*)scan);
3633                     else
3634                         cl_or(pRExC_state, data->start_class,
3635                               (struct regnode_charclass_class*)scan);
3636                     break;
3637                 case ALNUM:
3638                     if (flags & SCF_DO_STCLASS_AND) {
3639                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3640                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3641                             if (OP(scan) == ALNUMU) {
3642                                 for (value = 0; value < 256; value++) {
3643                                     if (!isWORDCHAR_L1(value)) {
3644                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3645                                     }
3646                                 }
3647                             } else {
3648                                 for (value = 0; value < 256; value++) {
3649                                     if (!isALNUM(value)) {
3650                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3651                                     }
3652                                 }
3653                             }
3654                         }
3655                     }
3656                     else {
3657                         if (data->start_class->flags & ANYOF_LOCALE)
3658                             ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3659                         else if (OP(scan) == ALNUMU) {
3660                             for (value = 0; value < 256; value++) {
3661                                 if (isWORDCHAR_L1(value)) {
3662                                     ANYOF_BITMAP_SET(data->start_class, value);
3663                                 }
3664                             }
3665                         } else {
3666                             for (value = 0; value < 256; value++) {
3667                                 if (isALNUM(value)) {
3668                                     ANYOF_BITMAP_SET(data->start_class, value);
3669                                 }
3670                             }
3671                         }
3672                     }
3673                     break;
3674                 case NALNUM:
3675                     if (flags & SCF_DO_STCLASS_AND) {
3676                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3677                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3678                             if (OP(scan) == NALNUMU) {
3679                                 for (value = 0; value < 256; value++) {
3680                                     if (isWORDCHAR_L1(value)) {
3681                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3682                                     }
3683                                 }
3684                             } else {
3685                                 for (value = 0; value < 256; value++) {
3686                                     if (isALNUM(value)) {
3687                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3688                                     }
3689                                 }
3690                             }
3691                         }
3692                     }
3693                     else {
3694                         if (data->start_class->flags & ANYOF_LOCALE)
3695                             ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3696                         else {
3697                             if (OP(scan) == NALNUMU) {
3698                                 for (value = 0; value < 256; value++) {
3699                                     if (! isWORDCHAR_L1(value)) {
3700                                         ANYOF_BITMAP_SET(data->start_class, value);
3701                                     }
3702                                 }
3703                             } else {
3704                                 for (value = 0; value < 256; value++) {
3705                                     if (! isALNUM(value)) {
3706                                         ANYOF_BITMAP_SET(data->start_class, value);
3707                                     }
3708                                 }
3709                             }
3710                         }
3711                     }
3712                     break;
3713                 case SPACE:
3714                     if (flags & SCF_DO_STCLASS_AND) {
3715                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3716                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3717                             if (OP(scan) == SPACEU) {
3718                                 for (value = 0; value < 256; value++) {
3719                                     if (!isSPACE_L1(value)) {
3720                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3721                                     }
3722                                 }
3723                             } else {
3724                                 for (value = 0; value < 256; value++) {
3725                                     if (!isSPACE(value)) {
3726                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3727                                     }
3728                                 }
3729                             }
3730                         }
3731                     }
3732                     else {
3733                         if (data->start_class->flags & ANYOF_LOCALE) {
3734                             ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3735                         }
3736                         else if (OP(scan) == SPACEU) {
3737                             for (value = 0; value < 256; value++) {
3738                                 if (isSPACE_L1(value)) {
3739                                     ANYOF_BITMAP_SET(data->start_class, value);
3740                                 }
3741                             }
3742                         } else {
3743                             for (value = 0; value < 256; value++) {
3744                                 if (isSPACE(value)) {
3745                                     ANYOF_BITMAP_SET(data->start_class, value);
3746                                 }
3747                             }
3748                         }
3749                     }
3750                     break;
3751                 case NSPACE:
3752                     if (flags & SCF_DO_STCLASS_AND) {
3753                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3754                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3755                             if (OP(scan) == NSPACEU) {
3756                                 for (value = 0; value < 256; value++) {
3757                                     if (isSPACE_L1(value)) {
3758                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3759                                     }
3760                                 }
3761                             } else {
3762                                 for (value = 0; value < 256; value++) {
3763                                     if (isSPACE(value)) {
3764                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3765                                     }
3766                                 }
3767                             }
3768                         }
3769                     }
3770                     else {
3771                         if (data->start_class->flags & ANYOF_LOCALE)
3772                             ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3773                         else if (OP(scan) == NSPACEU) {
3774                             for (value = 0; value < 256; value++) {
3775                                 if (!isSPACE_L1(value)) {
3776                                     ANYOF_BITMAP_SET(data->start_class, value);
3777                                 }
3778                             }
3779                         }
3780                         else {
3781                             for (value = 0; value < 256; value++) {
3782                                 if (!isSPACE(value)) {
3783                                     ANYOF_BITMAP_SET(data->start_class, value);
3784                                 }
3785                             }
3786                         }
3787                     }
3788                     break;
3789                 case DIGIT:
3790                     if (flags & SCF_DO_STCLASS_AND) {
3791                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3792                         for (value = 0; value < 256; value++)
3793                             if (!isDIGIT(value))
3794                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
3795                     }
3796                     else {
3797                         if (data->start_class->flags & ANYOF_LOCALE)
3798                             ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3799                         else {
3800                             for (value = 0; value < 256; value++)
3801                                 if (isDIGIT(value))
3802                                     ANYOF_BITMAP_SET(data->start_class, value);
3803                         }
3804                     }
3805                     break;
3806                 case NDIGIT:
3807                     if (flags & SCF_DO_STCLASS_AND) {
3808                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3809                         for (value = 0; value < 256; value++)
3810                             if (isDIGIT(value))
3811                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
3812                     }
3813                     else {
3814                         if (data->start_class->flags & ANYOF_LOCALE)
3815                             ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3816                         else {
3817                             for (value = 0; value < 256; value++)
3818                                 if (!isDIGIT(value))
3819                                     ANYOF_BITMAP_SET(data->start_class, value);
3820                         }
3821                     }
3822                     break;
3823                 CASE_SYNST_FNC(VERTWS);
3824                 CASE_SYNST_FNC(HORIZWS);
3825
3826                 }
3827                 if (flags & SCF_DO_STCLASS_OR)
3828                     cl_and(data->start_class, and_withp);
3829                 flags &= ~SCF_DO_STCLASS;
3830             }
3831         }
3832         else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3833             data->flags |= (OP(scan) == MEOL
3834                             ? SF_BEFORE_MEOL
3835                             : SF_BEFORE_SEOL);
3836         }
3837         else if (  PL_regkind[OP(scan)] == BRANCHJ
3838                  /* Lookbehind, or need to calculate parens/evals/stclass: */
3839                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
3840                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3841             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3842                 || OP(scan) == UNLESSM )
3843             {
3844                 /* Negative Lookahead/lookbehind
3845                    In this case we can't do fixed string optimisation.
3846                 */
3847
3848                 I32 deltanext, minnext, fake = 0;
3849                 regnode *nscan;
3850                 struct regnode_charclass_class intrnl;
3851                 int f = 0;
3852
3853                 data_fake.flags = 0;
3854                 if (data) {
3855                     data_fake.whilem_c = data->whilem_c;
3856                     data_fake.last_closep = data->last_closep;
3857                 }
3858                 else
3859                     data_fake.last_closep = &fake;
3860                 data_fake.pos_delta = delta;
3861                 if ( flags & SCF_DO_STCLASS && !scan->flags
3862                      && OP(scan) == IFMATCH ) { /* Lookahead */
3863                     cl_init(pRExC_state, &intrnl);
3864                     data_fake.start_class = &intrnl;
3865                     f |= SCF_DO_STCLASS_AND;
3866                 }
3867                 if (flags & SCF_WHILEM_VISITED_POS)
3868                     f |= SCF_WHILEM_VISITED_POS;
3869                 next = regnext(scan);
3870                 nscan = NEXTOPER(NEXTOPER(scan));
3871                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3872                     last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3873                 if (scan->flags) {
3874                     if (deltanext) {
3875                         FAIL("Variable length lookbehind not implemented");
3876                     }
3877                     else if (minnext > (I32)U8_MAX) {
3878                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3879                     }
3880                     scan->flags = (U8)minnext;
3881                 }
3882                 if (data) {
3883                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3884                         pars++;
3885                     if (data_fake.flags & SF_HAS_EVAL)
3886                         data->flags |= SF_HAS_EVAL;
3887                     data->whilem_c = data_fake.whilem_c;
3888                 }
3889                 if (f & SCF_DO_STCLASS_AND) {
3890                     if (flags & SCF_DO_STCLASS_OR) {
3891                         /* OR before, AND after: ideally we would recurse with
3892                          * data_fake to get the AND applied by study of the
3893                          * remainder of the pattern, and then derecurse;
3894                          * *** HACK *** for now just treat as "no information".
3895                          * See [perl #56690].
3896                          */
3897                         cl_init(pRExC_state, data->start_class);
3898                     }  else {
3899                         /* AND before and after: combine and continue */
3900                         const int was = (data->start_class->flags & ANYOF_EOS);
3901
3902                         cl_and(data->start_class, &intrnl);
3903                         if (was)
3904                             data->start_class->flags |= ANYOF_EOS;
3905                     }
3906                 }
3907             }
3908 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3909             else {
3910                 /* Positive Lookahead/lookbehind
3911                    In this case we can do fixed string optimisation,
3912                    but we must be careful about it. Note in the case of
3913                    lookbehind the positions will be offset by the minimum
3914                    length of the pattern, something we won't know about
3915                    until after the recurse.
3916                 */
3917                 I32 deltanext, fake = 0;
3918                 regnode *nscan;
3919                 struct regnode_charclass_class intrnl;
3920                 int f = 0;
3921                 /* We use SAVEFREEPV so that when the full compile
3922                     is finished perl will clean up the allocated
3923                     minlens when it's all done. This way we don't
3924                     have to worry about freeing them when we know
3925                     they wont be used, which would be a pain.
3926                  */
3927                 I32 *minnextp;
3928                 Newx( minnextp, 1, I32 );
3929                 SAVEFREEPV(minnextp);
3930
3931                 if (data) {
3932                     StructCopy(data, &data_fake, scan_data_t);
3933                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3934                         f |= SCF_DO_SUBSTR;
3935                         if (scan->flags)
3936                             SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
3937                         data_fake.last_found=newSVsv(data->last_found);
3938                     }
3939                 }
3940                 else
3941                     data_fake.last_closep = &fake;
3942                 data_fake.flags = 0;
3943                 data_fake.pos_delta = delta;
3944                 if (is_inf)
3945                     data_fake.flags |= SF_IS_INF;
3946                 if ( flags & SCF_DO_STCLASS && !scan->flags
3947                      && OP(scan) == IFMATCH ) { /* Lookahead */
3948                     cl_init(pRExC_state, &intrnl);
3949                     data_fake.start_class = &intrnl;
3950                     f |= SCF_DO_STCLASS_AND;
3951                 }
3952                 if (flags & SCF_WHILEM_VISITED_POS)
3953                     f |= SCF_WHILEM_VISITED_POS;
3954                 next = regnext(scan);
3955                 nscan = NEXTOPER(NEXTOPER(scan));
3956
3957                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3958                     last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3959                 if (scan->flags) {
3960                     if (deltanext) {
3961                         FAIL("Variable length lookbehind not implemented");
3962                     }
3963                     else if (*minnextp > (I32)U8_MAX) {
3964                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3965                     }
3966                     scan->flags = (U8)*minnextp;
3967                 }
3968
3969                 *minnextp += min;
3970
3971                 if (f & SCF_DO_STCLASS_AND) {
3972                     const int was = (data->start_class->flags & ANYOF_EOS);
3973
3974                     cl_and(data->start_class, &intrnl);
3975                     if (was)
3976                         data->start_class->flags |= ANYOF_EOS;
3977                 }
3978                 if (data) {
3979                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3980                         pars++;
3981                     if (data_fake.flags & SF_HAS_EVAL)
3982                         data->flags |= SF_HAS_EVAL;
3983                     data->whilem_c = data_fake.whilem_c;
3984                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3985                         if (RExC_rx->minlen<*minnextp)
3986                             RExC_rx->minlen=*minnextp;
3987                         SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
3988                         SvREFCNT_dec(data_fake.last_found);
3989
3990                         if ( data_fake.minlen_fixed != minlenp )
3991                         {
3992                             data->offset_fixed= data_fake.offset_fixed;
3993                             data->minlen_fixed= data_fake.minlen_fixed;
3994                             data->lookbehind_fixed+= scan->flags;
3995                         }
3996                         if ( data_fake.minlen_float != minlenp )
3997                         {
3998                             data->minlen_float= data_fake.minlen_float;
3999                             data->offset_float_min=data_fake.offset_float_min;
4000                             data->offset_float_max=data_fake.offset_float_max;
4001                             data->lookbehind_float+= scan->flags;
4002                         }
4003                     }
4004                 }
4005
4006
4007             }
4008 #endif
4009         }
4010         else if (OP(scan) == OPEN) {
4011             if (stopparen != (I32)ARG(scan))
4012                 pars++;
4013         }
4014         else if (OP(scan) == CLOSE) {
4015             if (stopparen == (I32)ARG(scan)) {
4016                 break;
4017             }
4018             if ((I32)ARG(scan) == is_par) {
4019                 next = regnext(scan);
4020
4021                 if ( next && (OP(next) != WHILEM) && next < last)
4022                     is_par = 0;         /* Disable optimization */
4023             }
4024             if (data)
4025                 *(data->last_closep) = ARG(scan);
4026         }
4027         else if (OP(scan) == EVAL) {
4028                 if (data)
4029                     data->flags |= SF_HAS_EVAL;
4030         }
4031         else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4032             if (flags & SCF_DO_SUBSTR) {
4033                 SCAN_COMMIT(pRExC_state,data,minlenp);
4034                 flags &= ~SCF_DO_SUBSTR;
4035             }
4036             if (data && OP(scan)==ACCEPT) {
4037                 data->flags |= SCF_SEEN_ACCEPT;
4038                 if (stopmin > min)
4039                     stopmin = min;
4040             }
4041         }
4042         else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4043         {
4044                 if (flags & SCF_DO_SUBSTR) {
4045                     SCAN_COMMIT(pRExC_state,data,minlenp);
4046                     data->longest = &(data->longest_float);
4047                 }
4048                 is_inf = is_inf_internal = 1;
4049                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4050                     cl_anything(pRExC_state, data->start_class);
4051                 flags &= ~SCF_DO_STCLASS;
4052         }
4053         else if (OP(scan) == GPOS) {
4054             if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4055                 !(delta || is_inf || (data && data->pos_delta)))
4056             {
4057                 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4058                     RExC_rx->extflags |= RXf_ANCH_GPOS;
4059                 if (RExC_rx->gofs < (U32)min)
4060                     RExC_rx->gofs = min;
4061             } else {
4062                 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4063                 RExC_rx->gofs = 0;
4064             }
4065         }
4066 #ifdef TRIE_STUDY_OPT
4067 #ifdef FULL_TRIE_STUDY
4068         else if (PL_regkind[OP(scan)] == TRIE) {
4069             /* NOTE - There is similar code to this block above for handling
4070                BRANCH nodes on the initial study.  If you change stuff here
4071                check there too. */
4072             regnode *trie_node= scan;
4073             regnode *tail= regnext(scan);
4074             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4075             I32 max1 = 0, min1 = I32_MAX;
4076             struct regnode_charclass_class accum;
4077
4078             if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4079                 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4080             if (flags & SCF_DO_STCLASS)
4081                 cl_init_zero(pRExC_state, &accum);
4082
4083             if (!trie->jump) {
4084                 min1= trie->minlen;
4085                 max1= trie->maxlen;
4086             } else {
4087                 const regnode *nextbranch= NULL;
4088                 U32 word;
4089
4090                 for ( word=1 ; word <= trie->wordcount ; word++)
4091                 {
4092                     I32 deltanext=0, minnext=0, f = 0, fake;
4093                     struct regnode_charclass_class this_class;
4094
4095                     data_fake.flags = 0;
4096                     if (data) {
4097                         data_fake.whilem_c = data->whilem_c;
4098                         data_fake.last_closep = data->last_closep;
4099                     }
4100                     else
4101                         data_fake.last_closep = &fake;
4102                     data_fake.pos_delta = delta;
4103                     if (flags & SCF_DO_STCLASS) {
4104                         cl_init(pRExC_state, &this_class);
4105                         data_fake.start_class = &this_class;
4106                         f = SCF_DO_STCLASS_AND;
4107                     }
4108                     if (flags & SCF_WHILEM_VISITED_POS)
4109                         f |= SCF_WHILEM_VISITED_POS;
4110
4111                     if (trie->jump[word]) {
4112                         if (!nextbranch)
4113                             nextbranch = trie_node + trie->jump[0];
4114                         scan= trie_node + trie->jump[word];
4115                         /* We go from the jump point to the branch that follows
4116                            it. Note this means we need the vestigal unused branches
4117                            even though they arent otherwise used.
4118                          */
4119                         minnext = study_chunk(pRExC_state, &scan, minlenp,
4120                             &deltanext, (regnode *)nextbranch, &data_fake,
4121                             stopparen, recursed, NULL, f,depth+1);
4122                     }
4123                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4124                         nextbranch= regnext((regnode*)nextbranch);
4125
4126                     if (min1 > (I32)(minnext + trie->minlen))
4127                         min1 = minnext + trie->minlen;
4128                     if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4129                         max1 = minnext + deltanext + trie->maxlen;
4130                     if (deltanext == I32_MAX)
4131                         is_inf = is_inf_internal = 1;
4132
4133                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4134                         pars++;
4135                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
4136                         if ( stopmin > min + min1)
4137                             stopmin = min + min1;
4138                         flags &= ~SCF_DO_SUBSTR;
4139                         if (data)
4140                             data->flags |= SCF_SEEN_ACCEPT;
4141                     }
4142                     if (data) {
4143                         if (data_fake.flags & SF_HAS_EVAL)
4144                             data->flags |= SF_HAS_EVAL;
4145                         data->whilem_c = data_fake.whilem_c;
4146                     }
4147                     if (flags & SCF_DO_STCLASS)
4148                         cl_or(pRExC_state, &accum, &this_class);
4149                 }
4150             }
4151             if (flags & SCF_DO_SUBSTR) {
4152                 data->pos_min += min1;
4153                 data->pos_delta += max1 - min1;
4154                 if (max1 != min1 || is_inf)
4155                     data->longest = &(data->longest_float);
4156             }
4157             min += min1;
4158             delta += max1 - min1;
4159             if (flags & SCF_DO_STCLASS_OR) {
4160                 cl_or(pRExC_state, data->start_class, &accum);
4161                 if (min1) {
4162                     cl_and(data->start_class, and_withp);
4163                     flags &= ~SCF_DO_STCLASS;
4164                 }
4165             }
4166             else if (flags & SCF_DO_STCLASS_AND) {
4167                 if (min1) {
4168                     cl_and(data->start_class, &accum);
4169                     flags &= ~SCF_DO_STCLASS;
4170                 }
4171                 else {
4172                     /* Switch to OR mode: cache the old value of
4173                      * data->start_class */
4174                     INIT_AND_WITHP;
4175                     StructCopy(data->start_class, and_withp,
4176                                struct regnode_charclass_class);
4177                     flags &= ~SCF_DO_STCLASS_AND;
4178                     StructCopy(&accum, data->start_class,
4179                                struct regnode_charclass_class);
4180                     flags |= SCF_DO_STCLASS_OR;
4181                     data->start_class->flags |= ANYOF_EOS;
4182                 }
4183             }
4184             scan= tail;
4185             continue;
4186         }
4187 #else
4188         else if (PL_regkind[OP(scan)] == TRIE) {
4189             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4190             U8*bang=NULL;
4191
4192             min += trie->minlen;
4193             delta += (trie->maxlen - trie->minlen);
4194             flags &= ~SCF_DO_STCLASS; /* xxx */
4195             if (flags & SCF_DO_SUBSTR) {
4196                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
4197                 data->pos_min += trie->minlen;
4198                 data->pos_delta += (trie->maxlen - trie->minlen);
4199                 if (trie->maxlen != trie->minlen)
4200                     data->longest = &(data->longest_float);
4201             }
4202             if (trie->jump) /* no more substrings -- for now /grr*/
4203                 flags &= ~SCF_DO_SUBSTR;
4204         }
4205 #endif /* old or new */
4206 #endif /* TRIE_STUDY_OPT */
4207
4208         /* Else: zero-length, ignore. */
4209         scan = regnext(scan);
4210     }
4211     if (frame) {
4212         last = frame->last;
4213         scan = frame->next;
4214         stopparen = frame->stop;
4215         frame = frame->prev;
4216         goto fake_study_recurse;
4217     }
4218
4219   finish:
4220     assert(!frame);
4221     DEBUG_STUDYDATA("pre-fin:",data,depth);
4222
4223     *scanp = scan;
4224     *deltap = is_inf_internal ? I32_MAX : delta;
4225     if (flags & SCF_DO_SUBSTR && is_inf)
4226         data->pos_delta = I32_MAX - data->pos_min;
4227     if (is_par > (I32)U8_MAX)
4228         is_par = 0;
4229     if (is_par && pars==1 && data) {
4230         data->flags |= SF_IN_PAR;
4231         data->flags &= ~SF_HAS_PAR;
4232     }
4233     else if (pars && data) {
4234         data->flags |= SF_HAS_PAR;
4235         data->flags &= ~SF_IN_PAR;
4236     }
4237     if (flags & SCF_DO_STCLASS_OR)
4238         cl_and(data->start_class, and_withp);
4239     if (flags & SCF_TRIE_RESTUDY)
4240         data->flags |=  SCF_TRIE_RESTUDY;
4241
4242     DEBUG_STUDYDATA("post-fin:",data,depth);
4243
4244     return min < stopmin ? min : stopmin;
4245 }
4246
4247 STATIC U32
4248 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4249 {
4250     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4251
4252     PERL_ARGS_ASSERT_ADD_DATA;
4253
4254     Renewc(RExC_rxi->data,
4255            sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4256            char, struct reg_data);
4257     if(count)
4258         Renew(RExC_rxi->data->what, count + n, U8);
4259     else
4260         Newx(RExC_rxi->data->what, n, U8);
4261     RExC_rxi->data->count = count + n;
4262     Copy(s, RExC_rxi->data->what + count, n, U8);
4263     return count;
4264 }
4265
4266 /*XXX: todo make this not included in a non debugging perl */
4267 #ifndef PERL_IN_XSUB_RE
4268 void
4269 Perl_reginitcolors(pTHX)
4270 {
4271     dVAR;
4272     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4273     if (s) {
4274         char *t = savepv(s);
4275         int i = 0;
4276         PL_colors[0] = t;
4277         while (++i < 6) {
4278             t = strchr(t, '\t');
4279             if (t) {
4280                 *t = '\0';
4281                 PL_colors[i] = ++t;
4282             }
4283             else
4284                 PL_colors[i] = t = (char *)"";
4285         }
4286     } else {
4287         int i = 0;
4288         while (i < 6)
4289             PL_colors[i++] = (char *)"";
4290     }
4291     PL_colorset = 1;
4292 }
4293 #endif
4294
4295
4296 #ifdef TRIE_STUDY_OPT
4297 #define CHECK_RESTUDY_GOTO                                  \
4298         if (                                                \
4299               (data.flags & SCF_TRIE_RESTUDY)               \
4300               && ! restudied++                              \
4301         )     goto reStudy
4302 #else
4303 #define CHECK_RESTUDY_GOTO
4304 #endif
4305
4306 /*
4307  - pregcomp - compile a regular expression into internal code
4308  *
4309  * We can't allocate space until we know how big the compiled form will be,
4310  * but we can't compile it (and thus know how big it is) until we've got a
4311  * place to put the code.  So we cheat:  we compile it twice, once with code
4312  * generation turned off and size counting turned on, and once "for real".
4313  * This also means that we don't allocate space until we are sure that the
4314  * thing really will compile successfully, and we never have to move the
4315  * code and thus invalidate pointers into it.  (Note that it has to be in
4316  * one piece because free() must be able to free it all.) [NB: not true in perl]
4317  *
4318  * Beware that the optimization-preparation code in here knows about some
4319  * of the structure of the compiled regexp.  [I'll say.]
4320  */
4321
4322
4323
4324 #ifndef PERL_IN_XSUB_RE
4325 #define RE_ENGINE_PTR &reh_regexp_engine
4326 #else
4327 extern const struct regexp_engine my_reg_engine;
4328 #define RE_ENGINE_PTR &my_reg_engine
4329 #endif
4330
4331 #ifndef PERL_IN_XSUB_RE
4332 REGEXP *
4333 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4334 {
4335     dVAR;
4336     HV * const table = GvHV(PL_hintgv);
4337
4338     PERL_ARGS_ASSERT_PREGCOMP;
4339
4340     /* Dispatch a request to compile a regexp to correct
4341        regexp engine. */
4342     if (table) {
4343         SV **ptr= hv_fetchs(table, "regcomp", FALSE);
4344         GET_RE_DEBUG_FLAGS_DECL;
4345         if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
4346             const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4347             DEBUG_COMPILE_r({
4348                 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4349                     SvIV(*ptr));
4350             });
4351             return CALLREGCOMP_ENG(eng, pattern, flags);
4352         }
4353     }
4354     return Perl_re_compile(aTHX_ pattern, flags);
4355 }
4356 #endif
4357
4358 REGEXP *
4359 Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
4360 {
4361     dVAR;
4362     REGEXP *rx;
4363     struct regexp *r;
4364     register regexp_internal *ri;
4365     STRLEN plen;
4366     char  *exp;
4367     char* xend;
4368     regnode *scan;
4369     I32 flags;
4370     I32 minlen = 0;
4371     U32 pm_flags;
4372
4373     /* these are all flags - maybe they should be turned
4374      * into a single int with different bit masks */
4375     I32 sawlookahead = 0;
4376     I32 sawplus = 0;
4377     I32 sawopen = 0;
4378     bool used_setjump = FALSE;
4379
4380     U8 jump_ret = 0;
4381     dJMPENV;
4382     scan_data_t data;
4383     RExC_state_t RExC_state;
4384     RExC_state_t * const pRExC_state = &RExC_state;
4385 #ifdef TRIE_STUDY_OPT
4386     int restudied;
4387     RExC_state_t copyRExC_state;
4388 #endif
4389     GET_RE_DEBUG_FLAGS_DECL;
4390
4391     PERL_ARGS_ASSERT_RE_COMPILE;
4392
4393     DEBUG_r(if (!PL_colorset) reginitcolors());
4394
4395     RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
4396
4397     /****************** LONG JUMP TARGET HERE***********************/
4398     /* Longjmp back to here if have to switch in midstream to utf8 */
4399     if (! RExC_orig_utf8) {
4400         JMPENV_PUSH(jump_ret);
4401         used_setjump = TRUE;
4402     }
4403
4404     if (jump_ret == 0) {    /* First time through */
4405         exp = SvPV(pattern, plen);
4406         xend = exp + plen;
4407         /* ignore the utf8ness if the pattern is 0 length */
4408         if (plen == 0) {
4409             RExC_utf8 = RExC_orig_utf8 = 0;
4410         }
4411
4412         DEBUG_COMPILE_r({
4413             SV *dsv= sv_newmortal();
4414             RE_PV_QUOTED_DECL(s, RExC_utf8,
4415                 dsv, exp, plen, 60);
4416             PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4417                            PL_colors[4],PL_colors[5],s);
4418         });
4419     }
4420     else {  /* longjumped back */
4421         STRLEN len = plen;
4422
4423         /* If the cause for the longjmp was other than changing to utf8, pop
4424          * our own setjmp, and longjmp to the correct handler */
4425         if (jump_ret != UTF8_LONGJMP) {
4426             JMPENV_POP;
4427             JMPENV_JUMP(jump_ret);
4428         }
4429
4430         GET_RE_DEBUG_FLAGS;
4431
4432         /* It's possible to write a regexp in ascii that represents Unicode
4433         codepoints outside of the byte range, such as via \x{100}. If we
4434         detect such a sequence we have to convert the entire pattern to utf8
4435         and then recompile, as our sizing calculation will have been based
4436         on 1 byte == 1 character, but we will need to use utf8 to encode
4437         at least some part of the pattern, and therefore must convert the whole
4438         thing.
4439         -- dmq */
4440         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4441             "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4442         exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)SvPV(pattern, plen), &len);
4443         xend = exp + len;
4444         RExC_orig_utf8 = RExC_utf8 = 1;
4445         SAVEFREEPV(exp);
4446     }
4447
4448 #ifdef TRIE_STUDY_OPT
4449     restudied = 0;
4450 #endif
4451
4452     /* Set to use unicode semantics if the pattern is in utf8 and has the
4453      * 'depends' charset specified, as it means unicode when utf8  */
4454     pm_flags = orig_pm_flags;
4455
4456     if (RExC_utf8 && get_regex_charset(pm_flags) == REGEX_DEPENDS_CHARSET) {
4457         set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
4458     }
4459
4460     RExC_precomp = exp;
4461     RExC_flags = pm_flags;
4462     RExC_sawback = 0;
4463
4464     RExC_seen = 0;
4465     RExC_in_lookbehind = 0;
4466     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4467     RExC_seen_evals = 0;
4468     RExC_extralen = 0;
4469
4470     /* First pass: determine size, legality. */
4471     RExC_parse = exp;
4472     RExC_start = exp;
4473     RExC_end = xend;
4474     RExC_naughty = 0;
4475     RExC_npar = 1;
4476     RExC_nestroot = 0;
4477     RExC_size = 0L;
4478     RExC_emit = &PL_regdummy;
4479     RExC_whilem_seen = 0;
4480     RExC_open_parens = NULL;
4481     RExC_close_parens = NULL;
4482     RExC_opend = NULL;
4483     RExC_paren_names = NULL;
4484 #ifdef DEBUGGING
4485     RExC_paren_name_list = NULL;
4486 #endif
4487     RExC_recurse = NULL;
4488     RExC_recurse_count = 0;
4489
4490 #if 0 /* REGC() is (currently) a NOP at the first pass.
4491        * Clever compilers notice this and complain. --jhi */
4492     REGC((U8)REG_MAGIC, (char*)RExC_emit);
4493 #endif
4494     DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4495     if (reg(pRExC_state, 0, &flags,1) == NULL) {
4496         RExC_precomp = NULL;
4497         return(NULL);
4498     }
4499
4500     /* Here, finished first pass.  Get rid of any added setjmp */
4501     if (used_setjump) {
4502         JMPENV_POP;
4503     }
4504     DEBUG_PARSE_r({
4505         PerlIO_printf(Perl_debug_log,
4506             "Required size %"IVdf" nodes\n"
4507             "Starting second pass (creation)\n",
4508             (IV)RExC_size);
4509         RExC_lastnum=0;
4510         RExC_lastparse=NULL;
4511     });
4512     /* Small enough for pointer-storage convention?
4513        If extralen==0, this means that we will not need long jumps. */
4514     if (RExC_size >= 0x10000L && RExC_extralen)
4515         RExC_size += RExC_extralen;
4516     else
4517         RExC_extralen = 0;
4518     if (RExC_whilem_seen > 15)
4519         RExC_whilem_seen = 15;
4520
4521     /* Allocate space and zero-initialize. Note, the two step process
4522        of zeroing when in debug mode, thus anything assigned has to
4523        happen after that */
4524     rx = (REGEXP*) newSV_type(SVt_REGEXP);
4525     r = (struct regexp*)SvANY(rx);
4526     Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4527          char, regexp_internal);
4528     if ( r == NULL || ri == NULL )
4529         FAIL("Regexp out of space");
4530 #ifdef DEBUGGING
4531     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
4532     Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
4533 #else
4534     /* bulk initialize base fields with 0. */
4535     Zero(ri, sizeof(regexp_internal), char);
4536 #endif
4537
4538     /* non-zero initialization begins here */
4539     RXi_SET( r, ri );
4540     r->engine= RE_ENGINE_PTR;
4541     r->extflags = pm_flags;
4542     {
4543         bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
4544         bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
4545
4546         /* The caret is output if there are any defaults: if not all the STD
4547          * flags are set, or if no character set specifier is needed */
4548         bool has_default =
4549                     (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
4550                     || ! has_charset);
4551         bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
4552         U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
4553                             >> RXf_PMf_STD_PMMOD_SHIFT);
4554         const char *fptr = STD_PAT_MODS;        /*"msix"*/
4555         char *p;
4556         /* Allocate for the worst case, which is all the std flags are turned
4557          * on.  If more precision is desired, we could do a population count of
4558          * the flags set.  This could be done with a small lookup table, or by
4559          * shifting, masking and adding, or even, when available, assembly
4560          * language for a machine-language population count.
4561          * We never output a minus, as all those are defaults, so are
4562          * covered by the caret */
4563         const STRLEN wraplen = plen + has_p + has_runon
4564             + has_default       /* If needs a caret */
4565
4566                 /* If needs a character set specifier */
4567             + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
4568             + (sizeof(STD_PAT_MODS) - 1)
4569             + (sizeof("(?:)") - 1);
4570
4571         p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
4572         SvPOK_on(rx);
4573         SvFLAGS(rx) |= SvUTF8(pattern);
4574         *p++='('; *p++='?';
4575
4576         /* If a default, cover it using the caret */
4577         if (has_default) {
4578             *p++= DEFAULT_PAT_MOD;
4579         }
4580         if (has_charset) {
4581             STRLEN len;
4582             const char* const name = get_regex_charset_name(r->extflags, &len);
4583             Copy(name, p, len, char);
4584             p += len;
4585         }
4586         if (has_p)
4587             *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
4588         {
4589             char ch;
4590             while((ch = *fptr++)) {
4591                 if(reganch & 1)
4592                     *p++ = ch;
4593                 reganch >>= 1;
4594             }
4595         }
4596
4597         *p++ = ':';
4598         Copy(RExC_precomp, p, plen, char);
4599         assert ((RX_WRAPPED(rx) - p) < 16);
4600         r->pre_prefix = p - RX_WRAPPED(rx);
4601         p += plen;
4602         if (has_runon)
4603             *p++ = '\n';
4604         *p++ = ')';
4605         *p = 0;
4606         SvCUR_set(rx, p - SvPVX_const(rx));
4607     }
4608
4609     r->intflags = 0;
4610     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4611
4612     if (RExC_seen & REG_SEEN_RECURSE) {
4613         Newxz(RExC_open_parens, RExC_npar,regnode *);
4614         SAVEFREEPV(RExC_open_parens);
4615         Newxz(RExC_close_parens,RExC_npar,regnode *);
4616         SAVEFREEPV(RExC_close_parens);
4617     }
4618
4619     /* Useful during FAIL. */
4620 #ifdef RE_TRACK_PATTERN_OFFSETS
4621     Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
4622     DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
4623                           "%s %"UVuf" bytes for offset annotations.\n",
4624                           ri->u.offsets ? "Got" : "Couldn't get",
4625                           (UV)((2*RExC_size+1) * sizeof(U32))));
4626 #endif
4627     SetProgLen(ri,RExC_size);
4628     RExC_rx_sv = rx;
4629     RExC_rx = r;
4630     RExC_rxi = ri;
4631     REH_CALL_COMP_BEGIN_HOOK(pRExC_state->rx);
4632
4633     /* Second pass: emit code. */
4634     RExC_flags = pm_flags;      /* don't let top level (?i) bleed */
4635     RExC_parse = exp;
4636     RExC_end = xend;
4637     RExC_naughty = 0;
4638     RExC_npar = 1;
4639     RExC_emit_start = ri->program;
4640     RExC_emit = ri->program;
4641     RExC_emit_bound = ri->program + RExC_size + 1;
4642
4643     /* Store the count of eval-groups for security checks: */
4644     RExC_rx->seen_evals = RExC_seen_evals;
4645     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
4646     if (reg(pRExC_state, 0, &flags,1) == NULL) {
4647         ReREFCNT_dec(rx);
4648         return(NULL);
4649     }
4650     /* XXXX To minimize changes to RE engine we always allocate
4651        3-units-long substrs field. */
4652     Newx(r->substrs, 1, struct reg_substr_data);
4653     if (RExC_recurse_count) {
4654         Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4655         SAVEFREEPV(RExC_recurse);
4656     }
4657
4658 reStudy:
4659     r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
4660     Zero(r->substrs, 1, struct reg_substr_data);
4661
4662 #ifdef TRIE_STUDY_OPT
4663     if (!restudied) {
4664         StructCopy(&zero_scan_data, &data, scan_data_t);
4665         copyRExC_state = RExC_state;
4666     } else {
4667         U32 seen=RExC_seen;
4668         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
4669
4670         RExC_state = copyRExC_state;
4671         if (seen & REG_TOP_LEVEL_BRANCHES)
4672             RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4673         else
4674             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
4675         if (data.last_found) {
4676             SvREFCNT_dec(data.longest_fixed);
4677             SvREFCNT_dec(data.longest_float);
4678             SvREFCNT_dec(data.last_found);
4679         }
4680         StructCopy(&zero_scan_data, &data, scan_data_t);
4681     }
4682 #else
4683     StructCopy(&zero_scan_data, &data, scan_data_t);
4684 #endif
4685
4686     /* Dig out information for optimizations. */
4687     r->extflags = RExC_flags; /* was pm_op */
4688     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
4689
4690     if (UTF)
4691         SvUTF8_on(rx);  /* Unicode in it? */
4692     ri->regstclass = NULL;
4693     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
4694         r->intflags |= PREGf_NAUGHTY;
4695     scan = ri->program + 1;             /* First BRANCH. */
4696
4697     /* testing for BRANCH here tells us whether there is "must appear"
4698        data in the pattern. If there is then we can use it for optimisations */
4699     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /*  Only one top-level choice. */
4700         I32 fake;
4701         STRLEN longest_float_length, longest_fixed_length;
4702         struct regnode_charclass_class ch_class; /* pointed to by data */
4703         int stclass_flag;
4704         I32 last_close = 0; /* pointed to by data */
4705         regnode *first= scan;
4706         regnode *first_next= regnext(first);
4707         /*
4708          * Skip introductions and multiplicators >= 1
4709          * so that we can extract the 'meat' of the pattern that must
4710          * match in the large if() sequence following.
4711          * NOTE that EXACT is NOT covered here, as it is normally
4712          * picked up by the optimiser separately.
4713          *
4714          * This is unfortunate as the optimiser isnt handling lookahead
4715          * properly currently.
4716          *
4717          */
4718         while ((OP(first) == OPEN && (sawopen = 1)) ||
4719                /* An OR of *one* alternative - should not happen now. */
4720             (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
4721             /* for now we can't handle lookbehind IFMATCH*/
4722             (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
4723             (OP(first) == PLUS) ||
4724             (OP(first) == MINMOD) ||
4725                /* An {n,m} with n>0 */
4726             (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
4727             (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
4728         {
4729                 /*
4730                  * the only op that could be a regnode is PLUS, all the rest
4731                  * will be regnode_1 or regnode_2.
4732                  *
4733                  */
4734                 if (OP(first) == PLUS)
4735                     sawplus = 1;
4736                 else
4737                     first += regarglen[OP(first)];
4738
4739                 first = NEXTOPER(first);
4740                 first_next= regnext(first);
4741         }
4742
4743         /* Starting-point info. */
4744       again:
4745         DEBUG_PEEP("first:",first,0);
4746         /* Ignore EXACT as we deal with it later. */
4747         if (PL_regkind[OP(first)] == EXACT) {
4748             if (OP(first) == EXACT)
4749                 NOOP;   /* Empty, get anchored substr later. */
4750             else
4751                 ri->regstclass = first;
4752         }
4753 #ifdef TRIE_STCLASS
4754         else if (PL_regkind[OP(first)] == TRIE &&
4755                 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
4756         {
4757             regnode *trie_op;
4758             /* this can happen only on restudy */
4759             if ( OP(first) == TRIE ) {
4760                 struct regnode_1 *trieop = (struct regnode_1 *)
4761                     PerlMemShared_calloc(1, sizeof(struct regnode_1));
4762                 StructCopy(first,trieop,struct regnode_1);
4763                 trie_op=(regnode *)trieop;
4764             } else {
4765                 struct regnode_charclass *trieop = (struct regnode_charclass *)
4766                     PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
4767                 StructCopy(first,trieop,struct regnode_charclass);
4768                 trie_op=(regnode *)trieop;
4769             }
4770             OP(trie_op)+=2;
4771             make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
4772             ri->regstclass = trie_op;
4773         }
4774 #endif
4775         else if (REGNODE_SIMPLE(OP(first)))
4776             ri->regstclass = first;
4777         else if (PL_regkind[OP(first)] == BOUND ||
4778                  PL_regkind[OP(first)] == NBOUND)
4779             ri->regstclass = first;
4780         else if (PL_regkind[OP(first)] == BOL) {
4781             r->extflags |= (OP(first) == MBOL
4782                            ? RXf_ANCH_MBOL
4783                            : (OP(first) == SBOL
4784                               ? RXf_ANCH_SBOL
4785                               : RXf_ANCH_BOL));
4786             first = NEXTOPER(first);
4787             goto again;
4788         }
4789         else if (OP(first) == GPOS) {
4790             r->extflags |= RXf_ANCH_GPOS;
4791             first = NEXTOPER(first);
4792             goto again;
4793         }
4794         else if ((!sawopen || !RExC_sawback) &&
4795             (OP(first) == STAR &&
4796             PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
4797             !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
4798         {
4799             /* turn .* into ^.* with an implied $*=1 */
4800             const int type =
4801                 (OP(NEXTOPER(first)) == REG_ANY)
4802                     ? RXf_ANCH_MBOL
4803                     : RXf_ANCH_SBOL;
4804             r->extflags |= type;
4805             r->intflags |= PREGf_IMPLICIT;
4806             first = NEXTOPER(first);
4807             goto again;
4808         }
4809         if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
4810             && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
4811             /* x+ must match at the 1st pos of run of x's */
4812             r->intflags |= PREGf_SKIP;
4813
4814         /* Scan is after the zeroth branch, first is atomic matcher. */
4815 #ifdef TRIE_STUDY_OPT
4816         DEBUG_PARSE_r(
4817             if (!restudied)
4818                 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4819                               (IV)(first - scan + 1))
4820         );
4821 #else
4822         DEBUG_PARSE_r(
4823             PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4824                 (IV)(first - scan + 1))
4825         );
4826 #endif
4827
4828
4829         /*
4830         * If there's something expensive in the r.e., find the
4831         * longest literal string that must appear and make it the
4832         * regmust.  Resolve ties in favor of later strings, since
4833         * the regstart check works with the beginning of the r.e.
4834         * and avoiding duplication strengthens checking.  Not a
4835         * strong reason, but sufficient in the absence of others.
4836         * [Now we resolve ties in favor of the earlier string if
4837         * it happens that c_offset_min has been invalidated, since the
4838         * earlier string may buy us something the later one won't.]
4839         */
4840
4841         data.longest_fixed = newSVpvs("");
4842         data.longest_float = newSVpvs("");
4843         data.last_found = newSVpvs("");
4844         data.longest = &(data.longest_fixed);
4845         first = scan;
4846         if (!ri->regstclass) {
4847             cl_init(pRExC_state, &ch_class);
4848             data.start_class = &ch_class;
4849             stclass_flag = SCF_DO_STCLASS_AND;
4850         } else                          /* XXXX Check for BOUND? */
4851             stclass_flag = 0;
4852         data.last_closep = &last_close;
4853
4854         minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
4855             &data, -1, NULL, NULL,
4856             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
4857
4858
4859         CHECK_RESTUDY_GOTO;
4860
4861
4862         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
4863              && data.last_start_min == 0 && data.last_end > 0
4864              && !RExC_seen_zerolen
4865              && !(RExC_seen & REG_SEEN_VERBARG)
4866              && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
4867             r->extflags |= RXf_CHECK_ALL;
4868         scan_commit(pRExC_state, &data,&minlen,0);
4869         SvREFCNT_dec(data.last_found);
4870
4871         /* Note that code very similar to this but for anchored string
4872            follows immediately below, changes may need to be made to both.
4873            Be careful.
4874          */
4875         longest_float_length = CHR_SVLEN(data.longest_float);
4876         if (longest_float_length
4877             || (data.flags & SF_FL_BEFORE_EOL
4878                 && (!(data.flags & SF_FL_BEFORE_MEOL)
4879                     || (RExC_flags & RXf_PMf_MULTILINE))))
4880         {
4881             I32 t,ml;
4882
4883             if (SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
4884                 && data.offset_fixed == data.offset_float_min
4885                 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4886                     goto remove_float;          /* As in (a)+. */
4887
4888             /* copy the information about the longest float from the reg_scan_data
4889                over to the program. */
4890             if (SvUTF8(data.longest_float)) {
4891                 r->float_utf8 = data.longest_float;
4892                 r->float_substr = NULL;
4893             } else {
4894                 r->float_substr = data.longest_float;
4895                 r->float_utf8 = NULL;
4896             }
4897             /* float_end_shift is how many chars that must be matched that
4898                follow this item. We calculate it ahead of time as once the
4899                lookbehind offset is added in we lose the ability to correctly
4900                calculate it.*/
4901             ml = data.minlen_float ? *(data.minlen_float)
4902                                    : (I32)longest_float_length;
4903             r->float_end_shift = ml - data.offset_float_min
4904                 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4905                 + data.lookbehind_float;
4906             r->float_min_offset = data.offset_float_min - data.lookbehind_float;
4907             r->float_max_offset = data.offset_float_max;
4908             if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
4909                 r->float_max_offset -= data.lookbehind_float;
4910
4911             t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4912                        && (!(data.flags & SF_FL_BEFORE_MEOL)
4913                            || (RExC_flags & RXf_PMf_MULTILINE)));
4914             fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
4915         }
4916         else {
4917           remove_float:
4918             r->float_substr = r->float_utf8 = NULL;
4919             SvREFCNT_dec(data.longest_float);
4920             longest_float_length = 0;
4921         }
4922
4923         /* Note that code very similar to this but for floating string
4924            is immediately above, changes may need to be made to both.
4925            Be careful.
4926          */
4927         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
4928         if (longest_fixed_length
4929             || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4930                 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4931                     || (RExC_flags & RXf_PMf_MULTILINE))))
4932         {
4933             I32 t,ml;
4934
4935             /* copy the information about the longest fixed
4936                from the reg_scan_data over to the program. */
4937             if (SvUTF8(data.longest_fixed)) {
4938                 r->anchored_utf8 = data.longest_fixed;
4939                 r->anchored_substr = NULL;
4940             } else {
4941                 r->anchored_substr = data.longest_fixed;
4942                 r->anchored_utf8 = NULL;
4943             }
4944             /* fixed_end_shift is how many chars that must be matched that
4945                follow this item. We calculate it ahead of time as once the
4946                lookbehind offset is added in we lose the ability to correctly
4947                calculate it.*/
4948             ml = data.minlen_fixed ? *(data.minlen_fixed)
4949                                    : (I32)longest_fixed_length;
4950             r->anchored_end_shift = ml - data.offset_fixed
4951                 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4952                 + data.lookbehind_fixed;
4953             r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4954
4955             t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4956                  && (!(data.flags & SF_FIX_BEFORE_MEOL)
4957                      || (RExC_flags & RXf_PMf_MULTILINE)));
4958             fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
4959         }
4960         else {
4961             r->anchored_substr = r->anchored_utf8 = NULL;
4962             SvREFCNT_dec(data.longest_fixed);
4963             longest_fixed_length = 0;
4964         }
4965         if (ri->regstclass
4966             && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
4967             ri->regstclass = NULL;
4968
4969         /* If the synthetic start class were to ever be used when EOS is set,
4970          * that bit would have to be cleared, as it is shared with another */
4971         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4972             && stclass_flag
4973             && !(data.start_class->flags & ANYOF_EOS)
4974             && !cl_is_anything(data.start_class))
4975         {
4976             const U32 n = add_data(pRExC_state, 1, "f");
4977
4978             Newx(RExC_rxi->data->data[n], 1,
4979                 struct regnode_charclass_class);
4980             StructCopy(data.start_class,
4981                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
4982                        struct regnode_charclass_class);
4983             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
4984             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
4985             DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
4986                       regprop(r, sv, (regnode*)data.start_class);
4987                       PerlIO_printf(Perl_debug_log,
4988                                     "synthetic stclass \"%s\".\n",
4989                                     SvPVX_const(sv));});
4990         }
4991
4992         /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
4993         if (longest_fixed_length > longest_float_length) {
4994             r->check_end_shift = r->anchored_end_shift;
4995             r->check_substr = r->anchored_substr;
4996             r->check_utf8 = r->anchored_utf8;
4997             r->check_offset_min = r->check_offset_max = r->anchored_offset;
4998             if (r->extflags & RXf_ANCH_SINGLE)
4999                 r->extflags |= RXf_NOSCAN;
5000         }
5001         else {
5002             r->check_end_shift = r->float_end_shift;
5003             r->check_substr = r->float_substr;
5004             r->check_utf8 = r->float_utf8;
5005             r->check_offset_min = r->float_min_offset;
5006             r->check_offset_max = r->float_max_offset;
5007         }
5008         /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
5009            This should be changed ASAP!  */
5010         if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
5011             r->extflags |= RXf_USE_INTUIT;
5012             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
5013                 r->extflags |= RXf_INTUIT_TAIL;
5014         }
5015         /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
5016         if ( (STRLEN)minlen < longest_float_length )
5017             minlen= longest_float_length;
5018         if ( (STRLEN)minlen < longest_fixed_length )
5019             minlen= longest_fixed_length;
5020         */
5021     }
5022     else {
5023         /* Several toplevels. Best we can is to set minlen. */
5024         I32 fake;
5025         struct regnode_charclass_class ch_class;
5026         I32 last_close = 0;
5027
5028         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
5029
5030         scan = ri->program + 1;
5031         cl_init(pRExC_state, &ch_class);
5032         data.start_class = &ch_class;
5033         data.last_closep = &last_close;
5034
5035
5036         minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
5037             &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
5038
5039         CHECK_RESTUDY_GOTO;
5040
5041         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
5042                 = r->float_substr = r->float_utf8 = NULL;
5043
5044         /* If the synthetic start class were to ever be used when EOS is set,
5045          * that bit would have to be cleared, as it is shared with another */
5046         if (!(data.start_class->flags & ANYOF_EOS)
5047             && !cl_is_anything(data.start_class))
5048         {
5049             const U32 n = add_data(pRExC_state, 1, "f");
5050
5051             Newx(RExC_rxi->data->data[n], 1,
5052                 struct regnode_charclass_class);
5053             StructCopy(data.start_class,
5054                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
5055                        struct regnode_charclass_class);
5056             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
5057             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
5058             DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
5059                       regprop(r, sv, (regnode*)data.start_class);
5060                       PerlIO_printf(Perl_debug_log,
5061                                     "synthetic stclass \"%s\".\n",
5062                                     SvPVX_const(sv));});
5063         }
5064     }
5065
5066     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
5067        the "real" pattern. */
5068     DEBUG_OPTIMISE_r({
5069         PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
5070                       (IV)minlen, (IV)r->minlen);
5071     });
5072     r->minlenret = minlen;
5073     if (r->minlen < minlen)
5074         r->minlen = minlen;
5075
5076     if (RExC_seen & REG_SEEN_GPOS)
5077         r->extflags |= RXf_GPOS_SEEN;
5078     if (RExC_seen & REG_SEEN_LOOKBEHIND)
5079         r->extflags |= RXf_LOOKBEHIND_SEEN;
5080     if (RExC_seen & REG_SEEN_EVAL)
5081         r->extflags |= RXf_EVAL_SEEN;
5082     if (RExC_seen & REG_SEEN_CANY)
5083         r->extflags |= RXf_CANY_SEEN;
5084     if (RExC_seen & REG_SEEN_VERBARG)
5085         r->intflags |= PREGf_VERBARG_SEEN;
5086     if (RExC_seen & REG_SEEN_CUTGROUP)
5087         r->intflags |= PREGf_CUTGROUP_SEEN;
5088     if (RExC_paren_names)
5089         RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
5090     else
5091         RXp_PAREN_NAMES(r) = NULL;
5092
5093 #ifdef STUPID_PATTERN_CHECKS
5094     if (RX_PRELEN(rx) == 0)
5095         r->extflags |= RXf_NULL;
5096     if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
5097         /* XXX: this should happen BEFORE we compile */
5098         r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5099     else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
5100         r->extflags |= RXf_WHITE;
5101     else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
5102         r->extflags |= RXf_START_ONLY;
5103 #else
5104     if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
5105             /* XXX: this should happen BEFORE we compile */
5106             r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5107     else {
5108         regnode *first = ri->program + 1;
5109         U8 fop = OP(first);
5110         U8 nop = OP(NEXTOPER(first));
5111
5112         if (PL_regkind[fop] == NOTHING && nop == END)
5113             r->extflags |= RXf_NULL;
5114         else if (PL_regkind[fop] == BOL && nop == END)
5115             r->extflags |= RXf_START_ONLY;
5116         else if (fop == PLUS && nop ==SPACE && OP(regnext(first))==END)
5117             r->extflags |= RXf_WHITE;
5118     }
5119 #endif
5120 #ifdef DEBUGGING
5121     if (RExC_paren_names) {
5122         ri->name_list_idx = add_data( pRExC_state, 1, "a" );
5123         ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
5124     } else
5125 #endif
5126         ri->name_list_idx = 0;
5127
5128     if (RExC_recurse_count) {
5129         for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
5130             const regnode *scan = RExC_recurse[RExC_recurse_count-1];
5131             ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
5132         }
5133     }
5134     Newxz(r->offs, RExC_npar, regexp_paren_pair);
5135     /* assume we don't need to swap parens around before we match */
5136
5137     DEBUG_DUMP_r({
5138         PerlIO_printf(Perl_debug_log,"Final program:\n");
5139         regdump(r);
5140     });
5141 #ifdef RE_TRACK_PATTERN_OFFSETS
5142     DEBUG_OFFSETS_r(if (ri->u.offsets) {
5143         const U32 len = ri->u.offsets[0];
5144         U32 i;
5145         GET_RE_DEBUG_FLAGS_DECL;
5146         PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
5147         for (i = 1; i <= len; i++) {
5148             if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
5149                 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
5150                 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
5151             }
5152         PerlIO_printf(Perl_debug_log, "\n");
5153     });
5154 #endif
5155     return rx;
5156 }
5157
5158 #undef RE_ENGINE_PTR
5159
5160
5161 SV*
5162 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
5163                     const U32 flags)
5164 {
5165     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
5166
5167     PERL_UNUSED_ARG(value);
5168
5169     if (flags & RXapif_FETCH) {
5170         return reg_named_buff_fetch(rx, key, flags);
5171     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
5172         Perl_croak_no_modify(aTHX);
5173         return NULL;
5174     } else if (flags & RXapif_EXISTS) {
5175         return reg_named_buff_exists(rx, key, flags)
5176             ? &PL_sv_yes
5177             : &PL_sv_no;
5178     } else if (flags & RXapif_REGNAMES) {
5179         return reg_named_buff_all(rx, flags);
5180     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
5181         return reg_named_buff_scalar(rx, flags);
5182     } else {
5183         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
5184         return NULL;
5185     }
5186 }
5187
5188 SV*
5189 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
5190                          const U32 flags)
5191 {
5192     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
5193     PERL_UNUSED_ARG(lastkey);
5194
5195     if (flags & RXapif_FIRSTKEY)
5196         return reg_named_buff_firstkey(rx, flags);
5197     else if (flags & RXapif_NEXTKEY)
5198         return reg_named_buff_nextkey(rx, flags);
5199     else {
5200         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
5201         return NULL;
5202     }
5203 }
5204
5205 SV*
5206 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
5207                           const U32 flags)
5208 {
5209     AV *retarray = NULL;
5210     SV *ret;
5211     struct regexp *const rx = (struct regexp *)SvANY(r);
5212
5213     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
5214
5215     if (flags & RXapif_ALL)
5216         retarray=newAV();
5217
5218     if (rx && RXp_PAREN_NAMES(rx)) {
5219         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
5220         if (he_str) {
5221             IV i;
5222             SV* sv_dat=HeVAL(he_str);
5223             I32 *nums=(I32*)SvPVX(sv_dat);
5224             for ( i=0; i<SvIVX(sv_dat); i++ ) {
5225                 if ((I32)(rx->nparens) >= nums[i]
5226                     && rx->offs[nums[i]].start != -1
5227                     && rx->offs[nums[i]].end != -1)
5228                 {
5229                     ret = newSVpvs("");
5230                     CALLREG_NUMBUF_FETCH(r,nums[i],ret);
5231                     if (!retarray)
5232                         return ret;
5233                 } else {
5234                     ret = newSVsv(&PL_sv_undef);
5235                 }
5236                 if (retarray)
5237                     av_push(retarray, ret);
5238             }
5239             if (retarray)
5240                 return newRV_noinc(MUTABLE_SV(retarray));
5241         }
5242     }
5243     return NULL;
5244 }
5245
5246 bool
5247 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
5248                            const U32 flags)
5249 {
5250     struct regexp *const rx = (struct regexp *)SvANY(r);
5251
5252     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
5253
5254     if (rx && RXp_PAREN_NAMES(rx)) {
5255         if (flags & RXapif_ALL) {
5256             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
5257         } else {
5258             SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
5259             if (sv) {
5260                 SvREFCNT_dec(sv);
5261                 return TRUE;
5262             } else {
5263                 return FALSE;
5264             }
5265         }
5266     } else {
5267         return FALSE;
5268     }
5269 }
5270
5271 SV*
5272 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
5273 {
5274     struct regexp *const rx = (struct regexp *)SvANY(r);
5275
5276     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
5277
5278     if ( rx && RXp_PAREN_NAMES(rx) ) {
5279         (void)hv_iterinit(RXp_PAREN_NAMES(rx));
5280
5281         return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
5282     } else {
5283         return FALSE;
5284     }
5285 }
5286
5287 SV*
5288 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
5289 {
5290     struct regexp *const rx = (struct regexp *)SvANY(r);
5291     GET_RE_DEBUG_FLAGS_DECL;
5292
5293     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
5294
5295     if (rx && RXp_PAREN_NAMES(rx)) {
5296         HV *hv = RXp_PAREN_NAMES(rx);
5297         HE *temphe;
5298         while ( (temphe = hv_iternext_flags(hv,0)) ) {
5299             IV i;
5300             IV parno = 0;
5301             SV* sv_dat = HeVAL(temphe);
5302             I32 *nums = (I32*)SvPVX(sv_dat);
5303             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5304                 if ((I32)(rx->lastparen) >= nums[i] &&
5305                     rx->offs[nums[i]].start != -1 &&
5306                     rx->offs[nums[i]].end != -1)
5307                 {
5308                     parno = nums[i];
5309                     break;
5310                 }
5311             }
5312             if (parno || flags & RXapif_ALL) {
5313                 return newSVhek(HeKEY_hek(temphe));
5314             }
5315         }
5316     }
5317     return NULL;
5318 }
5319
5320 SV*
5321 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
5322 {
5323     SV *ret;
5324     AV *av;
5325     I32 length;
5326     struct regexp *const rx = (struct regexp *)SvANY(r);
5327
5328     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
5329
5330     if (rx && RXp_PAREN_NAMES(rx)) {
5331         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5332             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
5333         } else if (flags & RXapif_ONE) {
5334             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
5335             av = MUTABLE_AV(SvRV(ret));
5336             length = av_len(av);
5337             SvREFCNT_dec(ret);
5338             return newSViv(length + 1);
5339         } else {
5340             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5341             return NULL;
5342         }
5343     }
5344     return &PL_sv_undef;
5345 }
5346
5347 SV*
5348 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
5349 {
5350     struct regexp *const rx = (struct regexp *)SvANY(r);
5351     AV *av = newAV();
5352
5353     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
5354
5355     if (rx && RXp_PAREN_NAMES(rx)) {
5356         HV *hv= RXp_PAREN_NAMES(rx);
5357         HE *temphe;
5358         (void)hv_iterinit(hv);
5359         while ( (temphe = hv_iternext_flags(hv,0)) ) {
5360             IV i;
5361             IV parno = 0;
5362             SV* sv_dat = HeVAL(temphe);
5363             I32 *nums = (I32*)SvPVX(sv_dat);
5364             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5365                 if ((I32)(rx->lastparen) >= nums[i] &&
5366                     rx->offs[nums[i]].start != -1 &&
5367                     rx->offs[nums[i]].end != -1)
5368                 {
5369                     parno = nums[i];
5370                     break;
5371                 }
5372             }
5373             if (parno || flags & RXapif_ALL) {
5374                 av_push(av, newSVhek(HeKEY_hek(temphe)));
5375             }
5376         }
5377     }
5378
5379     return newRV_noinc(MUTABLE_SV(av));
5380 }
5381
5382 void
5383 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
5384                              SV * const sv)
5385 {
5386     struct regexp *const rx = (struct regexp *)SvANY(r);
5387     char *s = NULL;
5388     I32 i = 0;
5389     I32 s1, t1;
5390
5391     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
5392
5393     if (!rx->subbeg) {
5394         sv_setsv(sv,&PL_sv_undef);
5395         return;
5396     }
5397     else
5398     if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
5399         /* $` */
5400         i = rx->offs[0].start;
5401         s = rx->subbeg;
5402     }
5403     else
5404     if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
5405         /* $' */
5406         s = rx->subbeg + rx->offs[0].end;
5407         i = rx->sublen - rx->offs[0].end;
5408     }
5409     else
5410     if ( 0 <= paren && paren <= (I32)rx->nparens &&
5411         (s1 = rx->offs[paren].start) != -1 &&
5412         (t1 = rx->offs[paren].end) != -1)
5413     {
5414         /* $& $1 ... */
5415         i = t1 - s1;
5416         s = rx->subbeg + s1;
5417     } else {
5418         sv_setsv(sv,&PL_sv_undef);
5419         return;
5420     }
5421     assert(rx->sublen >= (s - rx->subbeg) + i );
5422     if (i >= 0) {
5423         const int oldtainted = PL_tainted;
5424         TAINT_NOT;
5425         sv_setpvn(sv, s, i);
5426         PL_tainted = oldtainted;
5427         if ( (rx->extflags & RXf_CANY_SEEN)
5428             ? (RXp_MATCH_UTF8(rx)
5429                         && (!i || is_utf8_string((U8*)s, i)))
5430             : (RXp_MATCH_UTF8(rx)) )
5431         {
5432             SvUTF8_on(sv);
5433         }
5434         else
5435             SvUTF8_off(sv);
5436         if (PL_tainting) {
5437             if (RXp_MATCH_TAINTED(rx)) {
5438                 if (SvTYPE(sv) >= SVt_PVMG) {
5439                     MAGIC* const mg = SvMAGIC(sv);
5440                     MAGIC* mgt;
5441                     PL_tainted = 1;
5442                     SvMAGIC_set(sv, mg->mg_moremagic);
5443                     SvTAINT(sv);
5444                     if ((mgt = SvMAGIC(sv))) {
5445                         mg->mg_moremagic = mgt;
5446                         SvMAGIC_set(sv, mg);
5447                     }
5448                 } else {
5449                     PL_tainted = 1;
5450                     SvTAINT(sv);
5451                 }
5452             } else
5453                 SvTAINTED_off(sv);
5454         }
5455     } else {
5456         sv_setsv(sv,&PL_sv_undef);
5457         return;
5458     }
5459 }
5460
5461 void
5462 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
5463                                                          SV const * const value)
5464 {
5465     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
5466
5467     PERL_UNUSED_ARG(rx);
5468     PERL_UNUSED_ARG(paren);
5469     PERL_UNUSED_ARG(value);
5470
5471     if (!PL_localizing)
5472         Perl_croak_no_modify(aTHX);
5473 }
5474
5475 I32
5476 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
5477                               const I32 paren)
5478 {
5479     struct regexp *const rx = (struct regexp *)SvANY(r);
5480     I32 i;
5481     I32 s1, t1;
5482
5483     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
5484
5485     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
5486         switch (paren) {
5487       /* $` / ${^PREMATCH} */
5488       case RX_BUFF_IDX_PREMATCH:
5489         if (rx->offs[0].start != -1) {
5490                         i = rx->offs[0].start;
5491                         if (i > 0) {
5492                                 s1 = 0;
5493                                 t1 = i;
5494                                 goto getlen;
5495                         }
5496             }
5497         return 0;
5498       /* $' / ${^POSTMATCH} */
5499       case RX_BUFF_IDX_POSTMATCH:
5500             if (rx->offs[0].end != -1) {
5501                         i = rx->sublen - rx->offs[0].end;
5502                         if (i > 0) {
5503                                 s1 = rx->offs[0].end;
5504                                 t1 = rx->sublen;
5505                                 goto getlen;
5506                         }
5507             }
5508         return 0;
5509       /* $& / ${^MATCH}, $1, $2, ... */
5510       default:
5511             if (paren <= (I32)rx->nparens &&
5512             (s1 = rx->offs[paren].start) != -1 &&
5513             (t1 = rx->offs[paren].end) != -1)
5514             {
5515             i = t1 - s1;
5516             goto getlen;
5517         } else {
5518             if (ckWARN(WARN_UNINITIALIZED))
5519                 report_uninit((const SV *)sv);
5520             return 0;
5521         }
5522     }
5523   getlen:
5524     if (i > 0 && RXp_MATCH_UTF8(rx)) {
5525         const char * const s = rx->subbeg + s1;
5526         const U8 *ep;
5527         STRLEN el;
5528
5529         i = t1 - s1;
5530         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
5531                         i = el;
5532     }
5533     return i;
5534 }
5535
5536 SV*
5537 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
5538 {
5539     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
5540         PERL_UNUSED_ARG(rx);
5541         if (0)
5542             return NULL;
5543         else
5544             return newSVpvs("Regexp");
5545 }
5546
5547 /* Scans the name of a named buffer from the pattern.
5548  * If flags is REG_RSN_RETURN_NULL returns null.
5549  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
5550  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
5551  * to the parsed name as looked up in the RExC_paren_names hash.
5552  * If there is an error throws a vFAIL().. type exception.
5553  */
5554
5555 #define REG_RSN_RETURN_NULL    0
5556 #define REG_RSN_RETURN_NAME    1
5557 #define REG_RSN_RETURN_DATA    2
5558
5559 STATIC SV*
5560 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
5561 {
5562     char *name_start = RExC_parse;
5563
5564     PERL_ARGS_ASSERT_REG_SCAN_NAME;
5565
5566     if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
5567          /* skip IDFIRST by using do...while */
5568         if (UTF)
5569             do {
5570                 RExC_parse += UTF8SKIP(RExC_parse);
5571             } while (isALNUM_utf8((U8*)RExC_parse));
5572         else
5573             do {
5574                 RExC_parse++;
5575             } while (isALNUM(*RExC_parse));
5576     }
5577
5578     if ( flags ) {
5579         SV* sv_name
5580             = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
5581                              SVs_TEMP | (UTF ? SVf_UTF8 : 0));
5582         if ( flags == REG_RSN_RETURN_NAME)
5583             return sv_name;
5584         else if (flags==REG_RSN_RETURN_DATA) {
5585             HE *he_str = NULL;
5586             SV *sv_dat = NULL;
5587             if ( ! sv_name )      /* should not happen*/
5588                 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
5589             if (RExC_paren_names)
5590                 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
5591             if ( he_str )
5592                 sv_dat = HeVAL(he_str);
5593             if ( ! sv_dat )
5594                 vFAIL("Reference to nonexistent named group");
5595             return sv_dat;
5596         }
5597         else {
5598             Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
5599         }
5600         /* NOT REACHED */
5601     }
5602     return NULL;
5603 }
5604
5605 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
5606     int rem=(int)(RExC_end - RExC_parse);                       \
5607     int cut;                                                    \
5608     int num;                                                    \
5609     int iscut=0;                                                \
5610     if (rem>10) {                                               \
5611         rem=10;                                                 \
5612         iscut=1;                                                \
5613     }                                                           \
5614     cut=10-rem;                                                 \
5615     if (RExC_lastparse!=RExC_parse)                             \
5616         PerlIO_printf(Perl_debug_log," >%.*s%-*s",              \
5617             rem, RExC_parse,                                    \
5618             cut + 4,                                            \
5619             iscut ? "..." : "<"                                 \
5620         );                                                      \
5621     else                                                        \
5622         PerlIO_printf(Perl_debug_log,"%16s","");                \
5623                                                                 \
5624     if (SIZE_ONLY)                                              \
5625        num = RExC_size + 1;                                     \
5626     else                                                        \
5627        num=REG_NODE_NUM(RExC_emit);                             \
5628     if (RExC_lastnum!=num)                                      \
5629        PerlIO_printf(Perl_debug_log,"|%4d",num);                \
5630     else                                                        \
5631        PerlIO_printf(Perl_debug_log,"|%4s","");                 \
5632     PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
5633         (int)((depth*2)), "",                                   \
5634         (funcname)                                              \
5635     );                                                          \
5636     RExC_lastnum=num;                                           \
5637     RExC_lastparse=RExC_parse;                                  \
5638 })
5639
5640
5641
5642 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
5643     DEBUG_PARSE_MSG((funcname));                            \
5644     PerlIO_printf(Perl_debug_log,"%4s","\n");               \
5645 })
5646 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
5647     DEBUG_PARSE_MSG((funcname));                            \
5648     PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
5649 })
5650 /*
5651  - reg - regular expression, i.e. main body or parenthesized thing
5652  *
5653  * Caller must absorb opening parenthesis.
5654  *
5655  * Combining parenthesis handling with the base level of regular expression
5656  * is a trifle forced, but the need to tie the tails of the branches to what
5657  * follows makes it hard to avoid.
5658  */
5659 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
5660 #ifdef DEBUGGING
5661 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
5662 #else
5663 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
5664 #endif
5665
5666 STATIC regnode *
5667 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
5668     /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
5669 {
5670     dVAR;
5671     register regnode *ret;              /* Will be the head of the group. */
5672     register regnode *br;
5673     register regnode *lastbr;
5674     register regnode *ender = NULL;
5675     register I32 parno = 0;
5676     I32 flags;
5677     U32 oregflags = RExC_flags;
5678     bool have_branch = 0;
5679     bool is_open = 0;
5680     I32 freeze_paren = 0;
5681     I32 after_freeze = 0;
5682
5683     /* for (?g), (?gc), and (?o) warnings; warning
5684        about (?c) will warn about (?g) -- japhy    */
5685
5686 #define WASTED_O  0x01
5687 #define WASTED_G  0x02
5688 #define WASTED_C  0x04
5689 #define WASTED_GC (0x02|0x04)
5690     I32 wastedflags = 0x00;
5691
5692     char * parse_start = RExC_parse; /* MJD */
5693     char * const oregcomp_parse = RExC_parse;
5694
5695     GET_RE_DEBUG_FLAGS_DECL;
5696
5697     PERL_ARGS_ASSERT_REG;
5698     DEBUG_PARSE("reg ");
5699
5700     *flagp = 0;                         /* Tentatively. */
5701
5702
5703     /* Make an OPEN node, if parenthesized. */
5704     if (paren) {
5705         if ( *RExC_parse == '*') { /* (*VERB:ARG) */
5706             char *start_verb = RExC_parse;
5707             STRLEN verb_len = 0;
5708             char *start_arg = NULL;
5709             unsigned char op = 0;
5710             int argok = 1;
5711             int internal_argval = 0; /* internal_argval is only useful if !argok */
5712             while ( *RExC_parse && *RExC_parse != ')' ) {
5713                 if ( *RExC_parse == ':' ) {
5714                     start_arg = RExC_parse + 1;
5715                     break;
5716                 }
5717                 RExC_parse++;
5718             }
5719             ++start_verb;
5720             verb_len = RExC_parse - start_verb;
5721             if ( start_arg ) {
5722                 RExC_parse++;
5723                 while ( *RExC_parse && *RExC_parse != ')' )
5724                     RExC_parse++;
5725                 if ( *RExC_parse != ')' )
5726                     vFAIL("Unterminated verb pattern argument");
5727                 if ( RExC_parse == start_arg )
5728                     start_arg = NULL;
5729             } else {
5730                 if ( *RExC_parse != ')' )
5731                     vFAIL("Unterminated verb pattern");
5732             }
5733
5734             switch ( *start_verb ) {
5735             case 'A':  /* (*ACCEPT) */
5736                 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
5737                     op = ACCEPT;
5738                     internal_argval = RExC_nestroot;
5739                 }
5740                 break;
5741             case 'C':  /* (*COMMIT) */
5742                 if ( memEQs(start_verb,verb_len,"COMMIT") )
5743                     op = COMMIT;
5744                 break;
5745             case 'F':  /* (*FAIL) */
5746                 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
5747                     op = OPFAIL;
5748                     argok = 0;
5749                 }
5750                 break;
5751             case ':':  /* (*:NAME) */
5752             case 'M':  /* (*MARK:NAME) */
5753                 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
5754                     op = MARKPOINT;
5755                     argok = -1;
5756                 }
5757                 break;
5758             case 'P':  /* (*PRUNE) */
5759                 if ( memEQs(start_verb,verb_len,"PRUNE") )
5760                     op = PRUNE;
5761                 break;
5762             case 'S':   /* (*SKIP) */
5763                 if ( memEQs(start_verb,verb_len,"SKIP") )
5764                     op = SKIP;
5765                 break;
5766             case 'T':  /* (*THEN) */
5767                 /* [19:06] <TimToady> :: is then */
5768                 if ( memEQs(start_verb,verb_len,"THEN") ) {
5769                     op = CUTGROUP;
5770                     RExC_seen |= REG_SEEN_CUTGROUP;
5771                 }
5772                 break;
5773             }
5774             if ( ! op ) {
5775                 RExC_parse++;
5776                 vFAIL3("Unknown verb pattern '%.*s'",
5777                     verb_len, start_verb);
5778             }
5779             if ( argok ) {
5780                 if ( start_arg && internal_argval ) {
5781                     vFAIL3("Verb pattern '%.*s' may not have an argument",
5782                         verb_len, start_verb);
5783                 } else if ( argok < 0 && !start_arg ) {
5784                     vFAIL3("Verb pattern '%.*s' has a mandatory argument",
5785                         verb_len, start_verb);
5786                 } else {
5787                     ret = reganode(pRExC_state, op, internal_argval);
5788                     if ( ! internal_argval && ! SIZE_ONLY ) {
5789                         if (start_arg) {
5790                             SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
5791                             ARG(ret) = add_data( pRExC_state, 1, "S" );
5792                             RExC_rxi->data->data[ARG(ret)]=(void*)sv;
5793                             ret->flags = 0;
5794                         } else {
5795                             ret->flags = 1;
5796                         }
5797                     }
5798                 }
5799                 if (!internal_argval)
5800                     RExC_seen |= REG_SEEN_VERBARG;
5801             } else if ( start_arg ) {
5802                 vFAIL3("Verb pattern '%.*s' may not have an argument",
5803                         verb_len, start_verb);
5804             } else {
5805                 ret = reg_node(pRExC_state, op);
5806             }
5807             nextchar(pRExC_state);
5808             return ret;
5809         } else
5810         if (*RExC_parse == '?') { /* (?...) */
5811             bool is_logical = 0;
5812             const char * const seqstart = RExC_parse;
5813             bool has_use_defaults = FALSE;
5814
5815             RExC_parse++;
5816             paren = *RExC_parse++;
5817             ret = NULL;                 /* For look-ahead/behind. */
5818             switch (paren) {
5819
5820             case 'P':   /* (?P...) variants for those used to PCRE/Python */
5821                 paren = *RExC_parse++;
5822                 if ( paren == '<')         /* (?P<...>) named capture */
5823                     goto named_capture;
5824                 else if (paren == '>') {   /* (?P>name) named recursion */
5825                     goto named_recursion;
5826                 }
5827                 else if (paren == '=') {   /* (?P=...)  named backref */
5828                     /* this pretty much dupes the code for \k<NAME> in regatom(), if
5829                        you change this make sure you change that */
5830                     char* name_start = RExC_parse;
5831                     U32 num = 0;
5832                     SV *sv_dat = reg_scan_name(pRExC_state,
5833                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5834                     if (RExC_parse == name_start || *RExC_parse != ')')
5835                         vFAIL2("Sequence %.3s... not terminated",parse_start);
5836
5837                     if (!SIZE_ONLY) {
5838                         num = add_data( pRExC_state, 1, "S" );
5839                         RExC_rxi->data->data[num]=(void*)sv_dat;
5840                         SvREFCNT_inc_simple_void(sv_dat);
5841                     }
5842                     RExC_sawback = 1;
5843                     ret = reganode(pRExC_state,
5844                                    ((! FOLD)
5845                                      ? NREF
5846                                      : (UNI_SEMANTICS)
5847                                        ? NREFFU
5848                                        : (LOC)
5849                                          ? NREFFL
5850                                          : NREFF),
5851                                     num);
5852                     *flagp |= HASWIDTH;
5853
5854                     Set_Node_Offset(ret, parse_start+1);
5855                     Set_Node_Cur_Length(ret); /* MJD */
5856
5857                     nextchar(pRExC_state);
5858                     return ret;
5859                 }
5860                 RExC_parse++;
5861                 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5862                 /*NOTREACHED*/
5863             case '<':           /* (?<...) */
5864                 if (*RExC_parse == '!')
5865                     paren = ',';
5866                 else if (*RExC_parse != '=')
5867               named_capture:
5868                 {               /* (?<...>) */
5869                     char *name_start;
5870                     SV *svname;
5871                     paren= '>';
5872             case '\'':          /* (?'...') */
5873                     name_start= RExC_parse;
5874                     svname = reg_scan_name(pRExC_state,
5875                         SIZE_ONLY ?  /* reverse test from the others */
5876                         REG_RSN_RETURN_NAME :
5877                         REG_RSN_RETURN_NULL);
5878                     if (RExC_parse == name_start) {
5879                         RExC_parse++;
5880                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5881                         /*NOTREACHED*/
5882                     }
5883                     if (*RExC_parse != paren)
5884                         vFAIL2("Sequence (?%c... not terminated",
5885                             paren=='>' ? '<' : paren);
5886                     if (SIZE_ONLY) {
5887                         HE *he_str;
5888                         SV *sv_dat = NULL;
5889                         if (!svname) /* shouldn't happen */
5890                             Perl_croak(aTHX_
5891                                 "panic: reg_scan_name returned NULL");
5892                         if (!RExC_paren_names) {
5893                             RExC_paren_names= newHV();
5894                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
5895 #ifdef DEBUGGING
5896                             RExC_paren_name_list= newAV();
5897                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
5898 #endif
5899                         }
5900                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
5901                         if ( he_str )
5902                             sv_dat = HeVAL(he_str);
5903                         if ( ! sv_dat ) {
5904                             /* croak baby croak */
5905                             Perl_croak(aTHX_
5906                                 "panic: paren_name hash element allocation failed");
5907                         } else if ( SvPOK(sv_dat) ) {
5908                             /* (?|...) can mean we have dupes so scan to check
5909                                its already been stored. Maybe a flag indicating
5910                                we are inside such a construct would be useful,
5911                                but the arrays are likely to be quite small, so
5912                                for now we punt -- dmq */
5913                             IV count = SvIV(sv_dat);
5914                             I32 *pv = (I32*)SvPVX(sv_dat);
5915                             IV i;
5916                             for ( i = 0 ; i < count ; i++ ) {
5917                                 if ( pv[i] == RExC_npar ) {
5918                                     count = 0;
5919                                     break;
5920                                 }
5921                             }
5922                             if ( count ) {
5923                                 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
5924                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
5925                                 pv[count] = RExC_npar;
5926                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
5927                             }
5928                         } else {
5929                             (void)SvUPGRADE(sv_dat,SVt_PVNV);
5930                             sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
5931                             SvIOK_on(sv_dat);
5932                             SvIV_set(sv_dat, 1);
5933                         }
5934 #ifdef DEBUGGING
5935                         if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
5936                             SvREFCNT_dec(svname);
5937 #endif
5938
5939                         /*sv_dump(sv_dat);*/
5940                     }
5941                     nextchar(pRExC_state);
5942                     paren = 1;
5943                     goto capturing_parens;
5944                 }
5945                 RExC_seen |= REG_SEEN_LOOKBEHIND;
5946                 RExC_in_lookbehind++;
5947                 RExC_parse++;
5948             case '=':           /* (?=...) */
5949                 RExC_seen_zerolen++;
5950                 break;
5951             case '!':           /* (?!...) */
5952                 RExC_seen_zerolen++;
5953                 if (*RExC_parse == ')') {
5954                     ret=reg_node(pRExC_state, OPFAIL);
5955                     nextchar(pRExC_state);
5956                     return ret;
5957                 }
5958                 break;
5959             case '|':           /* (?|...) */
5960                 /* branch reset, behave like a (?:...) except that
5961                    buffers in alternations share the same numbers */
5962                 paren = ':';
5963                 after_freeze = freeze_paren = RExC_npar;
5964                 break;
5965             case ':':           /* (?:...) */
5966             case '>':           /* (?>...) */
5967                 break;
5968             case '$':           /* (?$...) */
5969             case '@':           /* (?@...) */
5970                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
5971                 break;
5972             case '#':           /* (?#...) */
5973                 while (*RExC_parse && *RExC_parse != ')')
5974                     RExC_parse++;
5975                 if (*RExC_parse != ')')
5976                     FAIL("Sequence (?#... not terminated");
5977                 nextchar(pRExC_state);
5978                 *flagp = TRYAGAIN;
5979                 return NULL;
5980             case '0' :           /* (?0) */
5981             case 'R' :           /* (?R) */
5982                 if (*RExC_parse != ')')
5983                     FAIL("Sequence (?R) not terminated");
5984                 ret = reg_node(pRExC_state, GOSTART);
5985                 *flagp |= POSTPONED;
5986                 nextchar(pRExC_state);
5987                 return ret;
5988                 /*notreached*/
5989             { /* named and numeric backreferences */
5990                 I32 num;
5991             case '&':            /* (?&NAME) */
5992                 parse_start = RExC_parse - 1;
5993               named_recursion:
5994                 {
5995                     SV *sv_dat = reg_scan_name(pRExC_state,
5996                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5997                      num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5998                 }
5999                 goto gen_recurse_regop;
6000                 /* NOT REACHED */
6001             case '+':
6002                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
6003                     RExC_parse++;
6004                     vFAIL("Illegal pattern");
6005                 }
6006                 goto parse_recursion;
6007                 /* NOT REACHED*/
6008             case '-': /* (?-1) */
6009                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
6010                     RExC_parse--; /* rewind to let it be handled later */
6011                     goto parse_flags;
6012                 }
6013                 /*FALLTHROUGH */
6014             case '1': case '2': case '3': case '4': /* (?1) */
6015             case '5': case '6': case '7': case '8': case '9':
6016                 RExC_parse--;
6017               parse_recursion:
6018                 num = atoi(RExC_parse);
6019                 parse_start = RExC_parse - 1; /* MJD */
6020                 if (*RExC_parse == '-')
6021                     RExC_parse++;
6022                 while (isDIGIT(*RExC_parse))
6023                         RExC_parse++;
6024                 if (*RExC_parse!=')')
6025                     vFAIL("Expecting close bracket");
6026
6027               gen_recurse_regop:
6028                 if ( paren == '-' ) {
6029                     /*
6030                     Diagram of capture buffer numbering.
6031                     Top line is the normal capture buffer numbers
6032                     Bottom line is the negative indexing as from
6033                     the X (the (?-2))
6034
6035                     +   1 2    3 4 5 X          6 7
6036                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
6037                     -   5 4    3 2 1 X          x x
6038
6039                     */
6040                     num = RExC_npar + num;
6041                     if (num < 1)  {
6042                         RExC_parse++;
6043                         vFAIL("Reference to nonexistent group");
6044                     }
6045                 } else if ( paren == '+' ) {
6046                     num = RExC_npar + num - 1;
6047                 }
6048
6049                 ret = reganode(pRExC_state, GOSUB, num);
6050                 if (!SIZE_ONLY) {
6051                     if (num > (I32)RExC_rx->nparens) {
6052                         RExC_parse++;
6053                         vFAIL("Reference to nonexistent group");
6054                     }
6055                     ARG2L_SET( ret, RExC_recurse_count++);
6056                     RExC_emit++;
6057                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6058                         "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
6059                 } else {
6060                     RExC_size++;
6061                 }
6062                 RExC_seen |= REG_SEEN_RECURSE;
6063                 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
6064                 Set_Node_Offset(ret, parse_start); /* MJD */
6065
6066                 *flagp |= POSTPONED;
6067                 nextchar(pRExC_state);
6068                 return ret;
6069             } /* named and numeric backreferences */
6070             /* NOT REACHED */
6071
6072             case '?':           /* (??...) */
6073                 is_logical = 1;
6074                 if (*RExC_parse != '{') {
6075                     RExC_parse++;
6076                     vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6077                     /*NOTREACHED*/
6078                 }
6079                 *flagp |= POSTPONED;
6080                 paren = *RExC_parse++;
6081                 /* FALL THROUGH */
6082             case '{':           /* (?{...}) */
6083             {
6084                 I32 count = 1;
6085                 U32 n = 0;
6086                 char c;
6087                 char *s = RExC_parse;
6088
6089                 RExC_seen_zerolen++;
6090                 RExC_seen |= REG_SEEN_EVAL;
6091                 while (count && (c = *RExC_parse)) {
6092                     if (c == '\\') {
6093                         if (RExC_parse[1])
6094                             RExC_parse++;
6095                     }
6096                     else if (c == '{')
6097                         count++;
6098                     else if (c == '}')
6099                         count--;
6100                     RExC_parse++;
6101                 }
6102                 if (*RExC_parse != ')') {
6103                     RExC_parse = s;
6104                     vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
6105                 }
6106                 if (!SIZE_ONLY) {
6107                     PAD *pad;
6108                     OP_4tree *sop, *rop;
6109                     SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
6110
6111                     ENTER;
6112                     Perl_save_re_context(aTHX);
6113                     rop = Perl_sv_compile_2op_is_broken(aTHX_ sv, &sop, "re", &pad);
6114                     sop->op_private |= OPpREFCOUNTED;
6115                     /* re_dup will OpREFCNT_inc */
6116                     OpREFCNT_set(sop, 1);
6117                     LEAVE;
6118
6119                     n = add_data(pRExC_state, 3, "nop");
6120                     RExC_rxi->data->data[n] = (void*)rop;
6121                     RExC_rxi->data->data[n+1] = (void*)sop;
6122                     RExC_rxi->data->data[n+2] = (void*)pad;
6123                     SvREFCNT_dec(sv);
6124                 }
6125                 else {                                          /* First pass */
6126                     if (PL_reginterp_cnt < ++RExC_seen_evals
6127                         && IN_PERL_RUNTIME)
6128                         /* No compiled RE interpolated, has runtime
6129                            components ===> unsafe.  */
6130                         FAIL("Eval-group not allowed at runtime, use re 'eval'");
6131                     if (PL_tainting && PL_tainted)
6132                         FAIL("Eval-group in insecure regular expression");
6133 #if PERL_VERSION > 8
6134                     if (IN_PERL_COMPILETIME)
6135                         PL_cv_has_eval = 1;
6136 #endif
6137                 }
6138
6139                 nextchar(pRExC_state);
6140                 if (is_logical) {
6141                     ret = reg_node(pRExC_state, LOGICAL);
6142                     if (!SIZE_ONLY)
6143                         ret->flags = 2;
6144                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
6145                     /* deal with the length of this later - MJD */
6146                     return ret;
6147                 }
6148                 ret = reganode(pRExC_state, EVAL, n);
6149                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
6150                 Set_Node_Offset(ret, parse_start);
6151                 return ret;
6152             }
6153             case '(':           /* (?(?{...})...) and (?(?=...)...) */
6154             {
6155                 int is_define= 0;
6156                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
6157                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
6158                         || RExC_parse[1] == '<'
6159                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
6160                         I32 flag;
6161
6162                         ret = reg_node(pRExC_state, LOGICAL);
6163                         if (!SIZE_ONLY)
6164                             ret->flags = 1;
6165                         REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
6166                         goto insert_if;
6167                     }
6168                 }
6169                 else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
6170                          || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
6171                 {
6172                     char ch = RExC_parse[0] == '<' ? '>' : '\'';
6173                     char *name_start= RExC_parse++;
6174                     U32 num = 0;
6175                     SV *sv_dat=reg_scan_name(pRExC_state,
6176                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6177                     if (RExC_parse == name_start || *RExC_parse != ch)
6178                         vFAIL2("Sequence (?(%c... not terminated",
6179                             (ch == '>' ? '<' : ch));
6180                     RExC_parse++;
6181                     if (!SIZE_ONLY) {
6182                         num = add_data( pRExC_state, 1, "S" );
6183                         RExC_rxi->data->data[num]=(void*)sv_dat;
6184                         SvREFCNT_inc_simple_void(sv_dat);
6185                     }
6186                     ret = reganode(pRExC_state,NGROUPP,num);
6187                     goto insert_if_check_paren;
6188                 }
6189                 else if (RExC_parse[0] == 'D' &&
6190                          RExC_parse[1] == 'E' &&
6191                          RExC_parse[2] == 'F' &&
6192                          RExC_parse[3] == 'I' &&
6193                          RExC_parse[4] == 'N' &&
6194                          RExC_parse[5] == 'E')
6195                 {
6196                     ret = reganode(pRExC_state,DEFINEP,0);
6197                     RExC_parse +=6 ;
6198                     is_define = 1;
6199                     goto insert_if_check_paren;
6200                 }
6201                 else if (RExC_parse[0] == 'R') {
6202                     RExC_parse++;
6203                     parno = 0;
6204                     if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
6205                         parno = atoi(RExC_parse++);
6206                         while (isDIGIT(*RExC_parse))
6207                             RExC_parse++;
6208                     } else if (RExC_parse[0] == '&') {
6209                         SV *sv_dat;
6210                         RExC_parse++;
6211                         sv_dat = reg_scan_name(pRExC_state,
6212                             SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6213                         parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
6214                     }
6215                     ret = reganode(pRExC_state,INSUBP,parno);
6216                     goto insert_if_check_paren;
6217                 }
6218                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
6219                     /* (?(1)...) */
6220                     char c;
6221                     parno = atoi(RExC_parse++);
6222
6223                     while (isDIGIT(*RExC_parse))
6224                         RExC_parse++;
6225                     ret = reganode(pRExC_state, GROUPP, parno);
6226
6227                  insert_if_check_paren:
6228                     if ((c = *nextchar(pRExC_state)) != ')')
6229                         vFAIL("Switch condition not recognized");
6230                   insert_if:
6231                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
6232                     br = regbranch(pRExC_state, &flags, 1,depth+1);
6233                     if (br == NULL)
6234                         br = reganode(pRExC_state, LONGJMP, 0);
6235                     else
6236                         REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
6237                     c = *nextchar(pRExC_state);
6238                     if (flags&HASWIDTH)
6239                         *flagp |= HASWIDTH;
6240                     if (c == '|') {
6241                         if (is_define)
6242                             vFAIL("(?(DEFINE)....) does not allow branches");
6243                         lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
6244                         regbranch(pRExC_state, &flags, 1,depth+1);
6245                         REGTAIL(pRExC_state, ret, lastbr);
6246                         if (flags&HASWIDTH)
6247                             *flagp |= HASWIDTH;
6248                         c = *nextchar(pRExC_state);
6249                     }
6250                     else
6251                         lastbr = NULL;
6252                     if (c != ')')
6253                         vFAIL("Switch (?(condition)... contains too many branches");
6254                     ender = reg_node(pRExC_state, TAIL);
6255                     REGTAIL(pRExC_state, br, ender);
6256                     if (lastbr) {
6257                         REGTAIL(pRExC_state, lastbr, ender);
6258                         REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
6259                     }
6260                     else
6261                         REGTAIL(pRExC_state, ret, ender);
6262                     RExC_size++; /* XXX WHY do we need this?!!
6263                                     For large programs it seems to be required
6264                                     but I can't figure out why. -- dmq*/
6265                     return ret;
6266                 }
6267                 else {
6268                     vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
6269                 }
6270             }
6271             case 0:
6272                 RExC_parse--; /* for vFAIL to print correctly */
6273                 vFAIL("Sequence (? incomplete");
6274                 break;
6275             case DEFAULT_PAT_MOD:   /* Use default flags with the exceptions
6276                                        that follow */
6277                 has_use_defaults = TRUE;
6278                 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
6279                 if (RExC_utf8) {    /* But the default for a utf8 pattern is
6280                                        unicode semantics */
6281                     set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
6282                 }
6283                 goto parse_flags;
6284             default:
6285                 --RExC_parse;
6286                 parse_flags:      /* (?i) */
6287             {
6288                 U32 posflags = 0, negflags = 0;
6289                 U32 *flagsp = &posflags;
6290                 bool has_charset_modifier = 0;
6291                 regex_charset cs = REGEX_DEPENDS_CHARSET;
6292
6293                 while (*RExC_parse) {
6294                     /* && strchr("iogcmsx", *RExC_parse) */
6295                     /* (?g), (?gc) and (?o) are useless here
6296                        and must be globally applied -- japhy */
6297                     switch (*RExC_parse) {
6298                     CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
6299                     case LOCALE_PAT_MOD:
6300                         if (has_charset_modifier || flagsp == &negflags) {
6301                             goto fail_modifiers;
6302                         }
6303                         cs = REGEX_LOCALE_CHARSET;
6304                         has_charset_modifier = 1;
6305                         break;
6306                     case UNICODE_PAT_MOD:
6307                         if (has_charset_modifier || flagsp == &negflags) {
6308                             goto fail_modifiers;
6309                         }
6310                         cs = REGEX_UNICODE_CHARSET;
6311                         has_charset_modifier = 1;
6312                         break;
6313                     case ASCII_RESTRICT_PAT_MOD:
6314                         if (has_charset_modifier || flagsp == &negflags) {
6315                             goto fail_modifiers;
6316                         }
6317                         cs = REGEX_ASCII_RESTRICTED_CHARSET;
6318                         has_charset_modifier = 1;
6319                         break;
6320                     case DEPENDS_PAT_MOD:
6321                         if (has_use_defaults
6322                             || has_charset_modifier
6323                             || flagsp == &negflags)
6324                         {
6325                             goto fail_modifiers;
6326                         }
6327
6328                         /* The dual charset means unicode semantics if the
6329                          * pattern (or target, not known until runtime) are
6330                          * utf8 */
6331                         cs = (RExC_utf8)
6332                              ? REGEX_UNICODE_CHARSET
6333                              : REGEX_DEPENDS_CHARSET;
6334                         has_charset_modifier = 1;
6335                         break;
6336                     case ONCE_PAT_MOD: /* 'o' */
6337                     case GLOBAL_PAT_MOD: /* 'g' */
6338                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6339                             const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
6340                             if (! (wastedflags & wflagbit) ) {
6341                                 wastedflags |= wflagbit;
6342                                 vWARN5(
6343                                     RExC_parse + 1,
6344                                     "Useless (%s%c) - %suse /%c modifier",
6345                                     flagsp == &negflags ? "?-" : "?",
6346                                     *RExC_parse,
6347                                     flagsp == &negflags ? "don't " : "",
6348                                     *RExC_parse
6349                                 );
6350                             }
6351                         }
6352                         break;
6353
6354                     case CONTINUE_PAT_MOD: /* 'c' */
6355                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6356                             if (! (wastedflags & WASTED_C) ) {
6357                                 wastedflags |= WASTED_GC;
6358                                 vWARN3(
6359                                     RExC_parse + 1,
6360                                     "Useless (%sc) - %suse /gc modifier",
6361                                     flagsp == &negflags ? "?-" : "?",
6362                                     flagsp == &negflags ? "don't " : ""
6363                                 );
6364                             }
6365                         }
6366                         break;
6367                     case KEEPCOPY_PAT_MOD: /* 'p' */
6368                         if (flagsp == &negflags) {
6369                             if (SIZE_ONLY)
6370                                 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
6371                         } else {
6372                             *flagsp |= RXf_PMf_KEEPCOPY;
6373                         }
6374                         break;
6375                     case '-':
6376                         /* A flag is a default iff it is following a minus, so
6377                          * if there is a minus, it means will be trying to
6378                          * re-specify a default which is an error */
6379                         if (has_use_defaults || flagsp == &negflags) {
6380             fail_modifiers:
6381                             RExC_parse++;
6382                             vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6383                             /*NOTREACHED*/
6384                         }
6385                         flagsp = &negflags;
6386                         wastedflags = 0;  /* reset so (?g-c) warns twice */
6387                         break;
6388                     case ':':
6389                         paren = ':';
6390                         /*FALLTHROUGH*/
6391                     case ')':
6392                         RExC_flags |= posflags;
6393                         RExC_flags &= ~negflags;
6394                         set_regex_charset(&RExC_flags, cs);
6395                         if (paren != ':') {
6396                             oregflags |= posflags;
6397                             oregflags &= ~negflags;
6398                             set_regex_charset(&oregflags, cs);
6399                         }
6400                         nextchar(pRExC_state);
6401                         if (paren != ':') {
6402                             *flagp = TRYAGAIN;
6403                             return NULL;
6404                         } else {
6405                             ret = NULL;
6406                             goto parse_rest;
6407                         }
6408                         /*NOTREACHED*/
6409                     default:
6410                         RExC_parse++;
6411                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6412                         /*NOTREACHED*/
6413                     }
6414                     ++RExC_parse;
6415                 }
6416             }} /* one for the default block, one for the switch */
6417         }
6418         else {                  /* (...) */
6419           capturing_parens:
6420             parno = RExC_npar;
6421             RExC_npar++;
6422
6423             ret = reganode(pRExC_state, OPEN, parno);
6424             if (!SIZE_ONLY ){
6425                 if (!RExC_nestroot)
6426                     RExC_nestroot = parno;
6427                 if (RExC_seen & REG_SEEN_RECURSE
6428                     && !RExC_open_parens[parno-1])
6429                 {
6430                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6431                         "Setting open paren #%"IVdf" to %d\n",
6432                         (IV)parno, REG_NODE_NUM(ret)));
6433                     RExC_open_parens[parno-1]= ret;
6434                 }
6435             }
6436             Set_Node_Length(ret, 1); /* MJD */
6437             Set_Node_Offset(ret, RExC_parse); /* MJD */
6438             is_open = 1;
6439         }
6440     }
6441     else                        /* ! paren */
6442         ret = NULL;
6443
6444    parse_rest:
6445     /* Pick up the branches, linking them together. */
6446     parse_start = RExC_parse;   /* MJD */
6447     br = regbranch(pRExC_state, &flags, 1,depth+1);
6448
6449     if (freeze_paren) {
6450         if (RExC_npar > after_freeze)
6451             after_freeze = RExC_npar;
6452         RExC_npar = freeze_paren;
6453     }
6454
6455     /*     branch_len = (paren != 0); */
6456
6457     if (br == NULL)
6458         return(NULL);
6459     if (*RExC_parse == '|') {
6460         if (!SIZE_ONLY && RExC_extralen) {
6461             reginsert(pRExC_state, BRANCHJ, br, depth+1);
6462         }
6463         else {                  /* MJD */
6464             reginsert(pRExC_state, BRANCH, br, depth+1);
6465             Set_Node_Length(br, paren != 0);
6466             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
6467         }
6468         have_branch = 1;
6469         if (SIZE_ONLY)
6470             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
6471     }
6472     else if (paren == ':') {
6473         *flagp |= flags&SIMPLE;
6474     }
6475     if (is_open) {                              /* Starts with OPEN. */
6476         REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
6477     }
6478     else if (paren != '?')              /* Not Conditional */
6479         ret = br;
6480     *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
6481     lastbr = br;
6482     while (*RExC_parse == '|') {
6483         if (!SIZE_ONLY && RExC_extralen) {
6484             ender = reganode(pRExC_state, LONGJMP,0);
6485             REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
6486         }
6487         if (SIZE_ONLY)
6488             RExC_extralen += 2;         /* Account for LONGJMP. */
6489         nextchar(pRExC_state);
6490         if (freeze_paren) {
6491             if (RExC_npar > after_freeze)
6492                 after_freeze = RExC_npar;
6493             RExC_npar = freeze_paren;
6494         }
6495         br = regbranch(pRExC_state, &flags, 0, depth+1);
6496
6497         if (br == NULL)
6498             return(NULL);
6499         REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
6500         lastbr = br;
6501         *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
6502     }
6503
6504     if (have_branch || paren != ':') {
6505         /* Make a closing node, and hook it on the end. */
6506         switch (paren) {
6507         case ':':
6508             ender = reg_node(pRExC_state, TAIL);
6509             break;
6510         case 1:
6511             ender = reganode(pRExC_state, CLOSE, parno);
6512             if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
6513                 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6514                         "Setting close paren #%"IVdf" to %d\n",
6515                         (IV)parno, REG_NODE_NUM(ender)));
6516                 RExC_close_parens[parno-1]= ender;
6517                 if (RExC_nestroot == parno)
6518                     RExC_nestroot = 0;
6519             }
6520             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
6521             Set_Node_Length(ender,1); /* MJD */
6522             break;
6523         case '<':
6524         case ',':
6525         case '=':
6526         case '!':
6527             *flagp &= ~HASWIDTH;
6528             /* FALL THROUGH */
6529         case '>':
6530             ender = reg_node(pRExC_state, SUCCEED);
6531             break;
6532         case 0:
6533             ender = reg_node(pRExC_state, END);
6534             if (!SIZE_ONLY) {
6535                 assert(!RExC_opend); /* there can only be one! */
6536                 RExC_opend = ender;
6537             }
6538             break;
6539         }
6540         REGTAIL(pRExC_state, lastbr, ender);
6541
6542         if (have_branch && !SIZE_ONLY) {
6543             if (depth==1)
6544                 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6545
6546             /* Hook the tails of the branches to the closing node. */
6547             for (br = ret; br; br = regnext(br)) {
6548                 const U8 op = PL_regkind[OP(br)];
6549                 if (op == BRANCH) {
6550                     REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
6551                 }
6552                 else if (op == BRANCHJ) {
6553                     REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
6554                 }
6555             }
6556         }
6557     }
6558
6559     {
6560         const char *p;
6561         static const char parens[] = "=!<,>";
6562
6563         if (paren && (p = strchr(parens, paren))) {
6564             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
6565             int flag = (p - parens) > 1;
6566
6567             if (paren == '>')
6568                 node = SUSPEND, flag = 0;
6569             reginsert(pRExC_state, node,ret, depth+1);
6570             Set_Node_Cur_Length(ret);
6571             Set_Node_Offset(ret, parse_start + 1);
6572             ret->flags = flag;
6573             REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
6574         }
6575     }
6576
6577     /* Check for proper termination. */
6578     if (paren) {
6579         RExC_flags = oregflags;
6580         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
6581             RExC_parse = oregcomp_parse;
6582             vFAIL("Unmatched (");
6583         }
6584     }
6585     else if (!paren && RExC_parse < RExC_end) {
6586         if (*RExC_parse == ')') {
6587             RExC_parse++;
6588             vFAIL("Unmatched )");
6589         }
6590         else
6591             FAIL("Junk on end of regexp");      /* "Can't happen". */
6592         /* NOTREACHED */
6593     }
6594
6595     if (RExC_in_lookbehind) {
6596         RExC_in_lookbehind--;
6597     }
6598     if (after_freeze)
6599         RExC_npar = after_freeze;
6600     return(ret);
6601 }
6602
6603 /*
6604  - regbranch - one alternative of an | operator
6605  *
6606  * Implements the concatenation operator.
6607  */
6608 STATIC regnode *
6609 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
6610 {
6611     dVAR;
6612     register regnode *ret;
6613     register regnode *chain = NULL;
6614     register regnode *latest;
6615     I32 flags = 0, c = 0;
6616     GET_RE_DEBUG_FLAGS_DECL;
6617
6618     PERL_ARGS_ASSERT_REGBRANCH;
6619
6620     DEBUG_PARSE("brnc");
6621
6622     if (first)
6623         ret = NULL;
6624     else {
6625         if (!SIZE_ONLY && RExC_extralen)
6626             ret = reganode(pRExC_state, BRANCHJ,0);
6627         else {
6628             ret = reg_node(pRExC_state, BRANCH);
6629             Set_Node_Length(ret, 1);
6630         }
6631     }
6632
6633     if (!first && SIZE_ONLY)
6634         RExC_extralen += 1;                     /* BRANCHJ */
6635
6636     *flagp = WORST;                     /* Tentatively. */
6637
6638     RExC_parse--;
6639     nextchar(pRExC_state);
6640     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
6641         flags &= ~TRYAGAIN;
6642         latest = regpiece(pRExC_state, &flags,depth+1);
6643         if (latest == NULL) {
6644             if (flags & TRYAGAIN)
6645                 continue;
6646             return(NULL);
6647         }
6648         else if (ret == NULL)
6649             ret = latest;
6650         *flagp |= flags&(HASWIDTH|POSTPONED);
6651         if (chain == NULL)      /* First piece. */
6652             *flagp |= flags&SPSTART;
6653         else {
6654             RExC_naughty++;
6655             REGTAIL(pRExC_state, chain, latest);
6656         }
6657         chain = latest;
6658         c++;
6659     }
6660     if (chain == NULL) {        /* Loop ran zero times. */
6661         chain = reg_node(pRExC_state, NOTHING);
6662         if (ret == NULL)
6663             ret = chain;
6664     }
6665     if (c == 1) {
6666         *flagp |= flags&SIMPLE;
6667     }
6668
6669     return ret;
6670 }
6671
6672 /*
6673  - regpiece - something followed by possible [*+?]
6674  *
6675  * Note that the branching code sequences used for ? and the general cases
6676  * of * and + are somewhat optimized:  they use the same NOTHING node as
6677  * both the endmarker for their branch list and the body of the last branch.
6678  * It might seem that this node could be dispensed with entirely, but the
6679  * endmarker role is not redundant.
6680  */
6681 STATIC regnode *
6682 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
6683 {
6684     dVAR;
6685     register regnode *ret;
6686     register char op;
6687     register char *next;
6688     I32 flags;
6689     const char * const origparse = RExC_parse;
6690     I32 min;
6691     I32 max = REG_INFTY;
6692     char *parse_start;
6693     const char *maxpos = NULL;
6694     GET_RE_DEBUG_FLAGS_DECL;
6695
6696     PERL_ARGS_ASSERT_REGPIECE;
6697
6698     DEBUG_PARSE("piec");
6699
6700     ret = regatom(pRExC_state, &flags,depth+1);
6701     if (ret == NULL) {
6702         if (flags & TRYAGAIN)
6703             *flagp |= TRYAGAIN;
6704         return(NULL);
6705     }
6706
6707     op = *RExC_parse;
6708
6709     if (op == '{' && regcurly(RExC_parse)) {
6710         maxpos = NULL;
6711         parse_start = RExC_parse; /* MJD */
6712         next = RExC_parse + 1;
6713         while (isDIGIT(*next) || *next == ',') {
6714             if (*next == ',') {
6715                 if (maxpos)
6716                     break;
6717                 else
6718                     maxpos = next;
6719             }
6720             next++;
6721         }
6722         if (*next == '}') {             /* got one */
6723             if (!maxpos)
6724                 maxpos = next;
6725             RExC_parse++;
6726             min = atoi(RExC_parse);
6727             if (*maxpos == ',')
6728                 maxpos++;
6729             else
6730                 maxpos = RExC_parse;
6731             max = atoi(maxpos);
6732             if (!max && *maxpos != '0')
6733                 max = REG_INFTY;                /* meaning "infinity" */
6734             else if (max >= REG_INFTY)
6735                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
6736             RExC_parse = next;
6737             nextchar(pRExC_state);
6738
6739         do_curly:
6740             if ((flags&SIMPLE)) {
6741                 RExC_naughty += 2 + RExC_naughty / 2;
6742                 reginsert(pRExC_state, CURLY, ret, depth+1);
6743                 Set_Node_Offset(ret, parse_start+1); /* MJD */
6744                 Set_Node_Cur_Length(ret);
6745             }
6746             else {
6747                 regnode * const w = reg_node(pRExC_state, WHILEM);
6748
6749                 w->flags = 0;
6750                 REGTAIL(pRExC_state, ret, w);
6751                 if (!SIZE_ONLY && RExC_extralen) {
6752                     reginsert(pRExC_state, LONGJMP,ret, depth+1);
6753                     reginsert(pRExC_state, NOTHING,ret, depth+1);
6754                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
6755                 }
6756                 reginsert(pRExC_state, CURLYX,ret, depth+1);
6757                                 /* MJD hk */
6758                 Set_Node_Offset(ret, parse_start+1);
6759                 Set_Node_Length(ret,
6760                                 op == '{' ? (RExC_parse - parse_start) : 1);
6761
6762                 if (!SIZE_ONLY && RExC_extralen)
6763                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
6764                 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
6765                 if (SIZE_ONLY)
6766                     RExC_whilem_seen++, RExC_extralen += 3;
6767                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
6768             }
6769             ret->flags = 0;
6770
6771             if (min > 0)
6772                 *flagp = WORST;
6773             if (max > 0)
6774                 *flagp |= HASWIDTH;
6775             if (max < min)
6776                 vFAIL("Can't do {n,m} with n > m");
6777             if (!SIZE_ONLY) {
6778                 ARG1_SET(ret, (U16)min);
6779                 ARG2_SET(ret, (U16)max);
6780             }
6781
6782             goto nest_check;
6783         }
6784     }
6785
6786     if (!ISMULT1(op)) {
6787         *flagp = flags;
6788         return(ret);
6789     }
6790
6791 #if 0                           /* Now runtime fix should be reliable. */
6792
6793     /* if this is reinstated, don't forget to put this back into perldiag:
6794
6795             =item Regexp *+ operand could be empty at {#} in regex m/%s/
6796
6797            (F) The part of the regexp subject to either the * or + quantifier
6798            could match an empty string. The {#} shows in the regular
6799            expression about where the problem was discovered.
6800
6801     */
6802
6803     if (!(flags&HASWIDTH) && op != '?')
6804       vFAIL("Regexp *+ operand could be empty");
6805 #endif
6806
6807     parse_start = RExC_parse;
6808     nextchar(pRExC_state);
6809
6810     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
6811
6812     if (op == '*' && (flags&SIMPLE)) {
6813         reginsert(pRExC_state, STAR, ret, depth+1);
6814         ret->flags = 0;
6815         RExC_naughty += 4;
6816     }
6817     else if (op == '*') {
6818         min = 0;
6819         goto do_curly;
6820     }
6821     else if (op == '+' && (flags&SIMPLE)) {
6822         reginsert(pRExC_state, PLUS, ret, depth+1);
6823         ret->flags = 0;
6824         RExC_naughty += 3;
6825     }
6826     else if (op == '+') {
6827         min = 1;
6828         goto do_curly;
6829     }
6830     else if (op == '?') {
6831         min = 0; max = 1;
6832         goto do_curly;
6833     }
6834   nest_check:
6835     if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
6836         ckWARN3reg(RExC_parse,
6837                    "%.*s matches null string many times",
6838                    (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
6839                    origparse);
6840     }
6841
6842     if (RExC_parse < RExC_end && *RExC_parse == '?') {
6843         nextchar(pRExC_state);
6844         reginsert(pRExC_state, MINMOD, ret, depth+1);
6845         REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
6846     }
6847 #ifndef REG_ALLOW_MINMOD_SUSPEND
6848     else
6849 #endif
6850     if (RExC_parse < RExC_end && *RExC_parse == '+') {
6851         regnode *ender;
6852         nextchar(pRExC_state);
6853         ender = reg_node(pRExC_state, SUCCEED);
6854         REGTAIL(pRExC_state, ret, ender);
6855         reginsert(pRExC_state, SUSPEND, ret, depth+1);
6856         ret->flags = 0;
6857         ender = reg_node(pRExC_state, TAIL);
6858         REGTAIL(pRExC_state, ret, ender);
6859         /*ret= ender;*/
6860     }
6861
6862     if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
6863         RExC_parse++;
6864         vFAIL("Nested quantifiers");
6865     }
6866
6867     return(ret);
6868 }
6869
6870
6871 /* reg_namedseq(pRExC_state,UVp)
6872
6873    This is expected to be called by a parser routine that has
6874    recognized '\N' and needs to handle the rest. RExC_parse is
6875    expected to point at the first char following the N at the time
6876    of the call.
6877
6878    The \N may be inside (indicated by valuep not being NULL) or outside a
6879    character class.
6880
6881    \N may begin either a named sequence, or if outside a character class, mean
6882    to match a non-newline.  For non single-quoted regexes, the tokenizer has
6883    attempted to decide which, and in the case of a named sequence converted it
6884    into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
6885    where c1... are the characters in the sequence.  For single-quoted regexes,
6886    the tokenizer passes the \N sequence through unchanged; this code will not
6887    attempt to determine this nor expand those.  The net effect is that if the
6888    beginning of the passed-in pattern isn't '{U+' or there is no '}', it
6889    signals that this \N occurrence means to match a non-newline.
6890
6891    Only the \N{U+...} form should occur in a character class, for the same
6892    reason that '.' inside a character class means to just match a period: it
6893    just doesn't make sense.
6894
6895    If valuep is non-null then it is assumed that we are parsing inside
6896    of a charclass definition and the first codepoint in the resolved
6897    string is returned via *valuep and the routine will return NULL.
6898    In this mode if a multichar string is returned from the charnames
6899    handler, a warning will be issued, and only the first char in the
6900    sequence will be examined. If the string returned is zero length
6901    then the value of *valuep is undefined and NON-NULL will
6902    be returned to indicate failure. (This will NOT be a valid pointer
6903    to a regnode.)
6904
6905    If valuep is null then it is assumed that we are parsing normal text and a
6906    new EXACT node is inserted into the program containing the resolved string,
6907    and a pointer to the new node is returned.  But if the string is zero length
6908    a NOTHING node is emitted instead.
6909
6910    On success RExC_parse is set to the char following the endbrace.
6911    Parsing failures will generate a fatal error via vFAIL(...)
6912  */
6913 STATIC regnode *
6914 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
6915 {
6916     char * endbrace;    /* '}' following the name */
6917     regnode *ret = NULL;
6918 #ifdef DEBUGGING
6919     char* parse_start = RExC_parse - 2;     /* points to the '\N' */
6920 #endif
6921     char* p;
6922
6923     GET_RE_DEBUG_FLAGS_DECL;
6924
6925     PERL_ARGS_ASSERT_REG_NAMEDSEQ;
6926
6927     GET_RE_DEBUG_FLAGS;
6928
6929     /* The [^\n] meaning of \N ignores spaces and comments under the /x
6930      * modifier.  The other meaning does not */
6931     p = (RExC_flags & RXf_PMf_EXTENDED)
6932         ? regwhite( pRExC_state, RExC_parse )
6933         : RExC_parse;
6934
6935     /* Disambiguate between \N meaning a named character versus \N meaning
6936      * [^\n].  The former is assumed when it can't be the latter. */
6937     if (*p != '{' || regcurly(p)) {
6938         RExC_parse = p;
6939         if (valuep) {
6940             /* no bare \N in a charclass */
6941             vFAIL("\\N in a character class must be a named character: \\N{...}");
6942         }
6943         nextchar(pRExC_state);
6944         ret = reg_node(pRExC_state, REG_ANY);
6945         *flagp |= HASWIDTH|SIMPLE;
6946         RExC_naughty++;
6947         RExC_parse--;
6948         Set_Node_Length(ret, 1); /* MJD */
6949         return ret;
6950     }
6951
6952     /* Here, we have decided it should be a named sequence */
6953
6954     /* The test above made sure that the next real character is a '{', but
6955      * under the /x modifier, it could be separated by space (or a comment and
6956      * \n) and this is not allowed (for consistency with \x{...} and the
6957      * tokenizer handling of \N{NAME}). */
6958     if (*RExC_parse != '{') {
6959         vFAIL("Missing braces on \\N{}");
6960     }
6961
6962     RExC_parse++;       /* Skip past the '{' */
6963
6964     if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
6965         || ! (endbrace == RExC_parse            /* nothing between the {} */
6966               || (endbrace - RExC_parse >= 2    /* U+ (bad hex is checked below */
6967                   && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
6968     {
6969         if (endbrace) RExC_parse = endbrace;    /* position msg's '<--HERE' */
6970         vFAIL("\\N{NAME} must be resolved by the lexer");
6971     }
6972
6973     if (endbrace == RExC_parse) {   /* empty: \N{} */
6974         if (! valuep) {
6975             RExC_parse = endbrace + 1;
6976             return reg_node(pRExC_state,NOTHING);
6977         }
6978
6979         if (SIZE_ONLY) {
6980             ckWARNreg(RExC_parse,
6981                     "Ignoring zero length \\N{} in character class"
6982             );
6983             RExC_parse = endbrace + 1;
6984         }
6985         *valuep = 0;
6986         return (regnode *) &RExC_parse; /* Invalid regnode pointer */
6987     }
6988
6989     REQUIRE_UTF8;       /* named sequences imply Unicode semantics */
6990     RExC_parse += 2;    /* Skip past the 'U+' */
6991
6992     if (valuep) {   /* In a bracketed char class */
6993         /* We only pay attention to the first char of
6994         multichar strings being returned. I kinda wonder
6995         if this makes sense as it does change the behaviour
6996         from earlier versions, OTOH that behaviour was broken
6997         as well. XXX Solution is to recharacterize as
6998         [rest-of-class]|multi1|multi2... */
6999
7000         STRLEN length_of_hex;
7001         I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7002             | PERL_SCAN_DISALLOW_PREFIX
7003             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
7004
7005         char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
7006         if (endchar < endbrace) {
7007             ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
7008         }
7009
7010         length_of_hex = (STRLEN)(endchar - RExC_parse);
7011         *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
7012
7013         /* The tokenizer should have guaranteed validity, but it's possible to
7014          * bypass it by using single quoting, so check */
7015         if (length_of_hex == 0
7016             || length_of_hex != (STRLEN)(endchar - RExC_parse) )
7017         {
7018             RExC_parse += length_of_hex;        /* Includes all the valid */
7019             RExC_parse += (RExC_orig_utf8)      /* point to after 1st invalid */
7020                             ? UTF8SKIP(RExC_parse)
7021                             : 1;
7022             /* Guard against malformed utf8 */
7023             if (RExC_parse >= endchar) RExC_parse = endchar;
7024             vFAIL("Invalid hexadecimal number in \\N{U+...}");
7025         }
7026
7027         RExC_parse = endbrace + 1;
7028         if (endchar == endbrace) return NULL;
7029
7030         ret = (regnode *) &RExC_parse;  /* Invalid regnode pointer */
7031     }
7032     else {      /* Not a char class */
7033         char *s;            /* String to put in generated EXACT node */
7034         STRLEN len = 0;     /* Its current byte length */
7035         char *endchar;      /* Points to '.' or '}' ending cur char in the input
7036                                stream */
7037
7038         ret = reg_node(pRExC_state, (U8) ((! FOLD) ? EXACT
7039                                                    : (LOC)
7040                                                       ? EXACTFL
7041                                                       : UNI_SEMANTICS
7042                                                         ? EXACTFU
7043                                                         : EXACTF));
7044         s= STRING(ret);
7045
7046         /* Exact nodes can hold only a U8 length's of text = 255.  Loop through
7047          * the input which is of the form now 'c1.c2.c3...}' until find the
7048          * ending brace or exceed length 255.  The characters that exceed this
7049          * limit are dropped.  The limit could be relaxed should it become
7050          * desirable by reparsing this as (?:\N{NAME}), so could generate
7051          * multiple EXACT nodes, as is done for just regular input.  But this
7052          * is primarily a named character, and not intended to be a huge long
7053          * string, so 255 bytes should be good enough */
7054         while (1) {
7055             STRLEN length_of_hex;
7056             I32 grok_flags = PERL_SCAN_ALLOW_UNDERSCORES
7057                             | PERL_SCAN_DISALLOW_PREFIX
7058                             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
7059             UV cp;  /* Ord of current character */
7060
7061             /* Code points are separated by dots.  If none, there is only one
7062              * code point, and is terminated by the brace */
7063             endchar = RExC_parse + strcspn(RExC_parse, ".}");
7064
7065             /* The values are Unicode even on EBCDIC machines */
7066             length_of_hex = (STRLEN)(endchar - RExC_parse);
7067             cp = grok_hex(RExC_parse, &length_of_hex, &grok_flags, NULL);
7068             if ( length_of_hex == 0
7069                 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
7070             {
7071                 RExC_parse += length_of_hex;        /* Includes all the valid */
7072                 RExC_parse += (RExC_orig_utf8)  /* point to after 1st invalid */
7073                                 ? UTF8SKIP(RExC_parse)
7074                                 : 1;
7075                 /* Guard against malformed utf8 */
7076                 if (RExC_parse >= endchar) RExC_parse = endchar;
7077                 vFAIL("Invalid hexadecimal number in \\N{U+...}");
7078             }
7079
7080             if (! FOLD) {       /* Not folding, just append to the string */
7081                 STRLEN unilen;
7082
7083                 /* Quit before adding this character if would exceed limit */
7084                 if (len + UNISKIP(cp) > U8_MAX) break;
7085
7086                 unilen = reguni(pRExC_state, cp, s);
7087                 if (unilen > 0) {
7088                     s   += unilen;
7089                     len += unilen;
7090                 }
7091             } else {    /* Folding, output the folded equivalent */
7092                 STRLEN foldlen,numlen;
7093                 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
7094                 cp = toFOLD_uni(cp, tmpbuf, &foldlen);
7095
7096                 /* Quit before exceeding size limit */
7097                 if (len + foldlen > U8_MAX) break;
7098
7099                 for (foldbuf = tmpbuf;
7100                     foldlen;
7101                     foldlen -= numlen)
7102                 {
7103                     cp = utf8_to_uvchr(foldbuf, &numlen);
7104                     if (numlen > 0) {
7105                         const STRLEN unilen = reguni(pRExC_state, cp, s);
7106                         s       += unilen;
7107                         len     += unilen;
7108                         /* In EBCDIC the numlen and unilen can differ. */
7109                         foldbuf += numlen;
7110                         if (numlen >= foldlen)
7111                             break;
7112                     }
7113                     else
7114                         break; /* "Can't happen." */
7115                 }
7116             }
7117
7118             /* Point to the beginning of the next character in the sequence. */
7119             RExC_parse = endchar + 1;
7120
7121             /* Quit if no more characters */
7122             if (RExC_parse >= endbrace) break;
7123         }
7124
7125
7126         if (SIZE_ONLY) {
7127             if (RExC_parse < endbrace) {
7128                 ckWARNreg(RExC_parse - 1,
7129                           "Using just the first characters returned by \\N{}");
7130             }
7131
7132             RExC_size += STR_SZ(len);
7133         } else {
7134             STR_LEN(ret) = len;
7135             RExC_emit += STR_SZ(len);
7136         }
7137
7138         RExC_parse = endbrace + 1;
7139
7140         *flagp |= HASWIDTH; /* Not SIMPLE, as that causes the engine to fail
7141                                with malformed in t/re/pat_advanced.t */
7142         RExC_parse --;
7143         Set_Node_Cur_Length(ret); /* MJD */
7144         nextchar(pRExC_state);
7145     }
7146
7147     return ret;
7148 }
7149
7150
7151 /*
7152  * reg_recode
7153  *
7154  * It returns the code point in utf8 for the value in *encp.
7155  *    value: a code value in the source encoding
7156  *    encp:  a pointer to an Encode object
7157  *
7158  * If the result from Encode is not a single character,
7159  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
7160  */
7161 STATIC UV
7162 S_reg_recode(pTHX_ const char value, SV **encp)
7163 {
7164     STRLEN numlen = 1;
7165     SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
7166     const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
7167     const STRLEN newlen = SvCUR(sv);
7168     UV uv = UNICODE_REPLACEMENT;
7169
7170     PERL_ARGS_ASSERT_REG_RECODE;
7171
7172     if (newlen)
7173         uv = SvUTF8(sv)
7174              ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
7175              : *(U8*)s;
7176
7177     if (!newlen || numlen != newlen) {
7178         uv = UNICODE_REPLACEMENT;
7179         *encp = NULL;
7180     }
7181     return uv;
7182 }
7183
7184
7185 /*
7186  - regatom - the lowest level
7187
7188    Try to identify anything special at the start of the pattern. If there
7189    is, then handle it as required. This may involve generating a single regop,
7190    such as for an assertion; or it may involve recursing, such as to
7191    handle a () structure.
7192
7193    If the string doesn't start with something special then we gobble up
7194    as much literal text as we can.
7195
7196    Once we have been able to handle whatever type of thing started the
7197    sequence, we return.
7198
7199    Note: we have to be careful with escapes, as they can be both literal
7200    and special, and in the case of \10 and friends can either, depending
7201    on context. Specifically there are two separate switches for handling
7202    escape sequences, with the one for handling literal escapes requiring
7203    a dummy entry for all of the special escapes that are actually handled
7204    by the other.
7205 */
7206
7207 STATIC regnode *
7208 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
7209 {
7210     dVAR;
7211     register regnode *ret = NULL;
7212     I32 flags;
7213     char *parse_start = RExC_parse;
7214     U8 op;
7215     GET_RE_DEBUG_FLAGS_DECL;
7216     DEBUG_PARSE("atom");
7217     *flagp = WORST;             /* Tentatively. */
7218
7219     PERL_ARGS_ASSERT_REGATOM;
7220
7221 tryagain:
7222     switch ((U8)*RExC_parse) {
7223     case '^':
7224         RExC_seen_zerolen++;
7225         nextchar(pRExC_state);
7226         if (RExC_flags & RXf_PMf_MULTILINE)
7227             ret = reg_node(pRExC_state, MBOL);
7228         else if (RExC_flags & RXf_PMf_SINGLELINE)
7229             ret = reg_node(pRExC_state, SBOL);
7230         else
7231             ret = reg_node(pRExC_state, BOL);
7232         Set_Node_Length(ret, 1); /* MJD */
7233         break;
7234     case '$':
7235         nextchar(pRExC_state);
7236         if (*RExC_parse)
7237             RExC_seen_zerolen++;
7238         if (RExC_flags & RXf_PMf_MULTILINE)
7239             ret = reg_node(pRExC_state, MEOL);
7240         else if (RExC_flags & RXf_PMf_SINGLELINE)
7241             ret = reg_node(pRExC_state, SEOL);
7242         else
7243             ret = reg_node(pRExC_state, EOL);
7244         Set_Node_Length(ret, 1); /* MJD */
7245         break;
7246     case '.':
7247         nextchar(pRExC_state);
7248         if (RExC_flags & RXf_PMf_SINGLELINE)
7249             ret = reg_node(pRExC_state, SANY);
7250         else
7251             ret = reg_node(pRExC_state, REG_ANY);
7252         *flagp |= HASWIDTH|SIMPLE;
7253         RExC_naughty++;
7254         Set_Node_Length(ret, 1); /* MJD */
7255         break;
7256     case '[':
7257     {
7258         char * const oregcomp_parse = ++RExC_parse;
7259         ret = regclass(pRExC_state,depth+1);
7260         if (*RExC_parse != ']') {
7261             RExC_parse = oregcomp_parse;
7262             vFAIL("Unmatched [");
7263         }
7264         nextchar(pRExC_state);
7265         *flagp |= HASWIDTH|SIMPLE;
7266         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
7267         break;
7268     }
7269     case '(':
7270         nextchar(pRExC_state);
7271         ret = reg(pRExC_state, 1, &flags,depth+1);
7272         if (ret == NULL) {
7273                 if (flags & TRYAGAIN) {
7274                     if (RExC_parse == RExC_end) {
7275                          /* Make parent create an empty node if needed. */
7276                         *flagp |= TRYAGAIN;
7277                         return(NULL);
7278                     }
7279                     goto tryagain;
7280                 }
7281                 return(NULL);
7282         }
7283         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
7284         break;
7285     case '|':
7286     case ')':
7287         if (flags & TRYAGAIN) {
7288             *flagp |= TRYAGAIN;
7289             return NULL;
7290         }
7291         vFAIL("Internal urp");
7292                                 /* Supposed to be caught earlier. */
7293         break;
7294     case '{':
7295         if (!regcurly(RExC_parse)) {
7296             RExC_parse++;
7297             goto defchar;
7298         }
7299         /* FALL THROUGH */
7300     case '?':
7301     case '+':
7302     case '*':
7303         RExC_parse++;
7304         vFAIL("Quantifier follows nothing");
7305         break;
7306     case LATIN_SMALL_LETTER_SHARP_S:
7307     case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
7308     case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
7309 #if UTF8_TWO_BYTE_HI_nocast(UPSILON_D_T) != UTF8_TWO_BYTE_HI_nocast(IOTA_D_T)
7310 #error The beginning utf8 byte of IOTA_D_T and UPSILON_D_T unexpectedly differ.  Other instances in this code should have the case statement below.
7311     case UTF8_TWO_BYTE_HI_nocast(UPSILON_D_T):
7312 #endif
7313         do_foldchar:
7314         if (!LOC && FOLD) {
7315             U32 len,cp;
7316             len=0; /* silence a spurious compiler warning */
7317             if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) {
7318                 *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */
7319                 RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */
7320                 ret = reganode(pRExC_state, FOLDCHAR, cp);
7321                 Set_Node_Length(ret, 1); /* MJD */
7322                 nextchar(pRExC_state); /* kill whitespace under /x */
7323                 return ret;
7324             }
7325         }
7326         goto outer_default;
7327     case '\\':
7328         /* Special Escapes
7329
7330            This switch handles escape sequences that resolve to some kind
7331            of special regop and not to literal text. Escape sequnces that
7332            resolve to literal text are handled below in the switch marked
7333            "Literal Escapes".
7334
7335            Every entry in this switch *must* have a corresponding entry
7336            in the literal escape switch. However, the opposite is not
7337            required, as the default for this switch is to jump to the
7338            literal text handling code.
7339         */
7340         switch ((U8)*++RExC_parse) {
7341         case LATIN_SMALL_LETTER_SHARP_S:
7342         case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
7343         case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
7344                    goto do_foldchar;
7345         /* Special Escapes */
7346         case 'A':
7347             RExC_seen_zerolen++;
7348             ret = reg_node(pRExC_state, SBOL);
7349             *flagp |= SIMPLE;
7350             goto finish_meta_pat;
7351         case 'G':
7352             ret = reg_node(pRExC_state, GPOS);
7353             RExC_seen |= REG_SEEN_GPOS;
7354             *flagp |= SIMPLE;
7355             goto finish_meta_pat;
7356         case 'K':
7357             RExC_seen_zerolen++;
7358             ret = reg_node(pRExC_state, KEEPS);
7359             *flagp |= SIMPLE;
7360             /* XXX:dmq : disabling in-place substitution seems to
7361              * be necessary here to avoid cases of memory corruption, as
7362              * with: C<$_="x" x 80; s/x\K/y/> -- rgs
7363              */
7364             RExC_seen |= REG_SEEN_LOOKBEHIND;
7365             goto finish_meta_pat;
7366         case 'Z':
7367             ret = reg_node(pRExC_state, SEOL);
7368             *flagp |= SIMPLE;
7369             RExC_seen_zerolen++;                /* Do not optimize RE away */
7370             goto finish_meta_pat;
7371         case 'z':
7372             ret = reg_node(pRExC_state, EOS);
7373             *flagp |= SIMPLE;
7374             RExC_seen_zerolen++;                /* Do not optimize RE away */
7375             goto finish_meta_pat;
7376         case 'C':
7377             ret = reg_node(pRExC_state, CANY);
7378             RExC_seen |= REG_SEEN_CANY;
7379             *flagp |= HASWIDTH|SIMPLE;
7380             goto finish_meta_pat;
7381         case 'X':
7382             ret = reg_node(pRExC_state, CLUMP);
7383             *flagp |= HASWIDTH;
7384             goto finish_meta_pat;
7385         case 'w':
7386             switch (get_regex_charset(RExC_flags)) {
7387                 case REGEX_LOCALE_CHARSET:
7388                     op = ALNUML;
7389                     break;
7390                 case REGEX_UNICODE_CHARSET:
7391                     op = ALNUMU;
7392                     break;
7393                 case REGEX_ASCII_RESTRICTED_CHARSET:
7394                     op = ALNUMA;
7395                     break;
7396                 case REGEX_DEPENDS_CHARSET:
7397                     op = ALNUM;
7398                     break;
7399                 default:
7400                     goto bad_charset;
7401             }
7402             ret = reg_node(pRExC_state, op);
7403             *flagp |= HASWIDTH|SIMPLE;
7404             goto finish_meta_pat;
7405         case 'W':
7406             switch (get_regex_charset(RExC_flags)) {
7407                 case REGEX_LOCALE_CHARSET:
7408                     op = NALNUML;
7409                     break;
7410                 case REGEX_UNICODE_CHARSET:
7411                     op = NALNUMU;
7412                     break;
7413                 case REGEX_ASCII_RESTRICTED_CHARSET:
7414                     op = NALNUMA;
7415                     break;
7416                 case REGEX_DEPENDS_CHARSET:
7417                     op = NALNUM;
7418                     break;
7419                 default:
7420                     goto bad_charset;
7421             }
7422             ret = reg_node(pRExC_state, op);
7423             *flagp |= HASWIDTH|SIMPLE;
7424             goto finish_meta_pat;
7425         case 'b':
7426             RExC_seen_zerolen++;
7427             RExC_seen |= REG_SEEN_LOOKBEHIND;
7428             switch (get_regex_charset(RExC_flags)) {
7429                 case REGEX_LOCALE_CHARSET:
7430                     op = BOUNDL;
7431                     break;
7432                 case REGEX_UNICODE_CHARSET:
7433                     op = BOUNDU;
7434                     break;
7435                 case REGEX_ASCII_RESTRICTED_CHARSET:
7436                     op = BOUNDA;
7437                     break;
7438                 case REGEX_DEPENDS_CHARSET:
7439                     op = BOUND;
7440                     break;
7441                 default:
7442                     goto bad_charset;
7443             }
7444             ret = reg_node(pRExC_state, op);
7445             FLAGS(ret) = get_regex_charset(RExC_flags);
7446             *flagp |= SIMPLE;
7447             goto finish_meta_pat;
7448         case 'B':
7449             RExC_seen_zerolen++;
7450             RExC_seen |= REG_SEEN_LOOKBEHIND;
7451             switch (get_regex_charset(RExC_flags)) {
7452                 case REGEX_LOCALE_CHARSET:
7453                     op = NBOUNDL;
7454                     break;
7455                 case REGEX_UNICODE_CHARSET:
7456                     op = NBOUNDU;
7457                     break;
7458                 case REGEX_ASCII_RESTRICTED_CHARSET:
7459                     op = NBOUNDA;
7460                     break;
7461                 case REGEX_DEPENDS_CHARSET:
7462                     op = NBOUND;
7463                     break;
7464                 default:
7465                     goto bad_charset;
7466             }
7467             ret = reg_node(pRExC_state, op);
7468             FLAGS(ret) = get_regex_charset(RExC_flags);
7469             *flagp |= SIMPLE;
7470             goto finish_meta_pat;
7471         case 's':
7472             switch (get_regex_charset(RExC_flags)) {
7473                 case REGEX_LOCALE_CHARSET:
7474                     op = SPACEL;
7475                     break;
7476                 case REGEX_UNICODE_CHARSET:
7477                     op = SPACEU;
7478                     break;
7479                 case REGEX_ASCII_RESTRICTED_CHARSET:
7480                     op = SPACEA;
7481                     break;
7482                 case REGEX_DEPENDS_CHARSET:
7483                     op = SPACE;
7484                     break;
7485                 default:
7486                     goto bad_charset;
7487             }
7488             ret = reg_node(pRExC_state, op);
7489             *flagp |= HASWIDTH|SIMPLE;
7490             goto finish_meta_pat;
7491         case 'S':
7492             switch (get_regex_charset(RExC_flags)) {
7493                 case REGEX_LOCALE_CHARSET:
7494                     op = NSPACEL;
7495                     break;
7496                 case REGEX_UNICODE_CHARSET:
7497                     op = NSPACEU;
7498                     break;
7499                 case REGEX_ASCII_RESTRICTED_CHARSET:
7500                     op = NSPACEA;
7501                     break;
7502                 case REGEX_DEPENDS_CHARSET:
7503                     op = NSPACE;
7504                     break;
7505                 default:
7506                     goto bad_charset;
7507             }
7508             ret = reg_node(pRExC_state, op);
7509             *flagp |= HASWIDTH|SIMPLE;
7510             goto finish_meta_pat;
7511         case 'd':
7512             switch (get_regex_charset(RExC_flags)) {
7513                 case REGEX_LOCALE_CHARSET:
7514                     op = DIGITL;
7515                     break;
7516                 case REGEX_ASCII_RESTRICTED_CHARSET:
7517                     op = DIGITA;
7518                     break;
7519                 case REGEX_DEPENDS_CHARSET: /* No difference between these */
7520                 case REGEX_UNICODE_CHARSET:
7521                     op = DIGIT;
7522                     break;
7523                 default:
7524                     goto bad_charset;
7525             }
7526             ret = reg_node(pRExC_state, op);
7527             *flagp |= HASWIDTH|SIMPLE;
7528             goto finish_meta_pat;
7529         case 'D':
7530             switch (get_regex_charset(RExC_flags)) {
7531                 case REGEX_LOCALE_CHARSET:
7532                     op = NDIGITL;
7533                     break;
7534                 case REGEX_ASCII_RESTRICTED_CHARSET:
7535                     op = NDIGITA;
7536                     break;
7537                 case REGEX_DEPENDS_CHARSET: /* No difference between these */
7538                 case REGEX_UNICODE_CHARSET:
7539                     op = NDIGIT;
7540                     break;
7541                 default:
7542                     goto bad_charset;
7543             }
7544             ret = reg_node(pRExC_state, op);
7545             *flagp |= HASWIDTH|SIMPLE;
7546             goto finish_meta_pat;
7547         case 'R':
7548             ret = reg_node(pRExC_state, LNBREAK);
7549             *flagp |= HASWIDTH|SIMPLE;
7550             goto finish_meta_pat;
7551         case 'h':
7552             ret = reg_node(pRExC_state, HORIZWS);
7553             *flagp |= HASWIDTH|SIMPLE;
7554             goto finish_meta_pat;
7555         case 'H':
7556             ret = reg_node(pRExC_state, NHORIZWS);
7557             *flagp |= HASWIDTH|SIMPLE;
7558             goto finish_meta_pat;
7559         case 'v':
7560             ret = reg_node(pRExC_state, VERTWS);
7561             *flagp |= HASWIDTH|SIMPLE;
7562             goto finish_meta_pat;
7563         case 'V':
7564             ret = reg_node(pRExC_state, NVERTWS);
7565             *flagp |= HASWIDTH|SIMPLE;
7566          finish_meta_pat:
7567             nextchar(pRExC_state);
7568             Set_Node_Length(ret, 2); /* MJD */
7569             break;
7570         case 'p':
7571         case 'P':
7572             {
7573                 char* const oldregxend = RExC_end;
7574 #ifdef DEBUGGING
7575                 char* parse_start = RExC_parse - 2;
7576 #endif
7577
7578                 if (RExC_parse[1] == '{') {
7579                   /* a lovely hack--pretend we saw [\pX] instead */
7580                     RExC_end = strchr(RExC_parse, '}');
7581                     if (!RExC_end) {
7582                         const U8 c = (U8)*RExC_parse;
7583                         RExC_parse += 2;
7584                         RExC_end = oldregxend;
7585                         vFAIL2("Missing right brace on \\%c{}", c);
7586                     }
7587                     RExC_end++;
7588                 }
7589                 else {
7590                     RExC_end = RExC_parse + 2;
7591                     if (RExC_end > oldregxend)
7592                         RExC_end = oldregxend;
7593                 }
7594                 RExC_parse--;
7595
7596                 ret = regclass(pRExC_state,depth+1);
7597
7598                 RExC_end = oldregxend;
7599                 RExC_parse--;
7600
7601                 Set_Node_Offset(ret, parse_start + 2);
7602                 Set_Node_Cur_Length(ret);
7603                 nextchar(pRExC_state);
7604                 *flagp |= HASWIDTH|SIMPLE;
7605             }
7606             break;
7607         case 'N':
7608             /* Handle \N and \N{NAME} here and not below because it can be
7609             multicharacter. join_exact() will join them up later on.
7610             Also this makes sure that things like /\N{BLAH}+/ and
7611             \N{BLAH} being multi char Just Happen. dmq*/
7612             ++RExC_parse;
7613             ret= reg_namedseq(pRExC_state, NULL, flagp);
7614             break;
7615         case 'k':    /* Handle \k<NAME> and \k'NAME' */
7616         parse_named_seq:
7617         {
7618             char ch= RExC_parse[1];
7619             if (ch != '<' && ch != '\'' && ch != '{') {
7620                 RExC_parse++;
7621                 vFAIL2("Sequence %.2s... not terminated",parse_start);
7622             } else {
7623                 /* this pretty much dupes the code for (?P=...) in reg(), if
7624                    you change this make sure you change that */
7625                 char* name_start = (RExC_parse += 2);
7626                 U32 num = 0;
7627                 SV *sv_dat = reg_scan_name(pRExC_state,
7628                     SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7629                 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
7630                 if (RExC_parse == name_start || *RExC_parse != ch)
7631                     vFAIL2("Sequence %.3s... not terminated",parse_start);
7632
7633                 if (!SIZE_ONLY) {
7634                     num = add_data( pRExC_state, 1, "S" );
7635                     RExC_rxi->data->data[num]=(void*)sv_dat;
7636                     SvREFCNT_inc_simple_void(sv_dat);
7637                 }
7638
7639                 RExC_sawback = 1;
7640                 ret = reganode(pRExC_state,
7641                                ((! FOLD)
7642                                  ? NREF
7643                                  : (AT_LEAST_UNI_SEMANTICS)
7644                                    ? NREFFU
7645                                    : (LOC)
7646                                      ? NREFFL
7647                                      : NREFF),
7648                                 num);
7649                 *flagp |= HASWIDTH;
7650
7651                 /* override incorrect value set in reganode MJD */
7652                 Set_Node_Offset(ret, parse_start+1);
7653                 Set_Node_Cur_Length(ret); /* MJD */
7654                 nextchar(pRExC_state);
7655
7656             }
7657             break;
7658         }
7659         case 'g':
7660         case '1': case '2': case '3': case '4':
7661         case '5': case '6': case '7': case '8': case '9':
7662             {
7663                 I32 num;
7664                 bool isg = *RExC_parse == 'g';
7665                 bool isrel = 0;
7666                 bool hasbrace = 0;
7667                 if (isg) {
7668                     RExC_parse++;
7669                     if (*RExC_parse == '{') {
7670                         RExC_parse++;
7671                         hasbrace = 1;
7672                     }
7673                     if (*RExC_parse == '-') {
7674                         RExC_parse++;
7675                         isrel = 1;
7676                     }
7677                     if (hasbrace && !isDIGIT(*RExC_parse)) {
7678                         if (isrel) RExC_parse--;
7679                         RExC_parse -= 2;
7680                         goto parse_named_seq;
7681                 }   }
7682                 num = atoi(RExC_parse);
7683                 if (isg && num == 0)
7684                     vFAIL("Reference to invalid group 0");
7685                 if (isrel) {
7686                     num = RExC_npar - num;
7687                     if (num < 1)
7688                         vFAIL("Reference to nonexistent or unclosed group");
7689                 }
7690                 if (!isg && num > 9 && num >= RExC_npar)
7691                     goto defchar;
7692                 else {
7693                     char * const parse_start = RExC_parse - 1; /* MJD */
7694                     while (isDIGIT(*RExC_parse))
7695                         RExC_parse++;
7696                     if (parse_start == RExC_parse - 1)
7697                         vFAIL("Unterminated \\g... pattern");
7698                     if (hasbrace) {
7699                         if (*RExC_parse != '}')
7700                             vFAIL("Unterminated \\g{...} pattern");
7701                         RExC_parse++;
7702                     }
7703                     if (!SIZE_ONLY) {
7704                         if (num > (I32)RExC_rx->nparens)
7705                             vFAIL("Reference to nonexistent group");
7706                     }
7707                     RExC_sawback = 1;
7708                     ret = reganode(pRExC_state,
7709                                    ((! FOLD)
7710                                      ? REF
7711                                      : (AT_LEAST_UNI_SEMANTICS)
7712                                        ? REFFU
7713                                        : (LOC)
7714                                          ? REFFL
7715                                          : REFF),
7716                                     num);
7717                     *flagp |= HASWIDTH;
7718
7719                     /* override incorrect value set in reganode MJD */
7720                     Set_Node_Offset(ret, parse_start+1);
7721                     Set_Node_Cur_Length(ret); /* MJD */
7722                     RExC_parse--;
7723                     nextchar(pRExC_state);
7724                 }
7725             }
7726             break;
7727         case '\0':
7728             if (RExC_parse >= RExC_end)
7729                 FAIL("Trailing \\");
7730             /* FALL THROUGH */
7731         default:
7732             /* Do not generate "unrecognized" warnings here, we fall
7733                back into the quick-grab loop below */
7734             parse_start--;
7735             goto defchar;
7736         }
7737         break;
7738
7739     case '#':
7740         if (RExC_flags & RXf_PMf_EXTENDED) {
7741             if ( reg_skipcomment( pRExC_state ) )
7742                 goto tryagain;
7743         }
7744         /* FALL THROUGH */
7745
7746     default:
7747         outer_default:{
7748             register STRLEN len;
7749             register UV ender;
7750             register char *p;
7751             char *s;
7752             STRLEN foldlen;
7753             U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
7754
7755             parse_start = RExC_parse - 1;
7756
7757             RExC_parse++;
7758
7759         defchar:
7760             ender = 0;
7761             ret = reg_node(pRExC_state,
7762                            (U8) ((! FOLD) ? EXACT
7763                                           : (LOC)
7764                                              ? EXACTFL
7765                                              : (AT_LEAST_UNI_SEMANTICS)
7766                                                ? EXACTFU
7767                                                : EXACTF)
7768                     );
7769             s = STRING(ret);
7770             for (len = 0, p = RExC_parse - 1;
7771               len < 127 && p < RExC_end;
7772               len++)
7773             {
7774                 char * const oldp = p;
7775
7776                 if (RExC_flags & RXf_PMf_EXTENDED)
7777                     p = regwhite( pRExC_state, p );
7778                 switch ((U8)*p) {
7779                 case LATIN_SMALL_LETTER_SHARP_S:
7780                 case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
7781                 case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
7782                            if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7783                                 goto normal_default;
7784                 case '^':
7785                 case '$':
7786                 case '.':
7787                 case '[':
7788                 case '(':
7789                 case ')':
7790                 case '|':
7791                     goto loopdone;
7792                 case '\\':
7793                     /* Literal Escapes Switch
7794
7795                        This switch is meant to handle escape sequences that
7796                        resolve to a literal character.
7797
7798                        Every escape sequence that represents something
7799                        else, like an assertion or a char class, is handled
7800                        in the switch marked 'Special Escapes' above in this
7801                        routine, but also has an entry here as anything that
7802                        isn't explicitly mentioned here will be treated as
7803                        an unescaped equivalent literal.
7804                     */
7805
7806                     switch ((U8)*++p) {
7807                     /* These are all the special escapes. */
7808                     case LATIN_SMALL_LETTER_SHARP_S:
7809                     case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
7810                     case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
7811                            if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7812                                 goto normal_default;
7813                     case 'A':             /* Start assertion */
7814                     case 'b': case 'B':   /* Word-boundary assertion*/
7815                     case 'C':             /* Single char !DANGEROUS! */
7816                     case 'd': case 'D':   /* digit class */
7817                     case 'g': case 'G':   /* generic-backref, pos assertion */
7818                     case 'h': case 'H':   /* HORIZWS */
7819                     case 'k': case 'K':   /* named backref, keep marker */
7820                     case 'N':             /* named char sequence */
7821                     case 'p': case 'P':   /* Unicode property */
7822                               case 'R':   /* LNBREAK */
7823                     case 's': case 'S':   /* space class */
7824                     case 'v': case 'V':   /* VERTWS */
7825                     case 'w': case 'W':   /* word class */
7826                     case 'X':             /* eXtended Unicode "combining character sequence" */
7827                     case 'z': case 'Z':   /* End of line/string assertion */
7828                         --p;
7829                         goto loopdone;
7830
7831                     /* Anything after here is an escape that resolves to a
7832                        literal. (Except digits, which may or may not)
7833                      */
7834                     case 'n':
7835                         ender = '\n';
7836                         p++;
7837                         break;
7838                     case 'r':
7839                         ender = '\r';
7840                         p++;
7841                         break;
7842                     case 't':
7843                         ender = '\t';
7844                         p++;
7845                         break;
7846                     case 'f':
7847                         ender = '\f';
7848                         p++;
7849                         break;
7850                     case 'e':
7851                           ender = ASCII_TO_NATIVE('\033');
7852                         p++;
7853                         break;
7854                     case 'a':
7855                           ender = ASCII_TO_NATIVE('\007');
7856                         p++;
7857                         break;
7858                     case 'o':
7859                         {
7860                             STRLEN brace_len = len;
7861                             UV result;
7862                             const char* error_msg;
7863
7864                             bool valid = grok_bslash_o(p,
7865                                                        &result,
7866                                                        &brace_len,
7867                                                        &error_msg,
7868                                                        1);
7869                             p += brace_len;
7870                             if (! valid) {
7871                                 RExC_parse = p; /* going to die anyway; point
7872                                                    to exact spot of failure */
7873                                 vFAIL(error_msg);
7874                             }
7875                             else
7876                             {
7877                                 ender = result;
7878                             }
7879                             if (PL_encoding && ender < 0x100) {
7880                                 goto recode_encoding;
7881                             }
7882                             if (ender > 0xff) {
7883                                 REQUIRE_UTF8;
7884                             }
7885                             break;
7886                         }
7887                     case 'x':
7888                         if (*++p == '{') {
7889                             char* const e = strchr(p, '}');
7890
7891                             if (!e) {
7892                                 RExC_parse = p + 1;
7893                                 vFAIL("Missing right brace on \\x{}");
7894                             }
7895                             else {
7896                                 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7897                                     | PERL_SCAN_DISALLOW_PREFIX;
7898                                 STRLEN numlen = e - p - 1;
7899                                 ender = grok_hex(p + 1, &numlen, &flags, NULL);
7900                                 if (ender > 0xff)
7901                                     REQUIRE_UTF8;
7902                                 p = e + 1;
7903                             }
7904                         }
7905                         else {
7906                             I32 flags = PERL_SCAN_DISALLOW_PREFIX;
7907                             STRLEN numlen = 2;
7908                             ender = grok_hex(p, &numlen, &flags, NULL);
7909                             p += numlen;
7910                         }
7911                         if (PL_encoding && ender < 0x100)
7912                             goto recode_encoding;
7913                         break;
7914                     case 'c':
7915                         p++;
7916                         ender = grok_bslash_c(*p++, SIZE_ONLY);
7917                         break;
7918                     case '0': case '1': case '2': case '3':case '4':
7919                     case '5': case '6': case '7': case '8':case '9':
7920                         if (*p == '0' ||
7921                             (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
7922                         {
7923                             I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
7924                             STRLEN numlen = 3;
7925                             ender = grok_oct(p, &numlen, &flags, NULL);
7926                             if (ender > 0xff) {
7927                                 REQUIRE_UTF8;
7928                             }
7929                             p += numlen;
7930                         }
7931                         else {
7932                             --p;
7933                             goto loopdone;
7934                         }
7935                         if (PL_encoding && ender < 0x100)
7936                             goto recode_encoding;
7937                         break;
7938                     recode_encoding:
7939                         {
7940                             SV* enc = PL_encoding;
7941                             ender = reg_recode((const char)(U8)ender, &enc);
7942                             if (!enc && SIZE_ONLY)
7943                                 ckWARNreg(p, "Invalid escape in the specified encoding");
7944                             REQUIRE_UTF8;
7945                         }
7946                         break;
7947                     case '\0':
7948                         if (p >= RExC_end)
7949                             FAIL("Trailing \\");
7950                         /* FALL THROUGH */
7951                     default:
7952                         if (!SIZE_ONLY&& isALPHA(*p))
7953                             ckWARN2reg(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
7954                         goto normal_default;
7955                     }
7956                     break;
7957                 default:
7958                   normal_default:
7959                     if (UTF8_IS_START(*p) && UTF) {
7960                         STRLEN numlen;
7961                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
7962                                                &numlen, UTF8_ALLOW_DEFAULT);
7963                         p += numlen;
7964                     }
7965                     else
7966                         ender = *p++;
7967                     break;
7968                 }
7969                 if ( RExC_flags & RXf_PMf_EXTENDED)
7970                     p = regwhite( pRExC_state, p );
7971                 if (UTF && FOLD) {
7972                     /* Prime the casefolded buffer. */
7973                     ender = toFOLD_uni(ender, tmpbuf, &foldlen);
7974                 }
7975                 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
7976                     if (len)
7977                         p = oldp;
7978                     else if (UTF) {
7979                          if (FOLD) {
7980                               /* Emit all the Unicode characters. */
7981                               STRLEN numlen;
7982                               for (foldbuf = tmpbuf;
7983                                    foldlen;
7984                                    foldlen -= numlen) {
7985                                    ender = utf8_to_uvchr(foldbuf, &numlen);
7986                                    if (numlen > 0) {
7987                                         const STRLEN unilen = reguni(pRExC_state, ender, s);
7988                                         s       += unilen;
7989                                         len     += unilen;
7990                                         /* In EBCDIC the numlen
7991                                          * and unilen can differ. */
7992                                         foldbuf += numlen;
7993                                         if (numlen >= foldlen)
7994                                              break;
7995                                    }
7996                                    else
7997                                         break; /* "Can't happen." */
7998                               }
7999                          }
8000                          else {
8001                               const STRLEN unilen = reguni(pRExC_state, ender, s);
8002                               if (unilen > 0) {
8003                                    s   += unilen;
8004                                    len += unilen;
8005                               }
8006                          }
8007                     }
8008                     else {
8009                         len++;
8010                         REGC((char)ender, s++);
8011                     }
8012                     break;
8013                 }
8014                 if (UTF) {
8015                      if (FOLD) {
8016                           /* Emit all the Unicode characters. */
8017                           STRLEN numlen;
8018                           for (foldbuf = tmpbuf;
8019                                foldlen;
8020                                foldlen -= numlen) {
8021                                ender = utf8_to_uvchr(foldbuf, &numlen);
8022                                if (numlen > 0) {
8023                                     const STRLEN unilen = reguni(pRExC_state, ender, s);
8024                                     len     += unilen;
8025                                     s       += unilen;
8026                                     /* In EBCDIC the numlen
8027                                      * and unilen can differ. */
8028                                     foldbuf += numlen;
8029                                     if (numlen >= foldlen)
8030                                          break;
8031                                }
8032                                else
8033                                     break;
8034                           }
8035                      }
8036                      else {
8037                           const STRLEN unilen = reguni(pRExC_state, ender, s);
8038                           if (unilen > 0) {
8039                                s   += unilen;
8040                                len += unilen;
8041                           }
8042                      }
8043                      len--;
8044                 }
8045                 else
8046                     REGC((char)ender, s++);
8047             }
8048         loopdone:
8049             RExC_parse = p - 1;
8050             Set_Node_Cur_Length(ret); /* MJD */
8051             nextchar(pRExC_state);
8052             {
8053                 /* len is STRLEN which is unsigned, need to copy to signed */
8054                 IV iv = len;
8055                 if (iv < 0)
8056                     vFAIL("Internal disaster");
8057             }
8058             if (len > 0)
8059                 *flagp |= HASWIDTH;
8060             if (len == 1 && UNI_IS_INVARIANT(ender))
8061                 *flagp |= SIMPLE;
8062
8063             if (SIZE_ONLY)
8064                 RExC_size += STR_SZ(len);
8065             else {
8066                 STR_LEN(ret) = len;
8067                 RExC_emit += STR_SZ(len);
8068             }
8069         }
8070         break;
8071     }
8072
8073     return(ret);
8074
8075 /* Jumped to when an unrecognized character set is encountered */
8076 bad_charset:
8077     Perl_croak(aTHX_ "panic: Unknown regex character set encoding: %u", get_regex_charset(RExC_flags));
8078     return(NULL);
8079 }
8080
8081 STATIC char *
8082 S_regwhite( RExC_state_t *pRExC_state, char *p )
8083 {
8084     const char *e = RExC_end;
8085
8086     PERL_ARGS_ASSERT_REGWHITE;
8087
8088     while (p < e) {
8089         if (isSPACE(*p))
8090             ++p;
8091         else if (*p == '#') {
8092             bool ended = 0;
8093             do {
8094                 if (*p++ == '\n') {
8095                     ended = 1;
8096                     break;
8097                 }
8098             } while (p < e);
8099             if (!ended)
8100                 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
8101         }
8102         else
8103             break;
8104     }
8105     return p;
8106 }
8107
8108 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
8109    Character classes ([:foo:]) can also be negated ([:^foo:]).
8110    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
8111    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
8112    but trigger failures because they are currently unimplemented. */
8113
8114 #define POSIXCC_DONE(c)   ((c) == ':')
8115 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
8116 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
8117
8118 STATIC I32
8119 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
8120 {
8121     dVAR;
8122     I32 namedclass = OOB_NAMEDCLASS;
8123
8124     PERL_ARGS_ASSERT_REGPPOSIXCC;
8125
8126     if (value == '[' && RExC_parse + 1 < RExC_end &&
8127         /* I smell either [: or [= or [. -- POSIX has been here, right? */
8128         POSIXCC(UCHARAT(RExC_parse))) {
8129         const char c = UCHARAT(RExC_parse);
8130         char* const s = RExC_parse++;
8131
8132         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
8133             RExC_parse++;
8134         if (RExC_parse == RExC_end)
8135             /* Grandfather lone [:, [=, [. */
8136             RExC_parse = s;
8137         else {
8138             const char* const t = RExC_parse++; /* skip over the c */
8139             assert(*t == c);
8140
8141             if (UCHARAT(RExC_parse) == ']') {
8142                 const char *posixcc = s + 1;
8143                 RExC_parse++; /* skip over the ending ] */
8144
8145                 if (*s == ':') {
8146                     const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
8147                     const I32 skip = t - posixcc;
8148
8149                     /* Initially switch on the length of the name.  */
8150                     switch (skip) {
8151                     case 4:
8152                         if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
8153                             namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
8154                         break;
8155                     case 5:
8156                         /* Names all of length 5.  */
8157                         /* alnum alpha ascii blank cntrl digit graph lower
8158                            print punct space upper  */
8159                         /* Offset 4 gives the best switch position.  */
8160                         switch (posixcc[4]) {
8161                         case 'a':
8162                             if (memEQ(posixcc, "alph", 4)) /* alpha */
8163                                 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
8164                             break;
8165                         case 'e':
8166                             if (memEQ(posixcc, "spac", 4)) /* space */
8167                                 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
8168                             break;
8169                         case 'h':
8170                             if (memEQ(posixcc, "grap", 4)) /* graph */
8171                                 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
8172                             break;
8173                         case 'i':
8174                             if (memEQ(posixcc, "asci", 4)) /* ascii */
8175                                 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
8176                             break;
8177                         case 'k':
8178                             if (memEQ(posixcc, "blan", 4)) /* blank */
8179                                 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
8180                             break;
8181                         case 'l':
8182                             if (memEQ(posixcc, "cntr", 4)) /* cntrl */
8183                                 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
8184                             break;
8185                         case 'm':
8186                             if (memEQ(posixcc, "alnu", 4)) /* alnum */
8187                                 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
8188                             break;
8189                         case 'r':
8190                             if (memEQ(posixcc, "lowe", 4)) /* lower */
8191                                 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
8192                             else if (memEQ(posixcc, "uppe", 4)) /* upper */
8193                                 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
8194                             break;
8195                         case 't':
8196                             if (memEQ(posixcc, "digi", 4)) /* digit */
8197                                 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
8198                             else if (memEQ(posixcc, "prin", 4)) /* print */
8199                                 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
8200                             else if (memEQ(posixcc, "punc", 4)) /* punct */
8201                                 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
8202                             break;
8203                         }
8204                         break;
8205                     case 6:
8206                         if (memEQ(posixcc, "xdigit", 6))
8207                             namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
8208                         break;
8209                     }
8210
8211                     if (namedclass == OOB_NAMEDCLASS)
8212                         Simple_vFAIL3("POSIX class [:%.*s:] unknown",
8213                                       t - s - 1, s + 1);
8214                     assert (posixcc[skip] == ':');
8215                     assert (posixcc[skip+1] == ']');
8216                 } else if (!SIZE_ONLY) {
8217                     /* [[=foo=]] and [[.foo.]] are still future. */
8218
8219                     /* adjust RExC_parse so the warning shows after
8220                        the class closes */
8221                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
8222                         RExC_parse++;
8223                     Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
8224                 }
8225             } else {
8226                 /* Maternal grandfather:
8227                  * "[:" ending in ":" but not in ":]" */
8228                 RExC_parse = s;
8229             }
8230         }
8231     }
8232
8233     return namedclass;
8234 }
8235
8236 STATIC void
8237 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
8238 {
8239     dVAR;
8240
8241     PERL_ARGS_ASSERT_CHECKPOSIXCC;
8242
8243     if (POSIXCC(UCHARAT(RExC_parse))) {
8244         const char *s = RExC_parse;
8245         const char  c = *s++;
8246
8247         while (isALNUM(*s))
8248             s++;
8249         if (*s && c == *s && s[1] == ']') {
8250             ckWARN3reg(s+2,
8251                        "POSIX syntax [%c %c] belongs inside character classes",
8252                        c, c);
8253
8254             /* [[=foo=]] and [[.foo.]] are still future. */
8255             if (POSIXCC_NOTYET(c)) {
8256                 /* adjust RExC_parse so the error shows after
8257                    the class closes */
8258                 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
8259                     NOOP;
8260                 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
8261             }
8262         }
8263     }
8264 }
8265
8266 /* No locale test, and always Unicode semantics */
8267 #define _C_C_T_NOLOC_(NAME,TEST,WORD)                                          \
8268 ANYOF_##NAME:                                                                  \
8269         for (value = 0; value < 256; value++)                                  \
8270             if (TEST)                                                          \
8271             stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value);  \
8272     yesno = '+';                                                               \
8273     what = WORD;                                                               \
8274     break;                                                                     \
8275 case ANYOF_N##NAME:                                                            \
8276         for (value = 0; value < 256; value++)                                  \
8277             if (!TEST)                                                         \
8278             stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value);  \
8279     yesno = '!';                                                               \
8280     what = WORD;                                                               \
8281     break
8282
8283 /* Like the above, but there are differences if we are in uni-8-bit or not, so
8284  * there are two tests passed in, to use depending on that. There aren't any
8285  * cases where the label is different from the name, so no need for that
8286  * parameter */
8287 #define _C_C_T_(NAME,TEST_8,TEST_7,WORD)                                       \
8288 ANYOF_##NAME:                                                                  \
8289     if (LOC) ANYOF_CLASS_SET(ret, ANYOF_##NAME);                               \
8290     else if (UNI_SEMANTICS) {                                                  \
8291         for (value = 0; value < 256; value++) {                                \
8292             if (TEST_8) stored +=                                              \
8293                       S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value);  \
8294         }                                                                      \
8295     }                                                                          \
8296     else {                                                                     \
8297         for (value = 0; value < 128; value++) {                                \
8298             if (TEST_7) stored +=                                              \
8299                 S_set_regclass_bit(aTHX_ pRExC_state, ret,                     \
8300                                    (U8) UNI_TO_NATIVE(value));                 \
8301         }                                                                      \
8302     }                                                                          \
8303     yesno = '+';                                                               \
8304     what = WORD;                                                               \
8305     break;                                                                     \
8306 case ANYOF_N##NAME:                                                            \
8307     if (LOC) ANYOF_CLASS_SET(ret, ANYOF_N##NAME);                              \
8308     else if (UNI_SEMANTICS) {                                                  \
8309         for (value = 0; value < 256; value++) {                                \
8310             if (! TEST_8) stored +=                                            \
8311                     S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value);    \
8312         }                                                                      \
8313     }                                                                          \
8314     else {                                                                     \
8315         for (value = 0; value < 128; value++) {                                \
8316             if (! TEST_7) stored +=                                            \
8317                     S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value);    \
8318         }                                                                      \
8319         if (ASCII_RESTRICTED) {                                                \
8320             for (value = 128; value < 256; value++) {                          \
8321              stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \
8322             }                                                                  \
8323             ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL|ANYOF_UTF8;                  \
8324         }                                                                      \
8325         else {                                                                 \
8326             /* For a non-ut8 target string with DEPENDS semantics, all above   \
8327              * ASCII Latin1 code points match the complement of any of the     \
8328              * classes.  But in utf8, they have their Unicode semantics, so    \
8329              * can't just set them in the bitmap, or else regexec.c will think \
8330              * they matched when they shouldn't. */                            \
8331             ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL|ANYOF_UTF8;          \
8332         }                                                                      \
8333     }                                                                          \
8334     yesno = '!';                                                               \
8335     what = WORD;                                                               \
8336     break
8337
8338 /*
8339    We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
8340    so that it is possible to override the option here without having to
8341    rebuild the entire core. as we are required to do if we change regcomp.h
8342    which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
8343 */
8344 #if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
8345 #define BROKEN_UNICODE_CHARCLASS_MAPPINGS
8346 #endif
8347
8348 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
8349 #define POSIX_CC_UNI_NAME(CCNAME) CCNAME
8350 #else
8351 #define POSIX_CC_UNI_NAME(CCNAME) "Posix" CCNAME
8352 #endif
8353
8354 STATIC U8
8355 S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value)
8356 {
8357
8358     /* Handle the setting of folds in the bitmap for non-locale ANYOF nodes.
8359      * Locale folding is done at run-time, so this function should not be
8360      * called for nodes that are for locales.
8361      *
8362      * This function simply sets the bit corresponding to the fold of the input
8363      * 'value', if not already set.  The fold of 'f' is 'F', and the fold of
8364      * 'F' is 'f'.
8365      *
8366      * It also sets any necessary flags, and returns the number of bits that
8367      * actually changed from 0 to 1 */
8368
8369     U8 stored = 0;
8370     U8 fold;
8371
8372     fold = (AT_LEAST_UNI_SEMANTICS) ? PL_fold_latin1[value]
8373                            : PL_fold[value];
8374
8375     /* It assumes the bit for 'value' has already been set */
8376     if (fold != value && ! ANYOF_BITMAP_TEST(node, fold)) {
8377         ANYOF_BITMAP_SET(node, fold);
8378         stored++;
8379     }
8380     if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value)
8381         || (! UNI_SEMANTICS
8382             && ! isASCII(value)
8383             && PL_fold_latin1[value] != value))
8384     {   /* A character that has a fold outside of Latin1 matches outside the
8385            bitmap, but only when the target string is utf8.  Similarly when we
8386            don't have unicode semantics for the above ASCII Latin-1 characters,
8387            and they have a fold, they should match if the target is utf8, and
8388            not otherwise */
8389         ANYOF_FLAGS(node) |= ANYOF_UTF8;
8390     }
8391
8392     return stored;
8393 }
8394
8395
8396 PERL_STATIC_INLINE U8
8397 S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value)
8398 {
8399     /* This inline function sets a bit in the bitmap if not already set, and if
8400      * appropriate, its fold, returning the number of bits that actually
8401      * changed from 0 to 1 */
8402
8403     U8 stored;
8404
8405     if (ANYOF_BITMAP_TEST(node, value)) {   /* Already set */
8406         return 0;
8407     }
8408
8409     ANYOF_BITMAP_SET(node, value);
8410     stored = 1;
8411
8412     if (FOLD && ! LOC) {        /* Locale folds aren't known until runtime */
8413         stored += S_set_regclass_bit_fold(aTHX_ pRExC_state, node, value);
8414     }
8415
8416     return stored;
8417 }
8418
8419 /*
8420    parse a class specification and produce either an ANYOF node that
8421    matches the pattern or if the pattern matches a single char only and
8422    that char is < 256 and we are case insensitive then we produce an
8423    EXACT node instead.
8424 */
8425
8426 STATIC regnode *
8427 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
8428 {
8429     dVAR;
8430     register UV nextvalue;
8431     register IV prevvalue = OOB_UNICODE;
8432     register IV range = 0;
8433     UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
8434     register regnode *ret;
8435     STRLEN numlen;
8436     IV namedclass;
8437     char *rangebegin = NULL;
8438     bool need_class = 0;
8439     SV *listsv = NULL;
8440     UV n;
8441     AV* unicode_alternate  = NULL;
8442 #ifdef EBCDIC
8443     UV literal_endpoint = 0;
8444 #endif
8445     UV stored = 0;  /* how many chars stored in the bitmap */
8446
8447     regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
8448         case we need to change the emitted regop to an EXACT. */
8449     const char * orig_parse = RExC_parse;
8450     GET_RE_DEBUG_FLAGS_DECL;
8451
8452     PERL_ARGS_ASSERT_REGCLASS;
8453 #ifndef DEBUGGING
8454     PERL_UNUSED_ARG(depth);
8455 #endif
8456
8457     DEBUG_PARSE("clas");
8458
8459     /* Assume we are going to generate an ANYOF node. */
8460     ret = reganode(pRExC_state, ANYOF, 0);
8461
8462     if (!SIZE_ONLY)
8463         ANYOF_FLAGS(ret) = 0;
8464
8465     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
8466         RExC_naughty++;
8467         RExC_parse++;
8468         if (!SIZE_ONLY)
8469             ANYOF_FLAGS(ret) |= ANYOF_INVERT;
8470     }
8471
8472     if (SIZE_ONLY) {
8473         RExC_size += ANYOF_SKIP;
8474 #ifdef ANYOF_ADD_LOC_SKIP
8475         if (LOC) {
8476             RExC_size += ANYOF_ADD_LOC_SKIP;
8477         }
8478 #endif
8479         listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
8480     }
8481     else {
8482         RExC_emit += ANYOF_SKIP;
8483         if (LOC) {
8484             ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
8485 #ifdef ANYOF_ADD_LOC_SKIP
8486             RExC_emit += ANYOF_ADD_LOC_SKIP;
8487 #endif
8488         }
8489         ANYOF_BITMAP_ZERO(ret);
8490         listsv = newSVpvs("# comment\n");
8491     }
8492
8493     nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
8494
8495     if (!SIZE_ONLY && POSIXCC(nextvalue))
8496         checkposixcc(pRExC_state);
8497
8498     /* allow 1st char to be ] (allowing it to be - is dealt with later) */
8499     if (UCHARAT(RExC_parse) == ']')
8500         goto charclassloop;
8501
8502 parseit:
8503     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
8504
8505     charclassloop:
8506
8507         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
8508
8509         if (!range)
8510             rangebegin = RExC_parse;
8511         if (UTF) {
8512             value = utf8n_to_uvchr((U8*)RExC_parse,
8513                                    RExC_end - RExC_parse,
8514                                    &numlen, UTF8_ALLOW_DEFAULT);
8515             RExC_parse += numlen;
8516         }
8517         else
8518             value = UCHARAT(RExC_parse++);
8519
8520         nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
8521         if (value == '[' && POSIXCC(nextvalue))
8522             namedclass = regpposixcc(pRExC_state, value);
8523         else if (value == '\\') {
8524             if (UTF) {
8525                 value = utf8n_to_uvchr((U8*)RExC_parse,
8526                                    RExC_end - RExC_parse,
8527                                    &numlen, UTF8_ALLOW_DEFAULT);
8528                 RExC_parse += numlen;
8529             }
8530             else
8531                 value = UCHARAT(RExC_parse++);
8532             /* Some compilers cannot handle switching on 64-bit integer
8533              * values, therefore value cannot be an UV.  Yes, this will
8534              * be a problem later if we want switch on Unicode.
8535              * A similar issue a little bit later when switching on
8536              * namedclass. --jhi */
8537             switch ((I32)value) {
8538             case 'w':   namedclass = ANYOF_ALNUM;       break;
8539             case 'W':   namedclass = ANYOF_NALNUM;      break;
8540             case 's':   namedclass = ANYOF_SPACE;       break;
8541             case 'S':   namedclass = ANYOF_NSPACE;      break;
8542             case 'd':   namedclass = ANYOF_DIGIT;       break;
8543             case 'D':   namedclass = ANYOF_NDIGIT;      break;
8544             case 'v':   namedclass = ANYOF_VERTWS;      break;
8545             case 'V':   namedclass = ANYOF_NVERTWS;     break;
8546             case 'h':   namedclass = ANYOF_HORIZWS;     break;
8547             case 'H':   namedclass = ANYOF_NHORIZWS;    break;
8548             case 'N':  /* Handle \N{NAME} in class */
8549                 {
8550                     /* We only pay attention to the first char of
8551                     multichar strings being returned. I kinda wonder
8552                     if this makes sense as it does change the behaviour
8553                     from earlier versions, OTOH that behaviour was broken
8554                     as well. */
8555                     UV v; /* value is register so we cant & it /grrr */
8556                     if (reg_namedseq(pRExC_state, &v, NULL)) {
8557                         goto parseit;
8558                     }
8559                     value= v;
8560                 }
8561                 break;
8562             case 'p':
8563             case 'P':
8564                 {
8565                 char *e;
8566                 if (RExC_parse >= RExC_end)
8567                     vFAIL2("Empty \\%c{}", (U8)value);
8568                 if (*RExC_parse == '{') {
8569                     const U8 c = (U8)value;
8570                     e = strchr(RExC_parse++, '}');
8571                     if (!e)
8572                         vFAIL2("Missing right brace on \\%c{}", c);
8573                     while (isSPACE(UCHARAT(RExC_parse)))
8574                         RExC_parse++;
8575                     if (e == RExC_parse)
8576                         vFAIL2("Empty \\%c{}", c);
8577                     n = e - RExC_parse;
8578                     while (isSPACE(UCHARAT(RExC_parse + n - 1)))
8579                         n--;
8580                 }
8581                 else {
8582                     e = RExC_parse;
8583                     n = 1;
8584                 }
8585                 if (!SIZE_ONLY) {
8586                     if (UCHARAT(RExC_parse) == '^') {
8587                          RExC_parse++;
8588                          n--;
8589                          value = value == 'p' ? 'P' : 'p'; /* toggle */
8590                          while (isSPACE(UCHARAT(RExC_parse))) {
8591                               RExC_parse++;
8592                               n--;
8593                          }
8594                     }
8595                     Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
8596                         (value=='p' ? '+' : '!'), (int)n, RExC_parse);
8597                 }
8598                 RExC_parse = e + 1;
8599
8600                 /* The \p could match something in the Latin1 range, hence
8601                  * something that isn't utf8 */
8602                 ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP;
8603                 namedclass = ANYOF_MAX;  /* no official name, but it's named */
8604                 }
8605                 break;
8606             case 'n':   value = '\n';                   break;
8607             case 'r':   value = '\r';                   break;
8608             case 't':   value = '\t';                   break;
8609             case 'f':   value = '\f';                   break;
8610             case 'b':   value = '\b';                   break;
8611             case 'e':   value = ASCII_TO_NATIVE('\033');break;
8612             case 'a':   value = ASCII_TO_NATIVE('\007');break;
8613             case 'o':
8614                 RExC_parse--;   /* function expects to be pointed at the 'o' */
8615                 {
8616                     const char* error_msg;
8617                     bool valid = grok_bslash_o(RExC_parse,
8618                                                &value,
8619                                                &numlen,
8620                                                &error_msg,
8621                                                SIZE_ONLY);
8622                     RExC_parse += numlen;
8623                     if (! valid) {
8624                         vFAIL(error_msg);
8625                     }
8626                 }
8627                 if (PL_encoding && value < 0x100) {
8628                     goto recode_encoding;
8629                 }
8630                 break;
8631             case 'x':
8632                 if (*RExC_parse == '{') {
8633                     I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
8634                         | PERL_SCAN_DISALLOW_PREFIX;
8635                     char * const e = strchr(RExC_parse++, '}');
8636                     if (!e)
8637                         vFAIL("Missing right brace on \\x{}");
8638
8639                     numlen = e - RExC_parse;
8640                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
8641                     RExC_parse = e + 1;
8642                 }
8643                 else {
8644                     I32 flags = PERL_SCAN_DISALLOW_PREFIX;
8645                     numlen = 2;
8646                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
8647                     RExC_parse += numlen;
8648                 }
8649                 if (PL_encoding && value < 0x100)
8650                     goto recode_encoding;
8651                 break;
8652             case 'c':
8653                 value = grok_bslash_c(*RExC_parse++, SIZE_ONLY);
8654                 break;
8655             case '0': case '1': case '2': case '3': case '4':
8656             case '5': case '6': case '7':
8657                 {
8658                     /* Take 1-3 octal digits */
8659                     I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
8660                     numlen = 3;
8661                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
8662                     RExC_parse += numlen;
8663                     if (PL_encoding && value < 0x100)
8664                         goto recode_encoding;
8665                     break;
8666                 }
8667             recode_encoding:
8668                 {
8669                     SV* enc = PL_encoding;
8670                     value = reg_recode((const char)(U8)value, &enc);
8671                     if (!enc && SIZE_ONLY)
8672                         ckWARNreg(RExC_parse,
8673                                   "Invalid escape in the specified encoding");
8674                     break;
8675                 }
8676             default:
8677                 /* Allow \_ to not give an error */
8678                 if (!SIZE_ONLY && isALNUM(value) && value != '_') {
8679                     ckWARN2reg(RExC_parse,
8680                                "Unrecognized escape \\%c in character class passed through",
8681                                (int)value);
8682                 }
8683                 break;
8684             }
8685         } /* end of \blah */
8686 #ifdef EBCDIC
8687         else
8688             literal_endpoint++;
8689 #endif
8690
8691         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
8692
8693             /* What matches in a locale is not known until runtime, so need to
8694              * (one time per class) allocate extra space to pass to regexec.
8695              * The space will contain a bit for each named class that is to be
8696              * matched against.  This isn't needed for \p{} and pseudo-classes,
8697              * as they are not affected by locale, and hence are dealt with
8698              * separately */
8699             if (LOC && namedclass < ANYOF_MAX && ! need_class) {
8700                 need_class = 1;
8701                 if (SIZE_ONLY) {
8702 #ifdef ANYOF_CLASS_ADD_SKIP
8703                     RExC_size += ANYOF_CLASS_ADD_SKIP;
8704 #endif
8705                 }
8706                 else {
8707 #ifdef ANYOF_CLASS_ADD_SKIP
8708                     RExC_emit += ANYOF_CLASS_ADD_SKIP;
8709 #endif
8710                     ANYOF_CLASS_ZERO(ret);
8711                 }
8712                 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
8713             }
8714
8715             /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
8716              * literal */
8717             if (range) {
8718                 if (!SIZE_ONLY) {
8719                     const int w =
8720                         RExC_parse >= rangebegin ?
8721                         RExC_parse - rangebegin : 0;
8722                     ckWARN4reg(RExC_parse,
8723                                "False [] range \"%*.*s\"",
8724                                w, w, rangebegin);
8725
8726                     if (prevvalue < 256) {
8727                         stored +=
8728                          S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) prevvalue);
8729                         stored +=
8730                          S_set_regclass_bit(aTHX_ pRExC_state, ret, '-');
8731                     }
8732                     else {
8733                         ANYOF_FLAGS(ret) |= ANYOF_UTF8;
8734                         Perl_sv_catpvf(aTHX_ listsv,
8735                            "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
8736                     }
8737                 }
8738
8739                 range = 0; /* this was not a true range */
8740             }
8741
8742
8743
8744             if (!SIZE_ONLY) {
8745                 const char *what = NULL;
8746                 char yesno = 0;
8747
8748                 /* Possible truncation here but in some 64-bit environments
8749                  * the compiler gets heartburn about switch on 64-bit values.
8750                  * A similar issue a little earlier when switching on value.
8751                  * --jhi */
8752                 switch ((I32)namedclass) {
8753
8754                 case _C_C_T_(ALNUMC, isALNUMC_L1(value), isALNUMC(value), "XPosixAlnum");
8755                 case _C_C_T_(ALPHA, isALPHA_L1(value), isALPHA(value), "XPosixAlpha");
8756                 case _C_C_T_(BLANK, isBLANK_L1(value), isBLANK(value), "XPosixBlank");
8757                 case _C_C_T_(CNTRL, isCNTRL_L1(value), isCNTRL(value), "XPosixCntrl");
8758                 case _C_C_T_(GRAPH, isGRAPH_L1(value), isGRAPH(value), "XPosixGraph");
8759                 case _C_C_T_(LOWER, isLOWER_L1(value), isLOWER(value), "XPosixLower");
8760                 case _C_C_T_(PRINT, isPRINT_L1(value), isPRINT(value), "XPosixPrint");
8761                 case _C_C_T_(PSXSPC, isPSXSPC_L1(value), isPSXSPC(value), "XPosixSpace");
8762                 case _C_C_T_(PUNCT, isPUNCT_L1(value), isPUNCT(value), "XPosixPunct");
8763                 case _C_C_T_(UPPER, isUPPER_L1(value), isUPPER(value), "XPosixUpper");
8764 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
8765                 /* \s, \w match all unicode if utf8. */
8766                 case _C_C_T_(SPACE, isSPACE_L1(value), isSPACE(value), "SpacePerl");
8767                 case _C_C_T_(ALNUM, isWORDCHAR_L1(value), isALNUM(value), "Word");
8768 #else
8769                 /* \s, \w match ascii and locale only */
8770                 case _C_C_T_(SPACE, isSPACE_L1(value), isSPACE(value), "PerlSpace");
8771                 case _C_C_T_(ALNUM, isWORDCHAR_L1(value), isALNUM(value), "PerlWord");
8772 #endif
8773                 case _C_C_T_(XDIGIT, isXDIGIT_L1(value), isXDIGIT(value), "XPosixXDigit");
8774                 case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
8775                 case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
8776                 case ANYOF_ASCII:
8777                     if (LOC)
8778                         ANYOF_CLASS_SET(ret, ANYOF_ASCII);
8779                     else {
8780                         for (value = 0; value < 128; value++)
8781                             stored +=
8782                               S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) ASCII_TO_NATIVE(value));
8783                     }
8784                     yesno = '+';
8785                     what = NULL;        /* Doesn't match outside ascii, so
8786                                            don't want to add +utf8:: */
8787                     break;
8788                 case ANYOF_NASCII:
8789                     if (LOC)
8790                         ANYOF_CLASS_SET(ret, ANYOF_NASCII);
8791                     else {
8792                         for (value = 128; value < 256; value++)
8793                             stored +=
8794                               S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) ASCII_TO_NATIVE(value));
8795                     }
8796                     ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
8797                     yesno = '!';
8798                     what = "ASCII";
8799                     break;
8800                 case ANYOF_DIGIT:
8801                     if (LOC)
8802                         ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
8803                     else {
8804                         /* consecutive digits assumed */
8805                         for (value = '0'; value <= '9'; value++)
8806                             stored +=
8807                               S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value);
8808                     }
8809                     yesno = '+';
8810                     what = POSIX_CC_UNI_NAME("Digit");
8811                     break;
8812                 case ANYOF_NDIGIT:
8813                     if (LOC)
8814                         ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
8815                     else {
8816                         /* consecutive digits assumed */
8817                         for (value = 0; value < '0'; value++)
8818                             stored +=
8819                               S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value);
8820                         for (value = '9' + 1; value < 256; value++)
8821                             stored +=
8822                               S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value);
8823                     }
8824                     yesno = '!';
8825                     what = POSIX_CC_UNI_NAME("Digit");
8826                     if (ASCII_RESTRICTED ) {
8827                         ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
8828                     }
8829                     break;
8830                 case ANYOF_MAX:
8831                     /* this is to handle \p and \P */
8832                     break;
8833                 default:
8834                     vFAIL("Invalid [::] class");
8835                     break;
8836                 }
8837                 if (what && ! (ASCII_RESTRICTED)) {
8838                     /* Strings such as "+utf8::isWord\n" */
8839                     Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
8840                     ANYOF_FLAGS(ret) |= ANYOF_UTF8;
8841                 }
8842
8843                 continue;
8844             }
8845         } /* end of namedclass \blah */
8846
8847         if (range) {
8848             if (prevvalue > (IV)value) /* b-a */ {
8849                 const int w = RExC_parse - rangebegin;
8850                 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
8851                 range = 0; /* not a valid range */
8852             }
8853         }
8854         else {
8855             prevvalue = value; /* save the beginning of the range */
8856             if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
8857                 RExC_parse[1] != ']') {
8858                 RExC_parse++;
8859
8860                 /* a bad range like \w-, [:word:]- ? */
8861                 if (namedclass > OOB_NAMEDCLASS) {
8862                     if (ckWARN(WARN_REGEXP)) {
8863                         const int w =
8864                             RExC_parse >= rangebegin ?
8865                             RExC_parse - rangebegin : 0;
8866                         vWARN4(RExC_parse,
8867                                "False [] range \"%*.*s\"",
8868                                w, w, rangebegin);
8869                     }
8870                     if (!SIZE_ONLY)
8871                         stored +=
8872                             S_set_regclass_bit(aTHX_ pRExC_state, ret, '-');
8873                 } else
8874                     range = 1;  /* yeah, it's a range! */
8875                 continue;       /* but do it the next time */
8876             }
8877         }
8878
8879         /* now is the next time */
8880         if (!SIZE_ONLY) {
8881             if (prevvalue < 256) {
8882                 const IV ceilvalue = value < 256 ? value : 255;
8883                 IV i;
8884 #ifdef EBCDIC
8885                 /* In EBCDIC [\x89-\x91] should include
8886                  * the \x8e but [i-j] should not. */
8887                 if (literal_endpoint == 2 &&
8888                     ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
8889                      (isUPPER(prevvalue) && isUPPER(ceilvalue))))
8890                 {
8891                     if (isLOWER(prevvalue)) {
8892                         for (i = prevvalue; i <= ceilvalue; i++)
8893                             if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8894                                 stored +=
8895                                   S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) i);
8896                             }
8897                     } else {
8898                         for (i = prevvalue; i <= ceilvalue; i++)
8899                             if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8900                                 stored +=
8901                                   S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) i);
8902                             }
8903                     }
8904                 }
8905                 else
8906 #endif
8907                       for (i = prevvalue; i <= ceilvalue; i++) {
8908                         stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) i);
8909                       }
8910           }
8911           if (value > 255 || UTF) {
8912                 const UV prevnatvalue  = NATIVE_TO_UNI(prevvalue);
8913                 const UV natvalue      = NATIVE_TO_UNI(value);
8914
8915                 /* If the code point requires utf8 to represent, and we are not
8916                  * folding, it can't match unless the target is in utf8.  Only
8917                  * a few code points above 255 fold to below it, so XXX an
8918                  * optimization would be to know which ones and set the flag
8919                  * appropriately. */
8920                 ANYOF_FLAGS(ret) |= (FOLD || value < 256)
8921                                     ? ANYOF_NONBITMAP
8922                                     : ANYOF_UTF8;
8923                 if (prevnatvalue < natvalue) { /* '>' case is fatal error above */
8924
8925                     /* The \t sets the whole range */
8926                     Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
8927                                    prevnatvalue, natvalue);
8928
8929                     /* Currently, we don't look at every value in the range.
8930                      * Therefore we have to assume the worst case: that if
8931                      * folding, it will match more than one character.  But in
8932                      * lookbehind patterns, can only be single character
8933                      * length, so disallow those folds */
8934                     if (FOLD && ! RExC_in_lookbehind) {
8935                       OP(ret) = ANYOFV;
8936                     }
8937                 }
8938                 else if (prevnatvalue == natvalue) {
8939                     Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
8940                     if (FOLD) {
8941                          U8 foldbuf[UTF8_MAXBYTES_CASE+1];
8942                          STRLEN foldlen;
8943                          const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
8944
8945 #ifdef EBCDIC /* RD t/uni/fold ff and 6b */
8946                          if (RExC_precomp[0] == ':' &&
8947                              RExC_precomp[1] == '[' &&
8948                              (f == 0xDF || f == 0x92)) {
8949                              f = NATIVE_TO_UNI(f);
8950                         }
8951 #endif
8952                          /* If folding and foldable and a single
8953                           * character, insert also the folded version
8954                           * to the charclass. */
8955                          if (f != value) {
8956 #ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
8957                              if ((RExC_precomp[0] == ':' &&
8958                                   RExC_precomp[1] == '[' &&
8959                                   (f == 0xA2 &&
8960                                    (value == 0xFB05 || value == 0xFB06))) ?
8961                                  foldlen == ((STRLEN)UNISKIP(f) - 1) :
8962                                  foldlen == (STRLEN)UNISKIP(f) )
8963 #else
8964                               if (foldlen == (STRLEN)UNISKIP(f))
8965 #endif
8966                                   Perl_sv_catpvf(aTHX_ listsv,
8967                                                  "%04"UVxf"\n", f);
8968                               else if (! RExC_in_lookbehind) {
8969                                   /* Any multicharacter foldings
8970                                    * (disallowed in lookbehind patterns)
8971                                    * require the following transform:
8972                                    * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
8973                                    * where E folds into "pq" and F folds
8974                                    * into "rst", all other characters
8975                                    * fold to single characters.  We save
8976                                    * away these multicharacter foldings,
8977                                    * to be later saved as part of the
8978                                    * additional "s" data. */
8979                                   SV *sv;
8980
8981                                   if (!unicode_alternate)
8982                                       unicode_alternate = newAV();
8983                                   sv = newSVpvn_utf8((char*)foldbuf, foldlen,
8984                                                      TRUE);
8985                                   av_push(unicode_alternate, sv);
8986                                   OP(ret) = ANYOFV;
8987                               }
8988                          }
8989
8990                          /* If folding and the value is one of the Greek
8991                           * sigmas insert a few more sigmas to make the
8992                           * folding rules of the sigmas to work right.
8993                           * Note that not all the possible combinations
8994                           * are handled here: some of them are handled
8995                           * by the standard folding rules, and some of
8996                           * them (literal or EXACTF cases) are handled
8997                           * during runtime in regexec.c:S_find_byclass(). */
8998                          if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
8999                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
9000                                              (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
9001                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
9002                                              (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
9003                          }
9004                          else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
9005                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
9006                                              (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
9007                     }
9008                 }
9009             }
9010 #ifdef EBCDIC
9011             literal_endpoint = 0;
9012 #endif
9013         }
9014
9015         range = 0; /* this range (if it was one) is done now */
9016     }
9017
9018
9019
9020     if (SIZE_ONLY)
9021         return ret;
9022     /****** !SIZE_ONLY AFTER HERE *********/
9023
9024     /* Optimize inverted simple patterns (e.g. [^a-z]).  Note that we haven't
9025      * set the FOLD flag yet, so this this does optimize those.  It doesn't
9026      * optimize locale.  Doing so perhaps could be done as long as there is
9027      * nothing like \w in it; some thought also would have to be given to the
9028      * interaction with above 0x100 chars */
9029     if (! LOC && (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
9030         for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
9031             ANYOF_BITMAP(ret)[value] ^= 0xFF;
9032         stored = 256 - stored;
9033
9034         /* The inversion means that everything above 255 is matched; and at the
9035          * same time we clear the invert flag */
9036         ANYOF_FLAGS(ret) = ANYOF_UTF8|ANYOF_UNICODE_ALL;
9037     }
9038
9039     if (FOLD) {
9040         SV *sv;
9041
9042         /* This is the one character in the bitmap that needs special handling
9043          * under non-locale folding, as it folds to two characters 'ss'.  This
9044          * happens if it is set and not inverting, or isn't set and are
9045          * inverting (disallowed in lookbehind patterns because they can't be
9046          * variable length) */
9047         if (! LOC
9048             && ! RExC_in_lookbehind
9049             && (cBOOL(ANYOF_BITMAP_TEST(ret, LATIN_SMALL_LETTER_SHARP_S))
9050                 ^ cBOOL(ANYOF_FLAGS(ret) & ANYOF_INVERT)))
9051         {
9052             OP(ret) = ANYOFV;   /* Can match more than a single char */
9053
9054             /* Under Unicode semantics), it can do this when the target string
9055              * isn't in utf8 */
9056             if (UNI_SEMANTICS) {
9057                 ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
9058             }
9059
9060             if (!unicode_alternate) {
9061                 unicode_alternate = newAV();
9062             }
9063             sv = newSVpvn_utf8("ss", 2, TRUE);
9064             av_push(unicode_alternate, sv);
9065         }
9066
9067         /* Folding in the bitmap is taken care of above, but not for locale
9068          * (for which we have to wait to see what folding is in effect at
9069          * runtime), and for things not in the bitmap.  Set run-time fold flag
9070          * for these */
9071         if ((LOC || (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP))) {
9072             ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD;
9073         }
9074     }
9075
9076     /* A single character class can be "optimized" into an EXACTish node.
9077      * Note that since we don't currently count how many characters there are
9078      * outside the bitmap, we are XXX missing optimization possibilities for
9079      * them.  This optimization can't happen unless this is a truly single
9080      * character class, which means that it can't be an inversion into a
9081      * many-character class, and there must be no possibility of there being
9082      * things outside the bitmap.  'stored' (only) for locales doesn't include
9083      * \w, etc, so have to make a special test that they aren't present
9084      *
9085      * Similarly A 2-character class of the very special form like [bB] can be
9086      * optimized into an EXACTFish node, but only for non-locales, and for
9087      * characters which only have the two folds; so things like 'fF' and 'Ii'
9088      * wouldn't work because they are part of the fold of 'LATIN SMALL LIGATURE
9089      * FI'. */
9090     if (! (ANYOF_FLAGS(ret) & (ANYOF_NONBITMAP|ANYOF_INVERT|ANYOF_UNICODE_ALL))
9091         && (((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
9092                               || (! ANYOF_CLASS_TEST_ANY_SET(ret)))))
9093             || (stored == 2 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
9094                                  && (! _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value))
9095                                  /* If the latest code point has a fold whose
9096                                   * bit is set, it must be the only other one */
9097                                 && ((prevvalue = PL_fold_latin1[value]) != (IV)value)
9098                                  && ANYOF_BITMAP_TEST(ret, prevvalue)))))
9099     {
9100         /* Note that the information needed to decide to do this optimization
9101          * is not currently available until the 2nd pass, and that the actually
9102          * used EXACTish node takes less space than the calculated ANYOF node,
9103          * and hence the amount of space calculated in the first pass is larger
9104          * than actually used, so this optimization doesn't gain us any space.
9105          * But an EXACT node is faster than an ANYOF node, and can be combined
9106          * with any adjacent EXACT nodes later by the optimizer for further
9107          * gains.  The speed of executing an EXACTF is similar to an ANYOF
9108          * node, so the optimization advantage comes from the ability to join
9109          * it to adjacent EXACT nodes */
9110
9111         const char * cur_parse= RExC_parse;
9112         U8 op;
9113         RExC_emit = (regnode *)orig_emit;
9114         RExC_parse = (char *)orig_parse;
9115
9116         if (stored == 1) {
9117
9118             /* A locale node with one point can be folded; all the other cases
9119              * with folding will have two points, since we calculate them above
9120              */
9121             if (ANYOF_FLAGS(ret) & ANYOF_LOC_NONBITMAP_FOLD) {
9122                  op = EXACTFL;
9123             }
9124             else {
9125                 op = EXACT;
9126             }
9127         }   /* else 2 chars in the bit map: the folds of each other */
9128         else if (AT_LEAST_UNI_SEMANTICS || !isASCII(value)) {
9129
9130             /* To join adjacent nodes, they must be the exact EXACTish type.
9131              * Try to use the most likely type, by using EXACTFU if the regex
9132              * calls for them, or is required because the character is
9133              * non-ASCII */
9134             op = EXACTFU;
9135         }
9136         else {    /* Otherwise, more likely to be EXACTF type */
9137             op = EXACTF;
9138         }
9139
9140         ret = reg_node(pRExC_state, op);
9141         RExC_parse = (char *)cur_parse;
9142         if (UTF && ! NATIVE_IS_INVARIANT(value)) {
9143             *STRING(ret)= UTF8_EIGHT_BIT_HI((U8) value);
9144             *(STRING(ret) + 1)= UTF8_EIGHT_BIT_LO((U8) value);
9145             STR_LEN(ret)= 2;
9146             RExC_emit += STR_SZ(2);
9147         }
9148         else {
9149             *STRING(ret)= (char)value;
9150             STR_LEN(ret)= 1;
9151             RExC_emit += STR_SZ(1);
9152         }
9153         SvREFCNT_dec(listsv);
9154         return ret;
9155     }
9156
9157     {
9158         AV * const av = newAV();
9159         SV *rv;
9160         /* The 0th element stores the character class description
9161          * in its textual form: used later (regexec.c:Perl_regclass_swash())
9162          * to initialize the appropriate swash (which gets stored in
9163          * the 1st element), and also useful for dumping the regnode.
9164          * The 2nd element stores the multicharacter foldings,
9165          * used later (regexec.c:S_reginclass()). */
9166         av_store(av, 0, listsv);
9167         av_store(av, 1, NULL);
9168         av_store(av, 2, MUTABLE_SV(unicode_alternate));
9169         rv = newRV_noinc(MUTABLE_SV(av));
9170         n = add_data(pRExC_state, 1, "s");
9171         RExC_rxi->data->data[n] = (void*)rv;
9172         ARG_SET(ret, n);
9173     }
9174     return ret;
9175 }
9176 #undef _C_C_T_
9177
9178
9179 /* reg_skipcomment()
9180
9181    Absorbs an /x style # comments from the input stream.
9182    Returns true if there is more text remaining in the stream.
9183    Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
9184    terminates the pattern without including a newline.
9185
9186    Note its the callers responsibility to ensure that we are
9187    actually in /x mode
9188
9189 */
9190
9191 STATIC bool
9192 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
9193 {
9194     bool ended = 0;
9195
9196     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
9197
9198     while (RExC_parse < RExC_end)
9199         if (*RExC_parse++ == '\n') {
9200             ended = 1;
9201             break;
9202         }
9203     if (!ended) {
9204         /* we ran off the end of the pattern without ending
9205            the comment, so we have to add an \n when wrapping */
9206         RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
9207         return 0;
9208     } else
9209         return 1;
9210 }
9211
9212 /* nextchar()
9213
9214    Advances the parse position, and optionally absorbs
9215    "whitespace" from the inputstream.
9216
9217    Without /x "whitespace" means (?#...) style comments only,
9218    with /x this means (?#...) and # comments and whitespace proper.
9219
9220    Returns the RExC_parse point from BEFORE the scan occurs.
9221
9222    This is the /x friendly way of saying RExC_parse++.
9223 */
9224
9225 STATIC char*
9226 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
9227 {
9228     char* const retval = RExC_parse++;
9229
9230     PERL_ARGS_ASSERT_NEXTCHAR;
9231
9232     for (;;) {
9233         if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
9234                 RExC_parse[2] == '#') {
9235             while (*RExC_parse != ')') {
9236                 if (RExC_parse == RExC_end)
9237                     FAIL("Sequence (?#... not terminated");
9238                 RExC_parse++;
9239             }
9240             RExC_parse++;
9241             continue;
9242         }
9243         if (RExC_flags & RXf_PMf_EXTENDED) {
9244             if (isSPACE(*RExC_parse)) {
9245                 RExC_parse++;
9246                 continue;
9247             }
9248             else if (*RExC_parse == '#') {
9249                 if ( reg_skipcomment( pRExC_state ) )
9250                     continue;
9251             }
9252         }
9253         return retval;
9254     }
9255 }
9256
9257 /*
9258 - reg_node - emit a node
9259 */
9260 STATIC regnode *                        /* Location. */
9261 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
9262 {
9263     dVAR;
9264     register regnode *ptr;
9265     regnode * const ret = RExC_emit;
9266     GET_RE_DEBUG_FLAGS_DECL;
9267
9268     PERL_ARGS_ASSERT_REG_NODE;
9269
9270     if (SIZE_ONLY) {
9271         SIZE_ALIGN(RExC_size);
9272         RExC_size += 1;
9273         return(ret);
9274     }
9275     if (RExC_emit >= RExC_emit_bound)
9276         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
9277
9278     NODE_ALIGN_FILL(ret);
9279     ptr = ret;
9280     FILL_ADVANCE_NODE(ptr, op);
9281     REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 1);
9282 #ifdef RE_TRACK_PATTERN_OFFSETS
9283     if (RExC_offsets) {         /* MJD */
9284         MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
9285               "reg_node", __LINE__,
9286               PL_reg_name[op],
9287               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
9288                 ? "Overwriting end of array!\n" : "OK",
9289               (UV)(RExC_emit - RExC_emit_start),
9290               (UV)(RExC_parse - RExC_start),
9291               (UV)RExC_offsets[0]));
9292         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
9293     }
9294 #endif
9295     RExC_emit = ptr;
9296     return(ret);
9297 }
9298
9299 /*
9300 - reganode - emit a node with an argument
9301 */
9302 STATIC regnode *                        /* Location. */
9303 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
9304 {
9305     dVAR;
9306     register regnode *ptr;
9307     regnode * const ret = RExC_emit;
9308     GET_RE_DEBUG_FLAGS_DECL;
9309
9310     PERL_ARGS_ASSERT_REGANODE;
9311
9312     if (SIZE_ONLY) {
9313         SIZE_ALIGN(RExC_size);
9314         RExC_size += 2;
9315         /*
9316            We can't do this:
9317
9318            assert(2==regarglen[op]+1);
9319
9320            Anything larger than this has to allocate the extra amount.
9321            If we changed this to be:
9322
9323            RExC_size += (1 + regarglen[op]);
9324
9325            then it wouldn't matter. Its not clear what side effect
9326            might come from that so its not done so far.
9327            -- dmq
9328         */
9329         return(ret);
9330     }
9331     if (RExC_emit >= RExC_emit_bound)
9332         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
9333
9334     NODE_ALIGN_FILL(ret);
9335     ptr = ret;
9336     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
9337     REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 2);
9338 #ifdef RE_TRACK_PATTERN_OFFSETS
9339     if (RExC_offsets) {         /* MJD */
9340         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
9341               "reganode",
9342               __LINE__,
9343               PL_reg_name[op],
9344               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
9345               "Overwriting end of array!\n" : "OK",
9346               (UV)(RExC_emit - RExC_emit_start),
9347               (UV)(RExC_parse - RExC_start),
9348               (UV)RExC_offsets[0]));
9349         Set_Cur_Node_Offset;
9350     }
9351 #endif
9352     RExC_emit = ptr;
9353     return(ret);
9354 }
9355
9356 /*
9357 - reguni - emit (if appropriate) a Unicode character
9358 */
9359 STATIC STRLEN
9360 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
9361 {
9362     dVAR;
9363
9364     PERL_ARGS_ASSERT_REGUNI;
9365
9366     return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
9367 }
9368
9369 /*
9370 - reginsert - insert an operator in front of already-emitted operand
9371 *
9372 * Means relocating the operand.
9373 */
9374 STATIC void
9375 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
9376 {
9377     dVAR;
9378     register regnode *src;
9379     register regnode *dst;
9380     register regnode *place;
9381     const int offset = regarglen[(U8)op];
9382     const int size = NODE_STEP_REGNODE + offset;
9383     GET_RE_DEBUG_FLAGS_DECL;
9384
9385     PERL_ARGS_ASSERT_REGINSERT;
9386     PERL_UNUSED_ARG(depth);
9387 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
9388     DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
9389     if (SIZE_ONLY) {
9390         RExC_size += size;
9391         return;
9392     }
9393
9394     src = RExC_emit;
9395     RExC_emit += size;
9396     dst = RExC_emit;
9397     if (RExC_open_parens) {
9398         int paren;
9399         /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
9400         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
9401             if ( RExC_open_parens[paren] >= opnd ) {
9402                 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
9403                 RExC_open_parens[paren] += size;
9404             } else {
9405                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
9406             }
9407             if ( RExC_close_parens[paren] >= opnd ) {
9408                 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
9409                 RExC_close_parens[paren] += size;
9410             } else {
9411                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
9412             }
9413         }
9414     }
9415
9416     while (src > opnd) {
9417         StructCopy(--src, --dst, regnode);
9418 #ifdef RE_TRACK_PATTERN_OFFSETS
9419         if (RExC_offsets) {     /* MJD 20010112 */
9420             MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
9421                   "reg_insert",
9422                   __LINE__,
9423                   PL_reg_name[op],
9424                   (UV)(dst - RExC_emit_start) > RExC_offsets[0]
9425                     ? "Overwriting end of array!\n" : "OK",
9426                   (UV)(src - RExC_emit_start),
9427                   (UV)(dst - RExC_emit_start),
9428                   (UV)RExC_offsets[0]));
9429             Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
9430             Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
9431         }
9432 #endif
9433     }
9434
9435
9436     place = opnd;               /* Op node, where operand used to be. */
9437 #ifdef RE_TRACK_PATTERN_OFFSETS
9438     if (RExC_offsets) {         /* MJD */
9439         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
9440               "reginsert",
9441               __LINE__,
9442               PL_reg_name[op],
9443               (UV)(place - RExC_emit_start) > RExC_offsets[0]
9444               ? "Overwriting end of array!\n" : "OK",
9445               (UV)(place - RExC_emit_start),
9446               (UV)(RExC_parse - RExC_start),
9447               (UV)RExC_offsets[0]));
9448         Set_Node_Offset(place, RExC_parse);
9449         Set_Node_Length(place, 1);
9450     }
9451 #endif
9452     src = NEXTOPER(place);
9453     FILL_ADVANCE_NODE(place, op);
9454     REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (place) - 1);
9455     Zero(src, offset, regnode);
9456 }
9457
9458 /*
9459 - regtail - set the next-pointer at the end of a node chain of p to val.
9460 - SEE ALSO: regtail_study
9461 */
9462 /* TODO: All three parms should be const */
9463 STATIC void
9464 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
9465 {
9466     dVAR;
9467     register regnode *scan;
9468     GET_RE_DEBUG_FLAGS_DECL;
9469
9470     PERL_ARGS_ASSERT_REGTAIL;
9471 #ifndef DEBUGGING
9472     PERL_UNUSED_ARG(depth);
9473 #endif
9474
9475     if (SIZE_ONLY)
9476         return;
9477
9478     /* Find last node. */
9479     scan = p;
9480     for (;;) {
9481         regnode * const temp = regnext(scan);
9482         DEBUG_PARSE_r({
9483             SV * const mysv=sv_newmortal();
9484             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
9485             regprop(RExC_rx, mysv, scan);
9486             PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
9487                 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
9488                     (temp == NULL ? "->" : ""),
9489                     (temp == NULL ? PL_reg_name[OP(val)] : "")
9490             );
9491         });
9492         if (temp == NULL)
9493             break;
9494         scan = temp;
9495     }
9496
9497     if (reg_off_by_arg[OP(scan)]) {
9498         ARG_SET(scan, val - scan);
9499     }
9500     else {
9501         NEXT_OFF(scan) = val - scan;
9502     }
9503 }
9504
9505 #ifdef DEBUGGING
9506 /*
9507 - regtail_study - set the next-pointer at the end of a node chain of p to val.
9508 - Look for optimizable sequences at the same time.
9509 - currently only looks for EXACT chains.
9510
9511 This is experimental code. The idea is to use this routine to perform
9512 in place optimizations on branches and groups as they are constructed,
9513 with the long term intention of removing optimization from study_chunk so
9514 that it is purely analytical.
9515
9516 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
9517 to control which is which.
9518
9519 */
9520 /* TODO: All four parms should be const */
9521
9522 STATIC U8
9523 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
9524 {
9525     dVAR;
9526     register regnode *scan;
9527     U8 exact = PSEUDO;
9528 #ifdef EXPERIMENTAL_INPLACESCAN
9529     I32 min = 0;
9530 #endif
9531     GET_RE_DEBUG_FLAGS_DECL;
9532
9533     PERL_ARGS_ASSERT_REGTAIL_STUDY;
9534
9535
9536     if (SIZE_ONLY)
9537         return exact;
9538
9539     /* Find last node. */
9540
9541     scan = p;
9542     for (;;) {
9543         regnode * const temp = regnext(scan);
9544 #ifdef EXPERIMENTAL_INPLACESCAN
9545         if (PL_regkind[OP(scan)] == EXACT)
9546             if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
9547                 return EXACT;
9548 #endif
9549         if ( exact ) {
9550             switch (OP(scan)) {
9551                 case EXACT:
9552                 case EXACTF:
9553                 case EXACTFU:
9554                 case EXACTFL:
9555                         if( exact == PSEUDO )
9556                             exact= OP(scan);
9557                         else if ( exact != OP(scan) )
9558                             exact= 0;
9559                 case NOTHING:
9560                     break;
9561                 default:
9562                     exact= 0;
9563             }
9564         }
9565         DEBUG_PARSE_r({
9566             SV * const mysv=sv_newmortal();
9567             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
9568             regprop(RExC_rx, mysv, scan);
9569             PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
9570                 SvPV_nolen_const(mysv),
9571                 REG_NODE_NUM(scan),
9572                 PL_reg_name[exact]);
9573         });
9574         if (temp == NULL)
9575             break;
9576         scan = temp;
9577     }
9578     DEBUG_PARSE_r({
9579         SV * const mysv_val=sv_newmortal();
9580         DEBUG_PARSE_MSG("");
9581         regprop(RExC_rx, mysv_val, val);
9582         PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
9583                       SvPV_nolen_const(mysv_val),
9584                       (IV)REG_NODE_NUM(val),
9585                       (IV)(val - scan)
9586         );
9587     });
9588     if (reg_off_by_arg[OP(scan)]) {
9589         ARG_SET(scan, val - scan);
9590     }
9591     else {
9592         NEXT_OFF(scan) = val - scan;
9593     }
9594
9595     return exact;
9596 }
9597 #endif
9598
9599 /*
9600  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
9601  */
9602 #ifdef DEBUGGING
9603 static void
9604 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
9605 {
9606     int bit;
9607     int set=0;
9608     regex_charset cs;
9609
9610     for (bit=0; bit<32; bit++) {
9611         if (flags & (1<<bit)) {
9612             if ((1<<bit) & RXf_PMf_CHARSET) {   /* Output separately, below */
9613                 continue;
9614             }
9615             if (!set++ && lead)
9616                 PerlIO_printf(Perl_debug_log, "%s",lead);
9617             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
9618         }
9619     }
9620     if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
9621             if (!set++ && lead) {
9622                 PerlIO_printf(Perl_debug_log, "%s",lead);
9623             }
9624             switch (cs) {
9625                 case REGEX_UNICODE_CHARSET:
9626                     PerlIO_printf(Perl_debug_log, "UNICODE");
9627                     break;
9628                 case REGEX_LOCALE_CHARSET:
9629                     PerlIO_printf(Perl_debug_log, "LOCALE");
9630                     break;
9631                 case REGEX_ASCII_RESTRICTED_CHARSET:
9632                     PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
9633                     break;
9634                 default:
9635                     PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
9636                     break;
9637             }
9638     }
9639     if (lead)  {
9640         if (set)
9641             PerlIO_printf(Perl_debug_log, "\n");
9642         else
9643             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
9644     }
9645 }
9646 #endif
9647
9648 void
9649 Perl_regdump(pTHX_ const regexp *r)
9650 {
9651 #ifdef DEBUGGING
9652     dVAR;
9653     SV * const sv = sv_newmortal();
9654     SV *dsv= sv_newmortal();
9655     RXi_GET_DECL(r,ri);
9656     GET_RE_DEBUG_FLAGS_DECL;
9657
9658     PERL_ARGS_ASSERT_REGDUMP;
9659
9660     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
9661
9662     /* Header fields of interest. */
9663     if (r->anchored_substr) {
9664         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
9665             RE_SV_DUMPLEN(r->anchored_substr), 30);
9666         PerlIO_printf(Perl_debug_log,
9667                       "anchored %s%s at %"IVdf" ",
9668                       s, RE_SV_TAIL(r->anchored_substr),
9669                       (IV)r->anchored_offset);
9670     } else if (r->anchored_utf8) {
9671         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
9672             RE_SV_DUMPLEN(r->anchored_utf8), 30);
9673         PerlIO_printf(Perl_debug_log,
9674                       "anchored utf8 %s%s at %"IVdf" ",
9675                       s, RE_SV_TAIL(r->anchored_utf8),
9676                       (IV)r->anchored_offset);
9677     }
9678     if (r->float_substr) {
9679         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
9680             RE_SV_DUMPLEN(r->float_substr), 30);
9681         PerlIO_printf(Perl_debug_log,
9682                       "floating %s%s at %"IVdf"..%"UVuf" ",
9683                       s, RE_SV_TAIL(r->float_substr),
9684                       (IV)r->float_min_offset, (UV)r->float_max_offset);
9685     } else if (r->float_utf8) {
9686         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
9687             RE_SV_DUMPLEN(r->float_utf8), 30);
9688         PerlIO_printf(Perl_debug_log,
9689                       "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
9690                       s, RE_SV_TAIL(r->float_utf8),
9691                       (IV)r->float_min_offset, (UV)r->float_max_offset);
9692     }
9693     if (r->check_substr || r->check_utf8)
9694         PerlIO_printf(Perl_debug_log,
9695                       (const char *)
9696                       (r->check_substr == r->float_substr
9697                        && r->check_utf8 == r->float_utf8
9698                        ? "(checking floating" : "(checking anchored"));
9699     if (r->extflags & RXf_NOSCAN)
9700         PerlIO_printf(Perl_debug_log, " noscan");
9701     if (r->extflags & RXf_CHECK_ALL)
9702         PerlIO_printf(Perl_debug_log, " isall");
9703     if (r->check_substr || r->check_utf8)
9704         PerlIO_printf(Perl_debug_log, ") ");
9705
9706     if (ri->regstclass) {
9707         regprop(r, sv, ri->regstclass);
9708         PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
9709     }
9710     if (r->extflags & RXf_ANCH) {
9711         PerlIO_printf(Perl_debug_log, "anchored");
9712         if (r->extflags & RXf_ANCH_BOL)
9713             PerlIO_printf(Perl_debug_log, "(BOL)");
9714         if (r->extflags & RXf_ANCH_MBOL)
9715             PerlIO_printf(Perl_debug_log, "(MBOL)");
9716         if (r->extflags & RXf_ANCH_SBOL)
9717             PerlIO_printf(Perl_debug_log, "(SBOL)");
9718         if (r->extflags & RXf_ANCH_GPOS)
9719             PerlIO_printf(Perl_debug_log, "(GPOS)");
9720         PerlIO_putc(Perl_debug_log, ' ');
9721     }
9722     if (r->extflags & RXf_GPOS_SEEN)
9723         PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
9724     if (r->intflags & PREGf_SKIP)
9725         PerlIO_printf(Perl_debug_log, "plus ");
9726     if (r->intflags & PREGf_IMPLICIT)
9727         PerlIO_printf(Perl_debug_log, "implicit ");
9728     PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
9729     if (r->extflags & RXf_EVAL_SEEN)
9730         PerlIO_printf(Perl_debug_log, "with eval ");
9731     PerlIO_printf(Perl_debug_log, "\n");
9732     DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
9733 #else
9734     PERL_ARGS_ASSERT_REGDUMP;
9735     PERL_UNUSED_CONTEXT;
9736     PERL_UNUSED_ARG(r);
9737 #endif  /* DEBUGGING */
9738 }
9739
9740 /*
9741 - regprop - printable representation of opcode
9742 */
9743 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
9744 STMT_START { \
9745         if (do_sep) {                           \
9746             Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
9747             if (flags & ANYOF_INVERT)           \
9748                 /*make sure the invert info is in each */ \
9749                 sv_catpvs(sv, "^");             \
9750             do_sep = 0;                         \
9751         }                                       \
9752 } STMT_END
9753
9754 void
9755 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
9756 {
9757 #ifdef DEBUGGING
9758     dVAR;
9759     register int k;
9760     RXi_GET_DECL(prog,progi);
9761     GET_RE_DEBUG_FLAGS_DECL;
9762
9763     PERL_ARGS_ASSERT_REGPROP;
9764
9765     sv_setpvs(sv, "");
9766
9767     if (OP(o) > REGNODE_MAX)            /* regnode.type is unsigned */
9768         /* It would be nice to FAIL() here, but this may be called from
9769            regexec.c, and it would be hard to supply pRExC_state. */
9770         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
9771     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
9772
9773     k = PL_regkind[OP(o)];
9774
9775     if (k == EXACT) {
9776         sv_catpvs(sv, " ");
9777         /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
9778          * is a crude hack but it may be the best for now since
9779          * we have no flag "this EXACTish node was UTF-8"
9780          * --jhi */
9781         pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
9782                   PERL_PV_ESCAPE_UNI_DETECT |
9783                   PERL_PV_ESCAPE_NONASCII   |
9784                   PERL_PV_PRETTY_ELLIPSES   |
9785                   PERL_PV_PRETTY_LTGT       |
9786                   PERL_PV_PRETTY_NOCLEAR
9787                   );
9788     } else if (k == TRIE) {
9789         /* print the details of the trie in dumpuntil instead, as
9790          * progi->data isn't available here */
9791         const char op = OP(o);
9792         const U32 n = ARG(o);
9793         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
9794                (reg_ac_data *)progi->data->data[n] :
9795                NULL;
9796         const reg_trie_data * const trie
9797             = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
9798
9799         Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
9800         DEBUG_TRIE_COMPILE_r(
9801             Perl_sv_catpvf(aTHX_ sv,
9802                 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
9803                 (UV)trie->startstate,
9804                 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
9805                 (UV)trie->wordcount,
9806                 (UV)trie->minlen,
9807                 (UV)trie->maxlen,
9808                 (UV)TRIE_CHARCOUNT(trie),
9809                 (UV)trie->uniquecharcount
9810             )
9811         );
9812         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
9813             int i;
9814             int rangestart = -1;
9815             U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
9816             sv_catpvs(sv, "[");
9817             for (i = 0; i <= 256; i++) {
9818                 if (i < 256 && BITMAP_TEST(bitmap,i)) {
9819                     if (rangestart == -1)
9820                         rangestart = i;
9821                 } else if (rangestart != -1) {
9822                     if (i <= rangestart + 3)
9823                         for (; rangestart < i; rangestart++)
9824                             put_byte(sv, rangestart);
9825                     else {
9826                         put_byte(sv, rangestart);
9827                         sv_catpvs(sv, "-");
9828                         put_byte(sv, i - 1);
9829                     }
9830                     rangestart = -1;
9831                 }
9832             }
9833             sv_catpvs(sv, "]");
9834         }
9835
9836     } else if (k == CURLY) {
9837         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
9838             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
9839         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
9840     }
9841     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
9842         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9843     else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
9844         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
9845         if ( RXp_PAREN_NAMES(prog) ) {
9846             if ( k != REF || (OP(o) < NREF)) {
9847                 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
9848                 SV **name= av_fetch(list, ARG(o), 0 );
9849                 if (name)
9850                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
9851             }
9852             else {
9853                 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
9854                 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
9855                 I32 *nums=(I32*)SvPVX(sv_dat);
9856                 SV **name= av_fetch(list, nums[0], 0 );
9857                 I32 n;
9858                 if (name) {
9859                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
9860                         Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
9861                                     (n ? "," : ""), (IV)nums[n]);
9862                     }
9863                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
9864                 }
9865             }
9866         }
9867     } else if (k == GOSUB)
9868         Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
9869     else if (k == VERB) {
9870         if (!o->flags)
9871             Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
9872                            SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
9873     } else if (k == LOGICAL)
9874         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);     /* 2: embedded, otherwise 1 */
9875     else if (k == FOLDCHAR)
9876         Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) );
9877     else if (k == ANYOF) {
9878         int i, rangestart = -1;
9879         const U8 flags = ANYOF_FLAGS(o);
9880         int do_sep = 0;
9881
9882         /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
9883         static const char * const anyofs[] = {
9884             "\\w",
9885             "\\W",
9886             "\\s",
9887             "\\S",
9888             "\\d",
9889             "\\D",
9890             "[:alnum:]",
9891             "[:^alnum:]",
9892             "[:alpha:]",
9893             "[:^alpha:]",
9894             "[:ascii:]",
9895             "[:^ascii:]",
9896             "[:cntrl:]",
9897             "[:^cntrl:]",
9898             "[:graph:]",
9899             "[:^graph:]",
9900             "[:lower:]",
9901             "[:^lower:]",
9902             "[:print:]",
9903             "[:^print:]",
9904             "[:punct:]",
9905             "[:^punct:]",
9906             "[:upper:]",
9907             "[:^upper:]",
9908             "[:xdigit:]",
9909             "[:^xdigit:]",
9910             "[:space:]",
9911             "[:^space:]",
9912             "[:blank:]",
9913             "[:^blank:]"
9914         };
9915
9916         if (flags & ANYOF_LOCALE)
9917             sv_catpvs(sv, "{loc}");
9918         if (flags & ANYOF_LOC_NONBITMAP_FOLD)
9919             sv_catpvs(sv, "{i}");
9920         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
9921         if (flags & ANYOF_INVERT)
9922             sv_catpvs(sv, "^");
9923
9924         /* output what the standard cp 0-255 bitmap matches */
9925         for (i = 0; i <= 256; i++) {
9926             if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
9927                 if (rangestart == -1)
9928                     rangestart = i;
9929             } else if (rangestart != -1) {
9930                 if (i <= rangestart + 3)
9931                     for (; rangestart < i; rangestart++)
9932                         put_byte(sv, rangestart);
9933                 else {
9934                     put_byte(sv, rangestart);
9935                     sv_catpvs(sv, "-");
9936                     put_byte(sv, i - 1);
9937                 }
9938                 do_sep = 1;
9939                 rangestart = -1;
9940             }
9941         }
9942
9943         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9944         /* output any special charclass tests (used entirely under use locale) */
9945         if (ANYOF_CLASS_TEST_ANY_SET(o))
9946             for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
9947                 if (ANYOF_CLASS_TEST(o,i)) {
9948                     sv_catpv(sv, anyofs[i]);
9949                     do_sep = 1;
9950                 }
9951
9952         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9953
9954         if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
9955             sv_catpvs(sv, "{non-utf8-latin1-all}");
9956         }
9957
9958         /* output information about the unicode matching */
9959         if (flags & ANYOF_UNICODE_ALL)
9960             sv_catpvs(sv, "{unicode_all}");
9961         else if (flags & ANYOF_UTF8)
9962             sv_catpvs(sv, "{unicode}");
9963         if (flags & ANYOF_NONBITMAP_NON_UTF8)
9964             sv_catpvs(sv, "{outside bitmap}");
9965
9966         {
9967             SV *lv;
9968             SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
9969
9970             if (lv) {
9971                 if (sw) {
9972                     U8 s[UTF8_MAXBYTES_CASE+1];
9973
9974                     for (i = 0; i <= 256; i++) { /* just the first 256 */
9975                         uvchr_to_utf8(s, i);
9976
9977                         if (i < 256 && swash_fetch(sw, s, TRUE)) {
9978                             if (rangestart == -1)
9979                                 rangestart = i;
9980                         } else if (rangestart != -1) {
9981                             if (i <= rangestart + 3)
9982                                 for (; rangestart < i; rangestart++) {
9983                                     const U8 * const e = uvchr_to_utf8(s,rangestart);
9984                                     U8 *p;
9985                                     for(p = s; p < e; p++)
9986                                         put_byte(sv, *p);
9987                                 }
9988                             else {
9989                                 const U8 *e = uvchr_to_utf8(s,rangestart);
9990                                 U8 *p;
9991                                 for (p = s; p < e; p++)
9992                                     put_byte(sv, *p);
9993                                 sv_catpvs(sv, "-");
9994                                 e = uvchr_to_utf8(s, i-1);
9995                                 for (p = s; p < e; p++)
9996                                     put_byte(sv, *p);
9997                                 }
9998                                 rangestart = -1;
9999                             }
10000                         }
10001
10002                     sv_catpvs(sv, "..."); /* et cetera */
10003                 }
10004
10005                 {
10006                     char *s = savesvpv(lv);
10007                     char * const origs = s;
10008
10009                     while (*s && *s != '\n')
10010                         s++;
10011
10012                     if (*s == '\n') {
10013                         const char * const t = ++s;
10014
10015                         while (*s) {
10016                             if (*s == '\n')
10017                                 *s = ' ';
10018                             s++;
10019                         }
10020                         if (s[-1] == ' ')
10021                             s[-1] = 0;
10022
10023                         sv_catpv(sv, t);
10024                     }
10025
10026                     Safefree(origs);
10027                 }
10028             }
10029         }
10030
10031         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
10032     }
10033     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
10034         Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
10035 #else
10036     PERL_UNUSED_CONTEXT;
10037     PERL_UNUSED_ARG(sv);
10038     PERL_UNUSED_ARG(o);
10039     PERL_UNUSED_ARG(prog);
10040 #endif  /* DEBUGGING */
10041 }
10042
10043 SV *
10044 Perl_re_intuit_string(pTHX_ REGEXP * const r)
10045 {                               /* Assume that RE_INTUIT is set */
10046     dVAR;
10047     struct regexp *const prog = (struct regexp *)SvANY(r);
10048     GET_RE_DEBUG_FLAGS_DECL;
10049
10050     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
10051     PERL_UNUSED_CONTEXT;
10052
10053     DEBUG_COMPILE_r(
10054         {
10055             const char * const s = SvPV_nolen_const(prog->check_substr
10056                       ? prog->check_substr : prog->check_utf8);
10057
10058             if (!PL_colorset) reginitcolors();
10059             PerlIO_printf(Perl_debug_log,
10060                       "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
10061                       PL_colors[4],
10062                       prog->check_substr ? "" : "utf8 ",
10063                       PL_colors[5],PL_colors[0],
10064                       s,
10065                       PL_colors[1],
10066                       (strlen(s) > 60 ? "..." : ""));
10067         } );
10068
10069     return prog->check_substr ? prog->check_substr : prog->check_utf8;
10070 }
10071
10072 /*
10073    pregfree()
10074
10075    handles refcounting and freeing the perl core regexp structure. When
10076    it is necessary to actually free the structure the first thing it
10077    does is call the 'free' method of the regexp_engine associated to
10078    the regexp, allowing the handling of the void *pprivate; member
10079    first. (This routine is not overridable by extensions, which is why
10080    the extensions free is called first.)
10081
10082    See regdupe and regdupe_internal if you change anything here.
10083 */
10084 #ifndef PERL_IN_XSUB_RE
10085 void
10086 Perl_pregfree(pTHX_ REGEXP *r)
10087 {
10088     SvREFCNT_dec(r);
10089 }
10090
10091 void
10092 Perl_pregfree2(pTHX_ REGEXP *rx)
10093 {
10094     dVAR;
10095     struct regexp *const r = (struct regexp *)SvANY(rx);
10096     GET_RE_DEBUG_FLAGS_DECL;
10097
10098     PERL_ARGS_ASSERT_PREGFREE2;
10099
10100     if (r->mother_re) {
10101         ReREFCNT_dec(r->mother_re);
10102     } else {
10103         CALLREGFREE_PVT(rx); /* free the private data */
10104         SvREFCNT_dec(RXp_PAREN_NAMES(r));
10105     }
10106     if (r->substrs) {
10107         SvREFCNT_dec(r->anchored_substr);
10108         SvREFCNT_dec(r->anchored_utf8);
10109         SvREFCNT_dec(r->float_substr);
10110         SvREFCNT_dec(r->float_utf8);
10111         Safefree(r->substrs);
10112     }
10113     RX_MATCH_COPY_FREE(rx);
10114 #ifdef PERL_OLD_COPY_ON_WRITE
10115     SvREFCNT_dec(r->saved_copy);
10116 #endif
10117     Safefree(r->offs);
10118 }
10119
10120 /*  reg_temp_copy()
10121
10122     This is a hacky workaround to the structural issue of match results
10123     being stored in the regexp structure which is in turn stored in
10124     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
10125     could be PL_curpm in multiple contexts, and could require multiple
10126     result sets being associated with the pattern simultaneously, such
10127     as when doing a recursive match with (??{$qr})
10128
10129     The solution is to make a lightweight copy of the regexp structure
10130     when a qr// is returned from the code executed by (??{$qr}) this
10131     lightweight copy doesn't actually own any of its data except for
10132     the starp/end and the actual regexp structure itself.
10133
10134 */
10135
10136
10137 REGEXP *
10138 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
10139 {
10140     struct regexp *ret;
10141     struct regexp *const r = (struct regexp *)SvANY(rx);
10142     register const I32 npar = r->nparens+1;
10143
10144     PERL_ARGS_ASSERT_REG_TEMP_COPY;
10145
10146     if (!ret_x)
10147         ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
10148     ret = (struct regexp *)SvANY(ret_x);
10149
10150     (void)ReREFCNT_inc(rx);
10151     /* We can take advantage of the existing "copied buffer" mechanism in SVs
10152        by pointing directly at the buffer, but flagging that the allocated
10153        space in the copy is zero. As we've just done a struct copy, it's now
10154        a case of zero-ing that, rather than copying the current length.  */
10155     SvPV_set(ret_x, RX_WRAPPED(rx));
10156     SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
10157     memcpy(&(ret->xpv_cur), &(r->xpv_cur),
10158            sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
10159     SvLEN_set(ret_x, 0);
10160     SvSTASH_set(ret_x, NULL);
10161     SvMAGIC_set(ret_x, NULL);
10162     Newx(ret->offs, npar, regexp_paren_pair);
10163     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
10164     if (r->substrs) {
10165         Newx(ret->substrs, 1, struct reg_substr_data);
10166         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
10167
10168         SvREFCNT_inc_void(ret->anchored_substr);
10169         SvREFCNT_inc_void(ret->anchored_utf8);
10170         SvREFCNT_inc_void(ret->float_substr);
10171         SvREFCNT_inc_void(ret->float_utf8);
10172
10173         /* check_substr and check_utf8, if non-NULL, point to either their
10174            anchored or float namesakes, and don't hold a second reference.  */
10175     }
10176     RX_MATCH_COPIED_off(ret_x);
10177 #ifdef PERL_OLD_COPY_ON_WRITE
10178     ret->saved_copy = NULL;
10179 #endif
10180     ret->mother_re = rx;
10181
10182     return ret_x;
10183 }
10184 #endif
10185
10186 /* regfree_internal()
10187
10188    Free the private data in a regexp. This is overloadable by
10189    extensions. Perl takes care of the regexp structure in pregfree(),
10190    this covers the *pprivate pointer which technically perl doesn't
10191    know about, however of course we have to handle the
10192    regexp_internal structure when no extension is in use.
10193
10194    Note this is called before freeing anything in the regexp
10195    structure.
10196  */
10197
10198 void
10199 Perl_regfree_internal(pTHX_ REGEXP * const rx)
10200 {
10201     dVAR;
10202     struct regexp *const r = (struct regexp *)SvANY(rx);
10203     RXi_GET_DECL(r,ri);
10204     GET_RE_DEBUG_FLAGS_DECL;
10205
10206     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
10207
10208     DEBUG_COMPILE_r({
10209         if (!PL_colorset)
10210             reginitcolors();
10211         {
10212             SV *dsv= sv_newmortal();
10213             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
10214                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
10215             PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
10216                 PL_colors[4],PL_colors[5],s);
10217         }
10218     });
10219 #ifdef RE_TRACK_PATTERN_OFFSETS
10220     if (ri->u.offsets)
10221         Safefree(ri->u.offsets);             /* 20010421 MJD */
10222 #endif
10223     if (ri->data) {
10224         int n = ri->data->count;
10225         PAD* new_comppad = NULL;
10226         PAD* old_comppad;
10227         PADOFFSET refcnt;
10228
10229         while (--n >= 0) {
10230           /* If you add a ->what type here, update the comment in regcomp.h */
10231             switch (ri->data->what[n]) {
10232             case 'a':
10233             case 's':
10234             case 'S':
10235             case 'u':
10236                 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
10237                 break;
10238             case 'f':
10239                 Safefree(ri->data->data[n]);
10240                 break;
10241             case 'p':
10242                 new_comppad = MUTABLE_AV(ri->data->data[n]);
10243                 break;
10244             case 'o':
10245                 if (new_comppad == NULL)
10246                     Perl_croak(aTHX_ "panic: pregfree comppad");
10247                 PAD_SAVE_LOCAL(old_comppad,
10248                     /* Watch out for global destruction's random ordering. */
10249                     (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
10250                 );
10251                 OP_REFCNT_LOCK;
10252                 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
10253                 OP_REFCNT_UNLOCK;
10254                 if (!refcnt)
10255                     op_free((OP_4tree*)ri->data->data[n]);
10256
10257                 PAD_RESTORE_LOCAL(old_comppad);
10258                 SvREFCNT_dec(MUTABLE_SV(new_comppad));
10259                 new_comppad = NULL;
10260                 break;
10261             case 'n':
10262                 break;
10263             case 'T':
10264                 { /* Aho Corasick add-on structure for a trie node.
10265                      Used in stclass optimization only */
10266                     U32 refcount;
10267                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
10268                     OP_REFCNT_LOCK;
10269                     refcount = --aho->refcount;
10270                     OP_REFCNT_UNLOCK;
10271                     if ( !refcount ) {
10272                         PerlMemShared_free(aho->states);
10273                         PerlMemShared_free(aho->fail);
10274                          /* do this last!!!! */
10275                         PerlMemShared_free(ri->data->data[n]);
10276                         PerlMemShared_free(ri->regstclass);
10277                     }
10278                 }
10279                 break;
10280             case 't':
10281                 {
10282                     /* trie structure. */
10283                     U32 refcount;
10284                     reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
10285                     OP_REFCNT_LOCK;
10286                     refcount = --trie->refcount;
10287                     OP_REFCNT_UNLOCK;
10288                     if ( !refcount ) {
10289                         PerlMemShared_free(trie->charmap);
10290                         PerlMemShared_free(trie->states);
10291                         PerlMemShared_free(trie->trans);
10292                         if (trie->bitmap)
10293                             PerlMemShared_free(trie->bitmap);
10294                         if (trie->jump)
10295                             PerlMemShared_free(trie->jump);
10296                         PerlMemShared_free(trie->wordinfo);
10297                         /* do this last!!!! */
10298                         PerlMemShared_free(ri->data->data[n]);
10299                     }
10300                 }
10301                 break;
10302             default:
10303                 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
10304             }
10305         }
10306         Safefree(ri->data->what);
10307         Safefree(ri->data);
10308     }
10309
10310     Safefree(ri);
10311 }
10312
10313 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
10314 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
10315 #define SAVEPVN(p,n)    ((p) ? savepvn(p,n) : NULL)
10316
10317 /*
10318    re_dup - duplicate a regexp.
10319
10320    This routine is expected to clone a given regexp structure. It is only
10321    compiled under USE_ITHREADS.
10322
10323    After all of the core data stored in struct regexp is duplicated
10324    the regexp_engine.dupe method is used to copy any private data
10325    stored in the *pprivate pointer. This allows extensions to handle
10326    any duplication it needs to do.
10327
10328    See pregfree() and regfree_internal() if you change anything here.
10329 */
10330 #if defined(USE_ITHREADS)
10331 #ifndef PERL_IN_XSUB_RE
10332 void
10333 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
10334 {
10335     dVAR;
10336     I32 npar;
10337     const struct regexp *r = (const struct regexp *)SvANY(sstr);
10338     struct regexp *ret = (struct regexp *)SvANY(dstr);
10339
10340     PERL_ARGS_ASSERT_RE_DUP_GUTS;
10341
10342     npar = r->nparens+1;
10343     Newx(ret->offs, npar, regexp_paren_pair);
10344     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
10345     if(ret->swap) {
10346         /* no need to copy these */
10347         Newx(ret->swap, npar, regexp_paren_pair);
10348     }
10349
10350     if (ret->substrs) {
10351         /* Do it this way to avoid reading from *r after the StructCopy().
10352            That way, if any of the sv_dup_inc()s dislodge *r from the L1
10353            cache, it doesn't matter.  */
10354         const bool anchored = r->check_substr
10355             ? r->check_substr == r->anchored_substr
10356             : r->check_utf8 == r->anchored_utf8;
10357         Newx(ret->substrs, 1, struct reg_substr_data);
10358         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
10359
10360         ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
10361         ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
10362         ret->float_substr = sv_dup_inc(ret->float_substr, param);
10363         ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
10364
10365         /* check_substr and check_utf8, if non-NULL, point to either their
10366            anchored or float namesakes, and don't hold a second reference.  */
10367
10368         if (ret->check_substr) {
10369             if (anchored) {
10370                 assert(r->check_utf8 == r->anchored_utf8);
10371                 ret->check_substr = ret->anchored_substr;
10372                 ret->check_utf8 = ret->anchored_utf8;
10373             } else {
10374                 assert(r->check_substr == r->float_substr);
10375                 assert(r->check_utf8 == r->float_utf8);
10376                 ret->check_substr = ret->float_substr;
10377                 ret->check_utf8 = ret->float_utf8;
10378             }
10379         } else if (ret->check_utf8) {
10380             if (anchored) {
10381                 ret->check_utf8 = ret->anchored_utf8;
10382             } else {
10383                 ret->check_utf8 = ret->float_utf8;
10384             }
10385         }
10386     }
10387
10388     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
10389
10390     if (ret->pprivate)
10391         RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
10392
10393     if (RX_MATCH_COPIED(dstr))
10394         ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
10395     else
10396         ret->subbeg = NULL;
10397 #ifdef PERL_OLD_COPY_ON_WRITE
10398     ret->saved_copy = NULL;
10399 #endif
10400
10401     if (ret->mother_re) {
10402         if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
10403             /* Our storage points directly to our mother regexp, but that's
10404                1: a buffer in a different thread
10405                2: something we no longer hold a reference on
10406                so we need to copy it locally.  */
10407             /* Note we need to sue SvCUR() on our mother_re, because it, in
10408                turn, may well be pointing to its own mother_re.  */
10409             SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
10410                                    SvCUR(ret->mother_re)+1));
10411             SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
10412         }
10413         ret->mother_re      = NULL;
10414     }
10415     ret->gofs = 0;
10416 }
10417 #endif /* PERL_IN_XSUB_RE */
10418
10419 /*
10420    regdupe_internal()
10421
10422    This is the internal complement to regdupe() which is used to copy
10423    the structure pointed to by the *pprivate pointer in the regexp.
10424    This is the core version of the extension overridable cloning hook.
10425    The regexp structure being duplicated will be copied by perl prior
10426    to this and will be provided as the regexp *r argument, however
10427    with the /old/ structures pprivate pointer value. Thus this routine
10428    may override any copying normally done by perl.
10429
10430    It returns a pointer to the new regexp_internal structure.
10431 */
10432
10433 void *
10434 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
10435 {
10436     dVAR;
10437     struct regexp *const r = (struct regexp *)SvANY(rx);
10438     regexp_internal *reti;
10439     int len, npar;
10440     RXi_GET_DECL(r,ri);
10441
10442     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
10443
10444     npar = r->nparens+1;
10445     len = ProgLen(ri);
10446
10447     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
10448     Copy(ri->program, reti->program, len+1, regnode);
10449
10450
10451     reti->regstclass = NULL;
10452
10453     if (ri->data) {
10454         struct reg_data *d;
10455         const int count = ri->data->count;
10456         int i;
10457
10458         Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
10459                 char, struct reg_data);
10460         Newx(d->what, count, U8);
10461
10462         d->count = count;
10463         for (i = 0; i < count; i++) {
10464             d->what[i] = ri->data->what[i];
10465             switch (d->what[i]) {
10466                 /* legal options are one of: sSfpontTua
10467                    see also regcomp.h and pregfree() */
10468             case 'a': /* actually an AV, but the dup function is identical.  */
10469             case 's':
10470             case 'S':
10471             case 'p': /* actually an AV, but the dup function is identical.  */
10472             case 'u': /* actually an HV, but the dup function is identical.  */
10473                 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
10474                 break;
10475             case 'f':
10476                 /* This is cheating. */
10477                 Newx(d->data[i], 1, struct regnode_charclass_class);
10478                 StructCopy(ri->data->data[i], d->data[i],
10479                             struct regnode_charclass_class);
10480                 reti->regstclass = (regnode*)d->data[i];
10481                 break;
10482             case 'o':
10483                 /* Compiled op trees are readonly and in shared memory,
10484                    and can thus be shared without duplication. */
10485                 OP_REFCNT_LOCK;
10486                 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
10487                 OP_REFCNT_UNLOCK;
10488                 break;
10489             case 'T':
10490                 /* Trie stclasses are readonly and can thus be shared
10491                  * without duplication. We free the stclass in pregfree
10492                  * when the corresponding reg_ac_data struct is freed.
10493                  */
10494                 reti->regstclass= ri->regstclass;
10495                 /* Fall through */
10496             case 't':
10497                 OP_REFCNT_LOCK;
10498                 ((reg_trie_data*)ri->data->data[i])->refcount++;
10499                 OP_REFCNT_UNLOCK;
10500                 /* Fall through */
10501             case 'n':
10502                 d->data[i] = ri->data->data[i];
10503                 break;
10504             default:
10505                 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
10506             }
10507         }
10508
10509         reti->data = d;
10510     }
10511     else
10512         reti->data = NULL;
10513
10514     reti->name_list_idx = ri->name_list_idx;
10515
10516 #ifdef RE_TRACK_PATTERN_OFFSETS
10517     if (ri->u.offsets) {
10518         Newx(reti->u.offsets, 2*len+1, U32);
10519         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
10520     }
10521 #else
10522     SetProgLen(reti,len);
10523 #endif
10524
10525     return (void*)reti;
10526 }
10527
10528 #endif    /* USE_ITHREADS */
10529
10530 #ifndef PERL_IN_XSUB_RE
10531
10532 /*
10533  - regnext - dig the "next" pointer out of a node
10534  */
10535 regnode *
10536 Perl_regnext(pTHX_ register regnode *p)
10537 {
10538     dVAR;
10539     register I32 offset;
10540
10541     if (!p)
10542         return(NULL);
10543
10544     if (OP(p) > REGNODE_MAX) {          /* regnode.type is unsigned */
10545         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
10546     }
10547
10548     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
10549     if (offset == 0)
10550         return(NULL);
10551
10552     return(p+offset);
10553 }
10554 #endif
10555
10556 STATIC void
10557 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
10558 {
10559     va_list args;
10560     STRLEN l1 = strlen(pat1);
10561     STRLEN l2 = strlen(pat2);
10562     char buf[512];
10563     SV *msv;
10564     const char *message;
10565
10566     PERL_ARGS_ASSERT_RE_CROAK2;
10567
10568     if (l1 > 510)
10569         l1 = 510;
10570     if (l1 + l2 > 510)
10571         l2 = 510 - l1;
10572     Copy(pat1, buf, l1 , char);
10573     Copy(pat2, buf + l1, l2 , char);
10574     buf[l1 + l2] = '\n';
10575     buf[l1 + l2 + 1] = '\0';
10576 #ifdef I_STDARG
10577     /* ANSI variant takes additional second argument */
10578     va_start(args, pat2);
10579 #else
10580     va_start(args);
10581 #endif
10582     msv = vmess(buf, &args);
10583     va_end(args);
10584     message = SvPV_const(msv,l1);
10585     if (l1 > 512)
10586         l1 = 512;
10587     Copy(message, buf, l1 , char);
10588     buf[l1-1] = '\0';                   /* Overwrite \n */
10589     Perl_croak(aTHX_ "%s", buf);
10590 }
10591
10592 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
10593
10594 #ifndef PERL_IN_XSUB_RE
10595 void
10596 Perl_save_re_context(pTHX)
10597 {
10598     dVAR;
10599
10600     struct re_save_state *state;
10601
10602     SAVEVPTR(PL_curcop);
10603     SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
10604
10605     state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
10606     PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
10607     SSPUSHUV(SAVEt_RE_STATE);
10608
10609     Copy(&PL_reg_state, state, 1, struct re_save_state);
10610
10611     PL_reg_start_tmp = 0;
10612     PL_reg_start_tmpl = 0;
10613     PL_reg_oldsaved = NULL;
10614     PL_reg_oldsavedlen = 0;
10615     PL_reg_maxiter = 0;
10616     PL_reg_leftiter = 0;
10617     PL_reg_poscache = NULL;
10618     PL_reg_poscache_size = 0;
10619 #ifdef PERL_OLD_COPY_ON_WRITE
10620     PL_nrs = NULL;
10621 #endif
10622
10623     /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
10624     if (PL_curpm) {
10625         const REGEXP * const rx = PM_GETRE(PL_curpm);
10626         if (rx) {
10627             U32 i;
10628             for (i = 1; i <= RX_NPARENS(rx); i++) {
10629                 char digits[TYPE_CHARS(long)];
10630                 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
10631                 GV *const *const gvp
10632                     = (GV**)hv_fetch(PL_defstash, digits, len, 0);
10633
10634                 if (gvp) {
10635                     GV * const gv = *gvp;
10636                     if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
10637                         save_scalar(gv);
10638                 }
10639             }
10640         }
10641     }
10642 }
10643 #endif
10644
10645 static void
10646 clear_re(pTHX_ void *r)
10647 {
10648     dVAR;
10649     ReREFCNT_dec((REGEXP *)r);
10650 }
10651
10652 #ifdef DEBUGGING
10653
10654 STATIC void
10655 S_put_byte(pTHX_ SV *sv, int c)
10656 {
10657     PERL_ARGS_ASSERT_PUT_BYTE;
10658
10659     /* Our definition of isPRINT() ignores locales, so only bytes that are
10660        not part of UTF-8 are considered printable. I assume that the same
10661        holds for UTF-EBCDIC.
10662        Also, code point 255 is not printable in either (it's E0 in EBCDIC,
10663        which Wikipedia says:
10664
10665        EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
10666        ones (binary 1111 1111, hexadecimal FF). It is similar, but not
10667        identical, to the ASCII delete (DEL) or rubout control character.
10668        ) So the old condition can be simplified to !isPRINT(c)  */
10669     if (!isPRINT(c)) {
10670         if (c < 256) {
10671             Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
10672         }
10673         else {
10674             Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
10675         }
10676     }
10677     else {
10678         const char string = c;
10679         if (c == '-' || c == ']' || c == '\\' || c == '^')
10680             sv_catpvs(sv, "\\");
10681         sv_catpvn(sv, &string, 1);
10682     }
10683 }
10684
10685
10686 #define CLEAR_OPTSTART \
10687     if (optstart) STMT_START { \
10688             DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
10689             optstart=NULL; \
10690     } STMT_END
10691
10692 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
10693
10694 STATIC const regnode *
10695 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
10696             const regnode *last, const regnode *plast,
10697             SV* sv, I32 indent, U32 depth)
10698 {
10699     dVAR;
10700     register U8 op = PSEUDO;    /* Arbitrary non-END op. */
10701     register const regnode *next;
10702     const regnode *optstart= NULL;
10703
10704     RXi_GET_DECL(r,ri);
10705     GET_RE_DEBUG_FLAGS_DECL;
10706
10707     PERL_ARGS_ASSERT_DUMPUNTIL;
10708
10709 #ifdef DEBUG_DUMPUNTIL
10710     PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
10711         last ? last-start : 0,plast ? plast-start : 0);
10712 #endif
10713
10714     if (plast && plast < last)
10715         last= plast;
10716
10717     while (PL_regkind[op] != END && (!last || node < last)) {
10718         /* While that wasn't END last time... */
10719         NODE_ALIGN(node);
10720         op = OP(node);
10721         if (op == CLOSE || op == WHILEM)
10722             indent--;
10723         next = regnext((regnode *)node);
10724
10725         /* Where, what. */
10726         if (OP(node) == OPTIMIZED) {
10727             if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
10728                 optstart = node;
10729             else
10730                 goto after_print;
10731         } else
10732             CLEAR_OPTSTART;
10733
10734         regprop(r, sv, node);
10735         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
10736                       (int)(2*indent + 1), "", SvPVX_const(sv));
10737
10738         if (OP(node) != OPTIMIZED) {
10739             if (next == NULL)           /* Next ptr. */
10740                 PerlIO_printf(Perl_debug_log, " (0)");
10741             else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
10742                 PerlIO_printf(Perl_debug_log, " (FAIL)");
10743             else
10744                 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
10745             (void)PerlIO_putc(Perl_debug_log, '\n');
10746         }
10747
10748       after_print:
10749         if (PL_regkind[(U8)op] == BRANCHJ) {
10750             assert(next);
10751             {
10752                 register const regnode *nnode = (OP(next) == LONGJMP
10753                                              ? regnext((regnode *)next)
10754                                              : next);
10755                 if (last && nnode > last)
10756                     nnode = last;
10757                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
10758             }
10759         }
10760         else if (PL_regkind[(U8)op] == BRANCH) {
10761             assert(next);
10762             DUMPUNTIL(NEXTOPER(node), next);
10763         }
10764         else if ( PL_regkind[(U8)op]  == TRIE ) {
10765             const regnode *this_trie = node;
10766             const char op = OP(node);
10767             const U32 n = ARG(node);
10768             const reg_ac_data * const ac = op>=AHOCORASICK ?
10769                (reg_ac_data *)ri->data->data[n] :
10770                NULL;
10771             const reg_trie_data * const trie =
10772                 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
10773 #ifdef DEBUGGING
10774             AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
10775 #endif
10776             const regnode *nextbranch= NULL;
10777             I32 word_idx;
10778             sv_setpvs(sv, "");
10779             for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
10780                 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
10781
10782                 PerlIO_printf(Perl_debug_log, "%*s%s ",
10783                    (int)(2*(indent+3)), "",
10784                     elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
10785                             PL_colors[0], PL_colors[1],
10786                             (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
10787                             PERL_PV_PRETTY_ELLIPSES    |
10788                             PERL_PV_PRETTY_LTGT
10789                             )
10790                             : "???"
10791                 );
10792                 if (trie->jump) {
10793                     U16 dist= trie->jump[word_idx+1];
10794                     PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
10795                                   (UV)((dist ? this_trie + dist : next) - start));
10796                     if (dist) {
10797                         if (!nextbranch)
10798                             nextbranch= this_trie + trie->jump[0];
10799                         DUMPUNTIL(this_trie + dist, nextbranch);
10800                     }
10801                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
10802                         nextbranch= regnext((regnode *)nextbranch);
10803                 } else {
10804                     PerlIO_printf(Perl_debug_log, "\n");
10805                 }
10806             }
10807             if (last && next > last)
10808                 node= last;
10809             else
10810                 node= next;
10811         }
10812         else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
10813             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
10814                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
10815         }
10816         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
10817             assert(next);
10818             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
10819         }
10820         else if ( op == PLUS || op == STAR) {
10821             DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
10822         }
10823         else if (PL_regkind[(U8)op] == ANYOF) {
10824             /* arglen 1 + class block */
10825             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
10826                     ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
10827             node = NEXTOPER(node);
10828         }
10829         else if (PL_regkind[(U8)op] == EXACT) {
10830             /* Literal string, where present. */
10831             node += NODE_SZ_STR(node) - 1;
10832             node = NEXTOPER(node);
10833         }
10834         else {
10835             node = NEXTOPER(node);
10836             node += regarglen[(U8)op];
10837         }
10838         if (op == CURLYX || op == OPEN)
10839             indent++;
10840     }
10841     CLEAR_OPTSTART;
10842 #ifdef DEBUG_DUMPUNTIL
10843     PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
10844 #endif
10845     return node;
10846 }
10847
10848 #endif  /* DEBUGGING */
10849
10850 /*
10851  * Local variables:
10852  * c-indentation-style: bsd
10853  * c-basic-offset: 4
10854  * indent-tabs-mode: t
10855  * End:
10856  *
10857  * ex: set ts=8 sts=4 sw=4 noet:
10858  */