src/5013009/regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  *      Copyright (c) 1986 by University of Toronto.
  42  *      Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  *      Permission is granted to anyone to use this software for any
  45  *      purpose on any computer system, and to redistribute it freely,
  46  *      subject to the following restrictions:
  47  *
  48  *      1. The author is not responsible for the consequences of use of
  49  *              this software, no matter how awful, even if they arise
  50  *              from defects in it.
  51  *
  52  *      2. The origin of this software must not be misrepresented, either
  53  *              by explicit claim or by omission.
  54  *
  55  *      3. Altered versions must be plainly marked as such, and must not
  56  *              be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #include "perl.h"
  76
  77 #ifndef PERL_IN_XSUB_RE
  78 #include "re_defs.h"
  79 #endif
  80
  81 #define REG_COMP_C
  82 #ifdef PERL_IN_XSUB_RE
  83 #  include "re_comp.h"
  84 #else
  85 #  include "regcomp.h"
  86 #endif
  87
  88 #include "dquote_static.c"
  89
  90 #ifdef op
  91 #undef op
  92 #endif /* op */
  93
  94 #ifdef MSDOS
  95 #  if defined(BUGGY_MSC6)
  96  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
  97 #    pragma optimize("a",off)
  98  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
  99 #    pragma optimize("w",on )
 100 #  endif /* BUGGY_MSC6 */
 101 #endif /* MSDOS */
 102
 103 #ifndef STATIC
 104 #define STATIC  static
 105 #endif
 106
 107 typedef struct RExC_state_t {
 108     U32         flags;                  /* are we folding, multilining? */
 109     char        *precomp;               /* uncompiled string. */
 110     REGEXP      *rx_sv;                 /* The SV that is the regexp. */
 111     regexp      *rx;                    /* perl core regexp structure */
 112     regexp_internal     *rxi;           /* internal data for regexp object pprivate field */
 113     char        *start;                 /* Start of input for compile */
 114     char        *end;                   /* End of input for compile */
 115     char        *parse;                 /* Input-scan pointer. */
 116     I32         whilem_seen;            /* number of WHILEM in this expr */
 117     regnode     *emit_start;            /* Start of emitted-code area */
 118     regnode     *emit_bound;            /* First regnode outside of the allocated space */
 119     regnode     *emit;                  /* Code-emit pointer; &regdummy = don't = compiling */
 120     I32         naughty;                /* How bad is this pattern? */
 121     I32         sawback;                /* Did we see \1, ...? */
 122     U32         seen;
 123     I32         size;                   /* Code size. */
 124     I32         npar;                   /* Capture buffer count, (OPEN). */
 125     I32         cpar;                   /* Capture buffer count, (CLOSE). */
 126     I32         nestroot;               /* root parens we are in - used by accept */
 127     I32         extralen;
 128     I32         seen_zerolen;
 129     I32         seen_evals;
 130     regnode     **open_parens;          /* pointers to open parens */
 131     regnode     **close_parens;         /* pointers to close parens */
 132     regnode     *opend;                 /* END node in program */
 133     I32         utf8;           /* whether the pattern is utf8 or not */
 134     I32         orig_utf8;      /* whether the pattern was originally in utf8 */
 135                                 /* XXX use this for future optimisation of case
 136                                  * where pattern must be upgraded to utf8. */
 137     HV          *paren_names;           /* Paren names */
 138
 139     regnode     **recurse;              /* Recurse regops */
 140     I32         recurse_count;          /* Number of recurse regops */
 141     I32         in_lookbehind;
 142 #if ADD_TO_REGEXEC
 143     char        *starttry;              /* -Dr: where regtry was called. */
 144 #define RExC_starttry   (pRExC_state->starttry)
 145 #endif
 146 #ifdef DEBUGGING
 147     const char  *lastparse;
 148     I32         lastnum;
 149     AV          *paren_name_list;       /* idx -> name */
 150 #define RExC_lastparse  (pRExC_state->lastparse)
 151 #define RExC_lastnum    (pRExC_state->lastnum)
 152 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 153 #endif
 154 } RExC_state_t;
 155
 156 #define RExC_flags      (pRExC_state->flags)
 157 #define RExC_precomp    (pRExC_state->precomp)
 158 #define RExC_rx_sv      (pRExC_state->rx_sv)
 159 #define RExC_rx         (pRExC_state->rx)
 160 #define RExC_rxi        (pRExC_state->rxi)
 161 #define RExC_start      (pRExC_state->start)
 162 #define RExC_end        (pRExC_state->end)
 163 #define RExC_parse      (pRExC_state->parse)
 164 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
 165 #ifdef RE_TRACK_PATTERN_OFFSETS
 166 #define RExC_offsets    (pRExC_state->rxi->u.offsets) /* I am not like the others */
 167 #endif
 168 #define RExC_emit       (pRExC_state->emit)
 169 #define RExC_emit_start (pRExC_state->emit_start)
 170 #define RExC_emit_bound (pRExC_state->emit_bound)
 171 #define RExC_naughty    (pRExC_state->naughty)
 172 #define RExC_sawback    (pRExC_state->sawback)
 173 #define RExC_seen       (pRExC_state->seen)
 174 #define RExC_size       (pRExC_state->size)
 175 #define RExC_npar       (pRExC_state->npar)
 176 #define RExC_nestroot   (pRExC_state->nestroot)
 177 #define RExC_extralen   (pRExC_state->extralen)
 178 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
 179 #define RExC_seen_evals (pRExC_state->seen_evals)
 180 #define RExC_utf8       (pRExC_state->utf8)
 181 #define RExC_orig_utf8  (pRExC_state->orig_utf8)
 182 #define RExC_open_parens        (pRExC_state->open_parens)
 183 #define RExC_close_parens       (pRExC_state->close_parens)
 184 #define RExC_opend      (pRExC_state->opend)
 185 #define RExC_paren_names        (pRExC_state->paren_names)
 186 #define RExC_recurse    (pRExC_state->recurse)
 187 #define RExC_recurse_count      (pRExC_state->recurse_count)
 188 #define RExC_in_lookbehind      (pRExC_state->in_lookbehind)
 189
 190
 191 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
 192 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 193         ((*s) == '{' && regcurly(s)))
 194
 195 #ifdef SPSTART
 196 #undef SPSTART          /* dratted cpp namespace... */
 197 #endif
 198 /*
 199  * Flags to be passed up and down.
 200  */
 201 #define WORST           0       /* Worst case. */
 202 #define HASWIDTH        0x01    /* Known to match non-null strings. */
 203
 204 /* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
 205  * character, and if utf8, must be invariant.  Note that this is not the same thing as REGNODE_SIMPLE */
 206 #define SIMPLE          0x02
 207 #define SPSTART         0x04    /* Starts with * or +. */
 208 #define TRYAGAIN        0x08    /* Weeded out a declaration. */
 209 #define POSTPONED       0x10    /* (?1),(?&name), (??{...}) or similar */
 210
 211 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 212
 213 /* whether trie related optimizations are enabled */
 214 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 215 #define TRIE_STUDY_OPT
 216 #define FULL_TRIE_STUDY
 217 #define TRIE_STCLASS
 218 #endif
 219
 220
 221
 222 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 223 #define PBITVAL(paren) (1 << ((paren) & 7))
 224 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 225 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 226 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 227
 228 /* If not already in utf8, do a longjmp back to the beginning */
 229 #define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
 230 #define REQUIRE_UTF8    STMT_START {                                       \
 231                                      if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
 232                         } STMT_END
 233
 234 /* About scan_data_t.
 235
 236   During optimisation we recurse through the regexp program performing
 237   various inplace (keyhole style) optimisations. In addition study_chunk
 238   and scan_commit populate this data structure with information about
 239   what strings MUST appear in the pattern. We look for the longest
 240   string that must appear at a fixed location, and we look for the
 241   longest string that may appear at a floating location. So for instance
 242   in the pattern:
 243
 244     /FOO[xX]A.*B[xX]BAR/
 245
 246   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 247   strings (because they follow a .* construct). study_chunk will identify
 248   both FOO and BAR as being the longest fixed and floating strings respectively.
 249
 250   The strings can be composites, for instance
 251
 252      /(f)(o)(o)/
 253
 254   will result in a composite fixed substring 'foo'.
 255
 256   For each string some basic information is maintained:
 257
 258   - offset or min_offset
 259     This is the position the string must appear at, or not before.
 260     It also implicitly (when combined with minlenp) tells us how many
 261     characters must match before the string we are searching for.
 262     Likewise when combined with minlenp and the length of the string it
 263     tells us how many characters must appear after the string we have
 264     found.
 265
 266   - max_offset
 267     Only used for floating strings. This is the rightmost point that
 268     the string can appear at. If set to I32 max it indicates that the
 269     string can occur infinitely far to the right.
 270
 271   - minlenp
 272     A pointer to the minimum length of the pattern that the string
 273     was found inside. This is important as in the case of positive
 274     lookahead or positive lookbehind we can have multiple patterns
 275     involved. Consider
 276
 277     /(?=FOO).*F/
 278
 279     The minimum length of the pattern overall is 3, the minimum length
 280     of the lookahead part is 3, but the minimum length of the part that
 281     will actually match is 1. So 'FOO's minimum length is 3, but the
 282     minimum length for the F is 1. This is important as the minimum length
 283     is used to determine offsets in front of and behind the string being
 284     looked for.  Since strings can be composites this is the length of the
 285     pattern at the time it was committed with a scan_commit. Note that
 286     the length is calculated by study_chunk, so that the minimum lengths
 287     are not known until the full pattern has been compiled, thus the
 288     pointer to the value.
 289
 290   - lookbehind
 291
 292     In the case of lookbehind the string being searched for can be
 293     offset past the start point of the final matching string.
 294     If this value was just blithely removed from the min_offset it would
 295     invalidate some of the calculations for how many chars must match
 296     before or after (as they are derived from min_offset and minlen and
 297     the length of the string being searched for).
 298     When the final pattern is compiled and the data is moved from the
 299     scan_data_t structure into the regexp structure the information
 300     about lookbehind is factored in, with the information that would
 301     have been lost precalculated in the end_shift field for the
 302     associated string.
 303
 304   The fields pos_min and pos_delta are used to store the minimum offset
 305   and the delta to the maximum offset at the current point in the pattern.
 306
 307 */
 308
 309 typedef struct scan_data_t {
 310     /*I32 len_min;      unused */
 311     /*I32 len_delta;    unused */
 312     I32 pos_min;
 313     I32 pos_delta;
 314     SV *last_found;
 315     I32 last_end;           /* min value, <0 unless valid. */
 316     I32 last_start_min;
 317     I32 last_start_max;
 318     SV **longest;           /* Either &l_fixed, or &l_float. */
 319     SV *longest_fixed;      /* longest fixed string found in pattern */
 320     I32 offset_fixed;       /* offset where it starts */
 321     I32 *minlen_fixed;      /* pointer to the minlen relevant to the string */
 322     I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 323     SV *longest_float;      /* longest floating string found in pattern */
 324     I32 offset_float_min;   /* earliest point in string it can appear */
 325     I32 offset_float_max;   /* latest point in string it can appear */
 326     I32 *minlen_float;      /* pointer to the minlen relevant to the string */
 327     I32 lookbehind_float;   /* is the position of the string modified by LB */
 328     I32 flags;
 329     I32 whilem_c;
 330     I32 *last_closep;
 331     struct regnode_charclass_class *start_class;
 332 } scan_data_t;
 333
 334 /*
 335  * Forward declarations for pregcomp()'s friends.
 336  */
 337
 338 static const scan_data_t zero_scan_data =
 339   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 340
 341 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 342 #define SF_BEFORE_SEOL          0x0001
 343 #define SF_BEFORE_MEOL          0x0002
 344 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 345 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 346
 347 #ifdef NO_UNARY_PLUS
 348 #  define SF_FIX_SHIFT_EOL      (0+2)
 349 #  define SF_FL_SHIFT_EOL               (0+4)
 350 #else
 351 #  define SF_FIX_SHIFT_EOL      (+2)
 352 #  define SF_FL_SHIFT_EOL               (+4)
 353 #endif
 354
 355 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 356 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 357
 358 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 359 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 360 #define SF_IS_INF               0x0040
 361 #define SF_HAS_PAR              0x0080
 362 #define SF_IN_PAR               0x0100
 363 #define SF_HAS_EVAL             0x0200
 364 #define SCF_DO_SUBSTR           0x0400
 365 #define SCF_DO_STCLASS_AND      0x0800
 366 #define SCF_DO_STCLASS_OR       0x1000
 367 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 368 #define SCF_WHILEM_VISITED_POS  0x2000
 369
 370 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 371 #define SCF_SEEN_ACCEPT         0x8000
 372
 373 #define UTF cBOOL(RExC_utf8)
 374 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 375 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
 376 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
 377 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
 378 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
 379
 380 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 381
 382 #define OOB_UNICODE             12345678
 383 #define OOB_NAMEDCLASS          -1
 384
 385 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 386 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 387
 388
 389 /* length of regex to show in messages that don't mark a position within */
 390 #define RegexLengthToShowInErrorMessages 127
 391
 392 /*
 393  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 394  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 395  * op/pragma/warn/regcomp.
 396  */
 397 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 398 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 399
 400 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
 401
 402 /*
 403  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 404  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 405  * "...".
 406  */
 407 #define _FAIL(code) STMT_START {                                        \
 408     const char *ellipses = "";                                          \
 409     IV len = RExC_end - RExC_precomp;                                   \
 410                                                                         \
 411     if (!SIZE_ONLY)                                                     \
 412         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);                   \
 413     if (len > RegexLengthToShowInErrorMessages) {                       \
 414         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
 415         len = RegexLengthToShowInErrorMessages - 10;                    \
 416         ellipses = "...";                                               \
 417     }                                                                   \
 418     code;                                                               \
 419 } STMT_END
 420
 421 #define FAIL(msg) _FAIL(                            \
 422     Perl_croak(aTHX_ "%s in regex m/%.*s%s/",       \
 423             msg, (int)len, RExC_precomp, ellipses))
 424
 425 #define FAIL2(msg,arg) _FAIL(                       \
 426     Perl_croak(aTHX_ msg " in regex m/%.*s%s/",     \
 427             arg, (int)len, RExC_precomp, ellipses))
 428
 429 /*
 430  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 431  */
 432 #define Simple_vFAIL(m) STMT_START {                                    \
 433     const IV offset = RExC_parse - RExC_precomp;                        \
 434     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
 435             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
 436 } STMT_END
 437
 438 /*
 439  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 440  */
 441 #define vFAIL(m) STMT_START {                           \
 442     if (!SIZE_ONLY)                                     \
 443         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 444     Simple_vFAIL(m);                                    \
 445 } STMT_END
 446
 447 /*
 448  * Like Simple_vFAIL(), but accepts two arguments.
 449  */
 450 #define Simple_vFAIL2(m,a1) STMT_START {                        \
 451     const IV offset = RExC_parse - RExC_precomp;                        \
 452     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,                   \
 453             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 454 } STMT_END
 455
 456 /*
 457  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 458  */
 459 #define vFAIL2(m,a1) STMT_START {                       \
 460     if (!SIZE_ONLY)                                     \
 461         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 462     Simple_vFAIL2(m, a1);                               \
 463 } STMT_END
 464
 465
 466 /*
 467  * Like Simple_vFAIL(), but accepts three arguments.
 468  */
 469 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
 470     const IV offset = RExC_parse - RExC_precomp;                \
 471     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,               \
 472             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 473 } STMT_END
 474
 475 /*
 476  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 477  */
 478 #define vFAIL3(m,a1,a2) STMT_START {                    \
 479     if (!SIZE_ONLY)                                     \
 480         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 481     Simple_vFAIL3(m, a1, a2);                           \
 482 } STMT_END
 483
 484 /*
 485  * Like Simple_vFAIL(), but accepts four arguments.
 486  */
 487 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
 488     const IV offset = RExC_parse - RExC_precomp;                \
 489     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,           \
 490             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 491 } STMT_END
 492
 493 #define ckWARNreg(loc,m) STMT_START {                                   \
 494     const IV offset = loc - RExC_precomp;                               \
 495     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 496             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 497 } STMT_END
 498
 499 #define ckWARNregdep(loc,m) STMT_START {                                \
 500     const IV offset = loc - RExC_precomp;                               \
 501     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 502             m REPORT_LOCATION,                                          \
 503             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 504 } STMT_END
 505
 506 #define ckWARN2reg(loc, m, a1) STMT_START {                             \
 507     const IV offset = loc - RExC_precomp;                               \
 508     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 509             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
 510 } STMT_END
 511
 512 #define vWARN3(loc, m, a1, a2) STMT_START {                             \
 513     const IV offset = loc - RExC_precomp;                               \
 514     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 515             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 516 } STMT_END
 517
 518 #define ckWARN3reg(loc, m, a1, a2) STMT_START {                         \
 519     const IV offset = loc - RExC_precomp;                               \
 520     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 521             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 522 } STMT_END
 523
 524 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
 525     const IV offset = loc - RExC_precomp;                               \
 526     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 527             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 528 } STMT_END
 529
 530 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START {                     \
 531     const IV offset = loc - RExC_precomp;                               \
 532     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 533             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 534 } STMT_END
 535
 536 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
 537     const IV offset = loc - RExC_precomp;                               \
 538     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 539             a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
 540 } STMT_END
 541
 542
 543 /* Allow for side effects in s */
 544 #define REGC(c,s) STMT_START {                  \
 545     if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
 546 } STMT_END
 547
 548 /* Macros for recording node offsets.   20001227 mjd@plover.com
 549  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 550  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 551  * Element 0 holds the number n.
 552  * Position is 1 indexed.
 553  */
 554 #ifndef RE_TRACK_PATTERN_OFFSETS
 555 #define Set_Node_Offset_To_R(node,byte)
 556 #define Set_Node_Offset(node,byte)
 557 #define Set_Cur_Node_Offset
 558 #define Set_Node_Length_To_R(node,len)
 559 #define Set_Node_Length(node,len)
 560 #define Set_Node_Cur_Length(node)
 561 #define Node_Offset(n)
 562 #define Node_Length(n)
 563 #define Set_Node_Offset_Length(node,offset,len)
 564 #define ProgLen(ri) ri->u.proglen
 565 #define SetProgLen(ri,x) ri->u.proglen = x
 566 #else
 567 #define ProgLen(ri) ri->u.offsets[0]
 568 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 569 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
 570     if (! SIZE_ONLY) {                                                  \
 571         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
 572                     __LINE__, (int)(node), (int)(byte)));               \
 573         if((node) < 0) {                                                \
 574             Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
 575         } else {                                                        \
 576             RExC_offsets[2*(node)-1] = (byte);                          \
 577         }                                                               \
 578     }                                                                   \
 579 } STMT_END
 580
 581 #define Set_Node_Offset(node,byte) \
 582     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 583 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 584
 585 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
 586     if (! SIZE_ONLY) {                                                  \
 587         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
 588                 __LINE__, (int)(node), (int)(len)));                    \
 589         if((node) < 0) {                                                \
 590             Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
 591         } else {                                                        \
 592             RExC_offsets[2*(node)] = (len);                             \
 593         }                                                               \
 594     }                                                                   \
 595 } STMT_END
 596
 597 #define Set_Node_Length(node,len) \
 598     Set_Node_Length_To_R((node)-RExC_emit_start, len)
 599 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
 600 #define Set_Node_Cur_Length(node) \
 601     Set_Node_Length(node, RExC_parse - parse_start)
 602
 603 /* Get offsets and lengths */
 604 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 605 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 606
 607 #define Set_Node_Offset_Length(node,offset,len) STMT_START {    \
 608     Set_Node_Offset_To_R((node)-RExC_emit_start, (offset));     \
 609     Set_Node_Length_To_R((node)-RExC_emit_start, (len));        \
 610 } STMT_END
 611 #endif
 612
 613 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 614 #define EXPERIMENTAL_INPLACESCAN
 615 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
 616
 617 #define DEBUG_STUDYDATA(str,data,depth)                              \
 618 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
 619     PerlIO_printf(Perl_debug_log,                                    \
 620         "%*s" str "Pos:%"IVdf"/%"IVdf                                \
 621         " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s",       \
 622         (int)(depth)*2, "",                                          \
 623         (IV)((data)->pos_min),                                       \
 624         (IV)((data)->pos_delta),                                     \
 625         (UV)((data)->flags),                                         \
 626         (IV)((data)->whilem_c),                                      \
 627         (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
 628         is_inf ? "INF " : ""                                         \
 629     );                                                               \
 630     if ((data)->last_found)                                          \
 631         PerlIO_printf(Perl_debug_log,                                \
 632             "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 633             " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 634             SvPVX_const((data)->last_found),                         \
 635             (IV)((data)->last_end),                                  \
 636             (IV)((data)->last_start_min),                            \
 637             (IV)((data)->last_start_max),                            \
 638             ((data)->longest &&                                      \
 639              (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 640             SvPVX_const((data)->longest_fixed),                      \
 641             (IV)((data)->offset_fixed),                              \
 642             ((data)->longest &&                                      \
 643              (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 644             SvPVX_const((data)->longest_float),                      \
 645             (IV)((data)->offset_float_min),                          \
 646             (IV)((data)->offset_float_max)                           \
 647         );                                                           \
 648     PerlIO_printf(Perl_debug_log,"\n");                              \
 649 });
 650
 651 static void clear_re(pTHX_ void *r);
 652
 653 /* Mark that we cannot extend a found fixed substring at this point.
 654    Update the longest found anchored substring and the longest found
 655    floating substrings if needed. */
 656
 657 STATIC void
 658 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
 659 {
 660     const STRLEN l = CHR_SVLEN(data->last_found);
 661     const STRLEN old_l = CHR_SVLEN(*data->longest);
 662     GET_RE_DEBUG_FLAGS_DECL;
 663
 664     PERL_ARGS_ASSERT_SCAN_COMMIT;
 665
 666     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 667         SvSetMagicSV(*data->longest, data->last_found);
 668         if (*data->longest == data->longest_fixed) {
 669             data->offset_fixed = l ? data->last_start_min : data->pos_min;
 670             if (data->flags & SF_BEFORE_EOL)
 671                 data->flags
 672                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
 673             else
 674                 data->flags &= ~SF_FIX_BEFORE_EOL;
 675             data->minlen_fixed=minlenp;
 676             data->lookbehind_fixed=0;
 677         }
 678         else { /* *data->longest == data->longest_float */
 679             data->offset_float_min = l ? data->last_start_min : data->pos_min;
 680             data->offset_float_max = (l
 681                                       ? data->last_start_max
 682                                       : data->pos_min + data->pos_delta);
 683             if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
 684                 data->offset_float_max = I32_MAX;
 685             if (data->flags & SF_BEFORE_EOL)
 686                 data->flags
 687                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
 688             else
 689                 data->flags &= ~SF_FL_BEFORE_EOL;
 690             data->minlen_float=minlenp;
 691             data->lookbehind_float=0;
 692         }
 693     }
 694     SvCUR_set(data->last_found, 0);
 695     {
 696         SV * const sv = data->last_found;
 697         if (SvUTF8(sv) && SvMAGICAL(sv)) {
 698             MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 699             if (mg)
 700                 mg->mg_len = 0;
 701         }
 702     }
 703     data->last_end = -1;
 704     data->flags &= ~SF_BEFORE_EOL;
 705     DEBUG_STUDYDATA("commit: ",data,0);
 706 }
 707
 708 /* Can match anything (initialization) */
 709 STATIC void
 710 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 711 {
 712     PERL_ARGS_ASSERT_CL_ANYTHING;
 713
 714     ANYOF_CLASS_ZERO(cl);
 715     ANYOF_BITMAP_SETALL(cl);
 716     cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL|ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL;
 717     if (LOC)
 718         cl->flags |= ANYOF_LOCALE;
 719 }
 720
 721 /* Can match anything (initialization) */
 722 STATIC int
 723 S_cl_is_anything(const struct regnode_charclass_class *cl)
 724 {
 725     int value;
 726
 727     PERL_ARGS_ASSERT_CL_IS_ANYTHING;
 728
 729     for (value = 0; value <= ANYOF_MAX; value += 2)
 730         if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
 731             return 1;
 732     if (!(cl->flags & ANYOF_UNICODE_ALL))
 733         return 0;
 734     if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
 735         return 0;
 736     return 1;
 737 }
 738
 739 /* Can match anything (initialization) */
 740 STATIC void
 741 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 742 {
 743     PERL_ARGS_ASSERT_CL_INIT;
 744
 745     Zero(cl, 1, struct regnode_charclass_class);
 746     cl->type = ANYOF;
 747     cl_anything(pRExC_state, cl);
 748 }
 749
 750 STATIC void
 751 S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 752 {
 753     PERL_ARGS_ASSERT_CL_INIT_ZERO;
 754
 755     Zero(cl, 1, struct regnode_charclass_class);
 756     cl->type = ANYOF;
 757     cl_anything(pRExC_state, cl);
 758     if (LOC)
 759         cl->flags |= ANYOF_LOCALE;
 760 }
 761
 762 /* 'And' a given class with another one.  Can create false positives */
 763 /* We assume that cl is not inverted */
 764 STATIC void
 765 S_cl_and(struct regnode_charclass_class *cl,
 766         const struct regnode_charclass_class *and_with)
 767 {
 768     PERL_ARGS_ASSERT_CL_AND;
 769
 770     assert(and_with->type == ANYOF);
 771
 772     if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
 773         && !(ANYOF_CLASS_TEST_ANY_SET(cl))
 774         && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 775         && !(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
 776         && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) {
 777         int i;
 778
 779         if (and_with->flags & ANYOF_INVERT)
 780             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 781                 cl->bitmap[i] &= ~and_with->bitmap[i];
 782         else
 783             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 784                 cl->bitmap[i] &= and_with->bitmap[i];
 785     } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
 786     if (!(and_with->flags & ANYOF_EOS))
 787         cl->flags &= ~ANYOF_EOS;
 788
 789     if (!(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD))
 790         cl->flags &= ~ANYOF_LOC_NONBITMAP_FOLD;
 791     if (!(and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL))
 792         cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
 793
 794     if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_NONBITMAP &&
 795         !(and_with->flags & ANYOF_INVERT)) {
 796         cl->flags &= ~ANYOF_UNICODE_ALL;
 797         cl->flags |= and_with->flags & ANYOF_NONBITMAP; /* field is 2 bits; use
 798                                                            only the one(s)
 799                                                            actually set */
 800         ARG_SET(cl, ARG(and_with));
 801     }
 802     if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
 803         !(and_with->flags & ANYOF_INVERT))
 804         cl->flags &= ~ANYOF_UNICODE_ALL;
 805     if (!(and_with->flags & (ANYOF_NONBITMAP|ANYOF_UNICODE_ALL)) &&
 806         !(and_with->flags & ANYOF_INVERT))
 807         cl->flags &= ~ANYOF_NONBITMAP;
 808 }
 809
 810 /* 'OR' a given class with another one.  Can create false positives */
 811 /* We assume that cl is not inverted */
 812 STATIC void
 813 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
 814 {
 815     PERL_ARGS_ASSERT_CL_OR;
 816
 817     if (or_with->flags & ANYOF_INVERT) {
 818         /* We do not use
 819          * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
 820          *   <= (B1 | !B2) | (CL1 | !CL2)
 821          * which is wasteful if CL2 is small, but we ignore CL2:
 822          *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
 823          * XXXX Can we handle case-fold?  Unclear:
 824          *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
 825          *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
 826          */
 827         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 828              && !(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
 829              && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD) ) {
 830             int i;
 831
 832             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 833                 cl->bitmap[i] |= ~or_with->bitmap[i];
 834         } /* XXXX: logic is complicated otherwise */
 835         else {
 836             cl_anything(pRExC_state, cl);
 837         }
 838     } else {
 839         /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
 840         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 841              && (!(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
 842                  || (cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) ) {
 843             int i;
 844
 845             /* OR char bitmap and class bitmap separately */
 846             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 847                 cl->bitmap[i] |= or_with->bitmap[i];
 848             if (ANYOF_CLASS_TEST_ANY_SET(or_with)) {
 849                 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
 850                     cl->classflags[i] |= or_with->classflags[i];
 851                 cl->flags |= ANYOF_CLASS;
 852             }
 853         }
 854         else { /* XXXX: logic is complicated, leave it along for a moment. */
 855             cl_anything(pRExC_state, cl);
 856         }
 857     }
 858     if (or_with->flags & ANYOF_EOS)
 859         cl->flags |= ANYOF_EOS;
 860     if (!(or_with->flags & ANYOF_NON_UTF8_LATIN1_ALL))
 861         cl->flags |= ANYOF_NON_UTF8_LATIN1_ALL;
 862
 863     if (or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
 864         cl->flags |= ANYOF_LOC_NONBITMAP_FOLD;
 865
 866     /* If both nodes match something outside the bitmap, but what they match
 867      * outside is not the same pointer, and hence not easily compared, give up
 868      * and allow the start class to match everything outside the bitmap */
 869     if (cl->flags & ANYOF_NONBITMAP && or_with->flags & ANYOF_NONBITMAP &&
 870         ARG(cl) != ARG(or_with)) {
 871         cl->flags |= ANYOF_UNICODE_ALL;
 872     }
 873
 874     if (or_with->flags & ANYOF_UNICODE_ALL) {
 875         cl->flags |= ANYOF_UNICODE_ALL;
 876     }
 877 }
 878
 879 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
 880 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
 881 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
 882 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
 883
 884
 885 #ifdef DEBUGGING
 886 /*
 887    dump_trie(trie,widecharmap,revcharmap)
 888    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
 889    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
 890
 891    These routines dump out a trie in a somewhat readable format.
 892    The _interim_ variants are used for debugging the interim
 893    tables that are used to generate the final compressed
 894    representation which is what dump_trie expects.
 895
 896    Part of the reason for their existence is to provide a form
 897    of documentation as to how the different representations function.
 898
 899 */
 900
 901 /*
 902   Dumps the final compressed table form of the trie to Perl_debug_log.
 903   Used for debugging make_trie().
 904 */
 905
 906 STATIC void
 907 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
 908             AV *revcharmap, U32 depth)
 909 {
 910     U32 state;
 911     SV *sv=sv_newmortal();
 912     int colwidth= widecharmap ? 6 : 4;
 913     U16 word;
 914     GET_RE_DEBUG_FLAGS_DECL;
 915
 916     PERL_ARGS_ASSERT_DUMP_TRIE;
 917
 918     PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
 919         (int)depth * 2 + 2,"",
 920         "Match","Base","Ofs" );
 921
 922     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
 923         SV ** const tmp = av_fetch( revcharmap, state, 0);
 924         if ( tmp ) {
 925             PerlIO_printf( Perl_debug_log, "%*s",
 926                 colwidth,
 927                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
 928                             PL_colors[0], PL_colors[1],
 929                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
 930                             PERL_PV_ESCAPE_FIRSTCHAR
 931                 )
 932             );
 933         }
 934     }
 935     PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
 936         (int)depth * 2 + 2,"");
 937
 938     for( state = 0 ; state < trie->uniquecharcount ; state++ )
 939         PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
 940     PerlIO_printf( Perl_debug_log, "\n");
 941
 942     for( state = 1 ; state < trie->statecount ; state++ ) {
 943         const U32 base = trie->states[ state ].trans.base;
 944
 945         PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
 946
 947         if ( trie->states[ state ].wordnum ) {
 948             PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
 949         } else {
 950             PerlIO_printf( Perl_debug_log, "%6s", "" );
 951         }
 952
 953         PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
 954
 955         if ( base ) {
 956             U32 ofs = 0;
 957
 958             while( ( base + ofs  < trie->uniquecharcount ) ||
 959                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
 960                      && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
 961                     ofs++;
 962
 963             PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
 964
 965             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
 966                 if ( ( base + ofs >= trie->uniquecharcount ) &&
 967                      ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
 968                      trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
 969                 {
 970                    PerlIO_printf( Perl_debug_log, "%*"UVXf,
 971                     colwidth,
 972                     (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
 973                 } else {
 974                     PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
 975                 }
 976             }
 977
 978             PerlIO_printf( Perl_debug_log, "]");
 979
 980         }
 981         PerlIO_printf( Perl_debug_log, "\n" );
 982     }
 983     PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
 984     for (word=1; word <= trie->wordcount; word++) {
 985         PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
 986             (int)word, (int)(trie->wordinfo[word].prev),
 987             (int)(trie->wordinfo[word].len));
 988     }
 989     PerlIO_printf(Perl_debug_log, "\n" );
 990 }
 991 /*
 992   Dumps a fully constructed but uncompressed trie in list form.
 993   List tries normally only are used for construction when the number of
 994   possible chars (trie->uniquecharcount) is very high.
 995   Used for debugging make_trie().
 996 */
 997 STATIC void
 998 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
 999                          HV *widecharmap, AV *revcharmap, U32 next_alloc,
1000                          U32 depth)
1001 {
1002     U32 state;
1003     SV *sv=sv_newmortal();
1004     int colwidth= widecharmap ? 6 : 4;
1005     GET_RE_DEBUG_FLAGS_DECL;
1006
1007     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1008
1009     /* print out the table precompression.  */
1010     PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1011         (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1012         "------:-----+-----------------\n" );
1013
1014     for( state=1 ; state < next_alloc ; state ++ ) {
1015         U16 charid;
1016
1017         PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1018             (int)depth * 2 + 2,"", (UV)state  );
1019         if ( ! trie->states[ state ].wordnum ) {
1020             PerlIO_printf( Perl_debug_log, "%5s| ","");
1021         } else {
1022             PerlIO_printf( Perl_debug_log, "W%4x| ",
1023                 trie->states[ state ].wordnum
1024             );
1025         }
1026         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1027             SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1028             if ( tmp ) {
1029                 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1030                     colwidth,
1031                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1032                             PL_colors[0], PL_colors[1],
1033                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1034                             PERL_PV_ESCAPE_FIRSTCHAR
1035                     ) ,
1036                     TRIE_LIST_ITEM(state,charid).forid,
1037                     (UV)TRIE_LIST_ITEM(state,charid).newstate
1038                 );
1039                 if (!(charid % 10))
1040                     PerlIO_printf(Perl_debug_log, "\n%*s| ",
1041                         (int)((depth * 2) + 14), "");
1042             }
1043         }
1044         PerlIO_printf( Perl_debug_log, "\n");
1045     }
1046 }
1047
1048 /*
1049   Dumps a fully constructed but uncompressed trie in table form.
1050   This is the normal DFA style state transition table, with a few
1051   twists to facilitate compression later.
1052   Used for debugging make_trie().
1053 */
1054 STATIC void
1055 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1056                           HV *widecharmap, AV *revcharmap, U32 next_alloc,
1057                           U32 depth)
1058 {
1059     U32 state;
1060     U16 charid;
1061     SV *sv=sv_newmortal();
1062     int colwidth= widecharmap ? 6 : 4;
1063     GET_RE_DEBUG_FLAGS_DECL;
1064
1065     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1066
1067     /*
1068        print out the table precompression so that we can do a visual check
1069        that they are identical.
1070      */
1071
1072     PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1073
1074     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1075         SV ** const tmp = av_fetch( revcharmap, charid, 0);
1076         if ( tmp ) {
1077             PerlIO_printf( Perl_debug_log, "%*s",
1078                 colwidth,
1079                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1080                             PL_colors[0], PL_colors[1],
1081                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1082                             PERL_PV_ESCAPE_FIRSTCHAR
1083                 )
1084             );
1085         }
1086     }
1087
1088     PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1089
1090     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1091         PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1092     }
1093
1094     PerlIO_printf( Perl_debug_log, "\n" );
1095
1096     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1097
1098         PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1099             (int)depth * 2 + 2,"",
1100             (UV)TRIE_NODENUM( state ) );
1101
1102         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1103             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1104             if (v)
1105                 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1106             else
1107                 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1108         }
1109         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1110             PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1111         } else {
1112             PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1113             trie->states[ TRIE_NODENUM( state ) ].wordnum );
1114         }
1115     }
1116 }
1117
1118 #endif
1119
1120
1121 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1122   startbranch: the first branch in the whole branch sequence
1123   first      : start branch of sequence of branch-exact nodes.
1124                May be the same as startbranch
1125   last       : Thing following the last branch.
1126                May be the same as tail.
1127   tail       : item following the branch sequence
1128   count      : words in the sequence
1129   flags      : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1130   depth      : indent depth
1131
1132 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1133
1134 A trie is an N'ary tree where the branches are determined by digital
1135 decomposition of the key. IE, at the root node you look up the 1st character and
1136 follow that branch repeat until you find the end of the branches. Nodes can be
1137 marked as "accepting" meaning they represent a complete word. Eg:
1138
1139   /he|she|his|hers/
1140
1141 would convert into the following structure. Numbers represent states, letters
1142 following numbers represent valid transitions on the letter from that state, if
1143 the number is in square brackets it represents an accepting state, otherwise it
1144 will be in parenthesis.
1145
1146       +-h->+-e->[3]-+-r->(8)-+-s->[9]
1147       |    |
1148       |   (2)
1149       |    |
1150      (1)   +-i->(6)-+-s->[7]
1151       |
1152       +-s->(3)-+-h->(4)-+-e->[5]
1153
1154       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1155
1156 This shows that when matching against the string 'hers' we will begin at state 1
1157 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1158 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1159 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1160 single traverse. We store a mapping from accepting to state to which word was
1161 matched, and then when we have multiple possibilities we try to complete the
1162 rest of the regex in the order in which they occured in the alternation.
1163
1164 The only prior NFA like behaviour that would be changed by the TRIE support is
1165 the silent ignoring of duplicate alternations which are of the form:
1166
1167  / (DUPE|DUPE) X? (?{ ... }) Y /x
1168
1169 Thus EVAL blocks following a trie may be called a different number of times with
1170 and without the optimisation. With the optimisations dupes will be silently
1171 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1172 the following demonstrates:
1173
1174  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1175
1176 which prints out 'word' three times, but
1177
1178  'words'=~/(word|word|word)(?{ print $1 })S/
1179
1180 which doesnt print it out at all. This is due to other optimisations kicking in.
1181
1182 Example of what happens on a structural level:
1183
1184 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1185
1186    1: CURLYM[1] {1,32767}(18)
1187    5:   BRANCH(8)
1188    6:     EXACT <ac>(16)
1189    8:   BRANCH(11)
1190    9:     EXACT <ad>(16)
1191   11:   BRANCH(14)
1192   12:     EXACT <ab>(16)
1193   16:   SUCCEED(0)
1194   17:   NOTHING(18)
1195   18: END(0)
1196
1197 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1198 and should turn into:
1199
1200    1: CURLYM[1] {1,32767}(18)
1201    5:   TRIE(16)
1202         [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1203           <ac>
1204           <ad>
1205           <ab>
1206   16:   SUCCEED(0)
1207   17:   NOTHING(18)
1208   18: END(0)
1209
1210 Cases where tail != last would be like /(?foo|bar)baz/:
1211
1212    1: BRANCH(4)
1213    2:   EXACT <foo>(8)
1214    4: BRANCH(7)
1215    5:   EXACT <bar>(8)
1216    7: TAIL(8)
1217    8: EXACT <baz>(10)
1218   10: END(0)
1219
1220 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1221 and would end up looking like:
1222
1223     1: TRIE(8)
1224       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1225         <foo>
1226         <bar>
1227    7: TAIL(8)
1228    8: EXACT <baz>(10)
1229   10: END(0)
1230
1231     d = uvuni_to_utf8_flags(d, uv, 0);
1232
1233 is the recommended Unicode-aware way of saying
1234
1235     *(d++) = uv;
1236 */
1237
1238 #define TRIE_STORE_REVCHAR                                                 \
1239     STMT_START {                                                           \
1240         if (UTF) {                                                         \
1241             SV *zlopp = newSV(2);                                          \
1242             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
1243             unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
1244             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
1245             SvPOK_on(zlopp);                                               \
1246             SvUTF8_on(zlopp);                                              \
1247             av_push(revcharmap, zlopp);                                    \
1248         } else {                                                           \
1249             char ooooff = (char)uvc;                                               \
1250             av_push(revcharmap, newSVpvn(&ooooff, 1));                     \
1251         }                                                                  \
1252         } STMT_END
1253
1254 #define TRIE_READ_CHAR STMT_START {                                           \
1255     wordlen++;                                                                \
1256     if ( UTF ) {                                                              \
1257         if ( folder ) {                                                       \
1258             if ( foldlen > 0 ) {                                              \
1259                uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags );     \
1260                foldlen -= len;                                                \
1261                scan += len;                                                   \
1262                len = 0;                                                       \
1263             } else {                                                          \
1264                 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1265                 uvc = to_uni_fold( uvc, foldbuf, &foldlen );                  \
1266                 foldlen -= UNISKIP( uvc );                                    \
1267                 scan = foldbuf + UNISKIP( uvc );                              \
1268             }                                                                 \
1269         } else {                                                              \
1270             uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1271         }                                                                     \
1272     } else {                                                                  \
1273         uvc = (U32)*uc;                                                       \
1274         len = 1;                                                              \
1275     }                                                                         \
1276 } STMT_END
1277
1278
1279
1280 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
1281     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
1282         U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
1283         Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1284     }                                                           \
1285     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
1286     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
1287     TRIE_LIST_CUR( state )++;                                   \
1288 } STMT_END
1289
1290 #define TRIE_LIST_NEW(state) STMT_START {                       \
1291     Newxz( trie->states[ state ].trans.list,               \
1292         4, reg_trie_trans_le );                                 \
1293      TRIE_LIST_CUR( state ) = 1;                                \
1294      TRIE_LIST_LEN( state ) = 4;                                \
1295 } STMT_END
1296
1297 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
1298     U16 dupe= trie->states[ state ].wordnum;                    \
1299     regnode * const noper_next = regnext( noper );              \
1300                                                                 \
1301     DEBUG_r({                                                   \
1302         /* store the word for dumping */                        \
1303         SV* tmp;                                                \
1304         if (OP(noper) != NOTHING)                               \
1305             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);    \
1306         else                                                    \
1307             tmp = newSVpvn_utf8( "", 0, UTF );                  \
1308         av_push( trie_words, tmp );                             \
1309     });                                                         \
1310                                                                 \
1311     curword++;                                                  \
1312     trie->wordinfo[curword].prev   = 0;                         \
1313     trie->wordinfo[curword].len    = wordlen;                   \
1314     trie->wordinfo[curword].accept = state;                     \
1315                                                                 \
1316     if ( noper_next < tail ) {                                  \
1317         if (!trie->jump)                                        \
1318             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1319         trie->jump[curword] = (U16)(noper_next - convert);      \
1320         if (!jumper)                                            \
1321             jumper = noper_next;                                \
1322         if (!nextbranch)                                        \
1323             nextbranch= regnext(cur);                           \
1324     }                                                           \
1325                                                                 \
1326     if ( dupe ) {                                               \
1327         /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
1328         /* chain, so that when the bits of chain are later    */\
1329         /* linked together, the dups appear in the chain      */\
1330         trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1331         trie->wordinfo[dupe].prev = curword;                    \
1332     } else {                                                    \
1333         /* we haven't inserted this word yet.                */ \
1334         trie->states[ state ].wordnum = curword;                \
1335     }                                                           \
1336 } STMT_END
1337
1338
1339 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)          \
1340      ( ( base + charid >=  ucharcount                                   \
1341          && base + charid < ubound                                      \
1342          && state == trie->trans[ base - ucharcount + charid ].check    \
1343          && trie->trans[ base - ucharcount + charid ].next )            \
1344            ? trie->trans[ base - ucharcount + charid ].next             \
1345            : ( state==1 ? special : 0 )                                 \
1346       )
1347
1348 #define MADE_TRIE       1
1349 #define MADE_JUMP_TRIE  2
1350 #define MADE_EXACT_TRIE 4
1351
1352 STATIC I32
1353 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1354 {
1355     dVAR;
1356     /* first pass, loop through and scan words */
1357     reg_trie_data *trie;
1358     HV *widecharmap = NULL;
1359     AV *revcharmap = newAV();
1360     regnode *cur;
1361     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1362     STRLEN len = 0;
1363     UV uvc = 0;
1364     U16 curword = 0;
1365     U32 next_alloc = 0;
1366     regnode *jumper = NULL;
1367     regnode *nextbranch = NULL;
1368     regnode *convert = NULL;
1369     U32 *prev_states; /* temp array mapping each state to previous one */
1370     /* we just use folder as a flag in utf8 */
1371     const U8 * folder = NULL;
1372
1373 #ifdef DEBUGGING
1374     const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1375     AV *trie_words = NULL;
1376     /* along with revcharmap, this only used during construction but both are
1377      * useful during debugging so we store them in the struct when debugging.
1378      */
1379 #else
1380     const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1381     STRLEN trie_charcount=0;
1382 #endif
1383     SV *re_trie_maxbuff;
1384     GET_RE_DEBUG_FLAGS_DECL;
1385
1386     PERL_ARGS_ASSERT_MAKE_TRIE;
1387 #ifndef DEBUGGING
1388     PERL_UNUSED_ARG(depth);
1389 #endif
1390
1391     switch (flags) {
1392         case EXACTFU: folder = PL_fold_latin1; break;
1393         case EXACTF:  folder = PL_fold; break;
1394         case EXACTFL: folder = PL_fold_locale; break;
1395     }
1396
1397     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1398     trie->refcount = 1;
1399     trie->startstate = 1;
1400     trie->wordcount = word_count;
1401     RExC_rxi->data->data[ data_slot ] = (void*)trie;
1402     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1403     if (!(UTF && folder))
1404         trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1405     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1406                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
1407
1408     DEBUG_r({
1409         trie_words = newAV();
1410     });
1411
1412     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1413     if (!SvIOK(re_trie_maxbuff)) {
1414         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1415     }
1416     DEBUG_OPTIMISE_r({
1417                 PerlIO_printf( Perl_debug_log,
1418                   "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1419                   (int)depth * 2 + 2, "",
1420                   REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1421                   REG_NODE_NUM(last), REG_NODE_NUM(tail),
1422                   (int)depth);
1423     });
1424
1425    /* Find the node we are going to overwrite */
1426     if ( first == startbranch && OP( last ) != BRANCH ) {
1427         /* whole branch chain */
1428         convert = first;
1429     } else {
1430         /* branch sub-chain */
1431         convert = NEXTOPER( first );
1432     }
1433
1434     /*  -- First loop and Setup --
1435
1436        We first traverse the branches and scan each word to determine if it
1437        contains widechars, and how many unique chars there are, this is
1438        important as we have to build a table with at least as many columns as we
1439        have unique chars.
1440
1441        We use an array of integers to represent the character codes 0..255
1442        (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1443        native representation of the character value as the key and IV's for the
1444        coded index.
1445
1446        *TODO* If we keep track of how many times each character is used we can
1447        remap the columns so that the table compression later on is more
1448        efficient in terms of memory by ensuring the most common value is in the
1449        middle and the least common are on the outside.  IMO this would be better
1450        than a most to least common mapping as theres a decent chance the most
1451        common letter will share a node with the least common, meaning the node
1452        will not be compressible. With a middle is most common approach the worst
1453        case is when we have the least common nodes twice.
1454
1455      */
1456
1457     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1458         regnode * const noper = NEXTOPER( cur );
1459         const U8 *uc = (U8*)STRING( noper );
1460         const U8 * const e  = uc + STR_LEN( noper );
1461         STRLEN foldlen = 0;
1462         U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1463         const U8 *scan = (U8*)NULL;
1464         U32 wordlen      = 0;         /* required init */
1465         STRLEN chars = 0;
1466         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1467
1468         if (OP(noper) == NOTHING) {
1469             trie->minlen= 0;
1470             continue;
1471         }
1472         if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1473             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1474                                           regardless of encoding */
1475
1476         for ( ; uc < e ; uc += len ) {
1477             TRIE_CHARCOUNT(trie)++;
1478             TRIE_READ_CHAR;
1479             chars++;
1480             if ( uvc < 256 ) {
1481                 if ( !trie->charmap[ uvc ] ) {
1482                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1483                     if ( folder )
1484                         trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1485                     TRIE_STORE_REVCHAR;
1486                 }
1487                 if ( set_bit ) {
1488                     /* store the codepoint in the bitmap, and its folded
1489                      * equivalent. */
1490                     TRIE_BITMAP_SET(trie,uvc);
1491
1492                     /* store the folded codepoint */
1493                     if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1494
1495                     if ( !UTF ) {
1496                         /* store first byte of utf8 representation of
1497                            variant codepoints */
1498                         if (! UNI_IS_INVARIANT(uvc)) {
1499                             TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
1500                         }
1501                     }
1502                     set_bit = 0; /* We've done our bit :-) */
1503                 }
1504             } else {
1505                 SV** svpp;
1506                 if ( !widecharmap )
1507                     widecharmap = newHV();
1508
1509                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1510
1511                 if ( !svpp )
1512                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1513
1514                 if ( !SvTRUE( *svpp ) ) {
1515                     sv_setiv( *svpp, ++trie->uniquecharcount );
1516                     TRIE_STORE_REVCHAR;
1517                 }
1518             }
1519         }
1520         if( cur == first ) {
1521             trie->minlen=chars;
1522             trie->maxlen=chars;
1523         } else if (chars < trie->minlen) {
1524             trie->minlen=chars;
1525         } else if (chars > trie->maxlen) {
1526             trie->maxlen=chars;
1527         }
1528
1529     } /* end first pass */
1530     DEBUG_TRIE_COMPILE_r(
1531         PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1532                 (int)depth * 2 + 2,"",
1533                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1534                 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1535                 (int)trie->minlen, (int)trie->maxlen )
1536     );
1537
1538     /*
1539         We now know what we are dealing with in terms of unique chars and
1540         string sizes so we can calculate how much memory a naive
1541         representation using a flat table  will take. If it's over a reasonable
1542         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1543         conservative but potentially much slower representation using an array
1544         of lists.
1545
1546         At the end we convert both representations into the same compressed
1547         form that will be used in regexec.c for matching with. The latter
1548         is a form that cannot be used to construct with but has memory
1549         properties similar to the list form and access properties similar
1550         to the table form making it both suitable for fast searches and
1551         small enough that its feasable to store for the duration of a program.
1552
1553         See the comment in the code where the compressed table is produced
1554         inplace from the flat tabe representation for an explanation of how
1555         the compression works.
1556
1557     */
1558
1559
1560     Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1561     prev_states[1] = 0;
1562
1563     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1564         /*
1565             Second Pass -- Array Of Lists Representation
1566
1567             Each state will be represented by a list of charid:state records
1568             (reg_trie_trans_le) the first such element holds the CUR and LEN
1569             points of the allocated array. (See defines above).
1570
1571             We build the initial structure using the lists, and then convert
1572             it into the compressed table form which allows faster lookups
1573             (but cant be modified once converted).
1574         */
1575
1576         STRLEN transcount = 1;
1577
1578         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1579             "%*sCompiling trie using list compiler\n",
1580             (int)depth * 2 + 2, ""));
1581
1582         trie->states = (reg_trie_state *)
1583             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1584                                   sizeof(reg_trie_state) );
1585         TRIE_LIST_NEW(1);
1586         next_alloc = 2;
1587
1588         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1589
1590             regnode * const noper = NEXTOPER( cur );
1591             U8 *uc           = (U8*)STRING( noper );
1592             const U8 * const e = uc + STR_LEN( noper );
1593             U32 state        = 1;         /* required init */
1594             U16 charid       = 0;         /* sanity init */
1595             U8 *scan         = (U8*)NULL; /* sanity init */
1596             STRLEN foldlen   = 0;         /* required init */
1597             U32 wordlen      = 0;         /* required init */
1598             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1599
1600             if (OP(noper) != NOTHING) {
1601                 for ( ; uc < e ; uc += len ) {
1602
1603                     TRIE_READ_CHAR;
1604
1605                     if ( uvc < 256 ) {
1606                         charid = trie->charmap[ uvc ];
1607                     } else {
1608                         SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1609                         if ( !svpp ) {
1610                             charid = 0;
1611                         } else {
1612                             charid=(U16)SvIV( *svpp );
1613                         }
1614                     }
1615                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
1616                     if ( charid ) {
1617
1618                         U16 check;
1619                         U32 newstate = 0;
1620
1621                         charid--;
1622                         if ( !trie->states[ state ].trans.list ) {
1623                             TRIE_LIST_NEW( state );
1624                         }
1625                         for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1626                             if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1627                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1628                                 break;
1629                             }
1630                         }
1631                         if ( ! newstate ) {
1632                             newstate = next_alloc++;
1633                             prev_states[newstate] = state;
1634                             TRIE_LIST_PUSH( state, charid, newstate );
1635                             transcount++;
1636                         }
1637                         state = newstate;
1638                     } else {
1639                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1640                     }
1641                 }
1642             }
1643             TRIE_HANDLE_WORD(state);
1644
1645         } /* end second pass */
1646
1647         /* next alloc is the NEXT state to be allocated */
1648         trie->statecount = next_alloc;
1649         trie->states = (reg_trie_state *)
1650             PerlMemShared_realloc( trie->states,
1651                                    next_alloc
1652                                    * sizeof(reg_trie_state) );
1653
1654         /* and now dump it out before we compress it */
1655         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1656                                                          revcharmap, next_alloc,
1657                                                          depth+1)
1658         );
1659
1660         trie->trans = (reg_trie_trans *)
1661             PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1662         {
1663             U32 state;
1664             U32 tp = 0;
1665             U32 zp = 0;
1666
1667
1668             for( state=1 ; state < next_alloc ; state ++ ) {
1669                 U32 base=0;
1670
1671                 /*
1672                 DEBUG_TRIE_COMPILE_MORE_r(
1673                     PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1674                 );
1675                 */
1676
1677                 if (trie->states[state].trans.list) {
1678                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1679                     U16 maxid=minid;
1680                     U16 idx;
1681
1682                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1683                         const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1684                         if ( forid < minid ) {
1685                             minid=forid;
1686                         } else if ( forid > maxid ) {
1687                             maxid=forid;
1688                         }
1689                     }
1690                     if ( transcount < tp + maxid - minid + 1) {
1691                         transcount *= 2;
1692                         trie->trans = (reg_trie_trans *)
1693                             PerlMemShared_realloc( trie->trans,
1694                                                      transcount
1695                                                      * sizeof(reg_trie_trans) );
1696                         Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1697                     }
1698                     base = trie->uniquecharcount + tp - minid;
1699                     if ( maxid == minid ) {
1700                         U32 set = 0;
1701                         for ( ; zp < tp ; zp++ ) {
1702                             if ( ! trie->trans[ zp ].next ) {
1703                                 base = trie->uniquecharcount + zp - minid;
1704                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1705                                 trie->trans[ zp ].check = state;
1706                                 set = 1;
1707                                 break;
1708                             }
1709                         }
1710                         if ( !set ) {
1711                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1712                             trie->trans[ tp ].check = state;
1713                             tp++;
1714                             zp = tp;
1715                         }
1716                     } else {
1717                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1718                             const U32 tid = base -  trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1719                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1720                             trie->trans[ tid ].check = state;
1721                         }
1722                         tp += ( maxid - minid + 1 );
1723                     }
1724                     Safefree(trie->states[ state ].trans.list);
1725                 }
1726                 /*
1727                 DEBUG_TRIE_COMPILE_MORE_r(
1728                     PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1729                 );
1730                 */
1731                 trie->states[ state ].trans.base=base;
1732             }
1733             trie->lasttrans = tp + 1;
1734         }
1735     } else {
1736         /*
1737            Second Pass -- Flat Table Representation.
1738
1739            we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1740            We know that we will need Charcount+1 trans at most to store the data
1741            (one row per char at worst case) So we preallocate both structures
1742            assuming worst case.
1743
1744            We then construct the trie using only the .next slots of the entry
1745            structs.
1746
1747            We use the .check field of the first entry of the node temporarily to
1748            make compression both faster and easier by keeping track of how many non
1749            zero fields are in the node.
1750
1751            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1752            transition.
1753
1754            There are two terms at use here: state as a TRIE_NODEIDX() which is a
1755            number representing the first entry of the node, and state as a
1756            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1757            TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1758            are 2 entrys per node. eg:
1759
1760              A B       A B
1761           1. 2 4    1. 3 7
1762           2. 0 3    3. 0 5
1763           3. 0 0    5. 0 0
1764           4. 0 0    7. 0 0
1765
1766            The table is internally in the right hand, idx form. However as we also
1767            have to deal with the states array which is indexed by nodenum we have to
1768            use TRIE_NODENUM() to convert.
1769
1770         */
1771         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1772             "%*sCompiling trie using table compiler\n",
1773             (int)depth * 2 + 2, ""));
1774
1775         trie->trans = (reg_trie_trans *)
1776             PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1777                                   * trie->uniquecharcount + 1,
1778                                   sizeof(reg_trie_trans) );
1779         trie->states = (reg_trie_state *)
1780             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1781                                   sizeof(reg_trie_state) );
1782         next_alloc = trie->uniquecharcount + 1;
1783
1784
1785         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1786
1787             regnode * const noper   = NEXTOPER( cur );
1788             const U8 *uc     = (U8*)STRING( noper );
1789             const U8 * const e = uc + STR_LEN( noper );
1790
1791             U32 state        = 1;         /* required init */
1792
1793             U16 charid       = 0;         /* sanity init */
1794             U32 accept_state = 0;         /* sanity init */
1795             U8 *scan         = (U8*)NULL; /* sanity init */
1796
1797             STRLEN foldlen   = 0;         /* required init */
1798             U32 wordlen      = 0;         /* required init */
1799             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1800
1801             if ( OP(noper) != NOTHING ) {
1802                 for ( ; uc < e ; uc += len ) {
1803
1804                     TRIE_READ_CHAR;
1805
1806                     if ( uvc < 256 ) {
1807                         charid = trie->charmap[ uvc ];
1808                     } else {
1809                         SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1810                         charid = svpp ? (U16)SvIV(*svpp) : 0;
1811                     }
1812                     if ( charid ) {
1813                         charid--;
1814                         if ( !trie->trans[ state + charid ].next ) {
1815                             trie->trans[ state + charid ].next = next_alloc;
1816                             trie->trans[ state ].check++;
1817                             prev_states[TRIE_NODENUM(next_alloc)]
1818                                     = TRIE_NODENUM(state);
1819                             next_alloc += trie->uniquecharcount;
1820                         }
1821                         state = trie->trans[ state + charid ].next;
1822                     } else {
1823                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1824                     }
1825                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
1826                 }
1827             }
1828             accept_state = TRIE_NODENUM( state );
1829             TRIE_HANDLE_WORD(accept_state);
1830
1831         } /* end second pass */
1832
1833         /* and now dump it out before we compress it */
1834         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1835                                                           revcharmap,
1836                                                           next_alloc, depth+1));
1837
1838         {
1839         /*
1840            * Inplace compress the table.*
1841
1842            For sparse data sets the table constructed by the trie algorithm will
1843            be mostly 0/FAIL transitions or to put it another way mostly empty.
1844            (Note that leaf nodes will not contain any transitions.)
1845
1846            This algorithm compresses the tables by eliminating most such
1847            transitions, at the cost of a modest bit of extra work during lookup:
1848
1849            - Each states[] entry contains a .base field which indicates the
1850            index in the state[] array wheres its transition data is stored.
1851
1852            - If .base is 0 there are no valid transitions from that node.
1853
1854            - If .base is nonzero then charid is added to it to find an entry in
1855            the trans array.
1856
1857            -If trans[states[state].base+charid].check!=state then the
1858            transition is taken to be a 0/Fail transition. Thus if there are fail
1859            transitions at the front of the node then the .base offset will point
1860            somewhere inside the previous nodes data (or maybe even into a node
1861            even earlier), but the .check field determines if the transition is
1862            valid.
1863
1864            XXX - wrong maybe?
1865            The following process inplace converts the table to the compressed
1866            table: We first do not compress the root node 1,and mark all its
1867            .check pointers as 1 and set its .base pointer as 1 as well. This
1868            allows us to do a DFA construction from the compressed table later,
1869            and ensures that any .base pointers we calculate later are greater
1870            than 0.
1871
1872            - We set 'pos' to indicate the first entry of the second node.
1873
1874            - We then iterate over the columns of the node, finding the first and
1875            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1876            and set the .check pointers accordingly, and advance pos
1877            appropriately and repreat for the next node. Note that when we copy
1878            the next pointers we have to convert them from the original
1879            NODEIDX form to NODENUM form as the former is not valid post
1880            compression.
1881
1882            - If a node has no transitions used we mark its base as 0 and do not
1883            advance the pos pointer.
1884
1885            - If a node only has one transition we use a second pointer into the
1886            structure to fill in allocated fail transitions from other states.
1887            This pointer is independent of the main pointer and scans forward
1888            looking for null transitions that are allocated to a state. When it
1889            finds one it writes the single transition into the "hole".  If the
1890            pointer doesnt find one the single transition is appended as normal.
1891
1892            - Once compressed we can Renew/realloc the structures to release the
1893            excess space.
1894
1895            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1896            specifically Fig 3.47 and the associated pseudocode.
1897
1898            demq
1899         */
1900         const U32 laststate = TRIE_NODENUM( next_alloc );
1901         U32 state, charid;
1902         U32 pos = 0, zp=0;
1903         trie->statecount = laststate;
1904
1905         for ( state = 1 ; state < laststate ; state++ ) {
1906             U8 flag = 0;
1907             const U32 stateidx = TRIE_NODEIDX( state );
1908             const U32 o_used = trie->trans[ stateidx ].check;
1909             U32 used = trie->trans[ stateidx ].check;
1910             trie->trans[ stateidx ].check = 0;
1911
1912             for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1913                 if ( flag || trie->trans[ stateidx + charid ].next ) {
1914                     if ( trie->trans[ stateidx + charid ].next ) {
1915                         if (o_used == 1) {
1916                             for ( ; zp < pos ; zp++ ) {
1917                                 if ( ! trie->trans[ zp ].next ) {
1918                                     break;
1919                                 }
1920                             }
1921                             trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1922                             trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1923                             trie->trans[ zp ].check = state;
1924                             if ( ++zp > pos ) pos = zp;
1925                             break;
1926                         }
1927                         used--;
1928                     }
1929                     if ( !flag ) {
1930                         flag = 1;
1931                         trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1932                     }
1933                     trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1934                     trie->trans[ pos ].check = state;
1935                     pos++;
1936                 }
1937             }
1938         }
1939         trie->lasttrans = pos + 1;
1940         trie->states = (reg_trie_state *)
1941             PerlMemShared_realloc( trie->states, laststate
1942                                    * sizeof(reg_trie_state) );
1943         DEBUG_TRIE_COMPILE_MORE_r(
1944                 PerlIO_printf( Perl_debug_log,
1945                     "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1946                     (int)depth * 2 + 2,"",
1947                     (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
1948                     (IV)next_alloc,
1949                     (IV)pos,
1950                     ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1951             );
1952
1953         } /* end table compress */
1954     }
1955     DEBUG_TRIE_COMPILE_MORE_r(
1956             PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1957                 (int)depth * 2 + 2, "",
1958                 (UV)trie->statecount,
1959                 (UV)trie->lasttrans)
1960     );
1961     /* resize the trans array to remove unused space */
1962     trie->trans = (reg_trie_trans *)
1963         PerlMemShared_realloc( trie->trans, trie->lasttrans
1964                                * sizeof(reg_trie_trans) );
1965
1966     {   /* Modify the program and insert the new TRIE node */
1967         U8 nodetype =(U8)(flags & 0xFF);
1968         char *str=NULL;
1969
1970 #ifdef DEBUGGING
1971         regnode *optimize = NULL;
1972 #ifdef RE_TRACK_PATTERN_OFFSETS
1973
1974         U32 mjd_offset = 0;
1975         U32 mjd_nodelen = 0;
1976 #endif /* RE_TRACK_PATTERN_OFFSETS */
1977 #endif /* DEBUGGING */
1978         /*
1979            This means we convert either the first branch or the first Exact,
1980            depending on whether the thing following (in 'last') is a branch
1981            or not and whther first is the startbranch (ie is it a sub part of
1982            the alternation or is it the whole thing.)
1983            Assuming its a sub part we convert the EXACT otherwise we convert
1984            the whole branch sequence, including the first.
1985          */
1986         /* Find the node we are going to overwrite */
1987         if ( first != startbranch || OP( last ) == BRANCH ) {
1988             /* branch sub-chain */
1989             NEXT_OFF( first ) = (U16)(last - first);
1990 #ifdef RE_TRACK_PATTERN_OFFSETS
1991             DEBUG_r({
1992                 mjd_offset= Node_Offset((convert));
1993                 mjd_nodelen= Node_Length((convert));
1994             });
1995 #endif
1996             /* whole branch chain */
1997         }
1998 #ifdef RE_TRACK_PATTERN_OFFSETS
1999         else {
2000             DEBUG_r({
2001                 const  regnode *nop = NEXTOPER( convert );
2002                 mjd_offset= Node_Offset((nop));
2003                 mjd_nodelen= Node_Length((nop));
2004             });
2005         }
2006         DEBUG_OPTIMISE_r(
2007             PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2008                 (int)depth * 2 + 2, "",
2009                 (UV)mjd_offset, (UV)mjd_nodelen)
2010         );
2011 #endif
2012         /* But first we check to see if there is a common prefix we can
2013            split out as an EXACT and put in front of the TRIE node.  */
2014         trie->startstate= 1;
2015         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
2016             U32 state;
2017             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2018                 U32 ofs = 0;
2019                 I32 idx = -1;
2020                 U32 count = 0;
2021                 const U32 base = trie->states[ state ].trans.base;
2022
2023                 if ( trie->states[state].wordnum )
2024                         count = 1;
2025
2026                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2027                     if ( ( base + ofs >= trie->uniquecharcount ) &&
2028                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2029                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2030                     {
2031                         if ( ++count > 1 ) {
2032                             SV **tmp = av_fetch( revcharmap, ofs, 0);
2033                             const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2034                             if ( state == 1 ) break;
2035                             if ( count == 2 ) {
2036                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2037                                 DEBUG_OPTIMISE_r(
2038                                     PerlIO_printf(Perl_debug_log,
2039                                         "%*sNew Start State=%"UVuf" Class: [",
2040                                         (int)depth * 2 + 2, "",
2041                                         (UV)state));
2042                                 if (idx >= 0) {
2043                                     SV ** const tmp = av_fetch( revcharmap, idx, 0);
2044                                     const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2045
2046                                     TRIE_BITMAP_SET(trie,*ch);
2047                                     if ( folder )
2048                                         TRIE_BITMAP_SET(trie, folder[ *ch ]);
2049                                     DEBUG_OPTIMISE_r(
2050                                         PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2051                                     );
2052                                 }
2053                             }
2054                             TRIE_BITMAP_SET(trie,*ch);
2055                             if ( folder )
2056                                 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2057                             DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2058                         }
2059                         idx = ofs;
2060                     }
2061                 }
2062                 if ( count == 1 ) {
2063                     SV **tmp = av_fetch( revcharmap, idx, 0);
2064                     STRLEN len;
2065                     char *ch = SvPV( *tmp, len );
2066                     DEBUG_OPTIMISE_r({
2067                         SV *sv=sv_newmortal();
2068                         PerlIO_printf( Perl_debug_log,
2069                             "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2070                             (int)depth * 2 + 2, "",
2071                             (UV)state, (UV)idx,
2072                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2073                                 PL_colors[0], PL_colors[1],
2074                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2075                                 PERL_PV_ESCAPE_FIRSTCHAR
2076                             )
2077                         );
2078                     });
2079                     if ( state==1 ) {
2080                         OP( convert ) = nodetype;
2081                         str=STRING(convert);
2082                         STR_LEN(convert)=0;
2083                     }
2084                     STR_LEN(convert) += len;
2085                     while (len--)
2086                         *str++ = *ch++;
2087                 } else {
2088 #ifdef DEBUGGING
2089                     if (state>1)
2090                         DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2091 #endif
2092                     break;
2093                 }
2094             }
2095             trie->prefixlen = (state-1);
2096             if (str) {
2097                 regnode *n = convert+NODE_SZ_STR(convert);
2098                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2099                 trie->startstate = state;
2100                 trie->minlen -= (state - 1);
2101                 trie->maxlen -= (state - 1);
2102 #ifdef DEBUGGING
2103                /* At least the UNICOS C compiler choked on this
2104                 * being argument to DEBUG_r(), so let's just have
2105                 * it right here. */
2106                if (
2107 #ifdef PERL_EXT_RE_BUILD
2108                    1
2109 #else
2110                    DEBUG_r_TEST
2111 #endif
2112                    ) {
2113                    regnode *fix = convert;
2114                    U32 word = trie->wordcount;
2115                    mjd_nodelen++;
2116                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2117                    while( ++fix < n ) {
2118                        Set_Node_Offset_Length(fix, 0, 0);
2119                    }
2120                    while (word--) {
2121                        SV ** const tmp = av_fetch( trie_words, word, 0 );
2122                        if (tmp) {
2123                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
2124                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2125                            else
2126                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2127                        }
2128                    }
2129                }
2130 #endif
2131                 if (trie->maxlen) {
2132                     convert = n;
2133                 } else {
2134                     NEXT_OFF(convert) = (U16)(tail - convert);
2135                     DEBUG_r(optimize= n);
2136                 }
2137             }
2138         }
2139         if (!jumper)
2140             jumper = last;
2141         if ( trie->maxlen ) {
2142             NEXT_OFF( convert ) = (U16)(tail - convert);
2143             ARG_SET( convert, data_slot );
2144             /* Store the offset to the first unabsorbed branch in
2145                jump[0], which is otherwise unused by the jump logic.
2146                We use this when dumping a trie and during optimisation. */
2147             if (trie->jump)
2148                 trie->jump[0] = (U16)(nextbranch - convert);
2149
2150             /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2151              *   and there is a bitmap
2152              *   and the first "jump target" node we found leaves enough room
2153              * then convert the TRIE node into a TRIEC node, with the bitmap
2154              * embedded inline in the opcode - this is hypothetically faster.
2155              */
2156             if ( !trie->states[trie->startstate].wordnum
2157                  && trie->bitmap
2158                  && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2159             {
2160                 OP( convert ) = TRIEC;
2161                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2162                 PerlMemShared_free(trie->bitmap);
2163                 trie->bitmap= NULL;
2164             } else
2165                 OP( convert ) = TRIE;
2166
2167             /* store the type in the flags */
2168             convert->flags = nodetype;
2169             DEBUG_r({
2170             optimize = convert
2171                       + NODE_STEP_REGNODE
2172                       + regarglen[ OP( convert ) ];
2173             });
2174             /* XXX We really should free up the resource in trie now,
2175                    as we won't use them - (which resources?) dmq */
2176         }
2177         /* needed for dumping*/
2178         DEBUG_r(if (optimize) {
2179             regnode *opt = convert;
2180
2181             while ( ++opt < optimize) {
2182                 Set_Node_Offset_Length(opt,0,0);
2183             }
2184             /*
2185                 Try to clean up some of the debris left after the
2186                 optimisation.
2187              */
2188             while( optimize < jumper ) {
2189                 mjd_nodelen += Node_Length((optimize));
2190                 OP( optimize ) = OPTIMIZED;
2191                 Set_Node_Offset_Length(optimize,0,0);
2192                 optimize++;
2193             }
2194             Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2195         });
2196     } /* end node insert */
2197
2198     /*  Finish populating the prev field of the wordinfo array.  Walk back
2199      *  from each accept state until we find another accept state, and if
2200      *  so, point the first word's .prev field at the second word. If the
2201      *  second already has a .prev field set, stop now. This will be the
2202      *  case either if we've already processed that word's accept state,
2203      *  or that state had multiple words, and the overspill words were
2204      *  already linked up earlier.
2205      */
2206     {
2207         U16 word;
2208         U32 state;
2209         U16 prev;
2210
2211         for (word=1; word <= trie->wordcount; word++) {
2212             prev = 0;
2213             if (trie->wordinfo[word].prev)
2214                 continue;
2215             state = trie->wordinfo[word].accept;
2216             while (state) {
2217                 state = prev_states[state];
2218                 if (!state)
2219                     break;
2220                 prev = trie->states[state].wordnum;
2221                 if (prev)
2222                     break;
2223             }
2224             trie->wordinfo[word].prev = prev;
2225         }
2226         Safefree(prev_states);
2227     }
2228
2229
2230     /* and now dump out the compressed format */
2231     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2232
2233     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2234 #ifdef DEBUGGING
2235     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2236     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2237 #else
2238     SvREFCNT_dec(revcharmap);
2239 #endif
2240     return trie->jump
2241            ? MADE_JUMP_TRIE
2242            : trie->startstate>1
2243              ? MADE_EXACT_TRIE
2244              : MADE_TRIE;
2245 }
2246
2247 STATIC void
2248 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode *stclass, U32 depth)
2249 {
2250 /* The Trie is constructed and compressed now so we can build a fail array if it's needed
2251
2252    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2253    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2254    ISBN 0-201-10088-6
2255
2256    We find the fail state for each state in the trie, this state is the longest proper
2257    suffix of the current state's 'word' that is also a proper prefix of another word in our
2258    trie. State 1 represents the word '' and is thus the default fail state. This allows
2259    the DFA not to have to restart after its tried and failed a word at a given point, it
2260    simply continues as though it had been matching the other word in the first place.
2261    Consider
2262       'abcdgu'=~/abcdefg|cdgu/
2263    When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2264    fail, which would bring us to the state representing 'd' in the second word where we would
2265    try 'g' and succeed, proceeding to match 'cdgu'.
2266  */
2267  /* add a fail transition */
2268     const U32 trie_offset = ARG(source);
2269     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2270     U32 *q;
2271     const U32 ucharcount = trie->uniquecharcount;
2272     const U32 numstates = trie->statecount;
2273     const U32 ubound = trie->lasttrans + ucharcount;
2274     U32 q_read = 0;
2275     U32 q_write = 0;
2276     U32 charid;
2277     U32 base = trie->states[ 1 ].trans.base;
2278     U32 *fail;
2279     reg_ac_data *aho;
2280     const U32 data_slot = add_data( pRExC_state, 1, "T" );
2281     GET_RE_DEBUG_FLAGS_DECL;
2282
2283     PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2284 #ifndef DEBUGGING
2285     PERL_UNUSED_ARG(depth);
2286 #endif
2287
2288
2289     ARG_SET( stclass, data_slot );
2290     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2291     RExC_rxi->data->data[ data_slot ] = (void*)aho;
2292     aho->trie=trie_offset;
2293     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2294     Copy( trie->states, aho->states, numstates, reg_trie_state );
2295     Newxz( q, numstates, U32);
2296     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2297     aho->refcount = 1;
2298     fail = aho->fail;
2299     /* initialize fail[0..1] to be 1 so that we always have
2300        a valid final fail state */
2301     fail[ 0 ] = fail[ 1 ] = 1;
2302
2303     for ( charid = 0; charid < ucharcount ; charid++ ) {
2304         const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2305         if ( newstate ) {
2306             q[ q_write ] = newstate;
2307             /* set to point at the root */
2308             fail[ q[ q_write++ ] ]=1;
2309         }
2310     }
2311     while ( q_read < q_write) {
2312         const U32 cur = q[ q_read++ % numstates ];
2313         base = trie->states[ cur ].trans.base;
2314
2315         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2316             const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2317             if (ch_state) {
2318                 U32 fail_state = cur;
2319                 U32 fail_base;
2320                 do {
2321                     fail_state = fail[ fail_state ];
2322                     fail_base = aho->states[ fail_state ].trans.base;
2323                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2324
2325                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2326                 fail[ ch_state ] = fail_state;
2327                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2328                 {
2329                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
2330                 }
2331                 q[ q_write++ % numstates] = ch_state;
2332             }
2333         }
2334     }
2335     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2336        when we fail in state 1, this allows us to use the
2337        charclass scan to find a valid start char. This is based on the principle
2338        that theres a good chance the string being searched contains lots of stuff
2339        that cant be a start char.
2340      */
2341     fail[ 0 ] = fail[ 1 ] = 0;
2342     DEBUG_TRIE_COMPILE_r({
2343         PerlIO_printf(Perl_debug_log,
2344                       "%*sStclass Failtable (%"UVuf" states): 0",
2345                       (int)(depth * 2), "", (UV)numstates
2346         );
2347         for( q_read=1; q_read<numstates; q_read++ ) {
2348             PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2349         }
2350         PerlIO_printf(Perl_debug_log, "\n");
2351     });
2352     Safefree(q);
2353     /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2354 }
2355
2356
2357 /*
2358  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2359  * These need to be revisited when a newer toolchain becomes available.
2360  */
2361 #if defined(__sparc64__) && defined(__GNUC__)
2362 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2363 #       undef  SPARC64_GCC_WORKAROUND
2364 #       define SPARC64_GCC_WORKAROUND 1
2365 #   endif
2366 #endif
2367
2368 #define DEBUG_PEEP(str,scan,depth) \
2369     DEBUG_OPTIMISE_r({if (scan){ \
2370        SV * const mysv=sv_newmortal(); \
2371        regnode *Next = regnext(scan); \
2372        regprop(RExC_rx, mysv, scan); \
2373        PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2374        (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2375        Next ? (REG_NODE_NUM(Next)) : 0 ); \
2376    }});
2377
2378
2379
2380
2381
2382 #define JOIN_EXACT(scan,min,flags) \
2383     if (PL_regkind[OP(scan)] == EXACT) \
2384         join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2385
2386 STATIC U32
2387 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2388     /* Merge several consecutive EXACTish nodes into one. */
2389     regnode *n = regnext(scan);
2390     U32 stringok = 1;
2391     regnode *next = scan + NODE_SZ_STR(scan);
2392     U32 merged = 0;
2393     U32 stopnow = 0;
2394 #ifdef DEBUGGING
2395     regnode *stop = scan;
2396     GET_RE_DEBUG_FLAGS_DECL;
2397 #else
2398     PERL_UNUSED_ARG(depth);
2399 #endif
2400
2401     PERL_ARGS_ASSERT_JOIN_EXACT;
2402 #ifndef EXPERIMENTAL_INPLACESCAN
2403     PERL_UNUSED_ARG(flags);
2404     PERL_UNUSED_ARG(val);
2405 #endif
2406     DEBUG_PEEP("join",scan,depth);
2407
2408     /* Skip NOTHING, merge EXACT*. */
2409     while (n &&
2410            ( PL_regkind[OP(n)] == NOTHING ||
2411              (stringok && (OP(n) == OP(scan))))
2412            && NEXT_OFF(n)
2413            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2414
2415         if (OP(n) == TAIL || n > next)
2416             stringok = 0;
2417         if (PL_regkind[OP(n)] == NOTHING) {
2418             DEBUG_PEEP("skip:",n,depth);
2419             NEXT_OFF(scan) += NEXT_OFF(n);
2420             next = n + NODE_STEP_REGNODE;
2421 #ifdef DEBUGGING
2422             if (stringok)
2423                 stop = n;
2424 #endif
2425             n = regnext(n);
2426         }
2427         else if (stringok) {
2428             const unsigned int oldl = STR_LEN(scan);
2429             regnode * const nnext = regnext(n);
2430
2431             DEBUG_PEEP("merg",n,depth);
2432
2433             merged++;
2434             if (oldl + STR_LEN(n) > U8_MAX)
2435                 break;
2436             NEXT_OFF(scan) += NEXT_OFF(n);
2437             STR_LEN(scan) += STR_LEN(n);
2438             next = n + NODE_SZ_STR(n);
2439             /* Now we can overwrite *n : */
2440             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2441 #ifdef DEBUGGING
2442             stop = next - 1;
2443 #endif
2444             n = nnext;
2445             if (stopnow) break;
2446         }
2447
2448 #ifdef EXPERIMENTAL_INPLACESCAN
2449         if (flags && !NEXT_OFF(n)) {
2450             DEBUG_PEEP("atch", val, depth);
2451             if (reg_off_by_arg[OP(n)]) {
2452                 ARG_SET(n, val - n);
2453             }
2454             else {
2455                 NEXT_OFF(n) = val - n;
2456             }
2457             stopnow = 1;
2458         }
2459 #endif
2460     }
2461 #define GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS   0x0390
2462 #define IOTA_D_T        GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS
2463 #define GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS     0x03B0
2464 #define UPSILON_D_T     GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS
2465
2466     if (UTF
2467         && ( OP(scan) == EXACTF || OP(scan) == EXACTFU)
2468         && ( STR_LEN(scan) >= 6 ) )
2469     {
2470     /*
2471     Two problematic code points in Unicode casefolding of EXACT nodes:
2472
2473     U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2474     U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2475
2476     which casefold to
2477
2478     Unicode                      UTF-8
2479
2480     U+03B9 U+0308 U+0301         0xCE 0xB9 0xCC 0x88 0xCC 0x81
2481     U+03C5 U+0308 U+0301         0xCF 0x85 0xCC 0x88 0xCC 0x81
2482
2483     This means that in case-insensitive matching (or "loose matching",
2484     as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2485     length of the above casefolded versions) can match a target string
2486     of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2487     This would rather mess up the minimum length computation.
2488
2489     What we'll do is to look for the tail four bytes, and then peek
2490     at the preceding two bytes to see whether we need to decrease
2491     the minimum length by four (six minus two).
2492
2493     Thanks to the design of UTF-8, there cannot be false matches:
2494     A sequence of valid UTF-8 bytes cannot be a subsequence of
2495     another valid sequence of UTF-8 bytes.
2496
2497     */
2498          char * const s0 = STRING(scan), *s, *t;
2499          char * const s1 = s0 + STR_LEN(scan) - 1;
2500          char * const s2 = s1 - 4;
2501 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2502          const char t0[] = "\xaf\x49\xaf\x42";
2503 #else
2504          const char t0[] = "\xcc\x88\xcc\x81";
2505 #endif
2506          const char * const t1 = t0 + 3;
2507
2508          for (s = s0 + 2;
2509               s < s2 && (t = ninstr(s, s1, t0, t1));
2510               s = t + 4) {
2511 #ifdef EBCDIC
2512               if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2513                   ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2514 #else
2515               if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2516                   ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
2517 #endif
2518                    *min -= 4;
2519          }
2520     }
2521
2522 #ifdef DEBUGGING
2523     /* Allow dumping */
2524     n = scan + NODE_SZ_STR(scan);
2525     while (n <= stop) {
2526         if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2527             OP(n) = OPTIMIZED;
2528             NEXT_OFF(n) = 0;
2529         }
2530         n++;
2531     }
2532 #endif
2533     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2534     return stopnow;
2535 }
2536
2537 /* REx optimizer.  Converts nodes into quicker variants "in place".
2538    Finds fixed substrings.  */
2539
2540 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2541    to the position after last scanned or to NULL. */
2542
2543 #define INIT_AND_WITHP \
2544     assert(!and_withp); \
2545     Newx(and_withp,1,struct regnode_charclass_class); \
2546     SAVEFREEPV(and_withp)
2547
2548 /* this is a chain of data about sub patterns we are processing that
2549    need to be handled separately/specially in study_chunk. Its so
2550    we can simulate recursion without losing state.  */
2551 struct scan_frame;
2552 typedef struct scan_frame {
2553     regnode *last;  /* last node to process in this frame */
2554     regnode *next;  /* next node to process when last is reached */
2555     struct scan_frame *prev; /*previous frame*/
2556     I32 stop; /* what stopparen do we use */
2557 } scan_frame;
2558
2559
2560 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2561
2562 #define CASE_SYNST_FNC(nAmE)                                       \
2563 case nAmE:                                                         \
2564     if (flags & SCF_DO_STCLASS_AND) {                              \
2565             for (value = 0; value < 256; value++)                  \
2566                 if (!is_ ## nAmE ## _cp(value))                       \
2567                     ANYOF_BITMAP_CLEAR(data->start_class, value);  \
2568     }                                                              \
2569     else {                                                         \
2570             for (value = 0; value < 256; value++)                  \
2571                 if (is_ ## nAmE ## _cp(value))                        \
2572                     ANYOF_BITMAP_SET(data->start_class, value);    \
2573     }                                                              \
2574     break;                                                         \
2575 case N ## nAmE:                                                    \
2576     if (flags & SCF_DO_STCLASS_AND) {                              \
2577             for (value = 0; value < 256; value++)                   \
2578                 if (is_ ## nAmE ## _cp(value))                         \
2579                     ANYOF_BITMAP_CLEAR(data->start_class, value);   \
2580     }                                                               \
2581     else {                                                          \
2582             for (value = 0; value < 256; value++)                   \
2583                 if (!is_ ## nAmE ## _cp(value))                        \
2584                     ANYOF_BITMAP_SET(data->start_class, value);     \
2585     }                                                               \
2586     break
2587
2588
2589
2590 STATIC I32
2591 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2592                         I32 *minlenp, I32 *deltap,
2593                         regnode *last,
2594                         scan_data_t *data,
2595                         I32 stopparen,
2596                         U8* recursed,
2597                         struct regnode_charclass_class *and_withp,
2598                         U32 flags, U32 depth)
2599                         /* scanp: Start here (read-write). */
2600                         /* deltap: Write maxlen-minlen here. */
2601                         /* last: Stop before this one. */
2602                         /* data: string data about the pattern */
2603                         /* stopparen: treat close N as END */
2604                         /* recursed: which subroutines have we recursed into */
2605                         /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2606 {
2607     dVAR;
2608     I32 min = 0, pars = 0, code;
2609     regnode *scan = *scanp, *next;
2610     I32 delta = 0;
2611     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2612     int is_inf_internal = 0;            /* The studied chunk is infinite */
2613     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2614     scan_data_t data_fake;
2615     SV *re_trie_maxbuff = NULL;
2616     regnode *first_non_open = scan;
2617     I32 stopmin = I32_MAX;
2618     scan_frame *frame = NULL;
2619     GET_RE_DEBUG_FLAGS_DECL;
2620
2621     PERL_ARGS_ASSERT_STUDY_CHUNK;
2622
2623 #ifdef DEBUGGING
2624     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2625 #endif
2626
2627     if ( depth == 0 ) {
2628         while (first_non_open && OP(first_non_open) == OPEN)
2629             first_non_open=regnext(first_non_open);
2630     }
2631
2632
2633   fake_study_recurse:
2634     while ( scan && OP(scan) != END && scan < last ){
2635         /* Peephole optimizer: */
2636         DEBUG_STUDYDATA("Peep:", data,depth);
2637         DEBUG_PEEP("Peep",scan,depth);
2638         JOIN_EXACT(scan,&min,0);
2639
2640         /* Follow the next-chain of the current node and optimize
2641            away all the NOTHINGs from it.  */
2642         if (OP(scan) != CURLYX) {
2643             const int max = (reg_off_by_arg[OP(scan)]
2644                        ? I32_MAX
2645                        /* I32 may be smaller than U16 on CRAYs! */
2646                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2647             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2648             int noff;
2649             regnode *n = scan;
2650
2651             /* Skip NOTHING and LONGJMP. */
2652             while ((n = regnext(n))
2653                    && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2654                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2655                    && off + noff < max)
2656                 off += noff;
2657             if (reg_off_by_arg[OP(scan)])
2658                 ARG(scan) = off;
2659             else
2660                 NEXT_OFF(scan) = off;
2661         }
2662
2663
2664
2665         /* The principal pseudo-switch.  Cannot be a switch, since we
2666            look into several different things.  */
2667         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2668                    || OP(scan) == IFTHEN) {
2669             next = regnext(scan);
2670             code = OP(scan);
2671             /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2672
2673             if (OP(next) == code || code == IFTHEN) {
2674                 /* NOTE - There is similar code to this block below for handling
2675                    TRIE nodes on a re-study.  If you change stuff here check there
2676                    too. */
2677                 I32 max1 = 0, min1 = I32_MAX, num = 0;
2678                 struct regnode_charclass_class accum;
2679                 regnode * const startbranch=scan;
2680
2681                 if (flags & SCF_DO_SUBSTR)
2682                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2683                 if (flags & SCF_DO_STCLASS)
2684                     cl_init_zero(pRExC_state, &accum);
2685
2686                 while (OP(scan) == code) {
2687                     I32 deltanext, minnext, f = 0, fake;
2688                     struct regnode_charclass_class this_class;
2689
2690                     num++;
2691                     data_fake.flags = 0;
2692                     if (data) {
2693                         data_fake.whilem_c = data->whilem_c;
2694                         data_fake.last_closep = data->last_closep;
2695                     }
2696                     else
2697                         data_fake.last_closep = &fake;
2698
2699                     data_fake.pos_delta = delta;
2700                     next = regnext(scan);
2701                     scan = NEXTOPER(scan);
2702                     if (code != BRANCH)
2703                         scan = NEXTOPER(scan);
2704                     if (flags & SCF_DO_STCLASS) {
2705                         cl_init(pRExC_state, &this_class);
2706                         data_fake.start_class = &this_class;
2707                         f = SCF_DO_STCLASS_AND;
2708                     }
2709                     if (flags & SCF_WHILEM_VISITED_POS)
2710                         f |= SCF_WHILEM_VISITED_POS;
2711
2712                     /* we suppose the run is continuous, last=next...*/
2713                     minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2714                                           next, &data_fake,
2715                                           stopparen, recursed, NULL, f,depth+1);
2716                     if (min1 > minnext)
2717                         min1 = minnext;
2718                     if (max1 < minnext + deltanext)
2719                         max1 = minnext + deltanext;
2720                     if (deltanext == I32_MAX)
2721                         is_inf = is_inf_internal = 1;
2722                     scan = next;
2723                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2724                         pars++;
2725                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
2726                         if ( stopmin > minnext)
2727                             stopmin = min + min1;
2728                         flags &= ~SCF_DO_SUBSTR;
2729                         if (data)
2730                             data->flags |= SCF_SEEN_ACCEPT;
2731                     }
2732                     if (data) {
2733                         if (data_fake.flags & SF_HAS_EVAL)
2734                             data->flags |= SF_HAS_EVAL;
2735                         data->whilem_c = data_fake.whilem_c;
2736                     }
2737                     if (flags & SCF_DO_STCLASS)
2738                         cl_or(pRExC_state, &accum, &this_class);
2739                 }
2740                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2741                     min1 = 0;
2742                 if (flags & SCF_DO_SUBSTR) {
2743                     data->pos_min += min1;
2744                     data->pos_delta += max1 - min1;
2745                     if (max1 != min1 || is_inf)
2746                         data->longest = &(data->longest_float);
2747                 }
2748                 min += min1;
2749                 delta += max1 - min1;
2750                 if (flags & SCF_DO_STCLASS_OR) {
2751                     cl_or(pRExC_state, data->start_class, &accum);
2752                     if (min1) {
2753                         cl_and(data->start_class, and_withp);
2754                         flags &= ~SCF_DO_STCLASS;
2755                     }
2756                 }
2757                 else if (flags & SCF_DO_STCLASS_AND) {
2758                     if (min1) {
2759                         cl_and(data->start_class, &accum);
2760                         flags &= ~SCF_DO_STCLASS;
2761                     }
2762                     else {
2763                         /* Switch to OR mode: cache the old value of
2764                          * data->start_class */
2765                         INIT_AND_WITHP;
2766                         StructCopy(data->start_class, and_withp,
2767                                    struct regnode_charclass_class);
2768                         flags &= ~SCF_DO_STCLASS_AND;
2769                         StructCopy(&accum, data->start_class,
2770                                    struct regnode_charclass_class);
2771                         flags |= SCF_DO_STCLASS_OR;
2772                         data->start_class->flags |= ANYOF_EOS;
2773                     }
2774                 }
2775
2776                 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2777                 /* demq.
2778
2779                    Assuming this was/is a branch we are dealing with: 'scan' now
2780                    points at the item that follows the branch sequence, whatever
2781                    it is. We now start at the beginning of the sequence and look
2782                    for subsequences of
2783
2784                    BRANCH->EXACT=>x1
2785                    BRANCH->EXACT=>x2
2786                    tail
2787
2788                    which would be constructed from a pattern like /A|LIST|OF|WORDS/
2789
2790                    If we can find such a subsequence we need to turn the first
2791                    element into a trie and then add the subsequent branch exact
2792                    strings to the trie.
2793
2794                    We have two cases
2795
2796                      1. patterns where the whole set of branches can be converted.
2797
2798                      2. patterns where only a subset can be converted.
2799
2800                    In case 1 we can replace the whole set with a single regop
2801                    for the trie. In case 2 we need to keep the start and end
2802                    branches so
2803
2804                      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2805                      becomes BRANCH TRIE; BRANCH X;
2806
2807                   There is an additional case, that being where there is a
2808                   common prefix, which gets split out into an EXACT like node
2809                   preceding the TRIE node.
2810
2811                   If x(1..n)==tail then we can do a simple trie, if not we make
2812                   a "jump" trie, such that when we match the appropriate word
2813                   we "jump" to the appropriate tail node. Essentially we turn
2814                   a nested if into a case structure of sorts.
2815
2816                 */
2817
2818                     int made=0;
2819                     if (!re_trie_maxbuff) {
2820                         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2821                         if (!SvIOK(re_trie_maxbuff))
2822                             sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2823                     }
2824                     if ( SvIV(re_trie_maxbuff)>=0  ) {
2825                         regnode *cur;
2826                         regnode *first = (regnode *)NULL;
2827                         regnode *last = (regnode *)NULL;
2828                         regnode *tail = scan;
2829                         U8 optype = 0;
2830                         U32 count=0;
2831
2832 #ifdef DEBUGGING
2833                         SV * const mysv = sv_newmortal();       /* for dumping */
2834 #endif
2835                         /* var tail is used because there may be a TAIL
2836                            regop in the way. Ie, the exacts will point to the
2837                            thing following the TAIL, but the last branch will
2838                            point at the TAIL. So we advance tail. If we
2839                            have nested (?:) we may have to move through several
2840                            tails.
2841                          */
2842
2843                         while ( OP( tail ) == TAIL ) {
2844                             /* this is the TAIL generated by (?:) */
2845                             tail = regnext( tail );
2846                         }
2847
2848
2849                         DEBUG_OPTIMISE_r({
2850                             regprop(RExC_rx, mysv, tail );
2851                             PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2852                                 (int)depth * 2 + 2, "",
2853                                 "Looking for TRIE'able sequences. Tail node is: ",
2854                                 SvPV_nolen_const( mysv )
2855                             );
2856                         });
2857
2858                         /*
2859
2860                            step through the branches, cur represents each
2861                            branch, noper is the first thing to be matched
2862                            as part of that branch and noper_next is the
2863                            regnext() of that node. if noper is an EXACT
2864                            and noper_next is the same as scan (our current
2865                            position in the regex) then the EXACT branch is
2866                            a possible optimization target. Once we have
2867                            two or more consecutive such branches we can
2868                            create a trie of the EXACT's contents and stich
2869                            it in place. If the sequence represents all of
2870                            the branches we eliminate the whole thing and
2871                            replace it with a single TRIE. If it is a
2872                            subsequence then we need to stitch it in. This
2873                            means the first branch has to remain, and needs
2874                            to be repointed at the item on the branch chain
2875                            following the last branch optimized. This could
2876                            be either a BRANCH, in which case the
2877                            subsequence is internal, or it could be the
2878                            item following the branch sequence in which
2879                            case the subsequence is at the end.
2880
2881                         */
2882
2883                         /* dont use tail as the end marker for this traverse */
2884                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2885                             regnode * const noper = NEXTOPER( cur );
2886 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
2887                             regnode * const noper_next = regnext( noper );
2888 #endif
2889
2890                             DEBUG_OPTIMISE_r({
2891                                 regprop(RExC_rx, mysv, cur);
2892                                 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2893                                    (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2894
2895                                 regprop(RExC_rx, mysv, noper);
2896                                 PerlIO_printf( Perl_debug_log, " -> %s",
2897                                     SvPV_nolen_const(mysv));
2898
2899                                 if ( noper_next ) {
2900                                   regprop(RExC_rx, mysv, noper_next );
2901                                   PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2902                                     SvPV_nolen_const(mysv));
2903                                 }
2904                                 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2905                                    REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2906                             });
2907                             if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2908                                          : PL_regkind[ OP( noper ) ] == EXACT )
2909                                   || OP(noper) == NOTHING )
2910 #ifdef NOJUMPTRIE
2911                                   && noper_next == tail
2912 #endif
2913                                   && count < U16_MAX)
2914                             {
2915                                 count++;
2916                                 if ( !first || optype == NOTHING ) {
2917                                     if (!first) first = cur;
2918                                     optype = OP( noper );
2919                                 } else {
2920                                     last = cur;
2921                                 }
2922                             } else {
2923 /*
2924     Currently we do not believe that the trie logic can
2925     handle case insensitive matching properly when the
2926     pattern is not unicode (thus forcing unicode semantics).
2927
2928     If/when this is fixed the following define can be swapped
2929     in below to fully enable trie logic.
2930
2931 #define TRIE_TYPE_IS_SAFE 1
2932
2933 */
2934 #define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
2935
2936                                 if ( last && TRIE_TYPE_IS_SAFE ) {
2937                                     make_trie( pRExC_state,
2938                                             startbranch, first, cur, tail, count,
2939                                             optype, depth+1 );
2940                                 }
2941                                 if ( PL_regkind[ OP( noper ) ] == EXACT
2942 #ifdef NOJUMPTRIE
2943                                      && noper_next == tail
2944 #endif
2945                                 ){
2946                                     count = 1;
2947                                     first = cur;
2948                                     optype = OP( noper );
2949                                 } else {
2950                                     count = 0;
2951                                     first = NULL;
2952                                     optype = 0;
2953                                 }
2954                                 last = NULL;
2955                             }
2956                         }
2957                         DEBUG_OPTIMISE_r({
2958                             regprop(RExC_rx, mysv, cur);
2959                             PerlIO_printf( Perl_debug_log,
2960                               "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2961                               "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2962
2963                         });
2964
2965                         if ( last && TRIE_TYPE_IS_SAFE ) {
2966                             made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
2967 #ifdef TRIE_STUDY_OPT
2968                             if ( ((made == MADE_EXACT_TRIE &&
2969                                  startbranch == first)
2970                                  || ( first_non_open == first )) &&
2971                                  depth==0 ) {
2972                                 flags |= SCF_TRIE_RESTUDY;
2973                                 if ( startbranch == first
2974                                      && scan == tail )
2975                                 {
2976                                     RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2977                                 }
2978                             }
2979 #endif
2980                         }
2981                     }
2982
2983                 } /* do trie */
2984
2985             }
2986             else if ( code == BRANCHJ ) {  /* single branch is optimized. */
2987                 scan = NEXTOPER(NEXTOPER(scan));
2988             } else                      /* single branch is optimized. */
2989                 scan = NEXTOPER(scan);
2990             continue;
2991         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2992             scan_frame *newframe = NULL;
2993             I32 paren;
2994             regnode *start;
2995             regnode *end;
2996
2997             if (OP(scan) != SUSPEND) {
2998             /* set the pointer */
2999                 if (OP(scan) == GOSUB) {
3000                     paren = ARG(scan);
3001                     RExC_recurse[ARG2L(scan)] = scan;
3002                     start = RExC_open_parens[paren-1];
3003                     end   = RExC_close_parens[paren-1];
3004                 } else {
3005                     paren = 0;
3006                     start = RExC_rxi->program + 1;
3007                     end   = RExC_opend;
3008                 }
3009                 if (!recursed) {
3010                     Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3011                     SAVEFREEPV(recursed);
3012                 }
3013                 if (!PAREN_TEST(recursed,paren+1)) {
3014                     PAREN_SET(recursed,paren+1);
3015                     Newx(newframe,1,scan_frame);
3016                 } else {
3017                     if (flags & SCF_DO_SUBSTR) {
3018                         SCAN_COMMIT(pRExC_state,data,minlenp);
3019                         data->longest = &(data->longest_float);
3020                     }
3021                     is_inf = is_inf_internal = 1;
3022                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3023                         cl_anything(pRExC_state, data->start_class);
3024                     flags &= ~SCF_DO_STCLASS;
3025                 }
3026             } else {
3027                 Newx(newframe,1,scan_frame);
3028                 paren = stopparen;
3029                 start = scan+2;
3030                 end = regnext(scan);
3031             }
3032             if (newframe) {
3033                 assert(start);
3034                 assert(end);
3035                 SAVEFREEPV(newframe);
3036                 newframe->next = regnext(scan);
3037                 newframe->last = last;
3038                 newframe->stop = stopparen;
3039                 newframe->prev = frame;
3040
3041                 frame = newframe;
3042                 scan =  start;
3043                 stopparen = paren;
3044                 last = end;
3045
3046                 continue;
3047             }
3048         }
3049         else if (OP(scan) == EXACT) {
3050             I32 l = STR_LEN(scan);
3051             UV uc;
3052             if (UTF) {
3053                 const U8 * const s = (U8*)STRING(scan);
3054                 l = utf8_length(s, s + l);
3055                 uc = utf8_to_uvchr(s, NULL);
3056             } else {
3057                 uc = *((U8*)STRING(scan));
3058             }
3059             min += l;
3060             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3061                 /* The code below prefers earlier match for fixed
3062                    offset, later match for variable offset.  */
3063                 if (data->last_end == -1) { /* Update the start info. */
3064                     data->last_start_min = data->pos_min;
3065                     data->last_start_max = is_inf
3066                         ? I32_MAX : data->pos_min + data->pos_delta;
3067                 }
3068                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3069                 if (UTF)
3070                     SvUTF8_on(data->last_found);
3071                 {
3072                     SV * const sv = data->last_found;
3073                     MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3074                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
3075                     if (mg && mg->mg_len >= 0)
3076                         mg->mg_len += utf8_length((U8*)STRING(scan),
3077                                                   (U8*)STRING(scan)+STR_LEN(scan));
3078                 }
3079                 data->last_end = data->pos_min + l;
3080                 data->pos_min += l; /* As in the first entry. */
3081                 data->flags &= ~SF_BEFORE_EOL;
3082             }
3083             if (flags & SCF_DO_STCLASS_AND) {
3084                 /* Check whether it is compatible with what we know already! */
3085                 int compat = 1;
3086
3087
3088                 /* If compatible, we or it in below.  It is compatible if is
3089                  * in the bitmp and either 1) its bit or its fold is set, or 2)
3090                  * it's for a locale.  Even if there isn't unicode semantics
3091                  * here, at runtime there may be because of matching against a
3092                  * utf8 string, so accept a possible false positive for
3093                  * latin1-range folds */
3094                 if (uc >= 0x100 ||
3095                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3096                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
3097                     && (!(data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD)
3098                         || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3099                     )
3100                     compat = 0;
3101                 ANYOF_CLASS_ZERO(data->start_class);
3102                 ANYOF_BITMAP_ZERO(data->start_class);
3103                 if (compat)
3104                     ANYOF_BITMAP_SET(data->start_class, uc);
3105                 data->start_class->flags &= ~ANYOF_EOS;
3106                 if (uc < 0x100)
3107                   data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3108             }
3109             else if (flags & SCF_DO_STCLASS_OR) {
3110                 /* false positive possible if the class is case-folded */
3111                 if (uc < 0x100)
3112                     ANYOF_BITMAP_SET(data->start_class, uc);
3113                 else
3114                     data->start_class->flags |= ANYOF_UNICODE_ALL;
3115                 data->start_class->flags &= ~ANYOF_EOS;
3116                 cl_and(data->start_class, and_withp);
3117             }
3118             flags &= ~SCF_DO_STCLASS;
3119         }
3120         else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3121             I32 l = STR_LEN(scan);
3122             UV uc = *((U8*)STRING(scan));
3123
3124             /* Search for fixed substrings supports EXACT only. */
3125             if (flags & SCF_DO_SUBSTR) {
3126                 assert(data);
3127                 SCAN_COMMIT(pRExC_state, data, minlenp);
3128             }
3129             if (UTF) {
3130                 const U8 * const s = (U8 *)STRING(scan);
3131                 l = utf8_length(s, s + l);
3132                 uc = utf8_to_uvchr(s, NULL);
3133             }
3134             min += l;
3135             if (flags & SCF_DO_SUBSTR)
3136                 data->pos_min += l;
3137             if (flags & SCF_DO_STCLASS_AND) {
3138                 /* Check whether it is compatible with what we know already! */
3139                 int compat = 1;
3140                 if (uc >= 0x100 ||
3141                  (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3142                   && !ANYOF_BITMAP_TEST(data->start_class, uc)
3143                   && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3144                 {
3145                     compat = 0;
3146                 }
3147                 ANYOF_CLASS_ZERO(data->start_class);
3148                 ANYOF_BITMAP_ZERO(data->start_class);
3149                 if (compat) {
3150                     ANYOF_BITMAP_SET(data->start_class, uc);
3151                     data->start_class->flags &= ~ANYOF_EOS;
3152                     data->start_class->flags |= ANYOF_LOC_NONBITMAP_FOLD;
3153                     if (OP(scan) == EXACTFL) {
3154                         data->start_class->flags |= ANYOF_LOCALE;
3155                     }
3156                     else {
3157
3158                         /* Also set the other member of the fold pair.  In case
3159                          * that unicode semantics is called for at runtime, use
3160                          * the full latin1 fold.  (Can't do this for locale,
3161                          * because not known until runtime */
3162                         ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3163                     }
3164                 }
3165             }
3166             else if (flags & SCF_DO_STCLASS_OR) {
3167                 if (data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) {
3168                     /* false positive possible if the class is case-folded.
3169                        Assume that the locale settings are the same... */
3170                     if (uc < 0x100) {
3171                         ANYOF_BITMAP_SET(data->start_class, uc);
3172                         if (OP(scan) != EXACTFL) {
3173
3174                             /* And set the other member of the fold pair, but
3175                              * can't do that in locale because not known until
3176                              * run-time */
3177                             ANYOF_BITMAP_SET(data->start_class,
3178                                              PL_fold_latin1[uc]);
3179                         }
3180                     }
3181                     data->start_class->flags &= ~ANYOF_EOS;
3182                 }
3183                 cl_and(data->start_class, and_withp);
3184             }
3185             flags &= ~SCF_DO_STCLASS;
3186         }
3187         else if (REGNODE_VARIES(OP(scan))) {
3188             I32 mincount, maxcount, minnext, deltanext, fl = 0;
3189             I32 f = flags, pos_before = 0;
3190             regnode * const oscan = scan;
3191             struct regnode_charclass_class this_class;
3192             struct regnode_charclass_class *oclass = NULL;
3193             I32 next_is_eval = 0;
3194
3195             switch (PL_regkind[OP(scan)]) {
3196             case WHILEM:                /* End of (?:...)* . */
3197                 scan = NEXTOPER(scan);
3198                 goto finish;
3199             case PLUS:
3200                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3201                     next = NEXTOPER(scan);
3202                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3203                         mincount = 1;
3204                         maxcount = REG_INFTY;
3205                         next = regnext(scan);
3206                         scan = NEXTOPER(scan);
3207                         goto do_curly;
3208                     }
3209                 }
3210                 if (flags & SCF_DO_SUBSTR)
3211                     data->pos_min++;
3212                 min++;
3213                 /* Fall through. */
3214             case STAR:
3215                 if (flags & SCF_DO_STCLASS) {
3216                     mincount = 0;
3217                     maxcount = REG_INFTY;
3218                     next = regnext(scan);
3219                     scan = NEXTOPER(scan);
3220                     goto do_curly;
3221                 }
3222                 is_inf = is_inf_internal = 1;
3223                 scan = regnext(scan);
3224                 if (flags & SCF_DO_SUBSTR) {
3225                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3226                     data->longest = &(data->longest_float);
3227                 }
3228                 goto optimize_curly_tail;
3229             case CURLY:
3230                 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3231                     && (scan->flags == stopparen))
3232                 {
3233                     mincount = 1;
3234                     maxcount = 1;
3235                 } else {
3236                     mincount = ARG1(scan);
3237                     maxcount = ARG2(scan);
3238                 }
3239                 next = regnext(scan);
3240                 if (OP(scan) == CURLYX) {
3241                     I32 lp = (data ? *(data->last_closep) : 0);
3242                     scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3243                 }
3244                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3245                 next_is_eval = (OP(scan) == EVAL);
3246               do_curly:
3247                 if (flags & SCF_DO_SUBSTR) {
3248                     if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3249                     pos_before = data->pos_min;
3250                 }
3251                 if (data) {
3252                     fl = data->flags;
3253                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3254                     if (is_inf)
3255                         data->flags |= SF_IS_INF;
3256                 }
3257                 if (flags & SCF_DO_STCLASS) {
3258                     cl_init(pRExC_state, &this_class);
3259                     oclass = data->start_class;
3260                     data->start_class = &this_class;
3261                     f |= SCF_DO_STCLASS_AND;
3262                     f &= ~SCF_DO_STCLASS_OR;
3263                 }
3264                 /* Exclude from super-linear cache processing any {n,m}
3265                    regops for which the combination of input pos and regex
3266                    pos is not enough information to determine if a match
3267                    will be possible.
3268
3269                    For example, in the regex /foo(bar\s*){4,8}baz/ with the
3270                    regex pos at the \s*, the prospects for a match depend not
3271                    only on the input position but also on how many (bar\s*)
3272                    repeats into the {4,8} we are. */
3273                if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3274                     f &= ~SCF_WHILEM_VISITED_POS;
3275
3276                 /* This will finish on WHILEM, setting scan, or on NULL: */
3277                 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3278                                       last, data, stopparen, recursed, NULL,
3279                                       (mincount == 0
3280                                         ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3281
3282                 if (flags & SCF_DO_STCLASS)
3283                     data->start_class = oclass;
3284                 if (mincount == 0 || minnext == 0) {
3285                     if (flags & SCF_DO_STCLASS_OR) {
3286                         cl_or(pRExC_state, data->start_class, &this_class);
3287                     }
3288                     else if (flags & SCF_DO_STCLASS_AND) {
3289                         /* Switch to OR mode: cache the old value of
3290                          * data->start_class */
3291                         INIT_AND_WITHP;
3292                         StructCopy(data->start_class, and_withp,
3293                                    struct regnode_charclass_class);
3294                         flags &= ~SCF_DO_STCLASS_AND;
3295                         StructCopy(&this_class, data->start_class,
3296                                    struct regnode_charclass_class);
3297                         flags |= SCF_DO_STCLASS_OR;
3298                         data->start_class->flags |= ANYOF_EOS;
3299                     }
3300                 } else {                /* Non-zero len */
3301                     if (flags & SCF_DO_STCLASS_OR) {
3302                         cl_or(pRExC_state, data->start_class, &this_class);
3303                         cl_and(data->start_class, and_withp);
3304                     }
3305                     else if (flags & SCF_DO_STCLASS_AND)
3306                         cl_and(data->start_class, &this_class);
3307                     flags &= ~SCF_DO_STCLASS;
3308                 }
3309                 if (!scan)              /* It was not CURLYX, but CURLY. */
3310                     scan = next;
3311                 if ( /* ? quantifier ok, except for (?{ ... }) */
3312                     (next_is_eval || !(mincount == 0 && maxcount == 1))
3313                     && (minnext == 0) && (deltanext == 0)
3314                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3315                     && maxcount <= REG_INFTY/3) /* Complement check for big count */
3316                 {
3317                     ckWARNreg(RExC_parse,
3318                               "Quantifier unexpected on zero-length expression");
3319                 }
3320
3321                 min += minnext * mincount;
3322                 is_inf_internal |= ((maxcount == REG_INFTY
3323                                      && (minnext + deltanext) > 0)
3324                                     || deltanext == I32_MAX);
3325                 is_inf |= is_inf_internal;
3326                 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3327
3328                 /* Try powerful optimization CURLYX => CURLYN. */
3329                 if (  OP(oscan) == CURLYX && data
3330                       && data->flags & SF_IN_PAR
3331                       && !(data->flags & SF_HAS_EVAL)
3332                       && !deltanext && minnext == 1 ) {
3333                     /* Try to optimize to CURLYN.  */
3334                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3335                     regnode * const nxt1 = nxt;
3336 #ifdef DEBUGGING
3337                     regnode *nxt2;
3338 #endif
3339
3340                     /* Skip open. */
3341                     nxt = regnext(nxt);
3342                     if (!REGNODE_SIMPLE(OP(nxt))
3343                         && !(PL_regkind[OP(nxt)] == EXACT
3344                              && STR_LEN(nxt) == 1))
3345                         goto nogo;
3346 #ifdef DEBUGGING
3347                     nxt2 = nxt;
3348 #endif
3349                     nxt = regnext(nxt);
3350                     if (OP(nxt) != CLOSE)
3351                         goto nogo;
3352                     if (RExC_open_parens) {
3353                         RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3354                         RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3355                     }
3356                     /* Now we know that nxt2 is the only contents: */
3357                     oscan->flags = (U8)ARG(nxt);
3358                     OP(oscan) = CURLYN;
3359                     OP(nxt1) = NOTHING; /* was OPEN. */
3360
3361 #ifdef DEBUGGING
3362                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3363                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3364                     NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3365                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
3366                     OP(nxt + 1) = OPTIMIZED; /* was count. */
3367                     NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3368 #endif
3369                 }
3370               nogo:
3371
3372                 /* Try optimization CURLYX => CURLYM. */
3373                 if (  OP(oscan) == CURLYX && data
3374                       && !(data->flags & SF_HAS_PAR)
3375                       && !(data->flags & SF_HAS_EVAL)
3376                       && !deltanext     /* atom is fixed width */
3377                       && minnext != 0   /* CURLYM can't handle zero width */
3378                 ) {
3379                     /* XXXX How to optimize if data == 0? */
3380                     /* Optimize to a simpler form.  */
3381                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3382                     regnode *nxt2;
3383
3384                     OP(oscan) = CURLYM;
3385                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3386                             && (OP(nxt2) != WHILEM))
3387                         nxt = nxt2;
3388                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
3389                     /* Need to optimize away parenths. */
3390                     if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3391                         /* Set the parenth number.  */
3392                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3393
3394                         oscan->flags = (U8)ARG(nxt);
3395                         if (RExC_open_parens) {
3396                             RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3397                             RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3398                         }
3399                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
3400                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
3401
3402 #ifdef DEBUGGING
3403                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3404                         OP(nxt + 1) = OPTIMIZED; /* was count. */
3405                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3406                         NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
3407 #endif
3408 #if 0
3409                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
3410                             regnode *nnxt = regnext(nxt1);
3411                             if (nnxt == nxt) {
3412                                 if (reg_off_by_arg[OP(nxt1)])
3413                                     ARG_SET(nxt1, nxt2 - nxt1);
3414                                 else if (nxt2 - nxt1 < U16_MAX)
3415                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
3416                                 else
3417                                     OP(nxt) = NOTHING;  /* Cannot beautify */
3418                             }
3419                             nxt1 = nnxt;
3420                         }
3421 #endif
3422                         /* Optimize again: */
3423                         study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3424                                     NULL, stopparen, recursed, NULL, 0,depth+1);
3425                     }
3426                     else
3427                         oscan->flags = 0;
3428                 }
3429                 else if ((OP(oscan) == CURLYX)
3430                          && (flags & SCF_WHILEM_VISITED_POS)
3431                          /* See the comment on a similar expression above.
3432                             However, this time it's not a subexpression
3433                             we care about, but the expression itself. */
3434                          && (maxcount == REG_INFTY)
3435                          && data && ++data->whilem_c < 16) {
3436                     /* This stays as CURLYX, we can put the count/of pair. */
3437                     /* Find WHILEM (as in regexec.c) */
3438                     regnode *nxt = oscan + NEXT_OFF(oscan);
3439
3440                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3441                         nxt += ARG(nxt);
3442                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
3443                         | (RExC_whilem_seen << 4)); /* On WHILEM */
3444                 }
3445                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3446                     pars++;
3447                 if (flags & SCF_DO_SUBSTR) {
3448                     SV *last_str = NULL;
3449                     int counted = mincount != 0;
3450
3451                     if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3452 #if defined(SPARC64_GCC_WORKAROUND)
3453                         I32 b = 0;
3454                         STRLEN l = 0;
3455                         const char *s = NULL;
3456                         I32 old = 0;
3457
3458                         if (pos_before >= data->last_start_min)
3459                             b = pos_before;
3460                         else
3461                             b = data->last_start_min;
3462
3463                         l = 0;
3464                         s = SvPV_const(data->last_found, l);
3465                         old = b - data->last_start_min;
3466
3467 #else
3468                         I32 b = pos_before >= data->last_start_min
3469                             ? pos_before : data->last_start_min;
3470                         STRLEN l;
3471                         const char * const s = SvPV_const(data->last_found, l);
3472                         I32 old = b - data->last_start_min;
3473 #endif
3474
3475                         if (UTF)
3476                             old = utf8_hop((U8*)s, old) - (U8*)s;
3477                         l -= old;
3478                         /* Get the added string: */
3479                         last_str = newSVpvn_utf8(s  + old, l, UTF);
3480                         if (deltanext == 0 && pos_before == b) {
3481                             /* What was added is a constant string */
3482                             if (mincount > 1) {
3483                                 SvGROW(last_str, (mincount * l) + 1);
3484                                 repeatcpy(SvPVX(last_str) + l,
3485                                           SvPVX_const(last_str), l, mincount - 1);
3486                                 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3487                                 /* Add additional parts. */
3488                                 SvCUR_set(data->last_found,
3489                                           SvCUR(data->last_found) - l);
3490                                 sv_catsv(data->last_found, last_str);
3491                                 {
3492                                     SV * sv = data->last_found;
3493                                     MAGIC *mg =
3494                                         SvUTF8(sv) && SvMAGICAL(sv) ?
3495                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
3496                                     if (mg && mg->mg_len >= 0)
3497                                         mg->mg_len += CHR_SVLEN(last_str) - l;
3498                                 }
3499                                 data->last_end += l * (mincount - 1);
3500                             }
3501                         } else {
3502                             /* start offset must point into the last copy */
3503                             data->last_start_min += minnext * (mincount - 1);
3504                             data->last_start_max += is_inf ? I32_MAX
3505                                 : (maxcount - 1) * (minnext + data->pos_delta);
3506                         }
3507                     }
3508                     /* It is counted once already... */
3509                     data->pos_min += minnext * (mincount - counted);
3510                     data->pos_delta += - counted * deltanext +
3511                         (minnext + deltanext) * maxcount - minnext * mincount;
3512                     if (mincount != maxcount) {
3513                          /* Cannot extend fixed substrings found inside
3514                             the group.  */
3515                         SCAN_COMMIT(pRExC_state,data,minlenp);
3516                         if (mincount && last_str) {
3517                             SV * const sv = data->last_found;
3518                             MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3519                                 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3520
3521                             if (mg)
3522                                 mg->mg_len = -1;
3523                             sv_setsv(sv, last_str);
3524                             data->last_end = data->pos_min;
3525                             data->last_start_min =
3526                                 data->pos_min - CHR_SVLEN(last_str);
3527                             data->last_start_max = is_inf
3528                                 ? I32_MAX
3529                                 : data->pos_min + data->pos_delta
3530                                 - CHR_SVLEN(last_str);
3531                         }
3532                         data->longest = &(data->longest_float);
3533                     }
3534                     SvREFCNT_dec(last_str);
3535                 }
3536                 if (data && (fl & SF_HAS_EVAL))
3537                     data->flags |= SF_HAS_EVAL;
3538               optimize_curly_tail:
3539                 if (OP(oscan) != CURLYX) {
3540                     while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3541                            && NEXT_OFF(next))
3542                         NEXT_OFF(oscan) += NEXT_OFF(next);
3543                 }
3544                 continue;
3545             default:                    /* REF, ANYOFV, and CLUMP only? */
3546                 if (flags & SCF_DO_SUBSTR) {
3547                     SCAN_COMMIT(pRExC_state,data,minlenp);      /* Cannot expect anything... */
3548                     data->longest = &(data->longest_float);
3549                 }
3550                 is_inf = is_inf_internal = 1;
3551                 if (flags & SCF_DO_STCLASS_OR)
3552                     cl_anything(pRExC_state, data->start_class);
3553                 flags &= ~SCF_DO_STCLASS;
3554                 break;
3555             }
3556         }
3557         else if (OP(scan) == LNBREAK) {
3558             if (flags & SCF_DO_STCLASS) {
3559                 int value = 0;
3560                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3561                 if (flags & SCF_DO_STCLASS_AND) {
3562                     for (value = 0; value < 256; value++)
3563                         if (!is_VERTWS_cp(value))
3564                             ANYOF_BITMAP_CLEAR(data->start_class, value);
3565                 }
3566                 else {
3567                     for (value = 0; value < 256; value++)
3568                         if (is_VERTWS_cp(value))
3569                             ANYOF_BITMAP_SET(data->start_class, value);
3570                 }
3571                 if (flags & SCF_DO_STCLASS_OR)
3572                     cl_and(data->start_class, and_withp);
3573                 flags &= ~SCF_DO_STCLASS;
3574             }
3575             min += 1;
3576             delta += 1;
3577             if (flags & SCF_DO_SUBSTR) {
3578                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
3579                 data->pos_min += 1;
3580                 data->pos_delta += 1;
3581                 data->longest = &(data->longest_float);
3582             }
3583         }
3584         else if (OP(scan) == FOLDCHAR) {
3585             int d = ARG(scan) == LATIN_SMALL_LETTER_SHARP_S ? 1 : 2;
3586             flags &= ~SCF_DO_STCLASS;
3587             min += 1;
3588             delta += d;
3589             if (flags & SCF_DO_SUBSTR) {
3590                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
3591                 data->pos_min += 1;
3592                 data->pos_delta += d;
3593                 data->longest = &(data->longest_float);
3594             }
3595         }
3596         else if (REGNODE_SIMPLE(OP(scan))) {
3597             int value = 0;
3598
3599             if (flags & SCF_DO_SUBSTR) {
3600                 SCAN_COMMIT(pRExC_state,data,minlenp);
3601                 data->pos_min++;
3602             }
3603             min++;
3604             if (flags & SCF_DO_STCLASS) {
3605                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3606
3607                 /* Some of the logic below assumes that switching
3608                    locale on will only add false positives. */
3609                 switch (PL_regkind[OP(scan)]) {
3610                 case SANY:
3611                 default:
3612                   do_default:
3613                     /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3614                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3615                         cl_anything(pRExC_state, data->start_class);
3616                     break;
3617                 case REG_ANY:
3618                     if (OP(scan) == SANY)
3619                         goto do_default;
3620                     if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3621                         value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3622                                  || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
3623                         cl_anything(pRExC_state, data->start_class);
3624                     }
3625                     if (flags & SCF_DO_STCLASS_AND || !value)
3626                         ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3627                     break;
3628                 case ANYOF:
3629                     if (flags & SCF_DO_STCLASS_AND)
3630                         cl_and(data->start_class,
3631                                (struct regnode_charclass_class*)scan);
3632                     else
3633                         cl_or(pRExC_state, data->start_class,
3634                               (struct regnode_charclass_class*)scan);
3635                     break;
3636                 case ALNUM:
3637                     if (flags & SCF_DO_STCLASS_AND) {
3638                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3639                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3640                             if (OP(scan) == ALNUMU) {
3641                                 for (value = 0; value < 256; value++) {
3642                                     if (!isWORDCHAR_L1(value)) {
3643                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3644                                     }
3645                                 }
3646                             } else {
3647                                 for (value = 0; value < 256; value++) {
3648                                     if (!isALNUM(value)) {
3649                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3650                                     }
3651                                 }
3652                             }
3653                         }
3654                     }
3655                     else {
3656                         if (data->start_class->flags & ANYOF_LOCALE)
3657                             ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3658                         else if (OP(scan) == ALNUMU) {
3659                             for (value = 0; value < 256; value++) {
3660                                 if (isWORDCHAR_L1(value)) {
3661                                     ANYOF_BITMAP_SET(data->start_class, value);
3662                                 }
3663                             }
3664                         } else {
3665                             for (value = 0; value < 256; value++) {
3666                                 if (isALNUM(value)) {
3667                                     ANYOF_BITMAP_SET(data->start_class, value);
3668                                 }
3669                             }
3670                         }
3671                     }
3672                     break;
3673                 case NALNUM:
3674                     if (flags & SCF_DO_STCLASS_AND) {
3675                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3676                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3677                             if (OP(scan) == NALNUMU) {
3678                                 for (value = 0; value < 256; value++) {
3679                                     if (isWORDCHAR_L1(value)) {
3680                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3681                                     }
3682                                 }
3683                             } else {
3684                                 for (value = 0; value < 256; value++) {
3685                                     if (isALNUM(value)) {
3686                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3687                                     }
3688                                 }
3689                             }
3690                         }
3691                     }
3692                     else {
3693                         if (data->start_class->flags & ANYOF_LOCALE)
3694                             ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3695                         else {
3696                             if (OP(scan) == NALNUMU) {
3697                                 for (value = 0; value < 256; value++) {
3698                                     if (! isWORDCHAR_L1(value)) {
3699                                         ANYOF_BITMAP_SET(data->start_class, value);
3700                                     }
3701                                 }
3702                             } else {
3703                                 for (value = 0; value < 256; value++) {
3704                                     if (! isALNUM(value)) {
3705                                         ANYOF_BITMAP_SET(data->start_class, value);
3706                                     }
3707                                 }
3708                             }
3709                         }
3710                     }
3711                     break;
3712                 case SPACE:
3713                     if (flags & SCF_DO_STCLASS_AND) {
3714                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3715                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3716                             if (OP(scan) == SPACEU) {
3717                                 for (value = 0; value < 256; value++) {
3718                                     if (!isSPACE_L1(value)) {
3719                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3720                                     }
3721                                 }
3722                             } else {
3723                                 for (value = 0; value < 256; value++) {
3724                                     if (!isSPACE(value)) {
3725                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3726                                     }
3727                                 }
3728                             }
3729                         }
3730                     }
3731                     else {
3732                         if (data->start_class->flags & ANYOF_LOCALE) {
3733                             ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3734                         }
3735                         else if (OP(scan) == SPACEU) {
3736                             for (value = 0; value < 256; value++) {
3737                                 if (isSPACE_L1(value)) {
3738                                     ANYOF_BITMAP_SET(data->start_class, value);
3739                                 }
3740                             }
3741                         } else {
3742                             for (value = 0; value < 256; value++) {
3743                                 if (isSPACE(value)) {
3744                                     ANYOF_BITMAP_SET(data->start_class, value);
3745                                 }
3746                             }
3747                         }
3748                     }
3749                     break;
3750                 case NSPACE:
3751                     if (flags & SCF_DO_STCLASS_AND) {
3752                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3753                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3754                             if (OP(scan) == NSPACEU) {
3755                                 for (value = 0; value < 256; value++) {
3756                                     if (isSPACE_L1(value)) {
3757                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3758                                     }
3759                                 }
3760                             } else {
3761                                 for (value = 0; value < 256; value++) {
3762                                     if (isSPACE(value)) {
3763                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3764                                     }
3765                                 }
3766                             }
3767                         }
3768                     }
3769                     else {
3770                         if (data->start_class->flags & ANYOF_LOCALE)
3771                             ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3772                         else if (OP(scan) == NSPACEU) {
3773                             for (value = 0; value < 256; value++) {
3774                                 if (!isSPACE_L1(value)) {
3775                                     ANYOF_BITMAP_SET(data->start_class, value);
3776                                 }
3777                             }
3778                         }
3779                         else {
3780                             for (value = 0; value < 256; value++) {
3781                                 if (!isSPACE(value)) {
3782                                     ANYOF_BITMAP_SET(data->start_class, value);
3783                                 }
3784                             }
3785                         }
3786                     }
3787                     break;
3788                 case DIGIT:
3789                     if (flags & SCF_DO_STCLASS_AND) {
3790                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3791                         for (value = 0; value < 256; value++)
3792                             if (!isDIGIT(value))
3793                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
3794                     }
3795                     else {
3796                         if (data->start_class->flags & ANYOF_LOCALE)
3797                             ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3798                         else {
3799                             for (value = 0; value < 256; value++)
3800                                 if (isDIGIT(value))
3801                                     ANYOF_BITMAP_SET(data->start_class, value);
3802                         }
3803                     }
3804                     break;
3805                 case NDIGIT:
3806                     if (flags & SCF_DO_STCLASS_AND) {
3807                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3808                         for (value = 0; value < 256; value++)
3809                             if (isDIGIT(value))
3810                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
3811                     }
3812                     else {
3813                         if (data->start_class->flags & ANYOF_LOCALE)
3814                             ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3815                         else {
3816                             for (value = 0; value < 256; value++)
3817                                 if (!isDIGIT(value))
3818                                     ANYOF_BITMAP_SET(data->start_class, value);
3819                         }
3820                     }
3821                     break;
3822                 CASE_SYNST_FNC(VERTWS);
3823                 CASE_SYNST_FNC(HORIZWS);
3824
3825                 }
3826                 if (flags & SCF_DO_STCLASS_OR)
3827                     cl_and(data->start_class, and_withp);
3828                 flags &= ~SCF_DO_STCLASS;
3829             }
3830         }
3831         else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3832             data->flags |= (OP(scan) == MEOL
3833                             ? SF_BEFORE_MEOL
3834                             : SF_BEFORE_SEOL);
3835         }
3836         else if (  PL_regkind[OP(scan)] == BRANCHJ
3837                  /* Lookbehind, or need to calculate parens/evals/stclass: */
3838                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
3839                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3840             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3841                 || OP(scan) == UNLESSM )
3842             {
3843                 /* Negative Lookahead/lookbehind
3844                    In this case we can't do fixed string optimisation.
3845                 */
3846
3847                 I32 deltanext, minnext, fake = 0;
3848                 regnode *nscan;
3849                 struct regnode_charclass_class intrnl;
3850                 int f = 0;
3851
3852                 data_fake.flags = 0;
3853                 if (data) {
3854                     data_fake.whilem_c = data->whilem_c;
3855                     data_fake.last_closep = data->last_closep;
3856                 }
3857                 else
3858                     data_fake.last_closep = &fake;
3859                 data_fake.pos_delta = delta;
3860                 if ( flags & SCF_DO_STCLASS && !scan->flags
3861                      && OP(scan) == IFMATCH ) { /* Lookahead */
3862                     cl_init(pRExC_state, &intrnl);
3863                     data_fake.start_class = &intrnl;
3864                     f |= SCF_DO_STCLASS_AND;
3865                 }
3866                 if (flags & SCF_WHILEM_VISITED_POS)
3867                     f |= SCF_WHILEM_VISITED_POS;
3868                 next = regnext(scan);
3869                 nscan = NEXTOPER(NEXTOPER(scan));
3870                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3871                     last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3872                 if (scan->flags) {
3873                     if (deltanext) {
3874                         FAIL("Variable length lookbehind not implemented");
3875                     }
3876                     else if (minnext > (I32)U8_MAX) {
3877                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3878                     }
3879                     scan->flags = (U8)minnext;
3880                 }
3881                 if (data) {
3882                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3883                         pars++;
3884                     if (data_fake.flags & SF_HAS_EVAL)
3885                         data->flags |= SF_HAS_EVAL;
3886                     data->whilem_c = data_fake.whilem_c;
3887                 }
3888                 if (f & SCF_DO_STCLASS_AND) {
3889                     if (flags & SCF_DO_STCLASS_OR) {
3890                         /* OR before, AND after: ideally we would recurse with
3891                          * data_fake to get the AND applied by study of the
3892                          * remainder of the pattern, and then derecurse;
3893                          * *** HACK *** for now just treat as "no information".
3894                          * See [perl #56690].
3895                          */
3896                         cl_init(pRExC_state, data->start_class);
3897                     }  else {
3898                         /* AND before and after: combine and continue */
3899                         const int was = (data->start_class->flags & ANYOF_EOS);
3900
3901                         cl_and(data->start_class, &intrnl);
3902                         if (was)
3903                             data->start_class->flags |= ANYOF_EOS;
3904                     }
3905                 }
3906             }
3907 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3908             else {
3909                 /* Positive Lookahead/lookbehind
3910                    In this case we can do fixed string optimisation,
3911                    but we must be careful about it. Note in the case of
3912                    lookbehind the positions will be offset by the minimum
3913                    length of the pattern, something we won't know about
3914                    until after the recurse.
3915                 */
3916                 I32 deltanext, fake = 0;
3917                 regnode *nscan;
3918                 struct regnode_charclass_class intrnl;
3919                 int f = 0;
3920                 /* We use SAVEFREEPV so that when the full compile
3921                     is finished perl will clean up the allocated
3922                     minlens when it's all done. This way we don't
3923                     have to worry about freeing them when we know
3924                     they wont be used, which would be a pain.
3925                  */
3926                 I32 *minnextp;
3927                 Newx( minnextp, 1, I32 );
3928                 SAVEFREEPV(minnextp);
3929
3930                 if (data) {
3931                     StructCopy(data, &data_fake, scan_data_t);
3932                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3933                         f |= SCF_DO_SUBSTR;
3934                         if (scan->flags)
3935                             SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
3936                         data_fake.last_found=newSVsv(data->last_found);
3937                     }
3938                 }
3939                 else
3940                     data_fake.last_closep = &fake;
3941                 data_fake.flags = 0;
3942                 data_fake.pos_delta = delta;
3943                 if (is_inf)
3944                     data_fake.flags |= SF_IS_INF;
3945                 if ( flags & SCF_DO_STCLASS && !scan->flags
3946                      && OP(scan) == IFMATCH ) { /* Lookahead */
3947                     cl_init(pRExC_state, &intrnl);
3948                     data_fake.start_class = &intrnl;
3949                     f |= SCF_DO_STCLASS_AND;
3950                 }
3951                 if (flags & SCF_WHILEM_VISITED_POS)
3952                     f |= SCF_WHILEM_VISITED_POS;
3953                 next = regnext(scan);
3954                 nscan = NEXTOPER(NEXTOPER(scan));
3955
3956                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3957                     last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3958                 if (scan->flags) {
3959                     if (deltanext) {
3960                         FAIL("Variable length lookbehind not implemented");
3961                     }
3962                     else if (*minnextp > (I32)U8_MAX) {
3963                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3964                     }
3965                     scan->flags = (U8)*minnextp;
3966                 }
3967
3968                 *minnextp += min;
3969
3970                 if (f & SCF_DO_STCLASS_AND) {
3971                     const int was = (data->start_class->flags & ANYOF_EOS);
3972
3973                     cl_and(data->start_class, &intrnl);
3974                     if (was)
3975                         data->start_class->flags |= ANYOF_EOS;
3976                 }
3977                 if (data) {
3978                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3979                         pars++;
3980                     if (data_fake.flags & SF_HAS_EVAL)
3981                         data->flags |= SF_HAS_EVAL;
3982                     data->whilem_c = data_fake.whilem_c;
3983                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3984                         if (RExC_rx->minlen<*minnextp)
3985                             RExC_rx->minlen=*minnextp;
3986                         SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
3987                         SvREFCNT_dec(data_fake.last_found);
3988
3989                         if ( data_fake.minlen_fixed != minlenp )
3990                         {
3991                             data->offset_fixed= data_fake.offset_fixed;
3992                             data->minlen_fixed= data_fake.minlen_fixed;
3993                             data->lookbehind_fixed+= scan->flags;
3994                         }
3995                         if ( data_fake.minlen_float != minlenp )
3996                         {
3997                             data->minlen_float= data_fake.minlen_float;
3998                             data->offset_float_min=data_fake.offset_float_min;
3999                             data->offset_float_max=data_fake.offset_float_max;
4000                             data->lookbehind_float+= scan->flags;
4001                         }
4002                     }
4003                 }
4004
4005
4006             }
4007 #endif
4008         }
4009         else if (OP(scan) == OPEN) {
4010             if (stopparen != (I32)ARG(scan))
4011                 pars++;
4012         }
4013         else if (OP(scan) == CLOSE) {
4014             if (stopparen == (I32)ARG(scan)) {
4015                 break;
4016             }
4017             if ((I32)ARG(scan) == is_par) {
4018                 next = regnext(scan);
4019
4020                 if ( next && (OP(next) != WHILEM) && next < last)
4021                     is_par = 0;         /* Disable optimization */
4022             }
4023             if (data)
4024                 *(data->last_closep) = ARG(scan);
4025         }
4026         else if (OP(scan) == EVAL) {
4027                 if (data)
4028                     data->flags |= SF_HAS_EVAL;
4029         }
4030         else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4031             if (flags & SCF_DO_SUBSTR) {
4032                 SCAN_COMMIT(pRExC_state,data,minlenp);
4033                 flags &= ~SCF_DO_SUBSTR;
4034             }
4035             if (data && OP(scan)==ACCEPT) {
4036                 data->flags |= SCF_SEEN_ACCEPT;
4037                 if (stopmin > min)
4038                     stopmin = min;
4039             }
4040         }
4041         else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4042         {
4043                 if (flags & SCF_DO_SUBSTR) {
4044                     SCAN_COMMIT(pRExC_state,data,minlenp);
4045                     data->longest = &(data->longest_float);
4046                 }
4047                 is_inf = is_inf_internal = 1;
4048                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4049                     cl_anything(pRExC_state, data->start_class);
4050                 flags &= ~SCF_DO_STCLASS;
4051         }
4052         else if (OP(scan) == GPOS) {
4053             if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4054                 !(delta || is_inf || (data && data->pos_delta)))
4055             {
4056                 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4057                     RExC_rx->extflags |= RXf_ANCH_GPOS;
4058                 if (RExC_rx->gofs < (U32)min)
4059                     RExC_rx->gofs = min;
4060             } else {
4061                 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4062                 RExC_rx->gofs = 0;
4063             }
4064         }
4065 #ifdef TRIE_STUDY_OPT
4066 #ifdef FULL_TRIE_STUDY
4067         else if (PL_regkind[OP(scan)] == TRIE) {
4068             /* NOTE - There is similar code to this block above for handling
4069                BRANCH nodes on the initial study.  If you change stuff here
4070                check there too. */
4071             regnode *trie_node= scan;
4072             regnode *tail= regnext(scan);
4073             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4074             I32 max1 = 0, min1 = I32_MAX;
4075             struct regnode_charclass_class accum;
4076
4077             if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4078                 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4079             if (flags & SCF_DO_STCLASS)
4080                 cl_init_zero(pRExC_state, &accum);
4081
4082             if (!trie->jump) {
4083                 min1= trie->minlen;
4084                 max1= trie->maxlen;
4085             } else {
4086                 const regnode *nextbranch= NULL;
4087                 U32 word;
4088
4089                 for ( word=1 ; word <= trie->wordcount ; word++)
4090                 {
4091                     I32 deltanext=0, minnext=0, f = 0, fake;
4092                     struct regnode_charclass_class this_class;
4093
4094                     data_fake.flags = 0;
4095                     if (data) {
4096                         data_fake.whilem_c = data->whilem_c;
4097                         data_fake.last_closep = data->last_closep;
4098                     }
4099                     else
4100                         data_fake.last_closep = &fake;
4101                     data_fake.pos_delta = delta;
4102                     if (flags & SCF_DO_STCLASS) {
4103                         cl_init(pRExC_state, &this_class);
4104                         data_fake.start_class = &this_class;
4105                         f = SCF_DO_STCLASS_AND;
4106                     }
4107                     if (flags & SCF_WHILEM_VISITED_POS)
4108                         f |= SCF_WHILEM_VISITED_POS;
4109
4110                     if (trie->jump[word]) {
4111                         if (!nextbranch)
4112                             nextbranch = trie_node + trie->jump[0];
4113                         scan= trie_node + trie->jump[word];
4114                         /* We go from the jump point to the branch that follows
4115                            it. Note this means we need the vestigal unused branches
4116                            even though they arent otherwise used.
4117                          */
4118                         minnext = study_chunk(pRExC_state, &scan, minlenp,
4119                             &deltanext, (regnode *)nextbranch, &data_fake,
4120                             stopparen, recursed, NULL, f,depth+1);
4121                     }
4122                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4123                         nextbranch= regnext((regnode*)nextbranch);
4124
4125                     if (min1 > (I32)(minnext + trie->minlen))
4126                         min1 = minnext + trie->minlen;
4127                     if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4128                         max1 = minnext + deltanext + trie->maxlen;
4129                     if (deltanext == I32_MAX)
4130                         is_inf = is_inf_internal = 1;
4131
4132                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4133                         pars++;
4134                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
4135                         if ( stopmin > min + min1)
4136                             stopmin = min + min1;
4137                         flags &= ~SCF_DO_SUBSTR;
4138                         if (data)
4139                             data->flags |= SCF_SEEN_ACCEPT;
4140                     }
4141                     if (data) {
4142                         if (data_fake.flags & SF_HAS_EVAL)
4143                             data->flags |= SF_HAS_EVAL;
4144                         data->whilem_c = data_fake.whilem_c;
4145                     }
4146                     if (flags & SCF_DO_STCLASS)
4147                         cl_or(pRExC_state, &accum, &this_class);
4148                 }
4149             }
4150             if (flags & SCF_DO_SUBSTR) {
4151                 data->pos_min += min1;
4152                 data->pos_delta += max1 - min1;
4153                 if (max1 != min1 || is_inf)
4154                     data->longest = &(data->longest_float);
4155             }
4156             min += min1;
4157             delta += max1 - min1;
4158             if (flags & SCF_DO_STCLASS_OR) {
4159                 cl_or(pRExC_state, data->start_class, &accum);
4160                 if (min1) {
4161                     cl_and(data->start_class, and_withp);
4162                     flags &= ~SCF_DO_STCLASS;
4163                 }
4164             }
4165             else if (flags & SCF_DO_STCLASS_AND) {
4166                 if (min1) {
4167                     cl_and(data->start_class, &accum);
4168                     flags &= ~SCF_DO_STCLASS;
4169                 }
4170                 else {
4171                     /* Switch to OR mode: cache the old value of
4172                      * data->start_class */
4173                     INIT_AND_WITHP;
4174                     StructCopy(data->start_class, and_withp,
4175                                struct regnode_charclass_class);
4176                     flags &= ~SCF_DO_STCLASS_AND;
4177                     StructCopy(&accum, data->start_class,
4178                                struct regnode_charclass_class);
4179                     flags |= SCF_DO_STCLASS_OR;
4180                     data->start_class->flags |= ANYOF_EOS;
4181                 }
4182             }
4183             scan= tail;
4184             continue;
4185         }
4186 #else
4187         else if (PL_regkind[OP(scan)] == TRIE) {
4188             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4189             U8*bang=NULL;
4190
4191             min += trie->minlen;
4192             delta += (trie->maxlen - trie->minlen);
4193             flags &= ~SCF_DO_STCLASS; /* xxx */
4194             if (flags & SCF_DO_SUBSTR) {
4195                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
4196                 data->pos_min += trie->minlen;
4197                 data->pos_delta += (trie->maxlen - trie->minlen);
4198                 if (trie->maxlen != trie->minlen)
4199                     data->longest = &(data->longest_float);
4200             }
4201             if (trie->jump) /* no more substrings -- for now /grr*/
4202                 flags &= ~SCF_DO_SUBSTR;
4203         }
4204 #endif /* old or new */
4205 #endif /* TRIE_STUDY_OPT */
4206
4207         /* Else: zero-length, ignore. */
4208         scan = regnext(scan);
4209     }
4210     if (frame) {
4211         last = frame->last;
4212         scan = frame->next;
4213         stopparen = frame->stop;
4214         frame = frame->prev;
4215         goto fake_study_recurse;
4216     }
4217
4218   finish:
4219     assert(!frame);
4220     DEBUG_STUDYDATA("pre-fin:",data,depth);
4221
4222     *scanp = scan;
4223     *deltap = is_inf_internal ? I32_MAX : delta;
4224     if (flags & SCF_DO_SUBSTR && is_inf)
4225         data->pos_delta = I32_MAX - data->pos_min;
4226     if (is_par > (I32)U8_MAX)
4227         is_par = 0;
4228     if (is_par && pars==1 && data) {
4229         data->flags |= SF_IN_PAR;
4230         data->flags &= ~SF_HAS_PAR;
4231     }
4232     else if (pars && data) {
4233         data->flags |= SF_HAS_PAR;
4234         data->flags &= ~SF_IN_PAR;
4235     }
4236     if (flags & SCF_DO_STCLASS_OR)
4237         cl_and(data->start_class, and_withp);
4238     if (flags & SCF_TRIE_RESTUDY)
4239         data->flags |=  SCF_TRIE_RESTUDY;
4240
4241     DEBUG_STUDYDATA("post-fin:",data,depth);
4242
4243     return min < stopmin ? min : stopmin;
4244 }
4245
4246 STATIC U32
4247 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4248 {
4249     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4250
4251     PERL_ARGS_ASSERT_ADD_DATA;
4252
4253     Renewc(RExC_rxi->data,
4254            sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4255            char, struct reg_data);
4256     if(count)
4257         Renew(RExC_rxi->data->what, count + n, U8);
4258     else
4259         Newx(RExC_rxi->data->what, n, U8);
4260     RExC_rxi->data->count = count + n;
4261     Copy(s, RExC_rxi->data->what + count, n, U8);
4262     return count;
4263 }
4264
4265 /*XXX: todo make this not included in a non debugging perl */
4266 #ifndef PERL_IN_XSUB_RE
4267 void
4268 Perl_reginitcolors(pTHX)
4269 {
4270     dVAR;
4271     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4272     if (s) {
4273         char *t = savepv(s);
4274         int i = 0;
4275         PL_colors[0] = t;
4276         while (++i < 6) {
4277             t = strchr(t, '\t');
4278             if (t) {
4279                 *t = '\0';
4280                 PL_colors[i] = ++t;
4281             }
4282             else
4283                 PL_colors[i] = t = (char *)"";
4284         }
4285     } else {
4286         int i = 0;
4287         while (i < 6)
4288             PL_colors[i++] = (char *)"";
4289     }
4290     PL_colorset = 1;
4291 }
4292 #endif
4293
4294
4295 #ifdef TRIE_STUDY_OPT
4296 #define CHECK_RESTUDY_GOTO                                  \
4297         if (                                                \
4298               (data.flags & SCF_TRIE_RESTUDY)               \
4299               && ! restudied++                              \
4300         )     goto reStudy
4301 #else
4302 #define CHECK_RESTUDY_GOTO
4303 #endif
4304
4305 /*
4306  - pregcomp - compile a regular expression into internal code
4307  *
4308  * We can't allocate space until we know how big the compiled form will be,
4309  * but we can't compile it (and thus know how big it is) until we've got a
4310  * place to put the code.  So we cheat:  we compile it twice, once with code
4311  * generation turned off and size counting turned on, and once "for real".
4312  * This also means that we don't allocate space until we are sure that the
4313  * thing really will compile successfully, and we never have to move the
4314  * code and thus invalidate pointers into it.  (Note that it has to be in
4315  * one piece because free() must be able to free it all.) [NB: not true in perl]
4316  *
4317  * Beware that the optimization-preparation code in here knows about some
4318  * of the structure of the compiled regexp.  [I'll say.]
4319  */
4320
4321
4322
4323 #ifndef PERL_IN_XSUB_RE
4324 #define RE_ENGINE_PTR &PL_core_reg_engine
4325 #else
4326 extern const struct regexp_engine my_reg_engine;
4327 #define RE_ENGINE_PTR &my_reg_engine
4328 #endif
4329
4330 #ifndef PERL_IN_XSUB_RE
4331 REGEXP *
4332 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4333 {
4334     dVAR;
4335     HV * const table = GvHV(PL_hintgv);
4336
4337     PERL_ARGS_ASSERT_PREGCOMP;
4338
4339     /* Dispatch a request to compile a regexp to correct
4340        regexp engine. */
4341     if (table) {
4342         SV **ptr= hv_fetchs(table, "regcomp", FALSE);
4343         GET_RE_DEBUG_FLAGS_DECL;
4344         if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
4345             const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4346             DEBUG_COMPILE_r({
4347                 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4348                     SvIV(*ptr));
4349             });
4350             return CALLREGCOMP_ENG(eng, pattern, flags);
4351         }
4352     }
4353     return Perl_re_compile(aTHX_ pattern, flags);
4354 }
4355 #endif
4356
4357 REGEXP *
4358 Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
4359 {
4360     dVAR;
4361     REGEXP *rx;
4362     struct regexp *r;
4363     register regexp_internal *ri;
4364     STRLEN plen;
4365     char  *exp;
4366     char* xend;
4367     regnode *scan;
4368     I32 flags;
4369     I32 minlen = 0;
4370     U32 pm_flags;
4371
4372     /* these are all flags - maybe they should be turned
4373      * into a single int with different bit masks */
4374     I32 sawlookahead = 0;
4375     I32 sawplus = 0;
4376     I32 sawopen = 0;
4377     bool used_setjump = FALSE;
4378
4379     U8 jump_ret = 0;
4380     dJMPENV;
4381     scan_data_t data;
4382     RExC_state_t RExC_state;
4383     RExC_state_t * const pRExC_state = &RExC_state;
4384 #ifdef TRIE_STUDY_OPT
4385     int restudied;
4386     RExC_state_t copyRExC_state;
4387 #endif
4388     GET_RE_DEBUG_FLAGS_DECL;
4389
4390     PERL_ARGS_ASSERT_RE_COMPILE;
4391
4392     DEBUG_r(if (!PL_colorset) reginitcolors());
4393
4394     RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
4395
4396     /****************** LONG JUMP TARGET HERE***********************/
4397     /* Longjmp back to here if have to switch in midstream to utf8 */
4398     if (! RExC_orig_utf8) {
4399         JMPENV_PUSH(jump_ret);
4400         used_setjump = TRUE;
4401     }
4402
4403     if (jump_ret == 0) {    /* First time through */
4404         exp = SvPV(pattern, plen);
4405         xend = exp + plen;
4406         /* ignore the utf8ness if the pattern is 0 length */
4407         if (plen == 0) {
4408             RExC_utf8 = RExC_orig_utf8 = 0;
4409         }
4410
4411         DEBUG_COMPILE_r({
4412             SV *dsv= sv_newmortal();
4413             RE_PV_QUOTED_DECL(s, RExC_utf8,
4414                 dsv, exp, plen, 60);
4415             PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4416                            PL_colors[4],PL_colors[5],s);
4417         });
4418     }
4419     else {  /* longjumped back */
4420         STRLEN len = plen;
4421
4422         /* If the cause for the longjmp was other than changing to utf8, pop
4423          * our own setjmp, and longjmp to the correct handler */
4424         if (jump_ret != UTF8_LONGJMP) {
4425             JMPENV_POP;
4426             JMPENV_JUMP(jump_ret);
4427         }
4428
4429         GET_RE_DEBUG_FLAGS;
4430
4431         /* It's possible to write a regexp in ascii that represents Unicode
4432         codepoints outside of the byte range, such as via \x{100}. If we
4433         detect such a sequence we have to convert the entire pattern to utf8
4434         and then recompile, as our sizing calculation will have been based
4435         on 1 byte == 1 character, but we will need to use utf8 to encode
4436         at least some part of the pattern, and therefore must convert the whole
4437         thing.
4438         -- dmq */
4439         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4440             "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4441         exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)SvPV(pattern, plen), &len);
4442         xend = exp + len;
4443         RExC_orig_utf8 = RExC_utf8 = 1;
4444         SAVEFREEPV(exp);
4445     }
4446
4447 #ifdef TRIE_STUDY_OPT
4448     restudied = 0;
4449 #endif
4450
4451     /* Set to use unicode semantics if the pattern is in utf8 and has the
4452      * 'depends' charset specified, as it means unicode when utf8  */
4453     pm_flags = orig_pm_flags;
4454
4455     if (RExC_utf8 && get_regex_charset(pm_flags) == REGEX_DEPENDS_CHARSET) {
4456         set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
4457     }
4458
4459     RExC_precomp = exp;
4460     RExC_flags = pm_flags;
4461     RExC_sawback = 0;
4462
4463     RExC_seen = 0;
4464     RExC_in_lookbehind = 0;
4465     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4466     RExC_seen_evals = 0;
4467     RExC_extralen = 0;
4468
4469     /* First pass: determine size, legality. */
4470     RExC_parse = exp;
4471     RExC_start = exp;
4472     RExC_end = xend;
4473     RExC_naughty = 0;
4474     RExC_npar = 1;
4475     RExC_nestroot = 0;
4476     RExC_size = 0L;
4477     RExC_emit = &PL_regdummy;
4478     RExC_whilem_seen = 0;
4479     RExC_open_parens = NULL;
4480     RExC_close_parens = NULL;
4481     RExC_opend = NULL;
4482     RExC_paren_names = NULL;
4483 #ifdef DEBUGGING
4484     RExC_paren_name_list = NULL;
4485 #endif
4486     RExC_recurse = NULL;
4487     RExC_recurse_count = 0;
4488
4489 #if 0 /* REGC() is (currently) a NOP at the first pass.
4490        * Clever compilers notice this and complain. --jhi */
4491     REGC((U8)REG_MAGIC, (char*)RExC_emit);
4492 #endif
4493     DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4494     if (reg(pRExC_state, 0, &flags,1) == NULL) {
4495         RExC_precomp = NULL;
4496         return(NULL);
4497     }
4498
4499     /* Here, finished first pass.  Get rid of any added setjmp */
4500     if (used_setjump) {
4501         JMPENV_POP;
4502     }
4503     DEBUG_PARSE_r({
4504         PerlIO_printf(Perl_debug_log,
4505             "Required size %"IVdf" nodes\n"
4506             "Starting second pass (creation)\n",
4507             (IV)RExC_size);
4508         RExC_lastnum=0;
4509         RExC_lastparse=NULL;
4510     });
4511     /* Small enough for pointer-storage convention?
4512        If extralen==0, this means that we will not need long jumps. */
4513     if (RExC_size >= 0x10000L && RExC_extralen)
4514         RExC_size += RExC_extralen;
4515     else
4516         RExC_extralen = 0;
4517     if (RExC_whilem_seen > 15)
4518         RExC_whilem_seen = 15;
4519
4520     /* Allocate space and zero-initialize. Note, the two step process
4521        of zeroing when in debug mode, thus anything assigned has to
4522        happen after that */
4523     rx = (REGEXP*) newSV_type(SVt_REGEXP);
4524     r = (struct regexp*)SvANY(rx);
4525     Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4526          char, regexp_internal);
4527     if ( r == NULL || ri == NULL )
4528         FAIL("Regexp out of space");
4529 #ifdef DEBUGGING
4530     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
4531     Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
4532 #else
4533     /* bulk initialize base fields with 0. */
4534     Zero(ri, sizeof(regexp_internal), char);
4535 #endif
4536
4537     /* non-zero initialization begins here */
4538     RXi_SET( r, ri );
4539     r->engine= RE_ENGINE_PTR;
4540     r->extflags = pm_flags;
4541     {
4542         bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
4543         bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
4544
4545         /* The caret is output if there are any defaults: if not all the STD
4546          * flags are set, or if no character set specifier is needed */
4547         bool has_default =
4548                     (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
4549                     || ! has_charset);
4550         bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
4551         U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
4552                             >> RXf_PMf_STD_PMMOD_SHIFT);
4553         const char *fptr = STD_PAT_MODS;        /*"msix"*/
4554         char *p;
4555         /* Allocate for the worst case, which is all the std flags are turned
4556          * on.  If more precision is desired, we could do a population count of
4557          * the flags set.  This could be done with a small lookup table, or by
4558          * shifting, masking and adding, or even, when available, assembly
4559          * language for a machine-language population count.
4560          * We never output a minus, as all those are defaults, so are
4561          * covered by the caret */
4562         const STRLEN wraplen = plen + has_p + has_runon
4563             + has_default       /* If needs a caret */
4564
4565                 /* If needs a character set specifier */
4566             + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
4567             + (sizeof(STD_PAT_MODS) - 1)
4568             + (sizeof("(?:)") - 1);
4569
4570         p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
4571         SvPOK_on(rx);
4572         SvFLAGS(rx) |= SvUTF8(pattern);
4573         *p++='('; *p++='?';
4574
4575         /* If a default, cover it using the caret */
4576         if (has_default) {
4577             *p++= DEFAULT_PAT_MOD;
4578         }
4579         if (has_charset) {
4580             STRLEN len;
4581             const char* const name = get_regex_charset_name(r->extflags, &len);
4582             Copy(name, p, len, char);
4583             p += len;
4584         }
4585         if (has_p)
4586             *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
4587         {
4588             char ch;
4589             while((ch = *fptr++)) {
4590                 if(reganch & 1)
4591                     *p++ = ch;
4592                 reganch >>= 1;
4593             }
4594         }
4595
4596         *p++ = ':';
4597         Copy(RExC_precomp, p, plen, char);
4598         assert ((RX_WRAPPED(rx) - p) < 16);
4599         r->pre_prefix = p - RX_WRAPPED(rx);
4600         p += plen;
4601         if (has_runon)
4602             *p++ = '\n';
4603         *p++ = ')';
4604         *p = 0;
4605         SvCUR_set(rx, p - SvPVX_const(rx));
4606     }
4607
4608     r->intflags = 0;
4609     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4610
4611     if (RExC_seen & REG_SEEN_RECURSE) {
4612         Newxz(RExC_open_parens, RExC_npar,regnode *);
4613         SAVEFREEPV(RExC_open_parens);
4614         Newxz(RExC_close_parens,RExC_npar,regnode *);
4615         SAVEFREEPV(RExC_close_parens);
4616     }
4617
4618     /* Useful during FAIL. */
4619 #ifdef RE_TRACK_PATTERN_OFFSETS
4620     Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
4621     DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
4622                           "%s %"UVuf" bytes for offset annotations.\n",
4623                           ri->u.offsets ? "Got" : "Couldn't get",
4624                           (UV)((2*RExC_size+1) * sizeof(U32))));
4625 #endif
4626     SetProgLen(ri,RExC_size);
4627     RExC_rx_sv = rx;
4628     RExC_rx = r;
4629     RExC_rxi = ri;
4630     REH_CALL_COMP_BEGIN_HOOK(pRExC_state->rx);
4631
4632     /* Second pass: emit code. */
4633     RExC_flags = pm_flags;      /* don't let top level (?i) bleed */
4634     RExC_parse = exp;
4635     RExC_end = xend;
4636     RExC_naughty = 0;
4637     RExC_npar = 1;
4638     RExC_emit_start = ri->program;
4639     RExC_emit = ri->program;
4640     RExC_emit_bound = ri->program + RExC_size + 1;
4641
4642     /* Store the count of eval-groups for security checks: */
4643     RExC_rx->seen_evals = RExC_seen_evals;
4644     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
4645     if (reg(pRExC_state, 0, &flags,1) == NULL) {
4646         ReREFCNT_dec(rx);
4647         return(NULL);
4648     }
4649     /* XXXX To minimize changes to RE engine we always allocate
4650        3-units-long substrs field. */
4651     Newx(r->substrs, 1, struct reg_substr_data);
4652     if (RExC_recurse_count) {
4653         Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4654         SAVEFREEPV(RExC_recurse);
4655     }
4656
4657 reStudy:
4658     r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
4659     Zero(r->substrs, 1, struct reg_substr_data);
4660
4661 #ifdef TRIE_STUDY_OPT
4662     if (!restudied) {
4663         StructCopy(&zero_scan_data, &data, scan_data_t);
4664         copyRExC_state = RExC_state;
4665     } else {
4666         U32 seen=RExC_seen;
4667         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
4668
4669         RExC_state = copyRExC_state;
4670         if (seen & REG_TOP_LEVEL_BRANCHES)
4671             RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4672         else
4673             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
4674         if (data.last_found) {
4675             SvREFCNT_dec(data.longest_fixed);
4676             SvREFCNT_dec(data.longest_float);
4677             SvREFCNT_dec(data.last_found);
4678         }
4679         StructCopy(&zero_scan_data, &data, scan_data_t);
4680     }
4681 #else
4682     StructCopy(&zero_scan_data, &data, scan_data_t);
4683 #endif
4684
4685     /* Dig out information for optimizations. */
4686     r->extflags = RExC_flags; /* was pm_op */
4687     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
4688
4689     if (UTF)
4690         SvUTF8_on(rx);  /* Unicode in it? */
4691     ri->regstclass = NULL;
4692     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
4693         r->intflags |= PREGf_NAUGHTY;
4694     scan = ri->program + 1;             /* First BRANCH. */
4695
4696     /* testing for BRANCH here tells us whether there is "must appear"
4697        data in the pattern. If there is then we can use it for optimisations */
4698     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /*  Only one top-level choice. */
4699         I32 fake;
4700         STRLEN longest_float_length, longest_fixed_length;
4701         struct regnode_charclass_class ch_class; /* pointed to by data */
4702         int stclass_flag;
4703         I32 last_close = 0; /* pointed to by data */
4704         regnode *first= scan;
4705         regnode *first_next= regnext(first);
4706         /*
4707          * Skip introductions and multiplicators >= 1
4708          * so that we can extract the 'meat' of the pattern that must
4709          * match in the large if() sequence following.
4710          * NOTE that EXACT is NOT covered here, as it is normally
4711          * picked up by the optimiser separately.
4712          *
4713          * This is unfortunate as the optimiser isnt handling lookahead
4714          * properly currently.
4715          *
4716          */
4717         while ((OP(first) == OPEN && (sawopen = 1)) ||
4718                /* An OR of *one* alternative - should not happen now. */
4719             (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
4720             /* for now we can't handle lookbehind IFMATCH*/
4721             (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
4722             (OP(first) == PLUS) ||
4723             (OP(first) == MINMOD) ||
4724                /* An {n,m} with n>0 */
4725             (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
4726             (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
4727         {
4728                 /*
4729                  * the only op that could be a regnode is PLUS, all the rest
4730                  * will be regnode_1 or regnode_2.
4731                  *
4732                  */
4733                 if (OP(first) == PLUS)
4734                     sawplus = 1;
4735                 else
4736                     first += regarglen[OP(first)];
4737
4738                 first = NEXTOPER(first);
4739                 first_next= regnext(first);
4740         }
4741
4742         /* Starting-point info. */
4743       again:
4744         DEBUG_PEEP("first:",first,0);
4745         /* Ignore EXACT as we deal with it later. */
4746         if (PL_regkind[OP(first)] == EXACT) {
4747             if (OP(first) == EXACT)
4748                 NOOP;   /* Empty, get anchored substr later. */
4749             else
4750                 ri->regstclass = first;
4751         }
4752 #ifdef TRIE_STCLASS
4753         else if (PL_regkind[OP(first)] == TRIE &&
4754                 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
4755         {
4756             regnode *trie_op;
4757             /* this can happen only on restudy */
4758             if ( OP(first) == TRIE ) {
4759                 struct regnode_1 *trieop = (struct regnode_1 *)
4760                     PerlMemShared_calloc(1, sizeof(struct regnode_1));
4761                 StructCopy(first,trieop,struct regnode_1);
4762                 trie_op=(regnode *)trieop;
4763             } else {
4764                 struct regnode_charclass *trieop = (struct regnode_charclass *)
4765                     PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
4766                 StructCopy(first,trieop,struct regnode_charclass);
4767                 trie_op=(regnode *)trieop;
4768             }
4769             OP(trie_op)+=2;
4770             make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
4771             ri->regstclass = trie_op;
4772         }
4773 #endif
4774         else if (REGNODE_SIMPLE(OP(first)))
4775             ri->regstclass = first;
4776         else if (PL_regkind[OP(first)] == BOUND ||
4777                  PL_regkind[OP(first)] == NBOUND)
4778             ri->regstclass = first;
4779         else if (PL_regkind[OP(first)] == BOL) {
4780             r->extflags |= (OP(first) == MBOL
4781                            ? RXf_ANCH_MBOL
4782                            : (OP(first) == SBOL
4783                               ? RXf_ANCH_SBOL
4784                               : RXf_ANCH_BOL));
4785             first = NEXTOPER(first);
4786             goto again;
4787         }
4788         else if (OP(first) == GPOS) {
4789             r->extflags |= RXf_ANCH_GPOS;
4790             first = NEXTOPER(first);
4791             goto again;
4792         }
4793         else if ((!sawopen || !RExC_sawback) &&
4794             (OP(first) == STAR &&
4795             PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
4796             !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
4797         {
4798             /* turn .* into ^.* with an implied $*=1 */
4799             const int type =
4800                 (OP(NEXTOPER(first)) == REG_ANY)
4801                     ? RXf_ANCH_MBOL
4802                     : RXf_ANCH_SBOL;
4803             r->extflags |= type;
4804             r->intflags |= PREGf_IMPLICIT;
4805             first = NEXTOPER(first);
4806             goto again;
4807         }
4808         if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
4809             && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
4810             /* x+ must match at the 1st pos of run of x's */
4811             r->intflags |= PREGf_SKIP;
4812
4813         /* Scan is after the zeroth branch, first is atomic matcher. */
4814 #ifdef TRIE_STUDY_OPT
4815         DEBUG_PARSE_r(
4816             if (!restudied)
4817                 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4818                               (IV)(first - scan + 1))
4819         );
4820 #else
4821         DEBUG_PARSE_r(
4822             PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4823                 (IV)(first - scan + 1))
4824         );
4825 #endif
4826
4827
4828         /*
4829         * If there's something expensive in the r.e., find the
4830         * longest literal string that must appear and make it the
4831         * regmust.  Resolve ties in favor of later strings, since
4832         * the regstart check works with the beginning of the r.e.
4833         * and avoiding duplication strengthens checking.  Not a
4834         * strong reason, but sufficient in the absence of others.
4835         * [Now we resolve ties in favor of the earlier string if
4836         * it happens that c_offset_min has been invalidated, since the
4837         * earlier string may buy us something the later one won't.]
4838         */
4839
4840         data.longest_fixed = newSVpvs("");
4841         data.longest_float = newSVpvs("");
4842         data.last_found = newSVpvs("");
4843         data.longest = &(data.longest_fixed);
4844         first = scan;
4845         if (!ri->regstclass) {
4846             cl_init(pRExC_state, &ch_class);
4847             data.start_class = &ch_class;
4848             stclass_flag = SCF_DO_STCLASS_AND;
4849         } else                          /* XXXX Check for BOUND? */
4850             stclass_flag = 0;
4851         data.last_closep = &last_close;
4852
4853         minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
4854             &data, -1, NULL, NULL,
4855             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
4856
4857
4858         CHECK_RESTUDY_GOTO;
4859
4860
4861         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
4862              && data.last_start_min == 0 && data.last_end > 0
4863              && !RExC_seen_zerolen
4864              && !(RExC_seen & REG_SEEN_VERBARG)
4865              && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
4866             r->extflags |= RXf_CHECK_ALL;
4867         scan_commit(pRExC_state, &data,&minlen,0);
4868         SvREFCNT_dec(data.last_found);
4869
4870         /* Note that code very similar to this but for anchored string
4871            follows immediately below, changes may need to be made to both.
4872            Be careful.
4873          */
4874         longest_float_length = CHR_SVLEN(data.longest_float);
4875         if (longest_float_length
4876             || (data.flags & SF_FL_BEFORE_EOL
4877                 && (!(data.flags & SF_FL_BEFORE_MEOL)
4878                     || (RExC_flags & RXf_PMf_MULTILINE))))
4879         {
4880             I32 t,ml;
4881
4882             if (SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
4883                 && data.offset_fixed == data.offset_float_min
4884                 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4885                     goto remove_float;          /* As in (a)+. */
4886
4887             /* copy the information about the longest float from the reg_scan_data
4888                over to the program. */
4889             if (SvUTF8(data.longest_float)) {
4890                 r->float_utf8 = data.longest_float;
4891                 r->float_substr = NULL;
4892             } else {
4893                 r->float_substr = data.longest_float;
4894                 r->float_utf8 = NULL;
4895             }
4896             /* float_end_shift is how many chars that must be matched that
4897                follow this item. We calculate it ahead of time as once the
4898                lookbehind offset is added in we lose the ability to correctly
4899                calculate it.*/
4900             ml = data.minlen_float ? *(data.minlen_float)
4901                                    : (I32)longest_float_length;
4902             r->float_end_shift = ml - data.offset_float_min
4903                 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4904                 + data.lookbehind_float;
4905             r->float_min_offset = data.offset_float_min - data.lookbehind_float;
4906             r->float_max_offset = data.offset_float_max;
4907             if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
4908                 r->float_max_offset -= data.lookbehind_float;
4909
4910             t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4911                        && (!(data.flags & SF_FL_BEFORE_MEOL)
4912                            || (RExC_flags & RXf_PMf_MULTILINE)));
4913             fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
4914         }
4915         else {
4916           remove_float:
4917             r->float_substr = r->float_utf8 = NULL;
4918             SvREFCNT_dec(data.longest_float);
4919             longest_float_length = 0;
4920         }
4921
4922         /* Note that code very similar to this but for floating string
4923            is immediately above, changes may need to be made to both.
4924            Be careful.
4925          */
4926         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
4927         if (longest_fixed_length
4928             || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4929                 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4930                     || (RExC_flags & RXf_PMf_MULTILINE))))
4931         {
4932             I32 t,ml;
4933
4934             /* copy the information about the longest fixed
4935                from the reg_scan_data over to the program. */
4936             if (SvUTF8(data.longest_fixed)) {
4937                 r->anchored_utf8 = data.longest_fixed;
4938                 r->anchored_substr = NULL;
4939             } else {
4940                 r->anchored_substr = data.longest_fixed;
4941                 r->anchored_utf8 = NULL;
4942             }
4943             /* fixed_end_shift is how many chars that must be matched that
4944                follow this item. We calculate it ahead of time as once the
4945                lookbehind offset is added in we lose the ability to correctly
4946                calculate it.*/
4947             ml = data.minlen_fixed ? *(data.minlen_fixed)
4948                                    : (I32)longest_fixed_length;
4949             r->anchored_end_shift = ml - data.offset_fixed
4950                 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4951                 + data.lookbehind_fixed;
4952             r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4953
4954             t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4955                  && (!(data.flags & SF_FIX_BEFORE_MEOL)
4956                      || (RExC_flags & RXf_PMf_MULTILINE)));
4957             fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
4958         }
4959         else {
4960             r->anchored_substr = r->anchored_utf8 = NULL;
4961             SvREFCNT_dec(data.longest_fixed);
4962             longest_fixed_length = 0;
4963         }
4964         if (ri->regstclass
4965             && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
4966             ri->regstclass = NULL;
4967
4968         /* If the synthetic start class were to ever be used when EOS is set,
4969          * that bit would have to be cleared, as it is shared with another */
4970         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4971             && stclass_flag
4972             && !(data.start_class->flags & ANYOF_EOS)
4973             && !cl_is_anything(data.start_class))
4974         {
4975             const U32 n = add_data(pRExC_state, 1, "f");
4976
4977             Newx(RExC_rxi->data->data[n], 1,
4978                 struct regnode_charclass_class);
4979             StructCopy(data.start_class,
4980                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
4981                        struct regnode_charclass_class);
4982             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
4983             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
4984             DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
4985                       regprop(r, sv, (regnode*)data.start_class);
4986                       PerlIO_printf(Perl_debug_log,
4987                                     "synthetic stclass \"%s\".\n",
4988                                     SvPVX_const(sv));});
4989         }
4990
4991         /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
4992         if (longest_fixed_length > longest_float_length) {
4993             r->check_end_shift = r->anchored_end_shift;
4994             r->check_substr = r->anchored_substr;
4995             r->check_utf8 = r->anchored_utf8;
4996             r->check_offset_min = r->check_offset_max = r->anchored_offset;
4997             if (r->extflags & RXf_ANCH_SINGLE)
4998                 r->extflags |= RXf_NOSCAN;
4999         }
5000         else {
5001             r->check_end_shift = r->float_end_shift;
5002             r->check_substr = r->float_substr;
5003             r->check_utf8 = r->float_utf8;
5004             r->check_offset_min = r->float_min_offset;
5005             r->check_offset_max = r->float_max_offset;
5006         }
5007         /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
5008            This should be changed ASAP!  */
5009         if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
5010             r->extflags |= RXf_USE_INTUIT;
5011             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
5012                 r->extflags |= RXf_INTUIT_TAIL;
5013         }
5014         /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
5015         if ( (STRLEN)minlen < longest_float_length )
5016             minlen= longest_float_length;
5017         if ( (STRLEN)minlen < longest_fixed_length )
5018             minlen= longest_fixed_length;
5019         */
5020     }
5021     else {
5022         /* Several toplevels. Best we can is to set minlen. */
5023         I32 fake;
5024         struct regnode_charclass_class ch_class;
5025         I32 last_close = 0;
5026
5027         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
5028
5029         scan = ri->program + 1;
5030         cl_init(pRExC_state, &ch_class);
5031         data.start_class = &ch_class;
5032         data.last_closep = &last_close;
5033
5034
5035         minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
5036             &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
5037
5038         CHECK_RESTUDY_GOTO;
5039
5040         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
5041                 = r->float_substr = r->float_utf8 = NULL;
5042
5043         /* If the synthetic start class were to ever be used when EOS is set,
5044          * that bit would have to be cleared, as it is shared with another */
5045         if (!(data.start_class->flags & ANYOF_EOS)
5046             && !cl_is_anything(data.start_class))
5047         {
5048             const U32 n = add_data(pRExC_state, 1, "f");
5049
5050             Newx(RExC_rxi->data->data[n], 1,
5051                 struct regnode_charclass_class);
5052             StructCopy(data.start_class,
5053                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
5054                        struct regnode_charclass_class);
5055             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
5056             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
5057             DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
5058                       regprop(r, sv, (regnode*)data.start_class);
5059                       PerlIO_printf(Perl_debug_log,
5060                                     "synthetic stclass \"%s\".\n",
5061                                     SvPVX_const(sv));});
5062         }
5063     }
5064
5065     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
5066        the "real" pattern. */
5067     DEBUG_OPTIMISE_r({
5068         PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
5069                       (IV)minlen, (IV)r->minlen);
5070     });
5071     r->minlenret = minlen;
5072     if (r->minlen < minlen)
5073         r->minlen = minlen;
5074
5075     if (RExC_seen & REG_SEEN_GPOS)
5076         r->extflags |= RXf_GPOS_SEEN;
5077     if (RExC_seen & REG_SEEN_LOOKBEHIND)
5078         r->extflags |= RXf_LOOKBEHIND_SEEN;
5079     if (RExC_seen & REG_SEEN_EVAL)
5080         r->extflags |= RXf_EVAL_SEEN;
5081     if (RExC_seen & REG_SEEN_CANY)
5082         r->extflags |= RXf_CANY_SEEN;
5083     if (RExC_seen & REG_SEEN_VERBARG)
5084         r->intflags |= PREGf_VERBARG_SEEN;
5085     if (RExC_seen & REG_SEEN_CUTGROUP)
5086         r->intflags |= PREGf_CUTGROUP_SEEN;
5087     if (RExC_paren_names)
5088         RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
5089     else
5090         RXp_PAREN_NAMES(r) = NULL;
5091
5092 #ifdef STUPID_PATTERN_CHECKS
5093     if (RX_PRELEN(rx) == 0)
5094         r->extflags |= RXf_NULL;
5095     if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
5096         /* XXX: this should happen BEFORE we compile */
5097         r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5098     else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
5099         r->extflags |= RXf_WHITE;
5100     else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
5101         r->extflags |= RXf_START_ONLY;
5102 #else
5103     if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
5104             /* XXX: this should happen BEFORE we compile */
5105             r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5106     else {
5107         regnode *first = ri->program + 1;
5108         U8 fop = OP(first);
5109         U8 nop = OP(NEXTOPER(first));
5110
5111         if (PL_regkind[fop] == NOTHING && nop == END)
5112             r->extflags |= RXf_NULL;
5113         else if (PL_regkind[fop] == BOL && nop == END)
5114             r->extflags |= RXf_START_ONLY;
5115         else if (fop == PLUS && nop ==SPACE && OP(regnext(first))==END)
5116             r->extflags |= RXf_WHITE;
5117     }
5118 #endif
5119 #ifdef DEBUGGING
5120     if (RExC_paren_names) {
5121         ri->name_list_idx = add_data( pRExC_state, 1, "a" );
5122         ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
5123     } else
5124 #endif
5125         ri->name_list_idx = 0;
5126
5127     if (RExC_recurse_count) {
5128         for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
5129             const regnode *scan = RExC_recurse[RExC_recurse_count-1];
5130             ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
5131         }
5132     }
5133     Newxz(r->offs, RExC_npar, regexp_paren_pair);
5134     /* assume we don't need to swap parens around before we match */
5135
5136     DEBUG_DUMP_r({
5137         PerlIO_printf(Perl_debug_log,"Final program:\n");
5138         regdump(r);
5139     });
5140 #ifdef RE_TRACK_PATTERN_OFFSETS
5141     DEBUG_OFFSETS_r(if (ri->u.offsets) {
5142         const U32 len = ri->u.offsets[0];
5143         U32 i;
5144         GET_RE_DEBUG_FLAGS_DECL;
5145         PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
5146         for (i = 1; i <= len; i++) {
5147             if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
5148                 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
5149                 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
5150             }
5151         PerlIO_printf(Perl_debug_log, "\n");
5152     });
5153 #endif
5154     return rx;
5155 }
5156
5157 #undef RE_ENGINE_PTR
5158
5159
5160 SV*
5161 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
5162                     const U32 flags)
5163 {
5164     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
5165
5166     PERL_UNUSED_ARG(value);
5167
5168     if (flags & RXapif_FETCH) {
5169         return reg_named_buff_fetch(rx, key, flags);
5170     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
5171         Perl_croak_no_modify(aTHX);
5172         return NULL;
5173     } else if (flags & RXapif_EXISTS) {
5174         return reg_named_buff_exists(rx, key, flags)
5175             ? &PL_sv_yes
5176             : &PL_sv_no;
5177     } else if (flags & RXapif_REGNAMES) {
5178         return reg_named_buff_all(rx, flags);
5179     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
5180         return reg_named_buff_scalar(rx, flags);
5181     } else {
5182         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
5183         return NULL;
5184     }
5185 }
5186
5187 SV*
5188 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
5189                          const U32 flags)
5190 {
5191     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
5192     PERL_UNUSED_ARG(lastkey);
5193
5194     if (flags & RXapif_FIRSTKEY)
5195         return reg_named_buff_firstkey(rx, flags);
5196     else if (flags & RXapif_NEXTKEY)
5197         return reg_named_buff_nextkey(rx, flags);
5198     else {
5199         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
5200         return NULL;
5201     }
5202 }
5203
5204 SV*
5205 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
5206                           const U32 flags)
5207 {
5208     AV *retarray = NULL;
5209     SV *ret;
5210     struct regexp *const rx = (struct regexp *)SvANY(r);
5211
5212     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
5213
5214     if (flags & RXapif_ALL)
5215         retarray=newAV();
5216
5217     if (rx && RXp_PAREN_NAMES(rx)) {
5218         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
5219         if (he_str) {
5220             IV i;
5221             SV* sv_dat=HeVAL(he_str);
5222             I32 *nums=(I32*)SvPVX(sv_dat);
5223             for ( i=0; i<SvIVX(sv_dat); i++ ) {
5224                 if ((I32)(rx->nparens) >= nums[i]
5225                     && rx->offs[nums[i]].start != -1
5226                     && rx->offs[nums[i]].end != -1)
5227                 {
5228                     ret = newSVpvs("");
5229                     CALLREG_NUMBUF_FETCH(r,nums[i],ret);
5230                     if (!retarray)
5231                         return ret;
5232                 } else {
5233                     ret = newSVsv(&PL_sv_undef);
5234                 }
5235                 if (retarray)
5236                     av_push(retarray, ret);
5237             }
5238             if (retarray)
5239                 return newRV_noinc(MUTABLE_SV(retarray));
5240         }
5241     }
5242     return NULL;
5243 }
5244
5245 bool
5246 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
5247                            const U32 flags)
5248 {
5249     struct regexp *const rx = (struct regexp *)SvANY(r);
5250
5251     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
5252
5253     if (rx && RXp_PAREN_NAMES(rx)) {
5254         if (flags & RXapif_ALL) {
5255             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
5256         } else {
5257             SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
5258             if (sv) {
5259                 SvREFCNT_dec(sv);
5260                 return TRUE;
5261             } else {
5262                 return FALSE;
5263             }
5264         }
5265     } else {
5266         return FALSE;
5267     }
5268 }
5269
5270 SV*
5271 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
5272 {
5273     struct regexp *const rx = (struct regexp *)SvANY(r);
5274
5275     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
5276
5277     if ( rx && RXp_PAREN_NAMES(rx) ) {
5278         (void)hv_iterinit(RXp_PAREN_NAMES(rx));
5279
5280         return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
5281     } else {
5282         return FALSE;
5283     }
5284 }
5285
5286 SV*
5287 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
5288 {
5289     struct regexp *const rx = (struct regexp *)SvANY(r);
5290     GET_RE_DEBUG_FLAGS_DECL;
5291
5292     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
5293
5294     if (rx && RXp_PAREN_NAMES(rx)) {
5295         HV *hv = RXp_PAREN_NAMES(rx);
5296         HE *temphe;
5297         while ( (temphe = hv_iternext_flags(hv,0)) ) {
5298             IV i;
5299             IV parno = 0;
5300             SV* sv_dat = HeVAL(temphe);
5301             I32 *nums = (I32*)SvPVX(sv_dat);
5302             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5303                 if ((I32)(rx->lastparen) >= nums[i] &&
5304                     rx->offs[nums[i]].start != -1 &&
5305                     rx->offs[nums[i]].end != -1)
5306                 {
5307                     parno = nums[i];
5308                     break;
5309                 }
5310             }
5311             if (parno || flags & RXapif_ALL) {
5312                 return newSVhek(HeKEY_hek(temphe));
5313             }
5314         }
5315     }
5316     return NULL;
5317 }
5318
5319 SV*
5320 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
5321 {
5322     SV *ret;
5323     AV *av;
5324     I32 length;
5325     struct regexp *const rx = (struct regexp *)SvANY(r);
5326
5327     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
5328
5329     if (rx && RXp_PAREN_NAMES(rx)) {
5330         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5331             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
5332         } else if (flags & RXapif_ONE) {
5333             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
5334             av = MUTABLE_AV(SvRV(ret));
5335             length = av_len(av);
5336             SvREFCNT_dec(ret);
5337             return newSViv(length + 1);
5338         } else {
5339             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5340             return NULL;
5341         }
5342     }
5343     return &PL_sv_undef;
5344 }
5345
5346 SV*
5347 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
5348 {
5349     struct regexp *const rx = (struct regexp *)SvANY(r);
5350     AV *av = newAV();
5351
5352     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
5353
5354     if (rx && RXp_PAREN_NAMES(rx)) {
5355         HV *hv= RXp_PAREN_NAMES(rx);
5356         HE *temphe;
5357         (void)hv_iterinit(hv);
5358         while ( (temphe = hv_iternext_flags(hv,0)) ) {
5359             IV i;
5360             IV parno = 0;
5361             SV* sv_dat = HeVAL(temphe);
5362             I32 *nums = (I32*)SvPVX(sv_dat);
5363             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5364                 if ((I32)(rx->lastparen) >= nums[i] &&
5365                     rx->offs[nums[i]].start != -1 &&
5366                     rx->offs[nums[i]].end != -1)
5367                 {
5368                     parno = nums[i];
5369                     break;
5370                 }
5371             }
5372             if (parno || flags & RXapif_ALL) {
5373                 av_push(av, newSVhek(HeKEY_hek(temphe)));
5374             }
5375         }
5376     }
5377
5378     return newRV_noinc(MUTABLE_SV(av));
5379 }
5380
5381 void
5382 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
5383                              SV * const sv)
5384 {
5385     struct regexp *const rx = (struct regexp *)SvANY(r);
5386     char *s = NULL;
5387     I32 i = 0;
5388     I32 s1, t1;
5389
5390     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
5391
5392     if (!rx->subbeg) {
5393         sv_setsv(sv,&PL_sv_undef);
5394         return;
5395     }
5396     else
5397     if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
5398         /* $` */
5399         i = rx->offs[0].start;
5400         s = rx->subbeg;
5401     }
5402     else
5403     if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
5404         /* $' */
5405         s = rx->subbeg + rx->offs[0].end;
5406         i = rx->sublen - rx->offs[0].end;
5407     }
5408     else
5409     if ( 0 <= paren && paren <= (I32)rx->nparens &&
5410         (s1 = rx->offs[paren].start) != -1 &&
5411         (t1 = rx->offs[paren].end) != -1)
5412     {
5413         /* $& $1 ... */
5414         i = t1 - s1;
5415         s = rx->subbeg + s1;
5416     } else {
5417         sv_setsv(sv,&PL_sv_undef);
5418         return;
5419     }
5420     assert(rx->sublen >= (s - rx->subbeg) + i );
5421     if (i >= 0) {
5422         const int oldtainted = PL_tainted;
5423         TAINT_NOT;
5424         sv_setpvn(sv, s, i);
5425         PL_tainted = oldtainted;
5426         if ( (rx->extflags & RXf_CANY_SEEN)
5427             ? (RXp_MATCH_UTF8(rx)
5428                         && (!i || is_utf8_string((U8*)s, i)))
5429             : (RXp_MATCH_UTF8(rx)) )
5430         {
5431             SvUTF8_on(sv);
5432         }
5433         else
5434             SvUTF8_off(sv);
5435         if (PL_tainting) {
5436             if (RXp_MATCH_TAINTED(rx)) {
5437                 if (SvTYPE(sv) >= SVt_PVMG) {
5438                     MAGIC* const mg = SvMAGIC(sv);
5439                     MAGIC* mgt;
5440                     PL_tainted = 1;
5441                     SvMAGIC_set(sv, mg->mg_moremagic);
5442                     SvTAINT(sv);
5443                     if ((mgt = SvMAGIC(sv))) {
5444                         mg->mg_moremagic = mgt;
5445                         SvMAGIC_set(sv, mg);
5446                     }
5447                 } else {
5448                     PL_tainted = 1;
5449                     SvTAINT(sv);
5450                 }
5451             } else
5452                 SvTAINTED_off(sv);
5453         }
5454     } else {
5455         sv_setsv(sv,&PL_sv_undef);
5456         return;
5457     }
5458 }
5459
5460 void
5461 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
5462                                                          SV const * const value)
5463 {
5464     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
5465
5466     PERL_UNUSED_ARG(rx);
5467     PERL_UNUSED_ARG(paren);
5468     PERL_UNUSED_ARG(value);
5469
5470     if (!PL_localizing)
5471         Perl_croak_no_modify(aTHX);
5472 }
5473
5474 I32
5475 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
5476                               const I32 paren)
5477 {
5478     struct regexp *const rx = (struct regexp *)SvANY(r);
5479     I32 i;
5480     I32 s1, t1;
5481
5482     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
5483
5484     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
5485         switch (paren) {
5486       /* $` / ${^PREMATCH} */
5487       case RX_BUFF_IDX_PREMATCH:
5488         if (rx->offs[0].start != -1) {
5489                         i = rx->offs[0].start;
5490                         if (i > 0) {
5491                                 s1 = 0;
5492                                 t1 = i;
5493                                 goto getlen;
5494                         }
5495             }
5496         return 0;
5497       /* $' / ${^POSTMATCH} */
5498       case RX_BUFF_IDX_POSTMATCH:
5499             if (rx->offs[0].end != -1) {
5500                         i = rx->sublen - rx->offs[0].end;
5501                         if (i > 0) {
5502                                 s1 = rx->offs[0].end;
5503                                 t1 = rx->sublen;
5504                                 goto getlen;
5505                         }
5506             }
5507         return 0;
5508       /* $& / ${^MATCH}, $1, $2, ... */
5509       default:
5510             if (paren <= (I32)rx->nparens &&
5511             (s1 = rx->offs[paren].start) != -1 &&
5512             (t1 = rx->offs[paren].end) != -1)
5513             {
5514             i = t1 - s1;
5515             goto getlen;
5516         } else {
5517             if (ckWARN(WARN_UNINITIALIZED))
5518                 report_uninit((const SV *)sv);
5519             return 0;
5520         }
5521     }
5522   getlen:
5523     if (i > 0 && RXp_MATCH_UTF8(rx)) {
5524         const char * const s = rx->subbeg + s1;
5525         const U8 *ep;
5526         STRLEN el;
5527
5528         i = t1 - s1;
5529         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
5530                         i = el;
5531     }
5532     return i;
5533 }
5534
5535 SV*
5536 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
5537 {
5538     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
5539         PERL_UNUSED_ARG(rx);
5540         if (0)
5541             return NULL;
5542         else
5543             return newSVpvs("Regexp");
5544 }
5545
5546 /* Scans the name of a named buffer from the pattern.
5547  * If flags is REG_RSN_RETURN_NULL returns null.
5548  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
5549  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
5550  * to the parsed name as looked up in the RExC_paren_names hash.
5551  * If there is an error throws a vFAIL().. type exception.
5552  */
5553
5554 #define REG_RSN_RETURN_NULL    0
5555 #define REG_RSN_RETURN_NAME    1
5556 #define REG_RSN_RETURN_DATA    2
5557
5558 STATIC SV*
5559 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
5560 {
5561     char *name_start = RExC_parse;
5562
5563     PERL_ARGS_ASSERT_REG_SCAN_NAME;
5564
5565     if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
5566          /* skip IDFIRST by using do...while */
5567         if (UTF)
5568             do {
5569                 RExC_parse += UTF8SKIP(RExC_parse);
5570             } while (isALNUM_utf8((U8*)RExC_parse));
5571         else
5572             do {
5573                 RExC_parse++;
5574             } while (isALNUM(*RExC_parse));
5575     }
5576
5577     if ( flags ) {
5578         SV* sv_name
5579             = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
5580                              SVs_TEMP | (UTF ? SVf_UTF8 : 0));
5581         if ( flags == REG_RSN_RETURN_NAME)
5582             return sv_name;
5583         else if (flags==REG_RSN_RETURN_DATA) {
5584             HE *he_str = NULL;
5585             SV *sv_dat = NULL;
5586             if ( ! sv_name )      /* should not happen*/
5587                 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
5588             if (RExC_paren_names)
5589                 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
5590             if ( he_str )
5591                 sv_dat = HeVAL(he_str);
5592             if ( ! sv_dat )
5593                 vFAIL("Reference to nonexistent named group");
5594             return sv_dat;
5595         }
5596         else {
5597             Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
5598         }
5599         /* NOT REACHED */
5600     }
5601     return NULL;
5602 }
5603
5604 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
5605     int rem=(int)(RExC_end - RExC_parse);                       \
5606     int cut;                                                    \
5607     int num;                                                    \
5608     int iscut=0;                                                \
5609     if (rem>10) {                                               \
5610         rem=10;                                                 \
5611         iscut=1;                                                \
5612     }                                                           \
5613     cut=10-rem;                                                 \
5614     if (RExC_lastparse!=RExC_parse)                             \
5615         PerlIO_printf(Perl_debug_log," >%.*s%-*s",              \
5616             rem, RExC_parse,                                    \
5617             cut + 4,                                            \
5618             iscut ? "..." : "<"                                 \
5619         );                                                      \
5620     else                                                        \
5621         PerlIO_printf(Perl_debug_log,"%16s","");                \
5622                                                                 \
5623     if (SIZE_ONLY)                                              \
5624        num = RExC_size + 1;                                     \
5625     else                                                        \
5626        num=REG_NODE_NUM(RExC_emit);                             \
5627     if (RExC_lastnum!=num)                                      \
5628        PerlIO_printf(Perl_debug_log,"|%4d",num);                \
5629     else                                                        \
5630        PerlIO_printf(Perl_debug_log,"|%4s","");                 \
5631     PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
5632         (int)((depth*2)), "",                                   \
5633         (funcname)                                              \
5634     );                                                          \
5635     RExC_lastnum=num;                                           \
5636     RExC_lastparse=RExC_parse;                                  \
5637 })
5638
5639
5640
5641 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
5642     DEBUG_PARSE_MSG((funcname));                            \
5643     PerlIO_printf(Perl_debug_log,"%4s","\n");               \
5644 })
5645 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
5646     DEBUG_PARSE_MSG((funcname));                            \
5647     PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
5648 })
5649 /*
5650  - reg - regular expression, i.e. main body or parenthesized thing
5651  *
5652  * Caller must absorb opening parenthesis.
5653  *
5654  * Combining parenthesis handling with the base level of regular expression
5655  * is a trifle forced, but the need to tie the tails of the branches to what
5656  * follows makes it hard to avoid.
5657  */
5658 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
5659 #ifdef DEBUGGING
5660 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
5661 #else
5662 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
5663 #endif
5664
5665 STATIC regnode *
5666 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
5667     /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
5668 {
5669     dVAR;
5670     register regnode *ret;              /* Will be the head of the group. */
5671     register regnode *br;
5672     register regnode *lastbr;
5673     register regnode *ender = NULL;
5674     register I32 parno = 0;
5675     I32 flags;
5676     U32 oregflags = RExC_flags;
5677     bool have_branch = 0;
5678     bool is_open = 0;
5679     I32 freeze_paren = 0;
5680     I32 after_freeze = 0;
5681
5682     /* for (?g), (?gc), and (?o) warnings; warning
5683        about (?c) will warn about (?g) -- japhy    */
5684
5685 #define WASTED_O  0x01
5686 #define WASTED_G  0x02
5687 #define WASTED_C  0x04
5688 #define WASTED_GC (0x02|0x04)
5689     I32 wastedflags = 0x00;
5690
5691     char * parse_start = RExC_parse; /* MJD */
5692     char * const oregcomp_parse = RExC_parse;
5693
5694     GET_RE_DEBUG_FLAGS_DECL;
5695
5696     PERL_ARGS_ASSERT_REG;
5697     DEBUG_PARSE("reg ");
5698
5699     *flagp = 0;                         /* Tentatively. */
5700
5701
5702     /* Make an OPEN node, if parenthesized. */
5703     if (paren) {
5704         if ( *RExC_parse == '*') { /* (*VERB:ARG) */
5705             char *start_verb = RExC_parse;
5706             STRLEN verb_len = 0;
5707             char *start_arg = NULL;
5708             unsigned char op = 0;
5709             int argok = 1;
5710             int internal_argval = 0; /* internal_argval is only useful if !argok */
5711             while ( *RExC_parse && *RExC_parse != ')' ) {
5712                 if ( *RExC_parse == ':' ) {
5713                     start_arg = RExC_parse + 1;
5714                     break;
5715                 }
5716                 RExC_parse++;
5717             }
5718             ++start_verb;
5719             verb_len = RExC_parse - start_verb;
5720             if ( start_arg ) {
5721                 RExC_parse++;
5722                 while ( *RExC_parse && *RExC_parse != ')' )
5723                     RExC_parse++;
5724                 if ( *RExC_parse != ')' )
5725                     vFAIL("Unterminated verb pattern argument");
5726                 if ( RExC_parse == start_arg )
5727                     start_arg = NULL;
5728             } else {
5729                 if ( *RExC_parse != ')' )
5730                     vFAIL("Unterminated verb pattern");
5731             }
5732
5733             switch ( *start_verb ) {
5734             case 'A':  /* (*ACCEPT) */
5735                 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
5736                     op = ACCEPT;
5737                     internal_argval = RExC_nestroot;
5738                 }
5739                 break;
5740             case 'C':  /* (*COMMIT) */
5741                 if ( memEQs(start_verb,verb_len,"COMMIT") )
5742                     op = COMMIT;
5743                 break;
5744             case 'F':  /* (*FAIL) */
5745                 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
5746                     op = OPFAIL;
5747                     argok = 0;
5748                 }
5749                 break;
5750             case ':':  /* (*:NAME) */
5751             case 'M':  /* (*MARK:NAME) */
5752                 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
5753                     op = MARKPOINT;
5754                     argok = -1;
5755                 }
5756                 break;
5757             case 'P':  /* (*PRUNE) */
5758                 if ( memEQs(start_verb,verb_len,"PRUNE") )
5759                     op = PRUNE;
5760                 break;
5761             case 'S':   /* (*SKIP) */
5762                 if ( memEQs(start_verb,verb_len,"SKIP") )
5763                     op = SKIP;
5764                 break;
5765             case 'T':  /* (*THEN) */
5766                 /* [19:06] <TimToady> :: is then */
5767                 if ( memEQs(start_verb,verb_len,"THEN") ) {
5768                     op = CUTGROUP;
5769                     RExC_seen |= REG_SEEN_CUTGROUP;
5770                 }
5771                 break;
5772             }
5773             if ( ! op ) {
5774                 RExC_parse++;
5775                 vFAIL3("Unknown verb pattern '%.*s'",
5776                     verb_len, start_verb);
5777             }
5778             if ( argok ) {
5779                 if ( start_arg && internal_argval ) {
5780                     vFAIL3("Verb pattern '%.*s' may not have an argument",
5781                         verb_len, start_verb);
5782                 } else if ( argok < 0 && !start_arg ) {
5783                     vFAIL3("Verb pattern '%.*s' has a mandatory argument",
5784                         verb_len, start_verb);
5785                 } else {
5786                     ret = reganode(pRExC_state, op, internal_argval);
5787                     if ( ! internal_argval && ! SIZE_ONLY ) {
5788                         if (start_arg) {
5789                             SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
5790                             ARG(ret) = add_data( pRExC_state, 1, "S" );
5791                             RExC_rxi->data->data[ARG(ret)]=(void*)sv;
5792                             ret->flags = 0;
5793                         } else {
5794                             ret->flags = 1;
5795                         }
5796                     }
5797                 }
5798                 if (!internal_argval)
5799                     RExC_seen |= REG_SEEN_VERBARG;
5800             } else if ( start_arg ) {
5801                 vFAIL3("Verb pattern '%.*s' may not have an argument",
5802                         verb_len, start_verb);
5803             } else {
5804                 ret = reg_node(pRExC_state, op);
5805             }
5806             nextchar(pRExC_state);
5807             return ret;
5808         } else
5809         if (*RExC_parse == '?') { /* (?...) */
5810             bool is_logical = 0;
5811             const char * const seqstart = RExC_parse;
5812             bool has_use_defaults = FALSE;
5813
5814             RExC_parse++;
5815             paren = *RExC_parse++;
5816             ret = NULL;                 /* For look-ahead/behind. */
5817             switch (paren) {
5818
5819             case 'P':   /* (?P...) variants for those used to PCRE/Python */
5820                 paren = *RExC_parse++;
5821                 if ( paren == '<')         /* (?P<...>) named capture */
5822                     goto named_capture;
5823                 else if (paren == '>') {   /* (?P>name) named recursion */
5824                     goto named_recursion;
5825                 }
5826                 else if (paren == '=') {   /* (?P=...)  named backref */
5827                     /* this pretty much dupes the code for \k<NAME> in regatom(), if
5828                        you change this make sure you change that */
5829                     char* name_start = RExC_parse;
5830                     U32 num = 0;
5831                     SV *sv_dat = reg_scan_name(pRExC_state,
5832                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5833                     if (RExC_parse == name_start || *RExC_parse != ')')
5834                         vFAIL2("Sequence %.3s... not terminated",parse_start);
5835
5836                     if (!SIZE_ONLY) {
5837                         num = add_data( pRExC_state, 1, "S" );
5838                         RExC_rxi->data->data[num]=(void*)sv_dat;
5839                         SvREFCNT_inc_simple_void(sv_dat);
5840                     }
5841                     RExC_sawback = 1;
5842                     ret = reganode(pRExC_state,
5843                                    ((! FOLD)
5844                                      ? NREF
5845                                      : (UNI_SEMANTICS)
5846                                        ? NREFFU
5847                                        : (LOC)
5848                                          ? NREFFL
5849                                          : NREFF),
5850                                     num);
5851                     *flagp |= HASWIDTH;
5852
5853                     Set_Node_Offset(ret, parse_start+1);
5854                     Set_Node_Cur_Length(ret); /* MJD */
5855
5856                     nextchar(pRExC_state);
5857                     return ret;
5858                 }
5859                 RExC_parse++;
5860                 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5861                 /*NOTREACHED*/
5862             case '<':           /* (?<...) */
5863                 if (*RExC_parse == '!')
5864                     paren = ',';
5865                 else if (*RExC_parse != '=')
5866               named_capture:
5867                 {               /* (?<...>) */
5868                     char *name_start;
5869                     SV *svname;
5870                     paren= '>';
5871             case '\'':          /* (?'...') */
5872                     name_start= RExC_parse;
5873                     svname = reg_scan_name(pRExC_state,
5874                         SIZE_ONLY ?  /* reverse test from the others */
5875                         REG_RSN_RETURN_NAME :
5876                         REG_RSN_RETURN_NULL);
5877                     if (RExC_parse == name_start) {
5878                         RExC_parse++;
5879                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5880                         /*NOTREACHED*/
5881                     }
5882                     if (*RExC_parse != paren)
5883                         vFAIL2("Sequence (?%c... not terminated",
5884                             paren=='>' ? '<' : paren);
5885                     if (SIZE_ONLY) {
5886                         HE *he_str;
5887                         SV *sv_dat = NULL;
5888                         if (!svname) /* shouldn't happen */
5889                             Perl_croak(aTHX_
5890                                 "panic: reg_scan_name returned NULL");
5891                         if (!RExC_paren_names) {
5892                             RExC_paren_names= newHV();
5893                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
5894 #ifdef DEBUGGING
5895                             RExC_paren_name_list= newAV();
5896                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
5897 #endif
5898                         }
5899                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
5900                         if ( he_str )
5901                             sv_dat = HeVAL(he_str);
5902                         if ( ! sv_dat ) {
5903                             /* croak baby croak */
5904                             Perl_croak(aTHX_
5905                                 "panic: paren_name hash element allocation failed");
5906                         } else if ( SvPOK(sv_dat) ) {
5907                             /* (?|...) can mean we have dupes so scan to check
5908                                its already been stored. Maybe a flag indicating
5909                                we are inside such a construct would be useful,
5910                                but the arrays are likely to be quite small, so
5911                                for now we punt -- dmq */
5912                             IV count = SvIV(sv_dat);
5913                             I32 *pv = (I32*)SvPVX(sv_dat);
5914                             IV i;
5915                             for ( i = 0 ; i < count ; i++ ) {
5916                                 if ( pv[i] == RExC_npar ) {
5917                                     count = 0;
5918                                     break;
5919                                 }
5920                             }
5921                             if ( count ) {
5922                                 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
5923                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
5924                                 pv[count] = RExC_npar;
5925                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
5926                             }
5927                         } else {
5928                             (void)SvUPGRADE(sv_dat,SVt_PVNV);
5929                             sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
5930                             SvIOK_on(sv_dat);
5931                             SvIV_set(sv_dat, 1);
5932                         }
5933 #ifdef DEBUGGING
5934                         if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
5935                             SvREFCNT_dec(svname);
5936 #endif
5937
5938                         /*sv_dump(sv_dat);*/
5939                     }
5940                     nextchar(pRExC_state);
5941                     paren = 1;
5942                     goto capturing_parens;
5943                 }
5944                 RExC_seen |= REG_SEEN_LOOKBEHIND;
5945                 RExC_in_lookbehind++;
5946                 RExC_parse++;
5947             case '=':           /* (?=...) */
5948                 RExC_seen_zerolen++;
5949                 break;
5950             case '!':           /* (?!...) */
5951                 RExC_seen_zerolen++;
5952                 if (*RExC_parse == ')') {
5953                     ret=reg_node(pRExC_state, OPFAIL);
5954                     nextchar(pRExC_state);
5955                     return ret;
5956                 }
5957                 break;
5958             case '|':           /* (?|...) */
5959                 /* branch reset, behave like a (?:...) except that
5960                    buffers in alternations share the same numbers */
5961                 paren = ':';
5962                 after_freeze = freeze_paren = RExC_npar;
5963                 break;
5964             case ':':           /* (?:...) */
5965             case '>':           /* (?>...) */
5966                 break;
5967             case '$':           /* (?$...) */
5968             case '@':           /* (?@...) */
5969                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
5970                 break;
5971             case '#':           /* (?#...) */
5972                 while (*RExC_parse && *RExC_parse != ')')
5973                     RExC_parse++;
5974                 if (*RExC_parse != ')')
5975                     FAIL("Sequence (?#... not terminated");
5976                 nextchar(pRExC_state);
5977                 *flagp = TRYAGAIN;
5978                 return NULL;
5979             case '0' :           /* (?0) */
5980             case 'R' :           /* (?R) */
5981                 if (*RExC_parse != ')')
5982                     FAIL("Sequence (?R) not terminated");
5983                 ret = reg_node(pRExC_state, GOSTART);
5984                 *flagp |= POSTPONED;
5985                 nextchar(pRExC_state);
5986                 return ret;
5987                 /*notreached*/
5988             { /* named and numeric backreferences */
5989                 I32 num;
5990             case '&':            /* (?&NAME) */
5991                 parse_start = RExC_parse - 1;
5992               named_recursion:
5993                 {
5994                     SV *sv_dat = reg_scan_name(pRExC_state,
5995                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5996                      num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5997                 }
5998                 goto gen_recurse_regop;
5999                 /* NOT REACHED */
6000             case '+':
6001                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
6002                     RExC_parse++;
6003                     vFAIL("Illegal pattern");
6004                 }
6005                 goto parse_recursion;
6006                 /* NOT REACHED*/
6007             case '-': /* (?-1) */
6008                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
6009                     RExC_parse--; /* rewind to let it be handled later */
6010                     goto parse_flags;
6011                 }
6012                 /*FALLTHROUGH */
6013             case '1': case '2': case '3': case '4': /* (?1) */
6014             case '5': case '6': case '7': case '8': case '9':
6015                 RExC_parse--;
6016               parse_recursion:
6017                 num = atoi(RExC_parse);
6018                 parse_start = RExC_parse - 1; /* MJD */
6019                 if (*RExC_parse == '-')
6020                     RExC_parse++;
6021                 while (isDIGIT(*RExC_parse))
6022                         RExC_parse++;
6023                 if (*RExC_parse!=')')
6024                     vFAIL("Expecting close bracket");
6025
6026               gen_recurse_regop:
6027                 if ( paren == '-' ) {
6028                     /*
6029                     Diagram of capture buffer numbering.
6030                     Top line is the normal capture buffer numbers
6031                     Bottom line is the negative indexing as from
6032                     the X (the (?-2))
6033
6034                     +   1 2    3 4 5 X          6 7
6035                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
6036                     -   5 4    3 2 1 X          x x
6037
6038                     */
6039                     num = RExC_npar + num;
6040                     if (num < 1)  {
6041                         RExC_parse++;
6042                         vFAIL("Reference to nonexistent group");
6043                     }
6044                 } else if ( paren == '+' ) {
6045                     num = RExC_npar + num - 1;
6046                 }
6047
6048                 ret = reganode(pRExC_state, GOSUB, num);
6049                 if (!SIZE_ONLY) {
6050                     if (num > (I32)RExC_rx->nparens) {
6051                         RExC_parse++;
6052                         vFAIL("Reference to nonexistent group");
6053                     }
6054                     ARG2L_SET( ret, RExC_recurse_count++);
6055                     RExC_emit++;
6056                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6057                         "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
6058                 } else {
6059                     RExC_size++;
6060                 }
6061                 RExC_seen |= REG_SEEN_RECURSE;
6062                 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
6063                 Set_Node_Offset(ret, parse_start); /* MJD */
6064
6065                 *flagp |= POSTPONED;
6066                 nextchar(pRExC_state);
6067                 return ret;
6068             } /* named and numeric backreferences */
6069             /* NOT REACHED */
6070
6071             case '?':           /* (??...) */
6072                 is_logical = 1;
6073                 if (*RExC_parse != '{') {
6074                     RExC_parse++;
6075                     vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6076                     /*NOTREACHED*/
6077                 }
6078                 *flagp |= POSTPONED;
6079                 paren = *RExC_parse++;
6080                 /* FALL THROUGH */
6081             case '{':           /* (?{...}) */
6082             {
6083                 I32 count = 1;
6084                 U32 n = 0;
6085                 char c;
6086                 char *s = RExC_parse;
6087
6088                 RExC_seen_zerolen++;
6089                 RExC_seen |= REG_SEEN_EVAL;
6090                 while (count && (c = *RExC_parse)) {
6091                     if (c == '\\') {
6092                         if (RExC_parse[1])
6093                             RExC_parse++;
6094                     }
6095                     else if (c == '{')
6096                         count++;
6097                     else if (c == '}')
6098                         count--;
6099                     RExC_parse++;
6100                 }
6101                 if (*RExC_parse != ')') {
6102                     RExC_parse = s;
6103                     vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
6104                 }
6105                 if (!SIZE_ONLY) {
6106                     PAD *pad;
6107                     OP_4tree *sop, *rop;
6108                     SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
6109
6110                     ENTER;
6111                     Perl_save_re_context(aTHX);
6112                     rop = Perl_sv_compile_2op_is_broken(aTHX_ sv, &sop, "re", &pad);
6113                     sop->op_private |= OPpREFCOUNTED;
6114                     /* re_dup will OpREFCNT_inc */
6115                     OpREFCNT_set(sop, 1);
6116                     LEAVE;
6117
6118                     n = add_data(pRExC_state, 3, "nop");
6119                     RExC_rxi->data->data[n] = (void*)rop;
6120                     RExC_rxi->data->data[n+1] = (void*)sop;
6121                     RExC_rxi->data->data[n+2] = (void*)pad;
6122                     SvREFCNT_dec(sv);
6123                 }
6124                 else {                                          /* First pass */
6125                     if (PL_reginterp_cnt < ++RExC_seen_evals
6126                         && IN_PERL_RUNTIME)
6127                         /* No compiled RE interpolated, has runtime
6128                            components ===> unsafe.  */
6129                         FAIL("Eval-group not allowed at runtime, use re 'eval'");
6130                     if (PL_tainting && PL_tainted)
6131                         FAIL("Eval-group in insecure regular expression");
6132 #if PERL_VERSION > 8
6133                     if (IN_PERL_COMPILETIME)
6134                         PL_cv_has_eval = 1;
6135 #endif
6136                 }
6137
6138                 nextchar(pRExC_state);
6139                 if (is_logical) {
6140                     ret = reg_node(pRExC_state, LOGICAL);
6141                     if (!SIZE_ONLY)
6142                         ret->flags = 2;
6143                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
6144                     /* deal with the length of this later - MJD */
6145                     return ret;
6146                 }
6147                 ret = reganode(pRExC_state, EVAL, n);
6148                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
6149                 Set_Node_Offset(ret, parse_start);
6150                 return ret;
6151             }
6152             case '(':           /* (?(?{...})...) and (?(?=...)...) */
6153             {
6154                 int is_define= 0;
6155                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
6156                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
6157                         || RExC_parse[1] == '<'
6158                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
6159                         I32 flag;
6160
6161                         ret = reg_node(pRExC_state, LOGICAL);
6162                         if (!SIZE_ONLY)
6163                             ret->flags = 1;
6164                         REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
6165                         goto insert_if;
6166                     }
6167                 }
6168                 else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
6169                          || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
6170                 {
6171                     char ch = RExC_parse[0] == '<' ? '>' : '\'';
6172                     char *name_start= RExC_parse++;
6173                     U32 num = 0;
6174                     SV *sv_dat=reg_scan_name(pRExC_state,
6175                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6176                     if (RExC_parse == name_start || *RExC_parse != ch)
6177                         vFAIL2("Sequence (?(%c... not terminated",
6178                             (ch == '>' ? '<' : ch));
6179                     RExC_parse++;
6180                     if (!SIZE_ONLY) {
6181                         num = add_data( pRExC_state, 1, "S" );
6182                         RExC_rxi->data->data[num]=(void*)sv_dat;
6183                         SvREFCNT_inc_simple_void(sv_dat);
6184                     }
6185                     ret = reganode(pRExC_state,NGROUPP,num);
6186                     goto insert_if_check_paren;
6187                 }
6188                 else if (RExC_parse[0] == 'D' &&
6189                          RExC_parse[1] == 'E' &&
6190                          RExC_parse[2] == 'F' &&
6191                          RExC_parse[3] == 'I' &&
6192                          RExC_parse[4] == 'N' &&
6193                          RExC_parse[5] == 'E')
6194                 {
6195                     ret = reganode(pRExC_state,DEFINEP,0);
6196                     RExC_parse +=6 ;
6197                     is_define = 1;
6198                     goto insert_if_check_paren;
6199                 }
6200                 else if (RExC_parse[0] == 'R') {
6201                     RExC_parse++;
6202                     parno = 0;
6203                     if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
6204                         parno = atoi(RExC_parse++);
6205                         while (isDIGIT(*RExC_parse))
6206                             RExC_parse++;
6207                     } else if (RExC_parse[0] == '&') {
6208                         SV *sv_dat;
6209                         RExC_parse++;
6210                         sv_dat = reg_scan_name(pRExC_state,
6211                             SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6212                         parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
6213                     }
6214                     ret = reganode(pRExC_state,INSUBP,parno);
6215                     goto insert_if_check_paren;
6216                 }
6217                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
6218                     /* (?(1)...) */
6219                     char c;
6220                     parno = atoi(RExC_parse++);
6221
6222                     while (isDIGIT(*RExC_parse))
6223                         RExC_parse++;
6224                     ret = reganode(pRExC_state, GROUPP, parno);
6225
6226                  insert_if_check_paren:
6227                     if ((c = *nextchar(pRExC_state)) != ')')
6228                         vFAIL("Switch condition not recognized");
6229                   insert_if:
6230                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
6231                     br = regbranch(pRExC_state, &flags, 1,depth+1);
6232                     if (br == NULL)
6233                         br = reganode(pRExC_state, LONGJMP, 0);
6234                     else
6235                         REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
6236                     c = *nextchar(pRExC_state);
6237                     if (flags&HASWIDTH)
6238                         *flagp |= HASWIDTH;
6239                     if (c == '|') {
6240                         if (is_define)
6241                             vFAIL("(?(DEFINE)....) does not allow branches");
6242                         lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
6243                         regbranch(pRExC_state, &flags, 1,depth+1);
6244                         REGTAIL(pRExC_state, ret, lastbr);
6245                         if (flags&HASWIDTH)
6246                             *flagp |= HASWIDTH;
6247                         c = *nextchar(pRExC_state);
6248                     }
6249                     else
6250                         lastbr = NULL;
6251                     if (c != ')')
6252                         vFAIL("Switch (?(condition)... contains too many branches");
6253                     ender = reg_node(pRExC_state, TAIL);
6254                     REGTAIL(pRExC_state, br, ender);
6255                     if (lastbr) {
6256                         REGTAIL(pRExC_state, lastbr, ender);
6257                         REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
6258                     }
6259                     else
6260                         REGTAIL(pRExC_state, ret, ender);
6261                     RExC_size++; /* XXX WHY do we need this?!!
6262                                     For large programs it seems to be required
6263                                     but I can't figure out why. -- dmq*/
6264                     return ret;
6265                 }
6266                 else {
6267                     vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
6268                 }
6269             }
6270             case 0:
6271                 RExC_parse--; /* for vFAIL to print correctly */
6272                 vFAIL("Sequence (? incomplete");
6273                 break;
6274             case DEFAULT_PAT_MOD:   /* Use default flags with the exceptions
6275                                        that follow */
6276                 has_use_defaults = TRUE;
6277                 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
6278                 if (RExC_utf8) {    /* But the default for a utf8 pattern is
6279                                        unicode semantics */
6280                     set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
6281                 }
6282                 goto parse_flags;
6283             default:
6284                 --RExC_parse;
6285                 parse_flags:      /* (?i) */
6286             {
6287                 U32 posflags = 0, negflags = 0;
6288                 U32 *flagsp = &posflags;
6289                 bool has_charset_modifier = 0;
6290                 regex_charset cs = REGEX_DEPENDS_CHARSET;
6291
6292                 while (*RExC_parse) {
6293                     /* && strchr("iogcmsx", *RExC_parse) */
6294                     /* (?g), (?gc) and (?o) are useless here
6295                        and must be globally applied -- japhy */
6296                     switch (*RExC_parse) {
6297                     CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
6298                     case LOCALE_PAT_MOD:
6299                         if (has_charset_modifier || flagsp == &negflags) {
6300                             goto fail_modifiers;
6301                         }
6302                         cs = REGEX_LOCALE_CHARSET;
6303                         has_charset_modifier = 1;
6304                         break;
6305                     case UNICODE_PAT_MOD:
6306                         if (has_charset_modifier || flagsp == &negflags) {
6307                             goto fail_modifiers;
6308                         }
6309                         cs = REGEX_UNICODE_CHARSET;
6310                         has_charset_modifier = 1;
6311                         break;
6312                     case ASCII_RESTRICT_PAT_MOD:
6313                         if (has_charset_modifier || flagsp == &negflags) {
6314                             goto fail_modifiers;
6315                         }
6316                         cs = REGEX_ASCII_RESTRICTED_CHARSET;
6317                         has_charset_modifier = 1;
6318                         break;
6319                     case DEPENDS_PAT_MOD:
6320                         if (has_use_defaults
6321                             || has_charset_modifier
6322                             || flagsp == &negflags)
6323                         {
6324                             goto fail_modifiers;
6325                         }
6326
6327                         /* The dual charset means unicode semantics if the
6328                          * pattern (or target, not known until runtime) are
6329                          * utf8 */
6330                         cs = (RExC_utf8)
6331                              ? REGEX_UNICODE_CHARSET
6332                              : REGEX_DEPENDS_CHARSET;
6333                         has_charset_modifier = 1;
6334                         break;
6335                     case ONCE_PAT_MOD: /* 'o' */
6336                     case GLOBAL_PAT_MOD: /* 'g' */
6337                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6338                             const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
6339                             if (! (wastedflags & wflagbit) ) {
6340                                 wastedflags |= wflagbit;
6341                                 vWARN5(
6342                                     RExC_parse + 1,
6343                                     "Useless (%s%c) - %suse /%c modifier",
6344                                     flagsp == &negflags ? "?-" : "?",
6345                                     *RExC_parse,
6346                                     flagsp == &negflags ? "don't " : "",
6347                                     *RExC_parse
6348                                 );
6349                             }
6350                         }
6351                         break;
6352
6353                     case CONTINUE_PAT_MOD: /* 'c' */
6354                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6355                             if (! (wastedflags & WASTED_C) ) {
6356                                 wastedflags |= WASTED_GC;
6357                                 vWARN3(
6358                                     RExC_parse + 1,
6359                                     "Useless (%sc) - %suse /gc modifier",
6360                                     flagsp == &negflags ? "?-" : "?",
6361                                     flagsp == &negflags ? "don't " : ""
6362                                 );
6363                             }
6364                         }
6365                         break;
6366                     case KEEPCOPY_PAT_MOD: /* 'p' */
6367                         if (flagsp == &negflags) {
6368                             if (SIZE_ONLY)
6369                                 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
6370                         } else {
6371                             *flagsp |= RXf_PMf_KEEPCOPY;
6372                         }
6373                         break;
6374                     case '-':
6375                         /* A flag is a default iff it is following a minus, so
6376                          * if there is a minus, it means will be trying to
6377                          * re-specify a default which is an error */
6378                         if (has_use_defaults || flagsp == &negflags) {
6379             fail_modifiers:
6380                             RExC_parse++;
6381                             vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6382                             /*NOTREACHED*/
6383                         }
6384                         flagsp = &negflags;
6385                         wastedflags = 0;  /* reset so (?g-c) warns twice */
6386                         break;
6387                     case ':':
6388                         paren = ':';
6389                         /*FALLTHROUGH*/
6390                     case ')':
6391                         RExC_flags |= posflags;
6392                         RExC_flags &= ~negflags;
6393                         set_regex_charset(&RExC_flags, cs);
6394                         if (paren != ':') {
6395                             oregflags |= posflags;
6396                             oregflags &= ~negflags;
6397                             set_regex_charset(&oregflags, cs);
6398                         }
6399                         nextchar(pRExC_state);
6400                         if (paren != ':') {
6401                             *flagp = TRYAGAIN;
6402                             return NULL;
6403                         } else {
6404                             ret = NULL;
6405                             goto parse_rest;
6406                         }
6407                         /*NOTREACHED*/
6408                     default:
6409                         RExC_parse++;
6410                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6411                         /*NOTREACHED*/
6412                     }
6413                     ++RExC_parse;
6414                 }
6415             }} /* one for the default block, one for the switch */
6416         }
6417         else {                  /* (...) */
6418           capturing_parens:
6419             parno = RExC_npar;
6420             RExC_npar++;
6421
6422             ret = reganode(pRExC_state, OPEN, parno);
6423             if (!SIZE_ONLY ){
6424                 if (!RExC_nestroot)
6425                     RExC_nestroot = parno;
6426                 if (RExC_seen & REG_SEEN_RECURSE
6427                     && !RExC_open_parens[parno-1])
6428                 {
6429                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6430                         "Setting open paren #%"IVdf" to %d\n",
6431                         (IV)parno, REG_NODE_NUM(ret)));
6432                     RExC_open_parens[parno-1]= ret;
6433                 }
6434             }
6435             Set_Node_Length(ret, 1); /* MJD */
6436             Set_Node_Offset(ret, RExC_parse); /* MJD */
6437             is_open = 1;
6438         }
6439     }
6440     else                        /* ! paren */
6441         ret = NULL;
6442
6443    parse_rest:
6444     /* Pick up the branches, linking them together. */
6445     parse_start = RExC_parse;   /* MJD */
6446     br = regbranch(pRExC_state, &flags, 1,depth+1);
6447
6448     if (freeze_paren) {
6449         if (RExC_npar > after_freeze)
6450             after_freeze = RExC_npar;
6451         RExC_npar = freeze_paren;
6452     }
6453
6454     /*     branch_len = (paren != 0); */
6455
6456     if (br == NULL)
6457         return(NULL);
6458     if (*RExC_parse == '|') {
6459         if (!SIZE_ONLY && RExC_extralen) {
6460             reginsert(pRExC_state, BRANCHJ, br, depth+1);
6461         }
6462         else {                  /* MJD */
6463             reginsert(pRExC_state, BRANCH, br, depth+1);
6464             Set_Node_Length(br, paren != 0);
6465             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
6466         }
6467         have_branch = 1;
6468         if (SIZE_ONLY)
6469             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
6470     }
6471     else if (paren == ':') {
6472         *flagp |= flags&SIMPLE;
6473     }
6474     if (is_open) {                              /* Starts with OPEN. */
6475         REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
6476     }
6477     else if (paren != '?')              /* Not Conditional */
6478         ret = br;
6479     *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
6480     lastbr = br;
6481     while (*RExC_parse == '|') {
6482         if (!SIZE_ONLY && RExC_extralen) {
6483             ender = reganode(pRExC_state, LONGJMP,0);
6484             REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
6485         }
6486         if (SIZE_ONLY)
6487             RExC_extralen += 2;         /* Account for LONGJMP. */
6488         nextchar(pRExC_state);
6489         if (freeze_paren) {
6490             if (RExC_npar > after_freeze)
6491                 after_freeze = RExC_npar;
6492             RExC_npar = freeze_paren;
6493         }
6494         br = regbranch(pRExC_state, &flags, 0, depth+1);
6495
6496         if (br == NULL)
6497             return(NULL);
6498         REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
6499         lastbr = br;
6500         *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
6501     }
6502
6503     if (have_branch || paren != ':') {
6504         /* Make a closing node, and hook it on the end. */
6505         switch (paren) {
6506         case ':':
6507             ender = reg_node(pRExC_state, TAIL);
6508             break;
6509         case 1:
6510             ender = reganode(pRExC_state, CLOSE, parno);
6511             if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
6512                 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6513                         "Setting close paren #%"IVdf" to %d\n",
6514                         (IV)parno, REG_NODE_NUM(ender)));
6515                 RExC_close_parens[parno-1]= ender;
6516                 if (RExC_nestroot == parno)
6517                     RExC_nestroot = 0;
6518             }
6519             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
6520             Set_Node_Length(ender,1); /* MJD */
6521             break;
6522         case '<':
6523         case ',':
6524         case '=':
6525         case '!':
6526             *flagp &= ~HASWIDTH;
6527             /* FALL THROUGH */
6528         case '>':
6529             ender = reg_node(pRExC_state, SUCCEED);
6530             break;
6531         case 0:
6532             ender = reg_node(pRExC_state, END);
6533             if (!SIZE_ONLY) {
6534                 assert(!RExC_opend); /* there can only be one! */
6535                 RExC_opend = ender;
6536             }
6537             break;
6538         }
6539         REGTAIL(pRExC_state, lastbr, ender);
6540
6541         if (have_branch && !SIZE_ONLY) {
6542             if (depth==1)
6543                 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6544
6545             /* Hook the tails of the branches to the closing node. */
6546             for (br = ret; br; br = regnext(br)) {
6547                 const U8 op = PL_regkind[OP(br)];
6548                 if (op == BRANCH) {
6549                     REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
6550                 }
6551                 else if (op == BRANCHJ) {
6552                     REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
6553                 }
6554             }
6555         }
6556     }
6557
6558     {
6559         const char *p;
6560         static const char parens[] = "=!<,>";
6561
6562         if (paren && (p = strchr(parens, paren))) {
6563             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
6564             int flag = (p - parens) > 1;
6565
6566             if (paren == '>')
6567                 node = SUSPEND, flag = 0;
6568             reginsert(pRExC_state, node,ret, depth+1);
6569             Set_Node_Cur_Length(ret);
6570             Set_Node_Offset(ret, parse_start + 1);
6571             ret->flags = flag;
6572             REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
6573         }
6574     }
6575
6576     /* Check for proper termination. */
6577     if (paren) {
6578         RExC_flags = oregflags;
6579         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
6580             RExC_parse = oregcomp_parse;
6581             vFAIL("Unmatched (");
6582         }
6583     }
6584     else if (!paren && RExC_parse < RExC_end) {
6585         if (*RExC_parse == ')') {
6586             RExC_parse++;
6587             vFAIL("Unmatched )");
6588         }
6589         else
6590             FAIL("Junk on end of regexp");      /* "Can't happen". */
6591         /* NOTREACHED */
6592     }
6593
6594     if (RExC_in_lookbehind) {
6595         RExC_in_lookbehind--;
6596     }
6597     if (after_freeze)
6598         RExC_npar = after_freeze;
6599     return(ret);
6600 }
6601
6602 /*
6603  - regbranch - one alternative of an | operator
6604  *
6605  * Implements the concatenation operator.
6606  */
6607 STATIC regnode *
6608 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
6609 {
6610     dVAR;
6611     register regnode *ret;
6612     register regnode *chain = NULL;
6613     register regnode *latest;
6614     I32 flags = 0, c = 0;
6615     GET_RE_DEBUG_FLAGS_DECL;
6616
6617     PERL_ARGS_ASSERT_REGBRANCH;
6618
6619     DEBUG_PARSE("brnc");
6620
6621     if (first)
6622         ret = NULL;
6623     else {
6624         if (!SIZE_ONLY && RExC_extralen)
6625             ret = reganode(pRExC_state, BRANCHJ,0);
6626         else {
6627             ret = reg_node(pRExC_state, BRANCH);
6628             Set_Node_Length(ret, 1);
6629         }
6630     }
6631
6632     if (!first && SIZE_ONLY)
6633         RExC_extralen += 1;                     /* BRANCHJ */
6634
6635     *flagp = WORST;                     /* Tentatively. */
6636
6637     RExC_parse--;
6638     nextchar(pRExC_state);
6639     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
6640         flags &= ~TRYAGAIN;
6641         latest = regpiece(pRExC_state, &flags,depth+1);
6642         if (latest == NULL) {
6643             if (flags & TRYAGAIN)
6644                 continue;
6645             return(NULL);
6646         }
6647         else if (ret == NULL)
6648             ret = latest;
6649         *flagp |= flags&(HASWIDTH|POSTPONED);
6650         if (chain == NULL)      /* First piece. */
6651             *flagp |= flags&SPSTART;
6652         else {
6653             RExC_naughty++;
6654             REGTAIL(pRExC_state, chain, latest);
6655         }
6656         chain = latest;
6657         c++;
6658     }
6659     if (chain == NULL) {        /* Loop ran zero times. */
6660         chain = reg_node(pRExC_state, NOTHING);
6661         if (ret == NULL)
6662             ret = chain;
6663     }
6664     if (c == 1) {
6665         *flagp |= flags&SIMPLE;
6666     }
6667
6668     return ret;
6669 }
6670
6671 /*
6672  - regpiece - something followed by possible [*+?]
6673  *
6674  * Note that the branching code sequences used for ? and the general cases
6675  * of * and + are somewhat optimized:  they use the same NOTHING node as
6676  * both the endmarker for their branch list and the body of the last branch.
6677  * It might seem that this node could be dispensed with entirely, but the
6678  * endmarker role is not redundant.
6679  */
6680 STATIC regnode *
6681 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
6682 {
6683     dVAR;
6684     register regnode *ret;
6685     register char op;
6686     register char *next;
6687     I32 flags;
6688     const char * const origparse = RExC_parse;
6689     I32 min;
6690     I32 max = REG_INFTY;
6691     char *parse_start;
6692     const char *maxpos = NULL;
6693     GET_RE_DEBUG_FLAGS_DECL;
6694
6695     PERL_ARGS_ASSERT_REGPIECE;
6696
6697     DEBUG_PARSE("piec");
6698
6699     ret = regatom(pRExC_state, &flags,depth+1);
6700     if (ret == NULL) {
6701         if (flags & TRYAGAIN)
6702             *flagp |= TRYAGAIN;
6703         return(NULL);
6704     }
6705
6706     op = *RExC_parse;
6707
6708     if (op == '{' && regcurly(RExC_parse)) {
6709         maxpos = NULL;
6710         parse_start = RExC_parse; /* MJD */
6711         next = RExC_parse + 1;
6712         while (isDIGIT(*next) || *next == ',') {
6713             if (*next == ',') {
6714                 if (maxpos)
6715                     break;
6716                 else
6717                     maxpos = next;
6718             }
6719             next++;
6720         }
6721         if (*next == '}') {             /* got one */
6722             if (!maxpos)
6723                 maxpos = next;
6724             RExC_parse++;
6725             min = atoi(RExC_parse);
6726             if (*maxpos == ',')
6727                 maxpos++;
6728             else
6729                 maxpos = RExC_parse;
6730             max = atoi(maxpos);
6731             if (!max && *maxpos != '0')
6732                 max = REG_INFTY;                /* meaning "infinity" */
6733             else if (max >= REG_INFTY)
6734                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
6735             RExC_parse = next;
6736             nextchar(pRExC_state);
6737
6738         do_curly:
6739             if ((flags&SIMPLE)) {
6740                 RExC_naughty += 2 + RExC_naughty / 2;
6741                 reginsert(pRExC_state, CURLY, ret, depth+1);
6742                 Set_Node_Offset(ret, parse_start+1); /* MJD */
6743                 Set_Node_Cur_Length(ret);
6744             }
6745             else {
6746                 regnode * const w = reg_node(pRExC_state, WHILEM);
6747
6748                 w->flags = 0;
6749                 REGTAIL(pRExC_state, ret, w);
6750                 if (!SIZE_ONLY && RExC_extralen) {
6751                     reginsert(pRExC_state, LONGJMP,ret, depth+1);
6752                     reginsert(pRExC_state, NOTHING,ret, depth+1);
6753                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
6754                 }
6755                 reginsert(pRExC_state, CURLYX,ret, depth+1);
6756                                 /* MJD hk */
6757                 Set_Node_Offset(ret, parse_start+1);
6758                 Set_Node_Length(ret,
6759                                 op == '{' ? (RExC_parse - parse_start) : 1);
6760
6761                 if (!SIZE_ONLY && RExC_extralen)
6762                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
6763                 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
6764                 if (SIZE_ONLY)
6765                     RExC_whilem_seen++, RExC_extralen += 3;
6766                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
6767             }
6768             ret->flags = 0;
6769
6770             if (min > 0)
6771                 *flagp = WORST;
6772             if (max > 0)
6773                 *flagp |= HASWIDTH;
6774             if (max < min)
6775                 vFAIL("Can't do {n,m} with n > m");
6776             if (!SIZE_ONLY) {
6777                 ARG1_SET(ret, (U16)min);
6778                 ARG2_SET(ret, (U16)max);
6779             }
6780
6781             goto nest_check;
6782         }
6783     }
6784
6785     if (!ISMULT1(op)) {
6786         *flagp = flags;
6787         return(ret);
6788     }
6789
6790 #if 0                           /* Now runtime fix should be reliable. */
6791
6792     /* if this is reinstated, don't forget to put this back into perldiag:
6793
6794             =item Regexp *+ operand could be empty at {#} in regex m/%s/
6795
6796            (F) The part of the regexp subject to either the * or + quantifier
6797            could match an empty string. The {#} shows in the regular
6798            expression about where the problem was discovered.
6799
6800     */
6801
6802     if (!(flags&HASWIDTH) && op != '?')
6803       vFAIL("Regexp *+ operand could be empty");
6804 #endif
6805
6806     parse_start = RExC_parse;
6807     nextchar(pRExC_state);
6808
6809     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
6810
6811     if (op == '*' && (flags&SIMPLE)) {
6812         reginsert(pRExC_state, STAR, ret, depth+1);
6813         ret->flags = 0;
6814         RExC_naughty += 4;
6815     }
6816     else if (op == '*') {
6817         min = 0;
6818         goto do_curly;
6819     }
6820     else if (op == '+' && (flags&SIMPLE)) {
6821         reginsert(pRExC_state, PLUS, ret, depth+1);
6822         ret->flags = 0;
6823         RExC_naughty += 3;
6824     }
6825     else if (op == '+') {
6826         min = 1;
6827         goto do_curly;
6828     }
6829     else if (op == '?') {
6830         min = 0; max = 1;
6831         goto do_curly;
6832     }
6833   nest_check:
6834     if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
6835         ckWARN3reg(RExC_parse,
6836                    "%.*s matches null string many times",
6837                    (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
6838                    origparse);
6839     }
6840
6841     if (RExC_parse < RExC_end && *RExC_parse == '?') {
6842         nextchar(pRExC_state);
6843         reginsert(pRExC_state, MINMOD, ret, depth+1);
6844         REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
6845     }
6846 #ifndef REG_ALLOW_MINMOD_SUSPEND
6847     else
6848 #endif
6849     if (RExC_parse < RExC_end && *RExC_parse == '+') {
6850         regnode *ender;
6851         nextchar(pRExC_state);
6852         ender = reg_node(pRExC_state, SUCCEED);
6853         REGTAIL(pRExC_state, ret, ender);
6854         reginsert(pRExC_state, SUSPEND, ret, depth+1);
6855         ret->flags = 0;
6856         ender = reg_node(pRExC_state, TAIL);
6857         REGTAIL(pRExC_state, ret, ender);
6858         /*ret= ender;*/
6859     }
6860
6861     if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
6862         RExC_parse++;
6863         vFAIL("Nested quantifiers");
6864     }
6865
6866     return(ret);
6867 }
6868
6869
6870 /* reg_namedseq(pRExC_state,UVp)
6871
6872    This is expected to be called by a parser routine that has
6873    recognized '\N' and needs to handle the rest. RExC_parse is
6874    expected to point at the first char following the N at the time
6875    of the call.
6876
6877    The \N may be inside (indicated by valuep not being NULL) or outside a
6878    character class.
6879
6880    \N may begin either a named sequence, or if outside a character class, mean
6881    to match a non-newline.  For non single-quoted regexes, the tokenizer has
6882    attempted to decide which, and in the case of a named sequence converted it
6883    into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
6884    where c1... are the characters in the sequence.  For single-quoted regexes,
6885    the tokenizer passes the \N sequence through unchanged; this code will not
6886    attempt to determine this nor expand those.  The net effect is that if the
6887    beginning of the passed-in pattern isn't '{U+' or there is no '}', it
6888    signals that this \N occurrence means to match a non-newline.
6889
6890    Only the \N{U+...} form should occur in a character class, for the same
6891    reason that '.' inside a character class means to just match a period: it
6892    just doesn't make sense.
6893
6894    If valuep is non-null then it is assumed that we are parsing inside
6895    of a charclass definition and the first codepoint in the resolved
6896    string is returned via *valuep and the routine will return NULL.
6897    In this mode if a multichar string is returned from the charnames
6898    handler, a warning will be issued, and only the first char in the
6899    sequence will be examined. If the string returned is zero length
6900    then the value of *valuep is undefined and NON-NULL will
6901    be returned to indicate failure. (This will NOT be a valid pointer
6902    to a regnode.)
6903
6904    If valuep is null then it is assumed that we are parsing normal text and a
6905    new EXACT node is inserted into the program containing the resolved string,
6906    and a pointer to the new node is returned.  But if the string is zero length
6907    a NOTHING node is emitted instead.
6908
6909    On success RExC_parse is set to the char following the endbrace.
6910    Parsing failures will generate a fatal error via vFAIL(...)
6911  */
6912 STATIC regnode *
6913 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
6914 {
6915     char * endbrace;    /* '}' following the name */
6916     regnode *ret = NULL;
6917 #ifdef DEBUGGING
6918     char* parse_start = RExC_parse - 2;     /* points to the '\N' */
6919 #endif
6920     char* p;
6921
6922     GET_RE_DEBUG_FLAGS_DECL;
6923
6924     PERL_ARGS_ASSERT_REG_NAMEDSEQ;
6925
6926     GET_RE_DEBUG_FLAGS;
6927
6928     /* The [^\n] meaning of \N ignores spaces and comments under the /x
6929      * modifier.  The other meaning does not */
6930     p = (RExC_flags & RXf_PMf_EXTENDED)
6931         ? regwhite( pRExC_state, RExC_parse )
6932         : RExC_parse;
6933
6934     /* Disambiguate between \N meaning a named character versus \N meaning
6935      * [^\n].  The former is assumed when it can't be the latter. */
6936     if (*p != '{' || regcurly(p)) {
6937         RExC_parse = p;
6938         if (valuep) {
6939             /* no bare \N in a charclass */
6940             vFAIL("\\N in a character class must be a named character: \\N{...}");
6941         }
6942         nextchar(pRExC_state);
6943         ret = reg_node(pRExC_state, REG_ANY);
6944         *flagp |= HASWIDTH|SIMPLE;
6945         RExC_naughty++;
6946         RExC_parse--;
6947         Set_Node_Length(ret, 1); /* MJD */
6948         return ret;
6949     }
6950
6951     /* Here, we have decided it should be a named sequence */
6952
6953     /* The test above made sure that the next real character is a '{', but
6954      * under the /x modifier, it could be separated by space (or a comment and
6955      * \n) and this is not allowed (for consistency with \x{...} and the
6956      * tokenizer handling of \N{NAME}). */
6957     if (*RExC_parse != '{') {
6958         vFAIL("Missing braces on \\N{}");
6959     }
6960
6961     RExC_parse++;       /* Skip past the '{' */
6962
6963     if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
6964         || ! (endbrace == RExC_parse            /* nothing between the {} */
6965               || (endbrace - RExC_parse >= 2    /* U+ (bad hex is checked below */
6966                   && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
6967     {
6968         if (endbrace) RExC_parse = endbrace;    /* position msg's '<--HERE' */
6969         vFAIL("\\N{NAME} must be resolved by the lexer");
6970     }
6971
6972     if (endbrace == RExC_parse) {   /* empty: \N{} */
6973         if (! valuep) {
6974             RExC_parse = endbrace + 1;
6975             return reg_node(pRExC_state,NOTHING);
6976         }
6977
6978         if (SIZE_ONLY) {
6979             ckWARNreg(RExC_parse,
6980                     "Ignoring zero length \\N{} in character class"
6981             );
6982             RExC_parse = endbrace + 1;
6983         }
6984         *valuep = 0;
6985         return (regnode *) &RExC_parse; /* Invalid regnode pointer */
6986     }
6987
6988     REQUIRE_UTF8;       /* named sequences imply Unicode semantics */
6989     RExC_parse += 2;    /* Skip past the 'U+' */
6990
6991     if (valuep) {   /* In a bracketed char class */
6992         /* We only pay attention to the first char of
6993         multichar strings being returned. I kinda wonder
6994         if this makes sense as it does change the behaviour
6995         from earlier versions, OTOH that behaviour was broken
6996         as well. XXX Solution is to recharacterize as
6997         [rest-of-class]|multi1|multi2... */
6998
6999         STRLEN length_of_hex;
7000         I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7001             | PERL_SCAN_DISALLOW_PREFIX
7002             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
7003
7004         char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
7005         if (endchar < endbrace) {
7006             ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
7007         }
7008
7009         length_of_hex = (STRLEN)(endchar - RExC_parse);
7010         *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
7011
7012         /* The tokenizer should have guaranteed validity, but it's possible to
7013          * bypass it by using single quoting, so check */
7014         if (length_of_hex == 0
7015             || length_of_hex != (STRLEN)(endchar - RExC_parse) )
7016         {
7017             RExC_parse += length_of_hex;        /* Includes all the valid */
7018             RExC_parse += (RExC_orig_utf8)      /* point to after 1st invalid */
7019                             ? UTF8SKIP(RExC_parse)
7020                             : 1;
7021             /* Guard against malformed utf8 */
7022             if (RExC_parse >= endchar) RExC_parse = endchar;
7023             vFAIL("Invalid hexadecimal number in \\N{U+...}");
7024         }
7025
7026         RExC_parse = endbrace + 1;
7027         if (endchar == endbrace) return NULL;
7028
7029         ret = (regnode *) &RExC_parse;  /* Invalid regnode pointer */
7030     }
7031     else {      /* Not a char class */
7032         char *s;            /* String to put in generated EXACT node */
7033         STRLEN len = 0;     /* Its current byte length */
7034         char *endchar;      /* Points to '.' or '}' ending cur char in the input
7035                                stream */
7036
7037         ret = reg_node(pRExC_state, (U8) ((! FOLD) ? EXACT
7038                                                    : (LOC)
7039                                                       ? EXACTFL
7040                                                       : UNI_SEMANTICS
7041                                                         ? EXACTFU
7042                                                         : EXACTF));
7043         s= STRING(ret);
7044
7045         /* Exact nodes can hold only a U8 length's of text = 255.  Loop through
7046          * the input which is of the form now 'c1.c2.c3...}' until find the
7047          * ending brace or exceed length 255.  The characters that exceed this
7048          * limit are dropped.  The limit could be relaxed should it become
7049          * desirable by reparsing this as (?:\N{NAME}), so could generate
7050          * multiple EXACT nodes, as is done for just regular input.  But this
7051          * is primarily a named character, and not intended to be a huge long
7052          * string, so 255 bytes should be good enough */
7053         while (1) {
7054             STRLEN length_of_hex;
7055             I32 grok_flags = PERL_SCAN_ALLOW_UNDERSCORES
7056                             | PERL_SCAN_DISALLOW_PREFIX
7057                             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
7058             UV cp;  /* Ord of current character */
7059
7060             /* Code points are separated by dots.  If none, there is only one
7061              * code point, and is terminated by the brace */
7062             endchar = RExC_parse + strcspn(RExC_parse, ".}");
7063
7064             /* The values are Unicode even on EBCDIC machines */
7065             length_of_hex = (STRLEN)(endchar - RExC_parse);
7066             cp = grok_hex(RExC_parse, &length_of_hex, &grok_flags, NULL);
7067             if ( length_of_hex == 0
7068                 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
7069             {
7070                 RExC_parse += length_of_hex;        /* Includes all the valid */
7071                 RExC_parse += (RExC_orig_utf8)  /* point to after 1st invalid */
7072                                 ? UTF8SKIP(RExC_parse)
7073                                 : 1;
7074                 /* Guard against malformed utf8 */
7075                 if (RExC_parse >= endchar) RExC_parse = endchar;
7076                 vFAIL("Invalid hexadecimal number in \\N{U+...}");
7077             }
7078
7079             if (! FOLD) {       /* Not folding, just append to the string */
7080                 STRLEN unilen;
7081
7082                 /* Quit before adding this character if would exceed limit */
7083                 if (len + UNISKIP(cp) > U8_MAX) break;
7084
7085                 unilen = reguni(pRExC_state, cp, s);
7086                 if (unilen > 0) {
7087                     s   += unilen;
7088                     len += unilen;
7089                 }
7090             } else {    /* Folding, output the folded equivalent */
7091                 STRLEN foldlen,numlen;
7092                 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
7093                 cp = toFOLD_uni(cp, tmpbuf, &foldlen);
7094
7095                 /* Quit before exceeding size limit */
7096                 if (len + foldlen > U8_MAX) break;
7097
7098                 for (foldbuf = tmpbuf;
7099                     foldlen;
7100                     foldlen -= numlen)
7101                 {
7102                     cp = utf8_to_uvchr(foldbuf, &numlen);
7103                     if (numlen > 0) {
7104                         const STRLEN unilen = reguni(pRExC_state, cp, s);
7105                         s       += unilen;
7106                         len     += unilen;
7107                         /* In EBCDIC the numlen and unilen can differ. */
7108                         foldbuf += numlen;
7109                         if (numlen >= foldlen)
7110                             break;
7111                     }
7112                     else
7113                         break; /* "Can't happen." */
7114                 }
7115             }
7116
7117             /* Point to the beginning of the next character in the sequence. */
7118             RExC_parse = endchar + 1;
7119
7120             /* Quit if no more characters */
7121             if (RExC_parse >= endbrace) break;
7122         }
7123
7124
7125         if (SIZE_ONLY) {
7126             if (RExC_parse < endbrace) {
7127                 ckWARNreg(RExC_parse - 1,
7128                           "Using just the first characters returned by \\N{}");
7129             }
7130
7131             RExC_size += STR_SZ(len);
7132         } else {
7133             STR_LEN(ret) = len;
7134             RExC_emit += STR_SZ(len);
7135         }
7136
7137         RExC_parse = endbrace + 1;
7138
7139         *flagp |= HASWIDTH; /* Not SIMPLE, as that causes the engine to fail
7140                                with malformed in t/re/pat_advanced.t */
7141         RExC_parse --;
7142         Set_Node_Cur_Length(ret); /* MJD */
7143         nextchar(pRExC_state);
7144     }
7145
7146     return ret;
7147 }
7148
7149
7150 /*
7151  * reg_recode
7152  *
7153  * It returns the code point in utf8 for the value in *encp.
7154  *    value: a code value in the source encoding
7155  *    encp:  a pointer to an Encode object
7156  *
7157  * If the result from Encode is not a single character,
7158  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
7159  */
7160 STATIC UV
7161 S_reg_recode(pTHX_ const char value, SV **encp)
7162 {
7163     STRLEN numlen = 1;
7164     SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
7165     const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
7166     const STRLEN newlen = SvCUR(sv);
7167     UV uv = UNICODE_REPLACEMENT;
7168
7169     PERL_ARGS_ASSERT_REG_RECODE;
7170
7171     if (newlen)
7172         uv = SvUTF8(sv)
7173              ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
7174              : *(U8*)s;
7175
7176     if (!newlen || numlen != newlen) {
7177         uv = UNICODE_REPLACEMENT;
7178         *encp = NULL;
7179     }
7180     return uv;
7181 }
7182
7183
7184 /*
7185  - regatom - the lowest level
7186
7187    Try to identify anything special at the start of the pattern. If there
7188    is, then handle it as required. This may involve generating a single regop,
7189    such as for an assertion; or it may involve recursing, such as to
7190    handle a () structure.
7191
7192    If the string doesn't start with something special then we gobble up
7193    as much literal text as we can.
7194
7195    Once we have been able to handle whatever type of thing started the
7196    sequence, we return.
7197
7198    Note: we have to be careful with escapes, as they can be both literal
7199    and special, and in the case of \10 and friends can either, depending
7200    on context. Specifically there are two separate switches for handling
7201    escape sequences, with the one for handling literal escapes requiring
7202    a dummy entry for all of the special escapes that are actually handled
7203    by the other.
7204 */
7205
7206 STATIC regnode *
7207 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
7208 {
7209     dVAR;
7210     register regnode *ret = NULL;
7211     I32 flags;
7212     char *parse_start = RExC_parse;
7213     U8 op;
7214     GET_RE_DEBUG_FLAGS_DECL;
7215     DEBUG_PARSE("atom");
7216     *flagp = WORST;             /* Tentatively. */
7217
7218     PERL_ARGS_ASSERT_REGATOM;
7219
7220 tryagain:
7221     switch ((U8)*RExC_parse) {
7222     case '^':
7223         RExC_seen_zerolen++;
7224         nextchar(pRExC_state);
7225         if (RExC_flags & RXf_PMf_MULTILINE)
7226             ret = reg_node(pRExC_state, MBOL);
7227         else if (RExC_flags & RXf_PMf_SINGLELINE)
7228             ret = reg_node(pRExC_state, SBOL);
7229         else
7230             ret = reg_node(pRExC_state, BOL);
7231         Set_Node_Length(ret, 1); /* MJD */
7232         break;
7233     case '$':
7234         nextchar(pRExC_state);
7235         if (*RExC_parse)
7236             RExC_seen_zerolen++;
7237         if (RExC_flags & RXf_PMf_MULTILINE)
7238             ret = reg_node(pRExC_state, MEOL);
7239         else if (RExC_flags & RXf_PMf_SINGLELINE)
7240             ret = reg_node(pRExC_state, SEOL);
7241         else
7242             ret = reg_node(pRExC_state, EOL);
7243         Set_Node_Length(ret, 1); /* MJD */
7244         break;
7245     case '.':
7246         nextchar(pRExC_state);
7247         if (RExC_flags & RXf_PMf_SINGLELINE)
7248             ret = reg_node(pRExC_state, SANY);
7249         else
7250             ret = reg_node(pRExC_state, REG_ANY);
7251         *flagp |= HASWIDTH|SIMPLE;
7252         RExC_naughty++;
7253         Set_Node_Length(ret, 1); /* MJD */
7254         break;
7255     case '[':
7256     {
7257         char * const oregcomp_parse = ++RExC_parse;
7258         ret = regclass(pRExC_state,depth+1);
7259         if (*RExC_parse != ']') {
7260             RExC_parse = oregcomp_parse;
7261             vFAIL("Unmatched [");
7262         }
7263         nextchar(pRExC_state);
7264         *flagp |= HASWIDTH|SIMPLE;
7265         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
7266         break;
7267     }
7268     case '(':
7269         nextchar(pRExC_state);
7270         ret = reg(pRExC_state, 1, &flags,depth+1);
7271         if (ret == NULL) {
7272                 if (flags & TRYAGAIN) {
7273                     if (RExC_parse == RExC_end) {
7274                          /* Make parent create an empty node if needed. */
7275                         *flagp |= TRYAGAIN;
7276                         return(NULL);
7277                     }
7278                     goto tryagain;
7279                 }
7280                 return(NULL);
7281         }
7282         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
7283         break;
7284     case '|':
7285     case ')':
7286         if (flags & TRYAGAIN) {
7287             *flagp |= TRYAGAIN;
7288             return NULL;
7289         }
7290         vFAIL("Internal urp");
7291                                 /* Supposed to be caught earlier. */
7292         break;
7293     case '{':
7294         if (!regcurly(RExC_parse)) {
7295             RExC_parse++;
7296             goto defchar;
7297         }
7298         /* FALL THROUGH */
7299     case '?':
7300     case '+':
7301     case '*':
7302         RExC_parse++;
7303         vFAIL("Quantifier follows nothing");
7304         break;
7305     case LATIN_SMALL_LETTER_SHARP_S:
7306     case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
7307     case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
7308 #if UTF8_TWO_BYTE_HI_nocast(UPSILON_D_T) != UTF8_TWO_BYTE_HI_nocast(IOTA_D_T)
7309 #error The beginning utf8 byte of IOTA_D_T and UPSILON_D_T unexpectedly differ.  Other instances in this code should have the case statement below.
7310     case UTF8_TWO_BYTE_HI_nocast(UPSILON_D_T):
7311 #endif
7312         do_foldchar:
7313         if (!LOC && FOLD) {
7314             U32 len,cp;
7315             len=0; /* silence a spurious compiler warning */
7316             if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) {
7317                 *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */
7318                 RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */
7319                 ret = reganode(pRExC_state, FOLDCHAR, cp);
7320                 Set_Node_Length(ret, 1); /* MJD */
7321                 nextchar(pRExC_state); /* kill whitespace under /x */
7322                 return ret;
7323             }
7324         }
7325         goto outer_default;
7326     case '\\':
7327         /* Special Escapes
7328
7329            This switch handles escape sequences that resolve to some kind
7330            of special regop and not to literal text. Escape sequnces that
7331            resolve to literal text are handled below in the switch marked
7332            "Literal Escapes".
7333
7334            Every entry in this switch *must* have a corresponding entry
7335            in the literal escape switch. However, the opposite is not
7336            required, as the default for this switch is to jump to the
7337            literal text handling code.
7338         */
7339         switch ((U8)*++RExC_parse) {
7340         case LATIN_SMALL_LETTER_SHARP_S:
7341         case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
7342         case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
7343                    goto do_foldchar;
7344         /* Special Escapes */
7345         case 'A':
7346             RExC_seen_zerolen++;
7347             ret = reg_node(pRExC_state, SBOL);
7348             *flagp |= SIMPLE;
7349             goto finish_meta_pat;
7350         case 'G':
7351             ret = reg_node(pRExC_state, GPOS);
7352             RExC_seen |= REG_SEEN_GPOS;
7353             *flagp |= SIMPLE;
7354             goto finish_meta_pat;
7355         case 'K':
7356             RExC_seen_zerolen++;
7357             ret = reg_node(pRExC_state, KEEPS);
7358             *flagp |= SIMPLE;
7359             /* XXX:dmq : disabling in-place substitution seems to
7360              * be necessary here to avoid cases of memory corruption, as
7361              * with: C<$_="x" x 80; s/x\K/y/> -- rgs
7362              */
7363             RExC_seen |= REG_SEEN_LOOKBEHIND;
7364             goto finish_meta_pat;
7365         case 'Z':
7366             ret = reg_node(pRExC_state, SEOL);
7367             *flagp |= SIMPLE;
7368             RExC_seen_zerolen++;                /* Do not optimize RE away */
7369             goto finish_meta_pat;
7370         case 'z':
7371             ret = reg_node(pRExC_state, EOS);
7372             *flagp |= SIMPLE;
7373             RExC_seen_zerolen++;                /* Do not optimize RE away */
7374             goto finish_meta_pat;
7375         case 'C':
7376             ret = reg_node(pRExC_state, CANY);
7377             RExC_seen |= REG_SEEN_CANY;
7378             *flagp |= HASWIDTH|SIMPLE;
7379             goto finish_meta_pat;
7380         case 'X':
7381             ret = reg_node(pRExC_state, CLUMP);
7382             *flagp |= HASWIDTH;
7383             goto finish_meta_pat;
7384         case 'w':
7385             switch (get_regex_charset(RExC_flags)) {
7386                 case REGEX_LOCALE_CHARSET:
7387                     op = ALNUML;
7388                     break;
7389                 case REGEX_UNICODE_CHARSET:
7390                     op = ALNUMU;
7391                     break;
7392                 case REGEX_ASCII_RESTRICTED_CHARSET:
7393                     op = ALNUMA;
7394                     break;
7395                 case REGEX_DEPENDS_CHARSET:
7396                     op = ALNUM;
7397                     break;
7398                 default:
7399                     goto bad_charset;
7400             }
7401             ret = reg_node(pRExC_state, op);
7402             *flagp |= HASWIDTH|SIMPLE;
7403             goto finish_meta_pat;
7404         case 'W':
7405             switch (get_regex_charset(RExC_flags)) {
7406                 case REGEX_LOCALE_CHARSET:
7407                     op = NALNUML;
7408                     break;
7409                 case REGEX_UNICODE_CHARSET:
7410                     op = NALNUMU;
7411                     break;
7412                 case REGEX_ASCII_RESTRICTED_CHARSET:
7413                     op = NALNUMA;
7414                     break;
7415                 case REGEX_DEPENDS_CHARSET:
7416                     op = NALNUM;
7417                     break;
7418                 default:
7419                     goto bad_charset;
7420             }
7421             ret = reg_node(pRExC_state, op);
7422             *flagp |= HASWIDTH|SIMPLE;
7423             goto finish_meta_pat;
7424         case 'b':
7425             RExC_seen_zerolen++;
7426             RExC_seen |= REG_SEEN_LOOKBEHIND;
7427             switch (get_regex_charset(RExC_flags)) {
7428                 case REGEX_LOCALE_CHARSET:
7429                     op = BOUNDL;
7430                     break;
7431                 case REGEX_UNICODE_CHARSET:
7432                     op = BOUNDU;
7433                     break;
7434                 case REGEX_ASCII_RESTRICTED_CHARSET:
7435                     op = BOUNDA;
7436                     break;
7437                 case REGEX_DEPENDS_CHARSET:
7438                     op = BOUND;
7439                     break;
7440                 default:
7441                     goto bad_charset;
7442             }
7443             ret = reg_node(pRExC_state, op);
7444             FLAGS(ret) = get_regex_charset(RExC_flags);
7445             *flagp |= SIMPLE;
7446             goto finish_meta_pat;
7447         case 'B':
7448             RExC_seen_zerolen++;
7449             RExC_seen |= REG_SEEN_LOOKBEHIND;
7450             switch (get_regex_charset(RExC_flags)) {
7451                 case REGEX_LOCALE_CHARSET:
7452                     op = NBOUNDL;
7453                     break;
7454                 case REGEX_UNICODE_CHARSET:
7455                     op = NBOUNDU;
7456                     break;
7457                 case REGEX_ASCII_RESTRICTED_CHARSET:
7458                     op = NBOUNDA;
7459                     break;
7460                 case REGEX_DEPENDS_CHARSET:
7461                     op = NBOUND;
7462                     break;
7463                 default:
7464                     goto bad_charset;
7465             }
7466             ret = reg_node(pRExC_state, op);
7467             FLAGS(ret) = get_regex_charset(RExC_flags);
7468             *flagp |= SIMPLE;
7469             goto finish_meta_pat;
7470         case 's':
7471             switch (get_regex_charset(RExC_flags)) {
7472                 case REGEX_LOCALE_CHARSET:
7473                     op = SPACEL;
7474                     break;
7475                 case REGEX_UNICODE_CHARSET:
7476                     op = SPACEU;
7477                     break;
7478                 case REGEX_ASCII_RESTRICTED_CHARSET:
7479                     op = SPACEA;
7480                     break;
7481                 case REGEX_DEPENDS_CHARSET:
7482                     op = SPACE;
7483                     break;
7484                 default:
7485                     goto bad_charset;
7486             }
7487             ret = reg_node(pRExC_state, op);
7488             *flagp |= HASWIDTH|SIMPLE;
7489             goto finish_meta_pat;
7490         case 'S':
7491             switch (get_regex_charset(RExC_flags)) {
7492                 case REGEX_LOCALE_CHARSET:
7493                     op = NSPACEL;
7494                     break;
7495                 case REGEX_UNICODE_CHARSET:
7496                     op = NSPACEU;
7497                     break;
7498                 case REGEX_ASCII_RESTRICTED_CHARSET:
7499                     op = NSPACEA;
7500                     break;
7501                 case REGEX_DEPENDS_CHARSET:
7502                     op = NSPACE;
7503                     break;
7504                 default:
7505                     goto bad_charset;
7506             }
7507             ret = reg_node(pRExC_state, op);
7508             *flagp |= HASWIDTH|SIMPLE;
7509             goto finish_meta_pat;
7510         case 'd':
7511             switch (get_regex_charset(RExC_flags)) {
7512                 case REGEX_LOCALE_CHARSET:
7513                     op = DIGITL;
7514                     break;
7515                 case REGEX_ASCII_RESTRICTED_CHARSET:
7516                     op = DIGITA;
7517                     break;
7518                 case REGEX_DEPENDS_CHARSET: /* No difference between these */
7519                 case REGEX_UNICODE_CHARSET:
7520                     op = DIGIT;
7521                     break;
7522                 default:
7523                     goto bad_charset;
7524             }
7525             ret = reg_node(pRExC_state, op);
7526             *flagp |= HASWIDTH|SIMPLE;
7527             goto finish_meta_pat;
7528         case 'D':
7529             switch (get_regex_charset(RExC_flags)) {
7530                 case REGEX_LOCALE_CHARSET:
7531                     op = NDIGITL;
7532                     break;
7533                 case REGEX_ASCII_RESTRICTED_CHARSET:
7534                     op = NDIGITA;
7535                     break;
7536                 case REGEX_DEPENDS_CHARSET: /* No difference between these */
7537                 case REGEX_UNICODE_CHARSET:
7538                     op = NDIGIT;
7539                     break;
7540                 default:
7541                     goto bad_charset;
7542             }
7543             ret = reg_node(pRExC_state, op);
7544             *flagp |= HASWIDTH|SIMPLE;
7545             goto finish_meta_pat;
7546         case 'R':
7547             ret = reg_node(pRExC_state, LNBREAK);
7548             *flagp |= HASWIDTH|SIMPLE;
7549             goto finish_meta_pat;
7550         case 'h':
7551             ret = reg_node(pRExC_state, HORIZWS);
7552             *flagp |= HASWIDTH|SIMPLE;
7553             goto finish_meta_pat;
7554         case 'H':
7555             ret = reg_node(pRExC_state, NHORIZWS);
7556             *flagp |= HASWIDTH|SIMPLE;
7557             goto finish_meta_pat;
7558         case 'v':
7559             ret = reg_node(pRExC_state, VERTWS);
7560             *flagp |= HASWIDTH|SIMPLE;
7561             goto finish_meta_pat;
7562         case 'V':
7563             ret = reg_node(pRExC_state, NVERTWS);
7564             *flagp |= HASWIDTH|SIMPLE;
7565          finish_meta_pat:
7566             nextchar(pRExC_state);
7567             Set_Node_Length(ret, 2); /* MJD */
7568             break;
7569         case 'p':
7570         case 'P':
7571             {
7572                 char* const oldregxend = RExC_end;
7573 #ifdef DEBUGGING
7574                 char* parse_start = RExC_parse - 2;
7575 #endif
7576
7577                 if (RExC_parse[1] == '{') {
7578                   /* a lovely hack--pretend we saw [\pX] instead */
7579                     RExC_end = strchr(RExC_parse, '}');
7580                     if (!RExC_end) {
7581                         const U8 c = (U8)*RExC_parse;
7582                         RExC_parse += 2;
7583                         RExC_end = oldregxend;
7584                         vFAIL2("Missing right brace on \\%c{}", c);
7585                     }
7586                     RExC_end++;
7587                 }
7588                 else {
7589                     RExC_end = RExC_parse + 2;
7590                     if (RExC_end > oldregxend)
7591                         RExC_end = oldregxend;
7592                 }
7593                 RExC_parse--;
7594
7595                 ret = regclass(pRExC_state,depth+1);
7596
7597                 RExC_end = oldregxend;
7598                 RExC_parse--;
7599
7600                 Set_Node_Offset(ret, parse_start + 2);
7601                 Set_Node_Cur_Length(ret);
7602                 nextchar(pRExC_state);
7603                 *flagp |= HASWIDTH|SIMPLE;
7604             }
7605             break;
7606         case 'N':
7607             /* Handle \N and \N{NAME} here and not below because it can be
7608             multicharacter. join_exact() will join them up later on.
7609             Also this makes sure that things like /\N{BLAH}+/ and
7610             \N{BLAH} being multi char Just Happen. dmq*/
7611             ++RExC_parse;
7612             ret= reg_namedseq(pRExC_state, NULL, flagp);
7613             break;
7614         case 'k':    /* Handle \k<NAME> and \k'NAME' */
7615         parse_named_seq:
7616         {
7617             char ch= RExC_parse[1];
7618             if (ch != '<' && ch != '\'' && ch != '{') {
7619                 RExC_parse++;
7620                 vFAIL2("Sequence %.2s... not terminated",parse_start);
7621             } else {
7622                 /* this pretty much dupes the code for (?P=...) in reg(), if
7623                    you change this make sure you change that */
7624                 char* name_start = (RExC_parse += 2);
7625                 U32 num = 0;
7626                 SV *sv_dat = reg_scan_name(pRExC_state,
7627                     SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7628                 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
7629                 if (RExC_parse == name_start || *RExC_parse != ch)
7630                     vFAIL2("Sequence %.3s... not terminated",parse_start);
7631
7632                 if (!SIZE_ONLY) {
7633                     num = add_data( pRExC_state, 1, "S" );
7634                     RExC_rxi->data->data[num]=(void*)sv_dat;
7635                     SvREFCNT_inc_simple_void(sv_dat);
7636                 }
7637
7638                 RExC_sawback = 1;
7639                 ret = reganode(pRExC_state,
7640                                ((! FOLD)
7641                                  ? NREF
7642                                  : (AT_LEAST_UNI_SEMANTICS)
7643                                    ? NREFFU
7644                                    : (LOC)
7645                                      ? NREFFL
7646                                      : NREFF),
7647                                 num);
7648                 *flagp |= HASWIDTH;
7649
7650                 /* override incorrect value set in reganode MJD */
7651                 Set_Node_Offset(ret, parse_start+1);
7652                 Set_Node_Cur_Length(ret); /* MJD */
7653                 nextchar(pRExC_state);
7654
7655             }
7656             break;
7657         }
7658         case 'g':
7659         case '1': case '2': case '3': case '4':
7660         case '5': case '6': case '7': case '8': case '9':
7661             {
7662                 I32 num;
7663                 bool isg = *RExC_parse == 'g';
7664                 bool isrel = 0;
7665                 bool hasbrace = 0;
7666                 if (isg) {
7667                     RExC_parse++;
7668                     if (*RExC_parse == '{') {
7669                         RExC_parse++;
7670                         hasbrace = 1;
7671                     }
7672                     if (*RExC_parse == '-') {
7673                         RExC_parse++;
7674                         isrel = 1;
7675                     }
7676                     if (hasbrace && !isDIGIT(*RExC_parse)) {
7677                         if (isrel) RExC_parse--;
7678                         RExC_parse -= 2;
7679                         goto parse_named_seq;
7680                 }   }
7681                 num = atoi(RExC_parse);
7682                 if (isg && num == 0)
7683                     vFAIL("Reference to invalid group 0");
7684                 if (isrel) {
7685                     num = RExC_npar - num;
7686                     if (num < 1)
7687                         vFAIL("Reference to nonexistent or unclosed group");
7688                 }
7689                 if (!isg && num > 9 && num >= RExC_npar)
7690                     goto defchar;
7691                 else {
7692                     char * const parse_start = RExC_parse - 1; /* MJD */
7693                     while (isDIGIT(*RExC_parse))
7694                         RExC_parse++;
7695                     if (parse_start == RExC_parse - 1)
7696                         vFAIL("Unterminated \\g... pattern");
7697                     if (hasbrace) {
7698                         if (*RExC_parse != '}')
7699                             vFAIL("Unterminated \\g{...} pattern");
7700                         RExC_parse++;
7701                     }
7702                     if (!SIZE_ONLY) {
7703                         if (num > (I32)RExC_rx->nparens)
7704                             vFAIL("Reference to nonexistent group");
7705                     }
7706                     RExC_sawback = 1;
7707                     ret = reganode(pRExC_state,
7708                                    ((! FOLD)
7709                                      ? REF
7710                                      : (AT_LEAST_UNI_SEMANTICS)
7711                                        ? REFFU
7712                                        : (LOC)
7713                                          ? REFFL
7714                                          : REFF),
7715                                     num);
7716                     *flagp |= HASWIDTH;
7717
7718                     /* override incorrect value set in reganode MJD */
7719                     Set_Node_Offset(ret, parse_start+1);
7720                     Set_Node_Cur_Length(ret); /* MJD */
7721                     RExC_parse--;
7722                     nextchar(pRExC_state);
7723                 }
7724             }
7725             break;
7726         case '\0':
7727             if (RExC_parse >= RExC_end)
7728                 FAIL("Trailing \\");
7729             /* FALL THROUGH */
7730         default:
7731             /* Do not generate "unrecognized" warnings here, we fall
7732                back into the quick-grab loop below */
7733             parse_start--;
7734             goto defchar;
7735         }
7736         break;
7737
7738     case '#':
7739         if (RExC_flags & RXf_PMf_EXTENDED) {
7740             if ( reg_skipcomment( pRExC_state ) )
7741                 goto tryagain;
7742         }
7743         /* FALL THROUGH */
7744
7745     default:
7746         outer_default:{
7747             register STRLEN len;
7748             register UV ender;
7749             register char *p;
7750             char *s;
7751             STRLEN foldlen;
7752             U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
7753
7754             parse_start = RExC_parse - 1;
7755
7756             RExC_parse++;
7757
7758         defchar:
7759             ender = 0;
7760             ret = reg_node(pRExC_state,
7761                            (U8) ((! FOLD) ? EXACT
7762                                           : (LOC)
7763                                              ? EXACTFL
7764                                              : (AT_LEAST_UNI_SEMANTICS)
7765                                                ? EXACTFU
7766                                                : EXACTF)
7767                     );
7768             s = STRING(ret);
7769             for (len = 0, p = RExC_parse - 1;
7770               len < 127 && p < RExC_end;
7771               len++)
7772             {
7773                 char * const oldp = p;
7774
7775                 if (RExC_flags & RXf_PMf_EXTENDED)
7776                     p = regwhite( pRExC_state, p );
7777                 switch ((U8)*p) {
7778                 case LATIN_SMALL_LETTER_SHARP_S:
7779                 case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
7780                 case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
7781                            if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7782                                 goto normal_default;
7783                 case '^':
7784                 case '$':
7785                 case '.':
7786                 case '[':
7787                 case '(':
7788                 case ')':
7789                 case '|':
7790                     goto loopdone;
7791                 case '\\':
7792                     /* Literal Escapes Switch
7793
7794                        This switch is meant to handle escape sequences that
7795                        resolve to a literal character.
7796
7797                        Every escape sequence that represents something
7798                        else, like an assertion or a char class, is handled
7799                        in the switch marked 'Special Escapes' above in this
7800                        routine, but also has an entry here as anything that
7801                        isn't explicitly mentioned here will be treated as
7802                        an unescaped equivalent literal.
7803                     */
7804
7805                     switch ((U8)*++p) {
7806                     /* These are all the special escapes. */
7807                     case LATIN_SMALL_LETTER_SHARP_S:
7808                     case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
7809                     case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
7810                            if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7811                                 goto normal_default;
7812                     case 'A':             /* Start assertion */
7813                     case 'b': case 'B':   /* Word-boundary assertion*/
7814                     case 'C':             /* Single char !DANGEROUS! */
7815                     case 'd': case 'D':   /* digit class */
7816                     case 'g': case 'G':   /* generic-backref, pos assertion */
7817                     case 'h': case 'H':   /* HORIZWS */
7818                     case 'k': case 'K':   /* named backref, keep marker */
7819                     case 'N':             /* named char sequence */
7820                     case 'p': case 'P':   /* Unicode property */
7821                               case 'R':   /* LNBREAK */
7822                     case 's': case 'S':   /* space class */
7823                     case 'v': case 'V':   /* VERTWS */
7824                     case 'w': case 'W':   /* word class */
7825                     case 'X':             /* eXtended Unicode "combining character sequence" */
7826                     case 'z': case 'Z':   /* End of line/string assertion */
7827                         --p;
7828                         goto loopdone;
7829
7830                     /* Anything after here is an escape that resolves to a
7831                        literal. (Except digits, which may or may not)
7832                      */
7833                     case 'n':
7834                         ender = '\n';
7835                         p++;
7836                         break;
7837                     case 'r':
7838                         ender = '\r';
7839                         p++;
7840                         break;
7841                     case 't':
7842                         ender = '\t';
7843                         p++;
7844                         break;
7845                     case 'f':
7846                         ender = '\f';
7847                         p++;
7848                         break;
7849                     case 'e':
7850                           ender = ASCII_TO_NATIVE('\033');
7851                         p++;
7852                         break;
7853                     case 'a':
7854                           ender = ASCII_TO_NATIVE('\007');
7855                         p++;
7856                         break;
7857                     case 'o':
7858                         {
7859                             STRLEN brace_len = len;
7860                             UV result;
7861                             const char* error_msg;
7862
7863                             bool valid = grok_bslash_o(p,
7864                                                        &result,
7865                                                        &brace_len,
7866                                                        &error_msg,
7867                                                        1);
7868                             p += brace_len;
7869                             if (! valid) {
7870                                 RExC_parse = p; /* going to die anyway; point
7871                                                    to exact spot of failure */
7872                                 vFAIL(error_msg);
7873                             }
7874                             else
7875                             {
7876                                 ender = result;
7877                             }
7878                             if (PL_encoding && ender < 0x100) {
7879                                 goto recode_encoding;
7880                             }
7881                             if (ender > 0xff) {
7882                                 REQUIRE_UTF8;
7883                             }
7884                             break;
7885                         }
7886                     case 'x':
7887                         if (*++p == '{') {
7888                             char* const e = strchr(p, '}');
7889
7890                             if (!e) {
7891                                 RExC_parse = p + 1;
7892                                 vFAIL("Missing right brace on \\x{}");
7893                             }
7894                             else {
7895                                 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7896                                     | PERL_SCAN_DISALLOW_PREFIX;
7897                                 STRLEN numlen = e - p - 1;
7898                                 ender = grok_hex(p + 1, &numlen, &flags, NULL);
7899                                 if (ender > 0xff)
7900                                     REQUIRE_UTF8;
7901                                 p = e + 1;
7902                             }
7903                         }
7904                         else {
7905                             I32 flags = PERL_SCAN_DISALLOW_PREFIX;
7906                             STRLEN numlen = 2;
7907                             ender = grok_hex(p, &numlen, &flags, NULL);
7908                             p += numlen;
7909                         }
7910                         if (PL_encoding && ender < 0x100)
7911                             goto recode_encoding;
7912                         break;
7913                     case 'c':
7914                         p++;
7915                         ender = grok_bslash_c(*p++, SIZE_ONLY);
7916                         break;
7917                     case '0': case '1': case '2': case '3':case '4':
7918                     case '5': case '6': case '7': case '8':case '9':
7919                         if (*p == '0' ||
7920                             (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
7921                         {
7922                             I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
7923                             STRLEN numlen = 3;
7924                             ender = grok_oct(p, &numlen, &flags, NULL);
7925                             if (ender > 0xff) {
7926                                 REQUIRE_UTF8;
7927                             }
7928                             p += numlen;
7929                         }
7930                         else {
7931                             --p;
7932                             goto loopdone;
7933                         }
7934                         if (PL_encoding && ender < 0x100)
7935                             goto recode_encoding;
7936                         break;
7937                     recode_encoding:
7938                         {
7939                             SV* enc = PL_encoding;
7940                             ender = reg_recode((const char)(U8)ender, &enc);
7941                             if (!enc && SIZE_ONLY)
7942                                 ckWARNreg(p, "Invalid escape in the specified encoding");
7943                             REQUIRE_UTF8;
7944                         }
7945                         break;
7946                     case '\0':
7947                         if (p >= RExC_end)
7948                             FAIL("Trailing \\");
7949                         /* FALL THROUGH */
7950                     default:
7951                         if (!SIZE_ONLY&& isALPHA(*p))
7952                             ckWARN2reg(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
7953                         goto normal_default;
7954                     }
7955                     break;
7956                 default:
7957                   normal_default:
7958                     if (UTF8_IS_START(*p) && UTF) {
7959                         STRLEN numlen;
7960                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
7961                                                &numlen, UTF8_ALLOW_DEFAULT);
7962                         p += numlen;
7963                     }
7964                     else
7965                         ender = *p++;
7966                     break;
7967                 }
7968                 if ( RExC_flags & RXf_PMf_EXTENDED)
7969                     p = regwhite( pRExC_state, p );
7970                 if (UTF && FOLD) {
7971                     /* Prime the casefolded buffer. */
7972                     ender = toFOLD_uni(ender, tmpbuf, &foldlen);
7973                 }
7974                 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
7975                     if (len)
7976                         p = oldp;
7977                     else if (UTF) {
7978                          if (FOLD) {
7979                               /* Emit all the Unicode characters. */
7980                               STRLEN numlen;
7981                               for (foldbuf = tmpbuf;
7982                                    foldlen;
7983                                    foldlen -= numlen) {
7984                                    ender = utf8_to_uvchr(foldbuf, &numlen);
7985                                    if (numlen > 0) {
7986                                         const STRLEN unilen = reguni(pRExC_state, ender, s);
7987                                         s       += unilen;
7988                                         len     += unilen;
7989                                         /* In EBCDIC the numlen
7990                                          * and unilen can differ. */
7991                                         foldbuf += numlen;
7992                                         if (numlen >= foldlen)
7993                                              break;
7994                                    }
7995                                    else
7996                                         break; /* "Can't happen." */
7997                               }
7998                          }
7999                          else {
8000                               const STRLEN unilen = reguni(pRExC_state, ender, s);
8001                               if (unilen > 0) {
8002                                    s   += unilen;
8003                                    len += unilen;
8004                               }
8005                          }
8006                     }
8007                     else {
8008                         len++;
8009                         REGC((char)ender, s++);
8010                     }
8011                     break;
8012                 }
8013                 if (UTF) {
8014                      if (FOLD) {
8015                           /* Emit all the Unicode characters. */
8016                           STRLEN numlen;
8017                           for (foldbuf = tmpbuf;
8018                                foldlen;
8019                                foldlen -= numlen) {
8020                                ender = utf8_to_uvchr(foldbuf, &numlen);
8021                                if (numlen > 0) {
8022                                     const STRLEN unilen = reguni(pRExC_state, ender, s);
8023                                     len     += unilen;
8024                                     s       += unilen;
8025                                     /* In EBCDIC the numlen
8026                                      * and unilen can differ. */
8027                                     foldbuf += numlen;
8028                                     if (numlen >= foldlen)
8029                                          break;
8030                                }
8031                                else
8032                                     break;
8033                           }
8034                      }
8035                      else {
8036                           const STRLEN unilen = reguni(pRExC_state, ender, s);
8037                           if (unilen > 0) {
8038                                s   += unilen;
8039                                len += unilen;
8040                           }
8041                      }
8042                      len--;
8043                 }
8044                 else
8045                     REGC((char)ender, s++);
8046             }
8047         loopdone:
8048             RExC_parse = p - 1;
8049             Set_Node_Cur_Length(ret); /* MJD */
8050             nextchar(pRExC_state);
8051             {
8052                 /* len is STRLEN which is unsigned, need to copy to signed */
8053                 IV iv = len;
8054                 if (iv < 0)
8055                     vFAIL("Internal disaster");
8056             }
8057             if (len > 0)
8058                 *flagp |= HASWIDTH;
8059             if (len == 1 && UNI_IS_INVARIANT(ender))
8060                 *flagp |= SIMPLE;
8061
8062             if (SIZE_ONLY)
8063                 RExC_size += STR_SZ(len);
8064             else {
8065                 STR_LEN(ret) = len;
8066                 RExC_emit += STR_SZ(len);
8067             }
8068         }
8069         break;
8070     }
8071
8072     return(ret);
8073
8074 /* Jumped to when an unrecognized character set is encountered */
8075 bad_charset:
8076     Perl_croak(aTHX_ "panic: Unknown regex character set encoding: %u", get_regex_charset(RExC_flags));
8077     return(NULL);
8078 }
8079
8080 STATIC char *
8081 S_regwhite( RExC_state_t *pRExC_state, char *p )
8082 {
8083     const char *e = RExC_end;
8084
8085     PERL_ARGS_ASSERT_REGWHITE;
8086
8087     while (p < e) {
8088         if (isSPACE(*p))
8089             ++p;
8090         else if (*p == '#') {
8091             bool ended = 0;
8092             do {
8093                 if (*p++ == '\n') {
8094                     ended = 1;
8095                     break;
8096                 }
8097             } while (p < e);
8098             if (!ended)
8099                 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
8100         }
8101         else
8102             break;
8103     }
8104     return p;
8105 }
8106
8107 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
8108    Character classes ([:foo:]) can also be negated ([:^foo:]).
8109    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
8110    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
8111    but trigger failures because they are currently unimplemented. */
8112
8113 #define POSIXCC_DONE(c)   ((c) == ':')
8114 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
8115 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
8116
8117 STATIC I32
8118 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
8119 {
8120     dVAR;
8121     I32 namedclass = OOB_NAMEDCLASS;
8122
8123     PERL_ARGS_ASSERT_REGPPOSIXCC;
8124
8125     if (value == '[' && RExC_parse + 1 < RExC_end &&
8126         /* I smell either [: or [= or [. -- POSIX has been here, right? */
8127         POSIXCC(UCHARAT(RExC_parse))) {
8128         const char c = UCHARAT(RExC_parse);
8129         char* const s = RExC_parse++;
8130
8131         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
8132             RExC_parse++;
8133         if (RExC_parse == RExC_end)
8134             /* Grandfather lone [:, [=, [. */
8135             RExC_parse = s;
8136         else {
8137             const char* const t = RExC_parse++; /* skip over the c */
8138             assert(*t == c);
8139
8140             if (UCHARAT(RExC_parse) == ']') {
8141                 const char *posixcc = s + 1;
8142                 RExC_parse++; /* skip over the ending ] */
8143
8144                 if (*s == ':') {
8145                     const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
8146                     const I32 skip = t - posixcc;
8147
8148                     /* Initially switch on the length of the name.  */
8149                     switch (skip) {
8150                     case 4:
8151                         if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
8152                             namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
8153                         break;
8154                     case 5:
8155                         /* Names all of length 5.  */
8156                         /* alnum alpha ascii blank cntrl digit graph lower
8157                            print punct space upper  */
8158                         /* Offset 4 gives the best switch position.  */
8159                         switch (posixcc[4]) {
8160                         case 'a':
8161                             if (memEQ(posixcc, "alph", 4)) /* alpha */
8162                                 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
8163                             break;
8164                         case 'e':
8165                             if (memEQ(posixcc, "spac", 4)) /* space */
8166                                 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
8167                             break;
8168                         case 'h':
8169                             if (memEQ(posixcc, "grap", 4)) /* graph */
8170                                 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
8171                             break;
8172                         case 'i':
8173                             if (memEQ(posixcc, "asci", 4)) /* ascii */
8174                                 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
8175                             break;
8176                         case 'k':
8177                             if (memEQ(posixcc, "blan", 4)) /* blank */
8178                                 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
8179                             break;
8180                         case 'l':
8181                             if (memEQ(posixcc, "cntr", 4)) /* cntrl */
8182                                 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
8183                             break;
8184                         case 'm':
8185                             if (memEQ(posixcc, "alnu", 4)) /* alnum */
8186                                 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
8187                             break;
8188                         case 'r':
8189                             if (memEQ(posixcc, "lowe", 4)) /* lower */
8190                                 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
8191                             else if (memEQ(posixcc, "uppe", 4)) /* upper */
8192                                 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
8193                             break;
8194                         case 't':
8195                             if (memEQ(posixcc, "digi", 4)) /* digit */
8196                                 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
8197                             else if (memEQ(posixcc, "prin", 4)) /* print */
8198                                 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
8199                             else if (memEQ(posixcc, "punc", 4)) /* punct */
8200                                 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
8201                             break;
8202                         }
8203                         break;
8204                     case 6:
8205                         if (memEQ(posixcc, "xdigit", 6))
8206                             namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
8207                         break;
8208                     }
8209
8210                     if (namedclass == OOB_NAMEDCLASS)
8211                         Simple_vFAIL3("POSIX class [:%.*s:] unknown",
8212                                       t - s - 1, s + 1);
8213                     assert (posixcc[skip] == ':');
8214                     assert (posixcc[skip+1] == ']');
8215                 } else if (!SIZE_ONLY) {
8216                     /* [[=foo=]] and [[.foo.]] are still future. */
8217
8218                     /* adjust RExC_parse so the warning shows after
8219                        the class closes */
8220                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
8221                         RExC_parse++;
8222                     Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
8223                 }
8224             } else {
8225                 /* Maternal grandfather:
8226                  * "[:" ending in ":" but not in ":]" */
8227                 RExC_parse = s;
8228             }
8229         }
8230     }
8231
8232     return namedclass;
8233 }
8234
8235 STATIC void
8236 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
8237 {
8238     dVAR;
8239
8240     PERL_ARGS_ASSERT_CHECKPOSIXCC;
8241
8242     if (POSIXCC(UCHARAT(RExC_parse))) {
8243         const char *s = RExC_parse;
8244         const char  c = *s++;
8245
8246         while (isALNUM(*s))
8247             s++;
8248         if (*s && c == *s && s[1] == ']') {
8249             ckWARN3reg(s+2,
8250                        "POSIX syntax [%c %c] belongs inside character classes",
8251                        c, c);
8252
8253             /* [[=foo=]] and [[.foo.]] are still future. */
8254             if (POSIXCC_NOTYET(c)) {
8255                 /* adjust RExC_parse so the error shows after
8256                    the class closes */
8257                 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
8258                     NOOP;
8259                 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
8260             }
8261         }
8262     }
8263 }
8264
8265 /* No locale test, and always Unicode semantics */
8266 #define _C_C_T_NOLOC_(NAME,TEST,WORD)                                          \
8267 ANYOF_##NAME:                                                                  \
8268         for (value = 0; value < 256; value++)                                  \
8269             if (TEST)                                                          \
8270             stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value);  \
8271     yesno = '+';                                                               \
8272     what = WORD;                                                               \
8273     break;                                                                     \
8274 case ANYOF_N##NAME:                                                            \
8275         for (value = 0; value < 256; value++)                                  \
8276             if (!TEST)                                                         \
8277             stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value);  \
8278     yesno = '!';                                                               \
8279     what = WORD;                                                               \
8280     break
8281
8282 /* Like the above, but there are differences if we are in uni-8-bit or not, so
8283  * there are two tests passed in, to use depending on that. There aren't any
8284  * cases where the label is different from the name, so no need for that
8285  * parameter */
8286 #define _C_C_T_(NAME,TEST_8,TEST_7,WORD)                                       \
8287 ANYOF_##NAME:                                                                  \
8288     if (LOC) ANYOF_CLASS_SET(ret, ANYOF_##NAME);                               \
8289     else if (UNI_SEMANTICS) {                                                  \
8290         for (value = 0; value < 256; value++) {                                \
8291             if (TEST_8) stored +=                                              \
8292                       S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value);  \
8293         }                                                                      \
8294     }                                                                          \
8295     else {                                                                     \
8296         for (value = 0; value < 128; value++) {                                \
8297             if (TEST_7) stored +=                                              \
8298                 S_set_regclass_bit(aTHX_ pRExC_state, ret,                     \
8299                                    (U8) UNI_TO_NATIVE(value));                 \
8300         }                                                                      \
8301     }                                                                          \
8302     yesno = '+';                                                               \
8303     what = WORD;                                                               \
8304     break;                                                                     \
8305 case ANYOF_N##NAME:                                                            \
8306     if (LOC) ANYOF_CLASS_SET(ret, ANYOF_N##NAME);                              \
8307     else if (UNI_SEMANTICS) {                                                  \
8308         for (value = 0; value < 256; value++) {                                \
8309             if (! TEST_8) stored +=                                            \
8310                     S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value);    \
8311         }                                                                      \
8312     }                                                                          \
8313     else {                                                                     \
8314         for (value = 0; value < 128; value++) {                                \
8315             if (! TEST_7) stored +=                                            \
8316                     S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value);    \
8317         }                                                                      \
8318         if (ASCII_RESTRICTED) {                                                \
8319             for (value = 128; value < 256; value++) {                          \
8320              stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \
8321             }                                                                  \
8322             ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL|ANYOF_UTF8;                  \
8323         }                                                                      \
8324         else {                                                                 \
8325             /* For a non-ut8 target string with DEPENDS semantics, all above   \
8326              * ASCII Latin1 code points match the complement of any of the     \
8327              * classes.  But in utf8, they have their Unicode semantics, so    \
8328              * can't just set them in the bitmap, or else regexec.c will think \
8329              * they matched when they shouldn't. */                            \
8330             ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL|ANYOF_UTF8;          \
8331         }                                                                      \
8332     }                                                                          \
8333     yesno = '!';                                                               \
8334     what = WORD;                                                               \
8335     break
8336
8337 /*
8338    We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
8339    so that it is possible to override the option here without having to
8340    rebuild the entire core. as we are required to do if we change regcomp.h
8341    which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
8342 */
8343 #if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
8344 #define BROKEN_UNICODE_CHARCLASS_MAPPINGS
8345 #endif
8346
8347 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
8348 #define POSIX_CC_UNI_NAME(CCNAME) CCNAME
8349 #else
8350 #define POSIX_CC_UNI_NAME(CCNAME) "Posix" CCNAME
8351 #endif
8352
8353 STATIC U8
8354 S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value)
8355 {
8356
8357     /* Handle the setting of folds in the bitmap for non-locale ANYOF nodes.
8358      * Locale folding is done at run-time, so this function should not be
8359      * called for nodes that are for locales.
8360      *
8361      * This function simply sets the bit corresponding to the fold of the input
8362      * 'value', if not already set.  The fold of 'f' is 'F', and the fold of
8363      * 'F' is 'f'.
8364      *
8365      * It also sets any necessary flags, and returns the number of bits that
8366      * actually changed from 0 to 1 */
8367
8368     U8 stored = 0;
8369     U8 fold;
8370
8371     fold = (AT_LEAST_UNI_SEMANTICS) ? PL_fold_latin1[value]
8372                            : PL_fold[value];
8373
8374     /* It assumes the bit for 'value' has already been set */
8375     if (fold != value && ! ANYOF_BITMAP_TEST(node, fold)) {
8376         ANYOF_BITMAP_SET(node, fold);
8377         stored++;
8378     }
8379     if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value)
8380         || (! UNI_SEMANTICS
8381             && ! isASCII(value)
8382             && PL_fold_latin1[value] != value))
8383     {   /* A character that has a fold outside of Latin1 matches outside the
8384            bitmap, but only when the target string is utf8.  Similarly when we
8385            don't have unicode semantics for the above ASCII Latin-1 characters,
8386            and they have a fold, they should match if the target is utf8, and
8387            not otherwise */
8388         ANYOF_FLAGS(node) |= ANYOF_UTF8;
8389     }
8390
8391     return stored;
8392 }
8393
8394
8395 PERL_STATIC_INLINE U8
8396 S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value)
8397 {
8398     /* This inline function sets a bit in the bitmap if not already set, and if
8399      * appropriate, its fold, returning the number of bits that actually
8400      * changed from 0 to 1 */
8401
8402     U8 stored;
8403
8404     if (ANYOF_BITMAP_TEST(node, value)) {   /* Already set */
8405         return 0;
8406     }
8407
8408     ANYOF_BITMAP_SET(node, value);
8409     stored = 1;
8410
8411     if (FOLD && ! LOC) {        /* Locale folds aren't known until runtime */
8412         stored += S_set_regclass_bit_fold(aTHX_ pRExC_state, node, value);
8413     }
8414
8415     return stored;
8416 }
8417
8418 /*
8419    parse a class specification and produce either an ANYOF node that
8420    matches the pattern or if the pattern matches a single char only and
8421    that char is < 256 and we are case insensitive then we produce an
8422    EXACT node instead.
8423 */
8424
8425 STATIC regnode *
8426 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
8427 {
8428     dVAR;
8429     register UV nextvalue;
8430     register IV prevvalue = OOB_UNICODE;
8431     register IV range = 0;
8432     UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
8433     register regnode *ret;
8434     STRLEN numlen;
8435     IV namedclass;
8436     char *rangebegin = NULL;
8437     bool need_class = 0;
8438     SV *listsv = NULL;
8439     UV n;
8440     AV* unicode_alternate  = NULL;
8441 #ifdef EBCDIC
8442     UV literal_endpoint = 0;
8443 #endif
8444     UV stored = 0;  /* how many chars stored in the bitmap */
8445
8446     regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
8447         case we need to change the emitted regop to an EXACT. */
8448     const char * orig_parse = RExC_parse;
8449     GET_RE_DEBUG_FLAGS_DECL;
8450
8451     PERL_ARGS_ASSERT_REGCLASS;
8452 #ifndef DEBUGGING
8453     PERL_UNUSED_ARG(depth);
8454 #endif
8455
8456     DEBUG_PARSE("clas");
8457
8458     /* Assume we are going to generate an ANYOF node. */
8459     ret = reganode(pRExC_state, ANYOF, 0);
8460
8461     if (!SIZE_ONLY)
8462         ANYOF_FLAGS(ret) = 0;
8463
8464     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
8465         RExC_naughty++;
8466         RExC_parse++;
8467         if (!SIZE_ONLY)
8468             ANYOF_FLAGS(ret) |= ANYOF_INVERT;
8469     }
8470
8471     if (SIZE_ONLY) {
8472         RExC_size += ANYOF_SKIP;
8473 #ifdef ANYOF_ADD_LOC_SKIP
8474         if (LOC) {
8475             RExC_size += ANYOF_ADD_LOC_SKIP;
8476         }
8477 #endif
8478         listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
8479     }
8480     else {
8481         RExC_emit += ANYOF_SKIP;
8482         if (LOC) {
8483             ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
8484 #ifdef ANYOF_ADD_LOC_SKIP
8485             RExC_emit += ANYOF_ADD_LOC_SKIP;
8486 #endif
8487         }
8488         ANYOF_BITMAP_ZERO(ret);
8489         listsv = newSVpvs("# comment\n");
8490     }
8491
8492     nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
8493
8494     if (!SIZE_ONLY && POSIXCC(nextvalue))
8495         checkposixcc(pRExC_state);
8496
8497     /* allow 1st char to be ] (allowing it to be - is dealt with later) */
8498     if (UCHARAT(RExC_parse) == ']')
8499         goto charclassloop;
8500
8501 parseit:
8502     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
8503
8504     charclassloop:
8505
8506         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
8507
8508         if (!range)
8509             rangebegin = RExC_parse;
8510         if (UTF) {
8511             value = utf8n_to_uvchr((U8*)RExC_parse,
8512                                    RExC_end - RExC_parse,
8513                                    &numlen, UTF8_ALLOW_DEFAULT);
8514             RExC_parse += numlen;
8515         }
8516         else
8517             value = UCHARAT(RExC_parse++);
8518
8519         nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
8520         if (value == '[' && POSIXCC(nextvalue))
8521             namedclass = regpposixcc(pRExC_state, value);
8522         else if (value == '\\') {
8523             if (UTF) {
8524                 value = utf8n_to_uvchr((U8*)RExC_parse,
8525                                    RExC_end - RExC_parse,
8526                                    &numlen, UTF8_ALLOW_DEFAULT);
8527                 RExC_parse += numlen;
8528             }
8529             else
8530                 value = UCHARAT(RExC_parse++);
8531             /* Some compilers cannot handle switching on 64-bit integer
8532              * values, therefore value cannot be an UV.  Yes, this will
8533              * be a problem later if we want switch on Unicode.
8534              * A similar issue a little bit later when switching on
8535              * namedclass. --jhi */
8536             switch ((I32)value) {
8537             case 'w':   namedclass = ANYOF_ALNUM;       break;
8538             case 'W':   namedclass = ANYOF_NALNUM;      break;
8539             case 's':   namedclass = ANYOF_SPACE;       break;
8540             case 'S':   namedclass = ANYOF_NSPACE;      break;
8541             case 'd':   namedclass = ANYOF_DIGIT;       break;
8542             case 'D':   namedclass = ANYOF_NDIGIT;      break;
8543             case 'v':   namedclass = ANYOF_VERTWS;      break;
8544             case 'V':   namedclass = ANYOF_NVERTWS;     break;
8545             case 'h':   namedclass = ANYOF_HORIZWS;     break;
8546             case 'H':   namedclass = ANYOF_NHORIZWS;    break;
8547             case 'N':  /* Handle \N{NAME} in class */
8548                 {
8549                     /* We only pay attention to the first char of
8550                     multichar strings being returned. I kinda wonder
8551                     if this makes sense as it does change the behaviour
8552                     from earlier versions, OTOH that behaviour was broken
8553                     as well. */
8554                     UV v; /* value is register so we cant & it /grrr */
8555                     if (reg_namedseq(pRExC_state, &v, NULL)) {
8556                         goto parseit;
8557                     }
8558                     value= v;
8559                 }
8560                 break;
8561             case 'p':
8562             case 'P':
8563                 {
8564                 char *e;
8565                 if (RExC_parse >= RExC_end)
8566                     vFAIL2("Empty \\%c{}", (U8)value);
8567                 if (*RExC_parse == '{') {
8568                     const U8 c = (U8)value;
8569                     e = strchr(RExC_parse++, '}');
8570                     if (!e)
8571                         vFAIL2("Missing right brace on \\%c{}", c);
8572                     while (isSPACE(UCHARAT(RExC_parse)))
8573                         RExC_parse++;
8574                     if (e == RExC_parse)
8575                         vFAIL2("Empty \\%c{}", c);
8576                     n = e - RExC_parse;
8577                     while (isSPACE(UCHARAT(RExC_parse + n - 1)))
8578                         n--;
8579                 }
8580                 else {
8581                     e = RExC_parse;
8582                     n = 1;
8583                 }
8584                 if (!SIZE_ONLY) {
8585                     if (UCHARAT(RExC_parse) == '^') {
8586                          RExC_parse++;
8587                          n--;
8588                          value = value == 'p' ? 'P' : 'p'; /* toggle */
8589                          while (isSPACE(UCHARAT(RExC_parse))) {
8590                               RExC_parse++;
8591                               n--;
8592                          }
8593                     }
8594                     Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
8595                         (value=='p' ? '+' : '!'), (int)n, RExC_parse);
8596                 }
8597                 RExC_parse = e + 1;
8598
8599                 /* The \p could match something in the Latin1 range, hence
8600                  * something that isn't utf8 */
8601                 ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP;
8602                 namedclass = ANYOF_MAX;  /* no official name, but it's named */
8603                 }
8604                 break;
8605             case 'n':   value = '\n';                   break;
8606             case 'r':   value = '\r';                   break;
8607             case 't':   value = '\t';                   break;
8608             case 'f':   value = '\f';                   break;
8609             case 'b':   value = '\b';                   break;
8610             case 'e':   value = ASCII_TO_NATIVE('\033');break;
8611             case 'a':   value = ASCII_TO_NATIVE('\007');break;
8612             case 'o':
8613                 RExC_parse--;   /* function expects to be pointed at the 'o' */
8614                 {
8615                     const char* error_msg;
8616                     bool valid = grok_bslash_o(RExC_parse,
8617                                                &value,
8618                                                &numlen,
8619                                                &error_msg,
8620                                                SIZE_ONLY);
8621                     RExC_parse += numlen;
8622                     if (! valid) {
8623                         vFAIL(error_msg);
8624                     }
8625                 }
8626                 if (PL_encoding && value < 0x100) {
8627                     goto recode_encoding;
8628                 }
8629                 break;
8630             case 'x':
8631                 if (*RExC_parse == '{') {
8632                     I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
8633                         | PERL_SCAN_DISALLOW_PREFIX;
8634                     char * const e = strchr(RExC_parse++, '}');
8635                     if (!e)
8636                         vFAIL("Missing right brace on \\x{}");
8637
8638                     numlen = e - RExC_parse;
8639                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
8640                     RExC_parse = e + 1;
8641                 }
8642                 else {
8643                     I32 flags = PERL_SCAN_DISALLOW_PREFIX;
8644                     numlen = 2;
8645                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
8646                     RExC_parse += numlen;
8647                 }
8648                 if (PL_encoding && value < 0x100)
8649                     goto recode_encoding;
8650                 break;
8651             case 'c':
8652                 value = grok_bslash_c(*RExC_parse++, SIZE_ONLY);
8653                 break;
8654             case '0': case '1': case '2': case '3': case '4':
8655             case '5': case '6': case '7':
8656                 {
8657                     /* Take 1-3 octal digits */
8658                     I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
8659                     numlen = 3;
8660                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
8661                     RExC_parse += numlen;
8662                     if (PL_encoding && value < 0x100)
8663                         goto recode_encoding;
8664                     break;
8665                 }
8666             recode_encoding:
8667                 {
8668                     SV* enc = PL_encoding;
8669                     value = reg_recode((const char)(U8)value, &enc);
8670                     if (!enc && SIZE_ONLY)
8671                         ckWARNreg(RExC_parse,
8672                                   "Invalid escape in the specified encoding");
8673                     break;
8674                 }
8675             default:
8676                 /* Allow \_ to not give an error */
8677                 if (!SIZE_ONLY && isALNUM(value) && value != '_') {
8678                     ckWARN2reg(RExC_parse,
8679                                "Unrecognized escape \\%c in character class passed through",
8680                                (int)value);
8681                 }
8682                 break;
8683             }
8684         } /* end of \blah */
8685 #ifdef EBCDIC
8686         else
8687             literal_endpoint++;
8688 #endif
8689
8690         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
8691
8692             /* What matches in a locale is not known until runtime, so need to
8693              * (one time per class) allocate extra space to pass to regexec.
8694              * The space will contain a bit for each named class that is to be
8695              * matched against.  This isn't needed for \p{} and pseudo-classes,
8696              * as they are not affected by locale, and hence are dealt with
8697              * separately */
8698             if (LOC && namedclass < ANYOF_MAX && ! need_class) {
8699                 need_class = 1;
8700                 if (SIZE_ONLY) {
8701 #ifdef ANYOF_CLASS_ADD_SKIP
8702                     RExC_size += ANYOF_CLASS_ADD_SKIP;
8703 #endif
8704                 }
8705                 else {
8706 #ifdef ANYOF_CLASS_ADD_SKIP
8707                     RExC_emit += ANYOF_CLASS_ADD_SKIP;
8708 #endif
8709                     ANYOF_CLASS_ZERO(ret);
8710                 }
8711                 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
8712             }
8713
8714             /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
8715              * literal */
8716             if (range) {
8717                 if (!SIZE_ONLY) {
8718                     const int w =
8719                         RExC_parse >= rangebegin ?
8720                         RExC_parse - rangebegin : 0;
8721                     ckWARN4reg(RExC_parse,
8722                                "False [] range \"%*.*s\"",
8723                                w, w, rangebegin);
8724
8725                     if (prevvalue < 256) {
8726                         stored +=
8727                          S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) prevvalue);
8728                         stored +=
8729                          S_set_regclass_bit(aTHX_ pRExC_state, ret, '-');
8730                     }
8731                     else {
8732                         ANYOF_FLAGS(ret) |= ANYOF_UTF8;
8733                         Perl_sv_catpvf(aTHX_ listsv,
8734                            "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
8735                     }
8736                 }
8737
8738                 range = 0; /* this was not a true range */
8739             }
8740
8741
8742
8743             if (!SIZE_ONLY) {
8744                 const char *what = NULL;
8745                 char yesno = 0;
8746
8747                 /* Possible truncation here but in some 64-bit environments
8748                  * the compiler gets heartburn about switch on 64-bit values.
8749                  * A similar issue a little earlier when switching on value.
8750                  * --jhi */
8751                 switch ((I32)namedclass) {
8752
8753                 case _C_C_T_(ALNUMC, isALNUMC_L1(value), isALNUMC(value), "XPosixAlnum");
8754                 case _C_C_T_(ALPHA, isALPHA_L1(value), isALPHA(value), "XPosixAlpha");
8755                 case _C_C_T_(BLANK, isBLANK_L1(value), isBLANK(value), "XPosixBlank");
8756                 case _C_C_T_(CNTRL, isCNTRL_L1(value), isCNTRL(value), "XPosixCntrl");
8757                 case _C_C_T_(GRAPH, isGRAPH_L1(value), isGRAPH(value), "XPosixGraph");
8758                 case _C_C_T_(LOWER, isLOWER_L1(value), isLOWER(value), "XPosixLower");
8759                 case _C_C_T_(PRINT, isPRINT_L1(value), isPRINT(value), "XPosixPrint");
8760                 case _C_C_T_(PSXSPC, isPSXSPC_L1(value), isPSXSPC(value), "XPosixSpace");
8761                 case _C_C_T_(PUNCT, isPUNCT_L1(value), isPUNCT(value), "XPosixPunct");
8762                 case _C_C_T_(UPPER, isUPPER_L1(value), isUPPER(value), "XPosixUpper");
8763 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
8764                 /* \s, \w match all unicode if utf8. */
8765                 case _C_C_T_(SPACE, isSPACE_L1(value), isSPACE(value), "SpacePerl");
8766                 case _C_C_T_(ALNUM, isWORDCHAR_L1(value), isALNUM(value), "Word");
8767 #else
8768                 /* \s, \w match ascii and locale only */
8769                 case _C_C_T_(SPACE, isSPACE_L1(value), isSPACE(value), "PerlSpace");
8770                 case _C_C_T_(ALNUM, isWORDCHAR_L1(value), isALNUM(value), "PerlWord");
8771 #endif
8772                 case _C_C_T_(XDIGIT, isXDIGIT_L1(value), isXDIGIT(value), "XPosixXDigit");
8773                 case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
8774                 case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
8775                 case ANYOF_ASCII:
8776                     if (LOC)
8777                         ANYOF_CLASS_SET(ret, ANYOF_ASCII);
8778                     else {
8779                         for (value = 0; value < 128; value++)
8780                             stored +=
8781                               S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) ASCII_TO_NATIVE(value));
8782                     }
8783                     yesno = '+';
8784                     what = NULL;        /* Doesn't match outside ascii, so
8785                                            don't want to add +utf8:: */
8786                     break;
8787                 case ANYOF_NASCII:
8788                     if (LOC)
8789                         ANYOF_CLASS_SET(ret, ANYOF_NASCII);
8790                     else {
8791                         for (value = 128; value < 256; value++)
8792                             stored +=
8793                               S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) ASCII_TO_NATIVE(value));
8794                     }
8795                     ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
8796                     yesno = '!';
8797                     what = "ASCII";
8798                     break;
8799                 case ANYOF_DIGIT:
8800                     if (LOC)
8801                         ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
8802                     else {
8803                         /* consecutive digits assumed */
8804                         for (value = '0'; value <= '9'; value++)
8805                             stored +=
8806                               S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value);
8807                     }
8808                     yesno = '+';
8809                     what = POSIX_CC_UNI_NAME("Digit");
8810                     break;
8811                 case ANYOF_NDIGIT:
8812                     if (LOC)
8813                         ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
8814                     else {
8815                         /* consecutive digits assumed */
8816                         for (value = 0; value < '0'; value++)
8817                             stored +=
8818                               S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value);
8819                         for (value = '9' + 1; value < 256; value++)
8820                             stored +=
8821                               S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value);
8822                     }
8823                     yesno = '!';
8824                     what = POSIX_CC_UNI_NAME("Digit");
8825                     if (ASCII_RESTRICTED ) {
8826                         ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
8827                     }
8828                     break;
8829                 case ANYOF_MAX:
8830                     /* this is to handle \p and \P */
8831                     break;
8832                 default:
8833                     vFAIL("Invalid [::] class");
8834                     break;
8835                 }
8836                 if (what && ! (ASCII_RESTRICTED)) {
8837                     /* Strings such as "+utf8::isWord\n" */
8838                     Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
8839                     ANYOF_FLAGS(ret) |= ANYOF_UTF8;
8840                 }
8841
8842                 continue;
8843             }
8844         } /* end of namedclass \blah */
8845
8846         if (range) {
8847             if (prevvalue > (IV)value) /* b-a */ {
8848                 const int w = RExC_parse - rangebegin;
8849                 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
8850                 range = 0; /* not a valid range */
8851             }
8852         }
8853         else {
8854             prevvalue = value; /* save the beginning of the range */
8855             if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
8856                 RExC_parse[1] != ']') {
8857                 RExC_parse++;
8858
8859                 /* a bad range like \w-, [:word:]- ? */
8860                 if (namedclass > OOB_NAMEDCLASS) {
8861                     if (ckWARN(WARN_REGEXP)) {
8862                         const int w =
8863                             RExC_parse >= rangebegin ?
8864                             RExC_parse - rangebegin : 0;
8865                         vWARN4(RExC_parse,
8866                                "False [] range \"%*.*s\"",
8867                                w, w, rangebegin);
8868                     }
8869                     if (!SIZE_ONLY)
8870                         stored +=
8871                             S_set_regclass_bit(aTHX_ pRExC_state, ret, '-');
8872                 } else
8873                     range = 1;  /* yeah, it's a range! */
8874                 continue;       /* but do it the next time */
8875             }
8876         }
8877
8878         /* now is the next time */
8879         if (!SIZE_ONLY) {
8880             if (prevvalue < 256) {
8881                 const IV ceilvalue = value < 256 ? value : 255;
8882                 IV i;
8883 #ifdef EBCDIC
8884                 /* In EBCDIC [\x89-\x91] should include
8885                  * the \x8e but [i-j] should not. */
8886                 if (literal_endpoint == 2 &&
8887                     ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
8888                      (isUPPER(prevvalue) && isUPPER(ceilvalue))))
8889                 {
8890                     if (isLOWER(prevvalue)) {
8891                         for (i = prevvalue; i <= ceilvalue; i++)
8892                             if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8893                                 stored +=
8894                                   S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) i);
8895                             }
8896                     } else {
8897                         for (i = prevvalue; i <= ceilvalue; i++)
8898                             if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8899                                 stored +=
8900                                   S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) i);
8901                             }
8902                     }
8903                 }
8904                 else
8905 #endif
8906                       for (i = prevvalue; i <= ceilvalue; i++) {
8907                         stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) i);
8908                       }
8909           }
8910           if (value > 255 || UTF) {
8911                 const UV prevnatvalue  = NATIVE_TO_UNI(prevvalue);
8912                 const UV natvalue      = NATIVE_TO_UNI(value);
8913
8914                 /* If the code point requires utf8 to represent, and we are not
8915                  * folding, it can't match unless the target is in utf8.  Only
8916                  * a few code points above 255 fold to below it, so XXX an
8917                  * optimization would be to know which ones and set the flag
8918                  * appropriately. */
8919                 ANYOF_FLAGS(ret) |= (FOLD || value < 256)
8920                                     ? ANYOF_NONBITMAP
8921                                     : ANYOF_UTF8;
8922                 if (prevnatvalue < natvalue) { /* '>' case is fatal error above */
8923
8924                     /* The \t sets the whole range */
8925                     Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
8926                                    prevnatvalue, natvalue);
8927
8928                     /* Currently, we don't look at every value in the range.
8929                      * Therefore we have to assume the worst case: that if
8930                      * folding, it will match more than one character.  But in
8931                      * lookbehind patterns, can only be single character
8932                      * length, so disallow those folds */
8933                     if (FOLD && ! RExC_in_lookbehind) {
8934                       OP(ret) = ANYOFV;
8935                     }
8936                 }
8937                 else if (prevnatvalue == natvalue) {
8938                     Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
8939                     if (FOLD) {
8940                          U8 foldbuf[UTF8_MAXBYTES_CASE+1];
8941                          STRLEN foldlen;
8942                          const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
8943
8944 #ifdef EBCDIC /* RD t/uni/fold ff and 6b */
8945                          if (RExC_precomp[0] == ':' &&
8946                              RExC_precomp[1] == '[' &&
8947                              (f == 0xDF || f == 0x92)) {
8948                              f = NATIVE_TO_UNI(f);
8949                         }
8950 #endif
8951                          /* If folding and foldable and a single
8952                           * character, insert also the folded version
8953                           * to the charclass. */
8954                          if (f != value) {
8955 #ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
8956                              if ((RExC_precomp[0] == ':' &&
8957                                   RExC_precomp[1] == '[' &&
8958                                   (f == 0xA2 &&
8959                                    (value == 0xFB05 || value == 0xFB06))) ?
8960                                  foldlen == ((STRLEN)UNISKIP(f) - 1) :
8961                                  foldlen == (STRLEN)UNISKIP(f) )
8962 #else
8963                               if (foldlen == (STRLEN)UNISKIP(f))
8964 #endif
8965                                   Perl_sv_catpvf(aTHX_ listsv,
8966                                                  "%04"UVxf"\n", f);
8967                               else if (! RExC_in_lookbehind) {
8968                                   /* Any multicharacter foldings
8969                                    * (disallowed in lookbehind patterns)
8970                                    * require the following transform:
8971                                    * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
8972                                    * where E folds into "pq" and F folds
8973                                    * into "rst", all other characters
8974                                    * fold to single characters.  We save
8975                                    * away these multicharacter foldings,
8976                                    * to be later saved as part of the
8977                                    * additional "s" data. */
8978                                   SV *sv;
8979
8980                                   if (!unicode_alternate)
8981                                       unicode_alternate = newAV();
8982                                   sv = newSVpvn_utf8((char*)foldbuf, foldlen,
8983                                                      TRUE);
8984                                   av_push(unicode_alternate, sv);
8985                                   OP(ret) = ANYOFV;
8986                               }
8987                          }
8988
8989                          /* If folding and the value is one of the Greek
8990                           * sigmas insert a few more sigmas to make the
8991                           * folding rules of the sigmas to work right.
8992                           * Note that not all the possible combinations
8993                           * are handled here: some of them are handled
8994                           * by the standard folding rules, and some of
8995                           * them (literal or EXACTF cases) are handled
8996                           * during runtime in regexec.c:S_find_byclass(). */
8997                          if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
8998                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8999                                              (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
9000                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
9001                                              (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
9002                          }
9003                          else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
9004                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
9005                                              (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
9006                     }
9007                 }
9008             }
9009 #ifdef EBCDIC
9010             literal_endpoint = 0;
9011 #endif
9012         }
9013
9014         range = 0; /* this range (if it was one) is done now */
9015     }
9016
9017
9018
9019     if (SIZE_ONLY)
9020         return ret;
9021     /****** !SIZE_ONLY AFTER HERE *********/
9022
9023     /* Optimize inverted simple patterns (e.g. [^a-z]).  Note that we haven't
9024      * set the FOLD flag yet, so this this does optimize those.  It doesn't
9025      * optimize locale.  Doing so perhaps could be done as long as there is
9026      * nothing like \w in it; some thought also would have to be given to the
9027      * interaction with above 0x100 chars */
9028     if (! LOC && (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
9029         for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
9030             ANYOF_BITMAP(ret)[value] ^= 0xFF;
9031         stored = 256 - stored;
9032
9033         /* The inversion means that everything above 255 is matched; and at the
9034          * same time we clear the invert flag */
9035         ANYOF_FLAGS(ret) = ANYOF_UTF8|ANYOF_UNICODE_ALL;
9036     }
9037
9038     if (FOLD) {
9039         SV *sv;
9040
9041         /* This is the one character in the bitmap that needs special handling
9042          * under non-locale folding, as it folds to two characters 'ss'.  This
9043          * happens if it is set and not inverting, or isn't set and are
9044          * inverting (disallowed in lookbehind patterns because they can't be
9045          * variable length) */
9046         if (! LOC
9047             && ! RExC_in_lookbehind
9048             && (cBOOL(ANYOF_BITMAP_TEST(ret, LATIN_SMALL_LETTER_SHARP_S))
9049                 ^ cBOOL(ANYOF_FLAGS(ret) & ANYOF_INVERT)))
9050         {
9051             OP(ret) = ANYOFV;   /* Can match more than a single char */
9052
9053             /* Under Unicode semantics), it can do this when the target string
9054              * isn't in utf8 */
9055             if (UNI_SEMANTICS) {
9056                 ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
9057             }
9058
9059             if (!unicode_alternate) {
9060                 unicode_alternate = newAV();
9061             }
9062             sv = newSVpvn_utf8("ss", 2, TRUE);
9063             av_push(unicode_alternate, sv);
9064         }
9065
9066         /* Folding in the bitmap is taken care of above, but not for locale
9067          * (for which we have to wait to see what folding is in effect at
9068          * runtime), and for things not in the bitmap.  Set run-time fold flag
9069          * for these */
9070         if ((LOC || (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP))) {
9071             ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD;
9072         }
9073     }
9074
9075     /* A single character class can be "optimized" into an EXACTish node.
9076      * Note that since we don't currently count how many characters there are
9077      * outside the bitmap, we are XXX missing optimization possibilities for
9078      * them.  This optimization can't happen unless this is a truly single
9079      * character class, which means that it can't be an inversion into a
9080      * many-character class, and there must be no possibility of there being
9081      * things outside the bitmap.  'stored' (only) for locales doesn't include
9082      * \w, etc, so have to make a special test that they aren't present
9083      *
9084      * Similarly A 2-character class of the very special form like [bB] can be
9085      * optimized into an EXACTFish node, but only for non-locales, and for
9086      * characters which only have the two folds; so things like 'fF' and 'Ii'
9087      * wouldn't work because they are part of the fold of 'LATIN SMALL LIGATURE
9088      * FI'. */
9089     if (! (ANYOF_FLAGS(ret) & (ANYOF_NONBITMAP|ANYOF_INVERT|ANYOF_UNICODE_ALL))
9090         && (((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
9091                               || (! ANYOF_CLASS_TEST_ANY_SET(ret)))))
9092             || (stored == 2 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
9093                                  && (! _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value))
9094                                  /* If the latest code point has a fold whose
9095                                   * bit is set, it must be the only other one */
9096                                 && ((prevvalue = PL_fold_latin1[value]) != (IV)value)
9097                                  && ANYOF_BITMAP_TEST(ret, prevvalue)))))
9098     {
9099         /* Note that the information needed to decide to do this optimization
9100          * is not currently available until the 2nd pass, and that the actually
9101          * used EXACTish node takes less space than the calculated ANYOF node,
9102          * and hence the amount of space calculated in the first pass is larger
9103          * than actually used, so this optimization doesn't gain us any space.
9104          * But an EXACT node is faster than an ANYOF node, and can be combined
9105          * with any adjacent EXACT nodes later by the optimizer for further
9106          * gains.  The speed of executing an EXACTF is similar to an ANYOF
9107          * node, so the optimization advantage comes from the ability to join
9108          * it to adjacent EXACT nodes */
9109
9110         const char * cur_parse= RExC_parse;
9111         U8 op;
9112         RExC_emit = (regnode *)orig_emit;
9113         RExC_parse = (char *)orig_parse;
9114
9115         if (stored == 1) {
9116
9117             /* A locale node with one point can be folded; all the other cases
9118              * with folding will have two points, since we calculate them above
9119              */
9120             if (ANYOF_FLAGS(ret) & ANYOF_LOC_NONBITMAP_FOLD) {
9121                  op = EXACTFL;
9122             }
9123             else {
9124                 op = EXACT;
9125             }
9126         }   /* else 2 chars in the bit map: the folds of each other */
9127         else if (AT_LEAST_UNI_SEMANTICS || !isASCII(value)) {
9128
9129             /* To join adjacent nodes, they must be the exact EXACTish type.
9130              * Try to use the most likely type, by using EXACTFU if the regex
9131              * calls for them, or is required because the character is
9132              * non-ASCII */
9133             op = EXACTFU;
9134         }
9135         else {    /* Otherwise, more likely to be EXACTF type */
9136             op = EXACTF;
9137         }
9138
9139         ret = reg_node(pRExC_state, op);
9140         RExC_parse = (char *)cur_parse;
9141         if (UTF && ! NATIVE_IS_INVARIANT(value)) {
9142             *STRING(ret)= UTF8_EIGHT_BIT_HI((U8) value);
9143             *(STRING(ret) + 1)= UTF8_EIGHT_BIT_LO((U8) value);
9144             STR_LEN(ret)= 2;
9145             RExC_emit += STR_SZ(2);
9146         }
9147         else {
9148             *STRING(ret)= (char)value;
9149             STR_LEN(ret)= 1;
9150             RExC_emit += STR_SZ(1);
9151         }
9152         SvREFCNT_dec(listsv);
9153         return ret;
9154     }
9155
9156     {
9157         AV * const av = newAV();
9158         SV *rv;
9159         /* The 0th element stores the character class description
9160          * in its textual form: used later (regexec.c:Perl_regclass_swash())
9161          * to initialize the appropriate swash (which gets stored in
9162          * the 1st element), and also useful for dumping the regnode.
9163          * The 2nd element stores the multicharacter foldings,
9164          * used later (regexec.c:S_reginclass()). */
9165         av_store(av, 0, listsv);
9166         av_store(av, 1, NULL);
9167         av_store(av, 2, MUTABLE_SV(unicode_alternate));
9168         rv = newRV_noinc(MUTABLE_SV(av));
9169         n = add_data(pRExC_state, 1, "s");
9170         RExC_rxi->data->data[n] = (void*)rv;
9171         ARG_SET(ret, n);
9172     }
9173     return ret;
9174 }
9175 #undef _C_C_T_
9176
9177
9178 /* reg_skipcomment()
9179
9180    Absorbs an /x style # comments from the input stream.
9181    Returns true if there is more text remaining in the stream.
9182    Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
9183    terminates the pattern without including a newline.
9184
9185    Note its the callers responsibility to ensure that we are
9186    actually in /x mode
9187
9188 */
9189
9190 STATIC bool
9191 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
9192 {
9193     bool ended = 0;
9194
9195     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
9196
9197     while (RExC_parse < RExC_end)
9198         if (*RExC_parse++ == '\n') {
9199             ended = 1;
9200             break;
9201         }
9202     if (!ended) {
9203         /* we ran off the end of the pattern without ending
9204            the comment, so we have to add an \n when wrapping */
9205         RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
9206         return 0;
9207     } else
9208         return 1;
9209 }
9210
9211 /* nextchar()
9212
9213    Advances the parse position, and optionally absorbs
9214    "whitespace" from the inputstream.
9215
9216    Without /x "whitespace" means (?#...) style comments only,
9217    with /x this means (?#...) and # comments and whitespace proper.
9218
9219    Returns the RExC_parse point from BEFORE the scan occurs.
9220
9221    This is the /x friendly way of saying RExC_parse++.
9222 */
9223
9224 STATIC char*
9225 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
9226 {
9227     char* const retval = RExC_parse++;
9228
9229     PERL_ARGS_ASSERT_NEXTCHAR;
9230
9231     for (;;) {
9232         if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
9233                 RExC_parse[2] == '#') {
9234             while (*RExC_parse != ')') {
9235                 if (RExC_parse == RExC_end)
9236                     FAIL("Sequence (?#... not terminated");
9237                 RExC_parse++;
9238             }
9239             RExC_parse++;
9240             continue;
9241         }
9242         if (RExC_flags & RXf_PMf_EXTENDED) {
9243             if (isSPACE(*RExC_parse)) {
9244                 RExC_parse++;
9245                 continue;
9246             }
9247             else if (*RExC_parse == '#') {
9248                 if ( reg_skipcomment( pRExC_state ) )
9249                     continue;
9250             }
9251         }
9252         return retval;
9253     }
9254 }
9255
9256 /*
9257 - reg_node - emit a node
9258 */
9259 STATIC regnode *                        /* Location. */
9260 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
9261 {
9262     dVAR;
9263     register regnode *ptr;
9264     regnode * const ret = RExC_emit;
9265     GET_RE_DEBUG_FLAGS_DECL;
9266
9267     PERL_ARGS_ASSERT_REG_NODE;
9268
9269     if (SIZE_ONLY) {
9270         SIZE_ALIGN(RExC_size);
9271         RExC_size += 1;
9272         return(ret);
9273     }
9274     if (RExC_emit >= RExC_emit_bound)
9275         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
9276
9277     NODE_ALIGN_FILL(ret);
9278     ptr = ret;
9279     FILL_ADVANCE_NODE(ptr, op);
9280     REH_CALL_REGCOMP_HOOK(pRExC_state->rx, (ptr) - 1);
9281 #ifdef RE_TRACK_PATTERN_OFFSETS
9282     if (RExC_offsets) {         /* MJD */
9283         MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
9284               "reg_node", __LINE__,
9285               PL_reg_name[op],
9286               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
9287                 ? "Overwriting end of array!\n" : "OK",
9288               (UV)(RExC_emit - RExC_emit_start),
9289               (UV)(RExC_parse - RExC_start),
9290               (UV)RExC_offsets[0]));
9291         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
9292     }
9293 #endif
9294     RExC_emit = ptr;
9295     return(ret);
9296 }
9297
9298 /*
9299 - reganode - emit a node with an argument
9300 */
9301 STATIC regnode *                        /* Location. */
9302 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
9303 {
9304     dVAR;
9305     register regnode *ptr;
9306     regnode * const ret = RExC_emit;
9307     GET_RE_DEBUG_FLAGS_DECL;
9308
9309     PERL_ARGS_ASSERT_REGANODE;
9310
9311     if (SIZE_ONLY) {
9312         SIZE_ALIGN(RExC_size);
9313         RExC_size += 2;
9314         /*
9315            We can't do this:
9316
9317            assert(2==regarglen[op]+1);
9318
9319            Anything larger than this has to allocate the extra amount.
9320            If we changed this to be:
9321
9322            RExC_size += (1 + regarglen[op]);
9323
9324            then it wouldn't matter. Its not clear what side effect
9325            might come from that so its not done so far.
9326            -- dmq
9327         */
9328         return(ret);
9329     }
9330     if (RExC_emit >= RExC_emit_bound)
9331         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
9332
9333     NODE_ALIGN_FILL(ret);
9334     ptr = ret;
9335     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
9336     REH_CALL_REGCOMP_HOOK(pRExC_state->rx, (ptr) - 2);
9337 #ifdef RE_TRACK_PATTERN_OFFSETS
9338     if (RExC_offsets) {         /* MJD */
9339         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
9340               "reganode",
9341               __LINE__,
9342               PL_reg_name[op],
9343               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
9344               "Overwriting end of array!\n" : "OK",
9345               (UV)(RExC_emit - RExC_emit_start),
9346               (UV)(RExC_parse - RExC_start),
9347               (UV)RExC_offsets[0]));
9348         Set_Cur_Node_Offset;
9349     }
9350 #endif
9351     RExC_emit = ptr;
9352     return(ret);
9353 }
9354
9355 /*
9356 - reguni - emit (if appropriate) a Unicode character
9357 */
9358 STATIC STRLEN
9359 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
9360 {
9361     dVAR;
9362
9363     PERL_ARGS_ASSERT_REGUNI;
9364
9365     return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
9366 }
9367
9368 /*
9369 - reginsert - insert an operator in front of already-emitted operand
9370 *
9371 * Means relocating the operand.
9372 */
9373 STATIC void
9374 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
9375 {
9376     dVAR;
9377     register regnode *src;
9378     register regnode *dst;
9379     register regnode *place;
9380     const int offset = regarglen[(U8)op];
9381     const int size = NODE_STEP_REGNODE + offset;
9382     GET_RE_DEBUG_FLAGS_DECL;
9383
9384     PERL_ARGS_ASSERT_REGINSERT;
9385     PERL_UNUSED_ARG(depth);
9386 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
9387     DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
9388     if (SIZE_ONLY) {
9389         RExC_size += size;
9390         return;
9391     }
9392
9393     src = RExC_emit;
9394     RExC_emit += size;
9395     dst = RExC_emit;
9396     if (RExC_open_parens) {
9397         int paren;
9398         /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
9399         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
9400             if ( RExC_open_parens[paren] >= opnd ) {
9401                 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
9402                 RExC_open_parens[paren] += size;
9403             } else {
9404                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
9405             }
9406             if ( RExC_close_parens[paren] >= opnd ) {
9407                 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
9408                 RExC_close_parens[paren] += size;
9409             } else {
9410                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
9411             }
9412         }
9413     }
9414
9415     while (src > opnd) {
9416         StructCopy(--src, --dst, regnode);
9417 #ifdef RE_TRACK_PATTERN_OFFSETS
9418         if (RExC_offsets) {     /* MJD 20010112 */
9419             MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
9420                   "reg_insert",
9421                   __LINE__,
9422                   PL_reg_name[op],
9423                   (UV)(dst - RExC_emit_start) > RExC_offsets[0]
9424                     ? "Overwriting end of array!\n" : "OK",
9425                   (UV)(src - RExC_emit_start),
9426                   (UV)(dst - RExC_emit_start),
9427                   (UV)RExC_offsets[0]));
9428             Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
9429             Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
9430         }
9431 #endif
9432     }
9433
9434
9435     place = opnd;               /* Op node, where operand used to be. */
9436 #ifdef RE_TRACK_PATTERN_OFFSETS
9437     if (RExC_offsets) {         /* MJD */
9438         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
9439               "reginsert",
9440               __LINE__,
9441               PL_reg_name[op],
9442               (UV)(place - RExC_emit_start) > RExC_offsets[0]
9443               ? "Overwriting end of array!\n" : "OK",
9444               (UV)(place - RExC_emit_start),
9445               (UV)(RExC_parse - RExC_start),
9446               (UV)RExC_offsets[0]));
9447         Set_Node_Offset(place, RExC_parse);
9448         Set_Node_Length(place, 1);
9449     }
9450 #endif
9451     src = NEXTOPER(place);
9452     FILL_ADVANCE_NODE(place, op);
9453     REH_CALL_REGCOMP_HOOK(pRExC_state->rx, (place) - 1);
9454     Zero(src, offset, regnode);
9455 }
9456
9457 /*
9458 - regtail - set the next-pointer at the end of a node chain of p to val.
9459 - SEE ALSO: regtail_study
9460 */
9461 /* TODO: All three parms should be const */
9462 STATIC void
9463 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
9464 {
9465     dVAR;
9466     register regnode *scan;
9467     GET_RE_DEBUG_FLAGS_DECL;
9468
9469     PERL_ARGS_ASSERT_REGTAIL;
9470 #ifndef DEBUGGING
9471     PERL_UNUSED_ARG(depth);
9472 #endif
9473
9474     if (SIZE_ONLY)
9475         return;
9476
9477     /* Find last node. */
9478     scan = p;
9479     for (;;) {
9480         regnode * const temp = regnext(scan);
9481         DEBUG_PARSE_r({
9482             SV * const mysv=sv_newmortal();
9483             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
9484             regprop(RExC_rx, mysv, scan);
9485             PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
9486                 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
9487                     (temp == NULL ? "->" : ""),
9488                     (temp == NULL ? PL_reg_name[OP(val)] : "")
9489             );
9490         });
9491         if (temp == NULL)
9492             break;
9493         scan = temp;
9494     }
9495
9496     if (reg_off_by_arg[OP(scan)]) {
9497         ARG_SET(scan, val - scan);
9498     }
9499     else {
9500         NEXT_OFF(scan) = val - scan;
9501     }
9502 }
9503
9504 #ifdef DEBUGGING
9505 /*
9506 - regtail_study - set the next-pointer at the end of a node chain of p to val.
9507 - Look for optimizable sequences at the same time.
9508 - currently only looks for EXACT chains.
9509
9510 This is experimental code. The idea is to use this routine to perform
9511 in place optimizations on branches and groups as they are constructed,
9512 with the long term intention of removing optimization from study_chunk so
9513 that it is purely analytical.
9514
9515 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
9516 to control which is which.
9517
9518 */
9519 /* TODO: All four parms should be const */
9520
9521 STATIC U8
9522 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
9523 {
9524     dVAR;
9525     register regnode *scan;
9526     U8 exact = PSEUDO;
9527 #ifdef EXPERIMENTAL_INPLACESCAN
9528     I32 min = 0;
9529 #endif
9530     GET_RE_DEBUG_FLAGS_DECL;
9531
9532     PERL_ARGS_ASSERT_REGTAIL_STUDY;
9533
9534
9535     if (SIZE_ONLY)
9536         return exact;
9537
9538     /* Find last node. */
9539
9540     scan = p;
9541     for (;;) {
9542         regnode * const temp = regnext(scan);
9543 #ifdef EXPERIMENTAL_INPLACESCAN
9544         if (PL_regkind[OP(scan)] == EXACT)
9545             if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
9546                 return EXACT;
9547 #endif
9548         if ( exact ) {
9549             switch (OP(scan)) {
9550                 case EXACT:
9551                 case EXACTF:
9552                 case EXACTFU:
9553                 case EXACTFL:
9554                         if( exact == PSEUDO )
9555                             exact= OP(scan);
9556                         else if ( exact != OP(scan) )
9557                             exact= 0;
9558                 case NOTHING:
9559                     break;
9560                 default:
9561                     exact= 0;
9562             }
9563         }
9564         DEBUG_PARSE_r({
9565             SV * const mysv=sv_newmortal();
9566             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
9567             regprop(RExC_rx, mysv, scan);
9568             PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
9569                 SvPV_nolen_const(mysv),
9570                 REG_NODE_NUM(scan),
9571                 PL_reg_name[exact]);
9572         });
9573         if (temp == NULL)
9574             break;
9575         scan = temp;
9576     }
9577     DEBUG_PARSE_r({
9578         SV * const mysv_val=sv_newmortal();
9579         DEBUG_PARSE_MSG("");
9580         regprop(RExC_rx, mysv_val, val);
9581         PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
9582                       SvPV_nolen_const(mysv_val),
9583                       (IV)REG_NODE_NUM(val),
9584                       (IV)(val - scan)
9585         );
9586     });
9587     if (reg_off_by_arg[OP(scan)]) {
9588         ARG_SET(scan, val - scan);
9589     }
9590     else {
9591         NEXT_OFF(scan) = val - scan;
9592     }
9593
9594     return exact;
9595 }
9596 #endif
9597
9598 /*
9599  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
9600  */
9601 #ifdef DEBUGGING
9602 static void
9603 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
9604 {
9605     int bit;
9606     int set=0;
9607     regex_charset cs;
9608
9609     for (bit=0; bit<32; bit++) {
9610         if (flags & (1<<bit)) {
9611             if ((1<<bit) & RXf_PMf_CHARSET) {   /* Output separately, below */
9612                 continue;
9613             }
9614             if (!set++ && lead)
9615                 PerlIO_printf(Perl_debug_log, "%s",lead);
9616             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
9617         }
9618     }
9619     if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
9620             if (!set++ && lead) {
9621                 PerlIO_printf(Perl_debug_log, "%s",lead);
9622             }
9623             switch (cs) {
9624                 case REGEX_UNICODE_CHARSET:
9625                     PerlIO_printf(Perl_debug_log, "UNICODE");
9626                     break;
9627                 case REGEX_LOCALE_CHARSET:
9628                     PerlIO_printf(Perl_debug_log, "LOCALE");
9629                     break;
9630                 case REGEX_ASCII_RESTRICTED_CHARSET:
9631                     PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
9632                     break;
9633                 default:
9634                     PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
9635                     break;
9636             }
9637     }
9638     if (lead)  {
9639         if (set)
9640             PerlIO_printf(Perl_debug_log, "\n");
9641         else
9642             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
9643     }
9644 }
9645 #endif
9646
9647 void
9648 Perl_regdump(pTHX_ const regexp *r)
9649 {
9650 #ifdef DEBUGGING
9651     dVAR;
9652     SV * const sv = sv_newmortal();
9653     SV *dsv= sv_newmortal();
9654     RXi_GET_DECL(r,ri);
9655     GET_RE_DEBUG_FLAGS_DECL;
9656
9657     PERL_ARGS_ASSERT_REGDUMP;
9658
9659     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
9660
9661     /* Header fields of interest. */
9662     if (r->anchored_substr) {
9663         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
9664             RE_SV_DUMPLEN(r->anchored_substr), 30);
9665         PerlIO_printf(Perl_debug_log,
9666                       "anchored %s%s at %"IVdf" ",
9667                       s, RE_SV_TAIL(r->anchored_substr),
9668                       (IV)r->anchored_offset);
9669     } else if (r->anchored_utf8) {
9670         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
9671             RE_SV_DUMPLEN(r->anchored_utf8), 30);
9672         PerlIO_printf(Perl_debug_log,
9673                       "anchored utf8 %s%s at %"IVdf" ",
9674                       s, RE_SV_TAIL(r->anchored_utf8),
9675                       (IV)r->anchored_offset);
9676     }
9677     if (r->float_substr) {
9678         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
9679             RE_SV_DUMPLEN(r->float_substr), 30);
9680         PerlIO_printf(Perl_debug_log,
9681                       "floating %s%s at %"IVdf"..%"UVuf" ",
9682                       s, RE_SV_TAIL(r->float_substr),
9683                       (IV)r->float_min_offset, (UV)r->float_max_offset);
9684     } else if (r->float_utf8) {
9685         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
9686             RE_SV_DUMPLEN(r->float_utf8), 30);
9687         PerlIO_printf(Perl_debug_log,
9688                       "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
9689                       s, RE_SV_TAIL(r->float_utf8),
9690                       (IV)r->float_min_offset, (UV)r->float_max_offset);
9691     }
9692     if (r->check_substr || r->check_utf8)
9693         PerlIO_printf(Perl_debug_log,
9694                       (const char *)
9695                       (r->check_substr == r->float_substr
9696                        && r->check_utf8 == r->float_utf8
9697                        ? "(checking floating" : "(checking anchored"));
9698     if (r->extflags & RXf_NOSCAN)
9699         PerlIO_printf(Perl_debug_log, " noscan");
9700     if (r->extflags & RXf_CHECK_ALL)
9701         PerlIO_printf(Perl_debug_log, " isall");
9702     if (r->check_substr || r->check_utf8)
9703         PerlIO_printf(Perl_debug_log, ") ");
9704
9705     if (ri->regstclass) {
9706         regprop(r, sv, ri->regstclass);
9707         PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
9708     }
9709     if (r->extflags & RXf_ANCH) {
9710         PerlIO_printf(Perl_debug_log, "anchored");
9711         if (r->extflags & RXf_ANCH_BOL)
9712             PerlIO_printf(Perl_debug_log, "(BOL)");
9713         if (r->extflags & RXf_ANCH_MBOL)
9714             PerlIO_printf(Perl_debug_log, "(MBOL)");
9715         if (r->extflags & RXf_ANCH_SBOL)
9716             PerlIO_printf(Perl_debug_log, "(SBOL)");
9717         if (r->extflags & RXf_ANCH_GPOS)
9718             PerlIO_printf(Perl_debug_log, "(GPOS)");
9719         PerlIO_putc(Perl_debug_log, ' ');
9720     }
9721     if (r->extflags & RXf_GPOS_SEEN)
9722         PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
9723     if (r->intflags & PREGf_SKIP)
9724         PerlIO_printf(Perl_debug_log, "plus ");
9725     if (r->intflags & PREGf_IMPLICIT)
9726         PerlIO_printf(Perl_debug_log, "implicit ");
9727     PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
9728     if (r->extflags & RXf_EVAL_SEEN)
9729         PerlIO_printf(Perl_debug_log, "with eval ");
9730     PerlIO_printf(Perl_debug_log, "\n");
9731     DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
9732 #else
9733     PERL_ARGS_ASSERT_REGDUMP;
9734     PERL_UNUSED_CONTEXT;
9735     PERL_UNUSED_ARG(r);
9736 #endif  /* DEBUGGING */
9737 }
9738
9739 /*
9740 - regprop - printable representation of opcode
9741 */
9742 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
9743 STMT_START { \
9744         if (do_sep) {                           \
9745             Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
9746             if (flags & ANYOF_INVERT)           \
9747                 /*make sure the invert info is in each */ \
9748                 sv_catpvs(sv, "^");             \
9749             do_sep = 0;                         \
9750         }                                       \
9751 } STMT_END
9752
9753 void
9754 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
9755 {
9756 #ifdef DEBUGGING
9757     dVAR;
9758     register int k;
9759     RXi_GET_DECL(prog,progi);
9760     GET_RE_DEBUG_FLAGS_DECL;
9761
9762     PERL_ARGS_ASSERT_REGPROP;
9763
9764     sv_setpvs(sv, "");
9765
9766     if (OP(o) > REGNODE_MAX)            /* regnode.type is unsigned */
9767         /* It would be nice to FAIL() here, but this may be called from
9768            regexec.c, and it would be hard to supply pRExC_state. */
9769         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
9770     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
9771
9772     k = PL_regkind[OP(o)];
9773
9774     if (k == EXACT) {
9775         sv_catpvs(sv, " ");
9776         /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
9777          * is a crude hack but it may be the best for now since
9778          * we have no flag "this EXACTish node was UTF-8"
9779          * --jhi */
9780         pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
9781                   PERL_PV_ESCAPE_UNI_DETECT |
9782                   PERL_PV_ESCAPE_NONASCII   |
9783                   PERL_PV_PRETTY_ELLIPSES   |
9784                   PERL_PV_PRETTY_LTGT       |
9785                   PERL_PV_PRETTY_NOCLEAR
9786                   );
9787     } else if (k == TRIE) {
9788         /* print the details of the trie in dumpuntil instead, as
9789          * progi->data isn't available here */
9790         const char op = OP(o);
9791         const U32 n = ARG(o);
9792         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
9793                (reg_ac_data *)progi->data->data[n] :
9794                NULL;
9795         const reg_trie_data * const trie
9796             = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
9797
9798         Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
9799         DEBUG_TRIE_COMPILE_r(
9800             Perl_sv_catpvf(aTHX_ sv,
9801                 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
9802                 (UV)trie->startstate,
9803                 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
9804                 (UV)trie->wordcount,
9805                 (UV)trie->minlen,
9806                 (UV)trie->maxlen,
9807                 (UV)TRIE_CHARCOUNT(trie),
9808                 (UV)trie->uniquecharcount
9809             )
9810         );
9811         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
9812             int i;
9813             int rangestart = -1;
9814             U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
9815             sv_catpvs(sv, "[");
9816             for (i = 0; i <= 256; i++) {
9817                 if (i < 256 && BITMAP_TEST(bitmap,i)) {
9818                     if (rangestart == -1)
9819                         rangestart = i;
9820                 } else if (rangestart != -1) {
9821                     if (i <= rangestart + 3)
9822                         for (; rangestart < i; rangestart++)
9823                             put_byte(sv, rangestart);
9824                     else {
9825                         put_byte(sv, rangestart);
9826                         sv_catpvs(sv, "-");
9827                         put_byte(sv, i - 1);
9828                     }
9829                     rangestart = -1;
9830                 }
9831             }
9832             sv_catpvs(sv, "]");
9833         }
9834
9835     } else if (k == CURLY) {
9836         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
9837             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
9838         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
9839     }
9840     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
9841         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9842     else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
9843         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
9844         if ( RXp_PAREN_NAMES(prog) ) {
9845             if ( k != REF || (OP(o) < NREF)) {
9846                 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
9847                 SV **name= av_fetch(list, ARG(o), 0 );
9848                 if (name)
9849                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
9850             }
9851             else {
9852                 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
9853                 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
9854                 I32 *nums=(I32*)SvPVX(sv_dat);
9855                 SV **name= av_fetch(list, nums[0], 0 );
9856                 I32 n;
9857                 if (name) {
9858                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
9859                         Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
9860                                     (n ? "," : ""), (IV)nums[n]);
9861                     }
9862                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
9863                 }
9864             }
9865         }
9866     } else if (k == GOSUB)
9867         Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
9868     else if (k == VERB) {
9869         if (!o->flags)
9870             Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
9871                            SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
9872     } else if (k == LOGICAL)
9873         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);     /* 2: embedded, otherwise 1 */
9874     else if (k == FOLDCHAR)
9875         Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) );
9876     else if (k == ANYOF) {
9877         int i, rangestart = -1;
9878         const U8 flags = ANYOF_FLAGS(o);
9879         int do_sep = 0;
9880
9881         /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
9882         static const char * const anyofs[] = {
9883             "\\w",
9884             "\\W",
9885             "\\s",
9886             "\\S",
9887             "\\d",
9888             "\\D",
9889             "[:alnum:]",
9890             "[:^alnum:]",
9891             "[:alpha:]",
9892             "[:^alpha:]",
9893             "[:ascii:]",
9894             "[:^ascii:]",
9895             "[:cntrl:]",
9896             "[:^cntrl:]",
9897             "[:graph:]",
9898             "[:^graph:]",
9899             "[:lower:]",
9900             "[:^lower:]",
9901             "[:print:]",
9902             "[:^print:]",
9903             "[:punct:]",
9904             "[:^punct:]",
9905             "[:upper:]",
9906             "[:^upper:]",
9907             "[:xdigit:]",
9908             "[:^xdigit:]",
9909             "[:space:]",
9910             "[:^space:]",
9911             "[:blank:]",
9912             "[:^blank:]"
9913         };
9914
9915         if (flags & ANYOF_LOCALE)
9916             sv_catpvs(sv, "{loc}");
9917         if (flags & ANYOF_LOC_NONBITMAP_FOLD)
9918             sv_catpvs(sv, "{i}");
9919         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
9920         if (flags & ANYOF_INVERT)
9921             sv_catpvs(sv, "^");
9922
9923         /* output what the standard cp 0-255 bitmap matches */
9924         for (i = 0; i <= 256; i++) {
9925             if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
9926                 if (rangestart == -1)
9927                     rangestart = i;
9928             } else if (rangestart != -1) {
9929                 if (i <= rangestart + 3)
9930                     for (; rangestart < i; rangestart++)
9931                         put_byte(sv, rangestart);
9932                 else {
9933                     put_byte(sv, rangestart);
9934                     sv_catpvs(sv, "-");
9935                     put_byte(sv, i - 1);
9936                 }
9937                 do_sep = 1;
9938                 rangestart = -1;
9939             }
9940         }
9941
9942         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9943         /* output any special charclass tests (used entirely under use locale) */
9944         if (ANYOF_CLASS_TEST_ANY_SET(o))
9945             for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
9946                 if (ANYOF_CLASS_TEST(o,i)) {
9947                     sv_catpv(sv, anyofs[i]);
9948                     do_sep = 1;
9949                 }
9950
9951         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9952
9953         if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
9954             sv_catpvs(sv, "{non-utf8-latin1-all}");
9955         }
9956
9957         /* output information about the unicode matching */
9958         if (flags & ANYOF_UNICODE_ALL)
9959             sv_catpvs(sv, "{unicode_all}");
9960         else if (flags & ANYOF_UTF8)
9961             sv_catpvs(sv, "{unicode}");
9962         if (flags & ANYOF_NONBITMAP_NON_UTF8)
9963             sv_catpvs(sv, "{outside bitmap}");
9964
9965         {
9966             SV *lv;
9967             SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
9968
9969             if (lv) {
9970                 if (sw) {
9971                     U8 s[UTF8_MAXBYTES_CASE+1];
9972
9973                     for (i = 0; i <= 256; i++) { /* just the first 256 */
9974                         uvchr_to_utf8(s, i);
9975
9976                         if (i < 256 && swash_fetch(sw, s, TRUE)) {
9977                             if (rangestart == -1)
9978                                 rangestart = i;
9979                         } else if (rangestart != -1) {
9980                             if (i <= rangestart + 3)
9981                                 for (; rangestart < i; rangestart++) {
9982                                     const U8 * const e = uvchr_to_utf8(s,rangestart);
9983                                     U8 *p;
9984                                     for(p = s; p < e; p++)
9985                                         put_byte(sv, *p);
9986                                 }
9987                             else {
9988                                 const U8 *e = uvchr_to_utf8(s,rangestart);
9989                                 U8 *p;
9990                                 for (p = s; p < e; p++)
9991                                     put_byte(sv, *p);
9992                                 sv_catpvs(sv, "-");
9993                                 e = uvchr_to_utf8(s, i-1);
9994                                 for (p = s; p < e; p++)
9995                                     put_byte(sv, *p);
9996                                 }
9997                                 rangestart = -1;
9998                             }
9999                         }
10000
10001                     sv_catpvs(sv, "..."); /* et cetera */
10002                 }
10003
10004                 {
10005                     char *s = savesvpv(lv);
10006                     char * const origs = s;
10007
10008                     while (*s && *s != '\n')
10009                         s++;
10010
10011                     if (*s == '\n') {
10012                         const char * const t = ++s;
10013
10014                         while (*s) {
10015                             if (*s == '\n')
10016                                 *s = ' ';
10017                             s++;
10018                         }
10019                         if (s[-1] == ' ')
10020                             s[-1] = 0;
10021
10022                         sv_catpv(sv, t);
10023                     }
10024
10025                     Safefree(origs);
10026                 }
10027             }
10028         }
10029
10030         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
10031     }
10032     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
10033         Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
10034 #else
10035     PERL_UNUSED_CONTEXT;
10036     PERL_UNUSED_ARG(sv);
10037     PERL_UNUSED_ARG(o);
10038     PERL_UNUSED_ARG(prog);
10039 #endif  /* DEBUGGING */
10040 }
10041
10042 SV *
10043 Perl_re_intuit_string(pTHX_ REGEXP * const r)
10044 {                               /* Assume that RE_INTUIT is set */
10045     dVAR;
10046     struct regexp *const prog = (struct regexp *)SvANY(r);
10047     GET_RE_DEBUG_FLAGS_DECL;
10048
10049     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
10050     PERL_UNUSED_CONTEXT;
10051
10052     DEBUG_COMPILE_r(
10053         {
10054             const char * const s = SvPV_nolen_const(prog->check_substr
10055                       ? prog->check_substr : prog->check_utf8);
10056
10057             if (!PL_colorset) reginitcolors();
10058             PerlIO_printf(Perl_debug_log,
10059                       "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
10060                       PL_colors[4],
10061                       prog->check_substr ? "" : "utf8 ",
10062                       PL_colors[5],PL_colors[0],
10063                       s,
10064                       PL_colors[1],
10065                       (strlen(s) > 60 ? "..." : ""));
10066         } );
10067
10068     return prog->check_substr ? prog->check_substr : prog->check_utf8;
10069 }
10070
10071 /*
10072    pregfree()
10073
10074    handles refcounting and freeing the perl core regexp structure. When
10075    it is necessary to actually free the structure the first thing it
10076    does is call the 'free' method of the regexp_engine associated to
10077    the regexp, allowing the handling of the void *pprivate; member
10078    first. (This routine is not overridable by extensions, which is why
10079    the extensions free is called first.)
10080
10081    See regdupe and regdupe_internal if you change anything here.
10082 */
10083 #ifndef PERL_IN_XSUB_RE
10084 void
10085 Perl_pregfree(pTHX_ REGEXP *r)
10086 {
10087     SvREFCNT_dec(r);
10088 }
10089
10090 void
10091 Perl_pregfree2(pTHX_ REGEXP *rx)
10092 {
10093     dVAR;
10094     struct regexp *const r = (struct regexp *)SvANY(rx);
10095     GET_RE_DEBUG_FLAGS_DECL;
10096
10097     PERL_ARGS_ASSERT_PREGFREE2;
10098
10099     if (r->mother_re) {
10100         ReREFCNT_dec(r->mother_re);
10101     } else {
10102         CALLREGFREE_PVT(rx); /* free the private data */
10103         SvREFCNT_dec(RXp_PAREN_NAMES(r));
10104     }
10105     if (r->substrs) {
10106         SvREFCNT_dec(r->anchored_substr);
10107         SvREFCNT_dec(r->anchored_utf8);
10108         SvREFCNT_dec(r->float_substr);
10109         SvREFCNT_dec(r->float_utf8);
10110         Safefree(r->substrs);
10111     }
10112     RX_MATCH_COPY_FREE(rx);
10113 #ifdef PERL_OLD_COPY_ON_WRITE
10114     SvREFCNT_dec(r->saved_copy);
10115 #endif
10116     Safefree(r->offs);
10117 }
10118
10119 /*  reg_temp_copy()
10120
10121     This is a hacky workaround to the structural issue of match results
10122     being stored in the regexp structure which is in turn stored in
10123     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
10124     could be PL_curpm in multiple contexts, and could require multiple
10125     result sets being associated with the pattern simultaneously, such
10126     as when doing a recursive match with (??{$qr})
10127
10128     The solution is to make a lightweight copy of the regexp structure
10129     when a qr// is returned from the code executed by (??{$qr}) this
10130     lightweight copy doesn't actually own any of its data except for
10131     the starp/end and the actual regexp structure itself.
10132
10133 */
10134
10135
10136 REGEXP *
10137 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
10138 {
10139     struct regexp *ret;
10140     struct regexp *const r = (struct regexp *)SvANY(rx);
10141     register const I32 npar = r->nparens+1;
10142
10143     PERL_ARGS_ASSERT_REG_TEMP_COPY;
10144
10145     if (!ret_x)
10146         ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
10147     ret = (struct regexp *)SvANY(ret_x);
10148
10149     (void)ReREFCNT_inc(rx);
10150     /* We can take advantage of the existing "copied buffer" mechanism in SVs
10151        by pointing directly at the buffer, but flagging that the allocated
10152        space in the copy is zero. As we've just done a struct copy, it's now
10153        a case of zero-ing that, rather than copying the current length.  */
10154     SvPV_set(ret_x, RX_WRAPPED(rx));
10155     SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
10156     memcpy(&(ret->xpv_cur), &(r->xpv_cur),
10157            sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
10158     SvLEN_set(ret_x, 0);
10159     SvSTASH_set(ret_x, NULL);
10160     SvMAGIC_set(ret_x, NULL);
10161     Newx(ret->offs, npar, regexp_paren_pair);
10162     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
10163     if (r->substrs) {
10164         Newx(ret->substrs, 1, struct reg_substr_data);
10165         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
10166
10167         SvREFCNT_inc_void(ret->anchored_substr);
10168         SvREFCNT_inc_void(ret->anchored_utf8);
10169         SvREFCNT_inc_void(ret->float_substr);
10170         SvREFCNT_inc_void(ret->float_utf8);
10171
10172         /* check_substr and check_utf8, if non-NULL, point to either their
10173            anchored or float namesakes, and don't hold a second reference.  */
10174     }
10175     RX_MATCH_COPIED_off(ret_x);
10176 #ifdef PERL_OLD_COPY_ON_WRITE
10177     ret->saved_copy = NULL;
10178 #endif
10179     ret->mother_re = rx;
10180
10181     return ret_x;
10182 }
10183 #endif
10184
10185 /* regfree_internal()
10186
10187    Free the private data in a regexp. This is overloadable by
10188    extensions. Perl takes care of the regexp structure in pregfree(),
10189    this covers the *pprivate pointer which technically perl doesn't
10190    know about, however of course we have to handle the
10191    regexp_internal structure when no extension is in use.
10192
10193    Note this is called before freeing anything in the regexp
10194    structure.
10195  */
10196
10197 void
10198 Perl_regfree_internal(pTHX_ REGEXP * const rx)
10199 {
10200     dVAR;
10201     struct regexp *const r = (struct regexp *)SvANY(rx);
10202     RXi_GET_DECL(r,ri);
10203     GET_RE_DEBUG_FLAGS_DECL;
10204
10205     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
10206
10207     DEBUG_COMPILE_r({
10208         if (!PL_colorset)
10209             reginitcolors();
10210         {
10211             SV *dsv= sv_newmortal();
10212             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
10213                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
10214             PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
10215                 PL_colors[4],PL_colors[5],s);
10216         }
10217     });
10218 #ifdef RE_TRACK_PATTERN_OFFSETS
10219     if (ri->u.offsets)
10220         Safefree(ri->u.offsets);             /* 20010421 MJD */
10221 #endif
10222     if (ri->data) {
10223         int n = ri->data->count;
10224         PAD* new_comppad = NULL;
10225         PAD* old_comppad;
10226         PADOFFSET refcnt;
10227
10228         while (--n >= 0) {
10229           /* If you add a ->what type here, update the comment in regcomp.h */
10230             switch (ri->data->what[n]) {
10231             case 'a':
10232             case 's':
10233             case 'S':
10234             case 'u':
10235                 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
10236                 break;
10237             case 'f':
10238                 Safefree(ri->data->data[n]);
10239                 break;
10240             case 'p':
10241                 new_comppad = MUTABLE_AV(ri->data->data[n]);
10242                 break;
10243             case 'o':
10244                 if (new_comppad == NULL)
10245                     Perl_croak(aTHX_ "panic: pregfree comppad");
10246                 PAD_SAVE_LOCAL(old_comppad,
10247                     /* Watch out for global destruction's random ordering. */
10248                     (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
10249                 );
10250                 OP_REFCNT_LOCK;
10251                 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
10252                 OP_REFCNT_UNLOCK;
10253                 if (!refcnt)
10254                     op_free((OP_4tree*)ri->data->data[n]);
10255
10256                 PAD_RESTORE_LOCAL(old_comppad);
10257                 SvREFCNT_dec(MUTABLE_SV(new_comppad));
10258                 new_comppad = NULL;
10259                 break;
10260             case 'n':
10261                 break;
10262             case 'T':
10263                 { /* Aho Corasick add-on structure for a trie node.
10264                      Used in stclass optimization only */
10265                     U32 refcount;
10266                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
10267                     OP_REFCNT_LOCK;
10268                     refcount = --aho->refcount;
10269                     OP_REFCNT_UNLOCK;
10270                     if ( !refcount ) {
10271                         PerlMemShared_free(aho->states);
10272                         PerlMemShared_free(aho->fail);
10273                          /* do this last!!!! */
10274                         PerlMemShared_free(ri->data->data[n]);
10275                         PerlMemShared_free(ri->regstclass);
10276                     }
10277                 }
10278                 break;
10279             case 't':
10280                 {
10281                     /* trie structure. */
10282                     U32 refcount;
10283                     reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
10284                     OP_REFCNT_LOCK;
10285                     refcount = --trie->refcount;
10286                     OP_REFCNT_UNLOCK;
10287                     if ( !refcount ) {
10288                         PerlMemShared_free(trie->charmap);
10289                         PerlMemShared_free(trie->states);
10290                         PerlMemShared_free(trie->trans);
10291                         if (trie->bitmap)
10292                             PerlMemShared_free(trie->bitmap);
10293                         if (trie->jump)
10294                             PerlMemShared_free(trie->jump);
10295                         PerlMemShared_free(trie->wordinfo);
10296                         /* do this last!!!! */
10297                         PerlMemShared_free(ri->data->data[n]);
10298                     }
10299                 }
10300                 break;
10301             default:
10302                 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
10303             }
10304         }
10305         Safefree(ri->data->what);
10306         Safefree(ri->data);
10307     }
10308
10309     Safefree(ri);
10310 }
10311
10312 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
10313 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
10314 #define SAVEPVN(p,n)    ((p) ? savepvn(p,n) : NULL)
10315
10316 /*
10317    re_dup - duplicate a regexp.
10318
10319    This routine is expected to clone a given regexp structure. It is only
10320    compiled under USE_ITHREADS.
10321
10322    After all of the core data stored in struct regexp is duplicated
10323    the regexp_engine.dupe method is used to copy any private data
10324    stored in the *pprivate pointer. This allows extensions to handle
10325    any duplication it needs to do.
10326
10327    See pregfree() and regfree_internal() if you change anything here.
10328 */
10329 #if defined(USE_ITHREADS)
10330 #ifndef PERL_IN_XSUB_RE
10331 void
10332 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
10333 {
10334     dVAR;
10335     I32 npar;
10336     const struct regexp *r = (const struct regexp *)SvANY(sstr);
10337     struct regexp *ret = (struct regexp *)SvANY(dstr);
10338
10339     PERL_ARGS_ASSERT_RE_DUP_GUTS;
10340
10341     npar = r->nparens+1;
10342     Newx(ret->offs, npar, regexp_paren_pair);
10343     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
10344     if(ret->swap) {
10345         /* no need to copy these */
10346         Newx(ret->swap, npar, regexp_paren_pair);
10347     }
10348
10349     if (ret->substrs) {
10350         /* Do it this way to avoid reading from *r after the StructCopy().
10351            That way, if any of the sv_dup_inc()s dislodge *r from the L1
10352            cache, it doesn't matter.  */
10353         const bool anchored = r->check_substr
10354             ? r->check_substr == r->anchored_substr
10355             : r->check_utf8 == r->anchored_utf8;
10356         Newx(ret->substrs, 1, struct reg_substr_data);
10357         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
10358
10359         ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
10360         ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
10361         ret->float_substr = sv_dup_inc(ret->float_substr, param);
10362         ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
10363
10364         /* check_substr and check_utf8, if non-NULL, point to either their
10365            anchored or float namesakes, and don't hold a second reference.  */
10366
10367         if (ret->check_substr) {
10368             if (anchored) {
10369                 assert(r->check_utf8 == r->anchored_utf8);
10370                 ret->check_substr = ret->anchored_substr;
10371                 ret->check_utf8 = ret->anchored_utf8;
10372             } else {
10373                 assert(r->check_substr == r->float_substr);
10374                 assert(r->check_utf8 == r->float_utf8);
10375                 ret->check_substr = ret->float_substr;
10376                 ret->check_utf8 = ret->float_utf8;
10377             }
10378         } else if (ret->check_utf8) {
10379             if (anchored) {
10380                 ret->check_utf8 = ret->anchored_utf8;
10381             } else {
10382                 ret->check_utf8 = ret->float_utf8;
10383             }
10384         }
10385     }
10386
10387     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
10388
10389     if (ret->pprivate)
10390         RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
10391
10392     if (RX_MATCH_COPIED(dstr))
10393         ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
10394     else
10395         ret->subbeg = NULL;
10396 #ifdef PERL_OLD_COPY_ON_WRITE
10397     ret->saved_copy = NULL;
10398 #endif
10399
10400     if (ret->mother_re) {
10401         if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
10402             /* Our storage points directly to our mother regexp, but that's
10403                1: a buffer in a different thread
10404                2: something we no longer hold a reference on
10405                so we need to copy it locally.  */
10406             /* Note we need to sue SvCUR() on our mother_re, because it, in
10407                turn, may well be pointing to its own mother_re.  */
10408             SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
10409                                    SvCUR(ret->mother_re)+1));
10410             SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
10411         }
10412         ret->mother_re      = NULL;
10413     }
10414     ret->gofs = 0;
10415 }
10416 #endif /* PERL_IN_XSUB_RE */
10417
10418 /*
10419    regdupe_internal()
10420
10421    This is the internal complement to regdupe() which is used to copy
10422    the structure pointed to by the *pprivate pointer in the regexp.
10423    This is the core version of the extension overridable cloning hook.
10424    The regexp structure being duplicated will be copied by perl prior
10425    to this and will be provided as the regexp *r argument, however
10426    with the /old/ structures pprivate pointer value. Thus this routine
10427    may override any copying normally done by perl.
10428
10429    It returns a pointer to the new regexp_internal structure.
10430 */
10431
10432 void *
10433 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
10434 {
10435     dVAR;
10436     struct regexp *const r = (struct regexp *)SvANY(rx);
10437     regexp_internal *reti;
10438     int len, npar;
10439     RXi_GET_DECL(r,ri);
10440
10441     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
10442
10443     npar = r->nparens+1;
10444     len = ProgLen(ri);
10445
10446     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
10447     Copy(ri->program, reti->program, len+1, regnode);
10448
10449
10450     reti->regstclass = NULL;
10451
10452     if (ri->data) {
10453         struct reg_data *d;
10454         const int count = ri->data->count;
10455         int i;
10456
10457         Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
10458                 char, struct reg_data);
10459         Newx(d->what, count, U8);
10460
10461         d->count = count;
10462         for (i = 0; i < count; i++) {
10463             d->what[i] = ri->data->what[i];
10464             switch (d->what[i]) {
10465                 /* legal options are one of: sSfpontTua
10466                    see also regcomp.h and pregfree() */
10467             case 'a': /* actually an AV, but the dup function is identical.  */
10468             case 's':
10469             case 'S':
10470             case 'p': /* actually an AV, but the dup function is identical.  */
10471             case 'u': /* actually an HV, but the dup function is identical.  */
10472                 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
10473                 break;
10474             case 'f':
10475                 /* This is cheating. */
10476                 Newx(d->data[i], 1, struct regnode_charclass_class);
10477                 StructCopy(ri->data->data[i], d->data[i],
10478                             struct regnode_charclass_class);
10479                 reti->regstclass = (regnode*)d->data[i];
10480                 break;
10481             case 'o':
10482                 /* Compiled op trees are readonly and in shared memory,
10483                    and can thus be shared without duplication. */
10484                 OP_REFCNT_LOCK;
10485                 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
10486                 OP_REFCNT_UNLOCK;
10487                 break;
10488             case 'T':
10489                 /* Trie stclasses are readonly and can thus be shared
10490                  * without duplication. We free the stclass in pregfree
10491                  * when the corresponding reg_ac_data struct is freed.
10492                  */
10493                 reti->regstclass= ri->regstclass;
10494                 /* Fall through */
10495             case 't':
10496                 OP_REFCNT_LOCK;
10497                 ((reg_trie_data*)ri->data->data[i])->refcount++;
10498                 OP_REFCNT_UNLOCK;
10499                 /* Fall through */
10500             case 'n':
10501                 d->data[i] = ri->data->data[i];
10502                 break;
10503             default:
10504                 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
10505             }
10506         }
10507
10508         reti->data = d;
10509     }
10510     else
10511         reti->data = NULL;
10512
10513     reti->name_list_idx = ri->name_list_idx;
10514
10515 #ifdef RE_TRACK_PATTERN_OFFSETS
10516     if (ri->u.offsets) {
10517         Newx(reti->u.offsets, 2*len+1, U32);
10518         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
10519     }
10520 #else
10521     SetProgLen(reti,len);
10522 #endif
10523
10524     return (void*)reti;
10525 }
10526
10527 #endif    /* USE_ITHREADS */
10528
10529 #ifndef PERL_IN_XSUB_RE
10530
10531 /*
10532  - regnext - dig the "next" pointer out of a node
10533  */
10534 regnode *
10535 Perl_regnext(pTHX_ register regnode *p)
10536 {
10537     dVAR;
10538     register I32 offset;
10539
10540     if (!p)
10541         return(NULL);
10542
10543     if (OP(p) > REGNODE_MAX) {          /* regnode.type is unsigned */
10544         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
10545     }
10546
10547     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
10548     if (offset == 0)
10549         return(NULL);
10550
10551     return(p+offset);
10552 }
10553 #endif
10554
10555 STATIC void
10556 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
10557 {
10558     va_list args;
10559     STRLEN l1 = strlen(pat1);
10560     STRLEN l2 = strlen(pat2);
10561     char buf[512];
10562     SV *msv;
10563     const char *message;
10564
10565     PERL_ARGS_ASSERT_RE_CROAK2;
10566
10567     if (l1 > 510)
10568         l1 = 510;
10569     if (l1 + l2 > 510)
10570         l2 = 510 - l1;
10571     Copy(pat1, buf, l1 , char);
10572     Copy(pat2, buf + l1, l2 , char);
10573     buf[l1 + l2] = '\n';
10574     buf[l1 + l2 + 1] = '\0';
10575 #ifdef I_STDARG
10576     /* ANSI variant takes additional second argument */
10577     va_start(args, pat2);
10578 #else
10579     va_start(args);
10580 #endif
10581     msv = vmess(buf, &args);
10582     va_end(args);
10583     message = SvPV_const(msv,l1);
10584     if (l1 > 512)
10585         l1 = 512;
10586     Copy(message, buf, l1 , char);
10587     buf[l1-1] = '\0';                   /* Overwrite \n */
10588     Perl_croak(aTHX_ "%s", buf);
10589 }
10590
10591 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
10592
10593 #ifndef PERL_IN_XSUB_RE
10594 void
10595 Perl_save_re_context(pTHX)
10596 {
10597     dVAR;
10598
10599     struct re_save_state *state;
10600
10601     SAVEVPTR(PL_curcop);
10602     SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
10603
10604     state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
10605     PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
10606     SSPUSHUV(SAVEt_RE_STATE);
10607
10608     Copy(&PL_reg_state, state, 1, struct re_save_state);
10609
10610     PL_reg_start_tmp = 0;
10611     PL_reg_start_tmpl = 0;
10612     PL_reg_oldsaved = NULL;
10613     PL_reg_oldsavedlen = 0;
10614     PL_reg_maxiter = 0;
10615     PL_reg_leftiter = 0;
10616     PL_reg_poscache = NULL;
10617     PL_reg_poscache_size = 0;
10618 #ifdef PERL_OLD_COPY_ON_WRITE
10619     PL_nrs = NULL;
10620 #endif
10621
10622     /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
10623     if (PL_curpm) {
10624         const REGEXP * const rx = PM_GETRE(PL_curpm);
10625         if (rx) {
10626             U32 i;
10627             for (i = 1; i <= RX_NPARENS(rx); i++) {
10628                 char digits[TYPE_CHARS(long)];
10629                 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
10630                 GV *const *const gvp
10631                     = (GV**)hv_fetch(PL_defstash, digits, len, 0);
10632
10633                 if (gvp) {
10634                     GV * const gv = *gvp;
10635                     if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
10636                         save_scalar(gv);
10637                 }
10638             }
10639         }
10640     }
10641 }
10642 #endif
10643
10644 static void
10645 clear_re(pTHX_ void *r)
10646 {
10647     dVAR;
10648     ReREFCNT_dec((REGEXP *)r);
10649 }
10650
10651 #ifdef DEBUGGING
10652
10653 STATIC void
10654 S_put_byte(pTHX_ SV *sv, int c)
10655 {
10656     PERL_ARGS_ASSERT_PUT_BYTE;
10657
10658     /* Our definition of isPRINT() ignores locales, so only bytes that are
10659        not part of UTF-8 are considered printable. I assume that the same
10660        holds for UTF-EBCDIC.
10661        Also, code point 255 is not printable in either (it's E0 in EBCDIC,
10662        which Wikipedia says:
10663
10664        EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
10665        ones (binary 1111 1111, hexadecimal FF). It is similar, but not
10666        identical, to the ASCII delete (DEL) or rubout control character.
10667        ) So the old condition can be simplified to !isPRINT(c)  */
10668     if (!isPRINT(c)) {
10669         if (c < 256) {
10670             Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
10671         }
10672         else {
10673             Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
10674         }
10675     }
10676     else {
10677         const char string = c;
10678         if (c == '-' || c == ']' || c == '\\' || c == '^')
10679             sv_catpvs(sv, "\\");
10680         sv_catpvn(sv, &string, 1);
10681     }
10682 }
10683
10684
10685 #define CLEAR_OPTSTART \
10686     if (optstart) STMT_START { \
10687             DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
10688             optstart=NULL; \
10689     } STMT_END
10690
10691 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
10692
10693 STATIC const regnode *
10694 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
10695             const regnode *last, const regnode *plast,
10696             SV* sv, I32 indent, U32 depth)
10697 {
10698     dVAR;
10699     register U8 op = PSEUDO;    /* Arbitrary non-END op. */
10700     register const regnode *next;
10701     const regnode *optstart= NULL;
10702
10703     RXi_GET_DECL(r,ri);
10704     GET_RE_DEBUG_FLAGS_DECL;
10705
10706     PERL_ARGS_ASSERT_DUMPUNTIL;
10707
10708 #ifdef DEBUG_DUMPUNTIL
10709     PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
10710         last ? last-start : 0,plast ? plast-start : 0);
10711 #endif
10712
10713     if (plast && plast < last)
10714         last= plast;
10715
10716     while (PL_regkind[op] != END && (!last || node < last)) {
10717         /* While that wasn't END last time... */
10718         NODE_ALIGN(node);
10719         op = OP(node);
10720         if (op == CLOSE || op == WHILEM)
10721             indent--;
10722         next = regnext((regnode *)node);
10723
10724         /* Where, what. */
10725         if (OP(node) == OPTIMIZED) {
10726             if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
10727                 optstart = node;
10728             else
10729                 goto after_print;
10730         } else
10731             CLEAR_OPTSTART;
10732
10733         regprop(r, sv, node);
10734         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
10735                       (int)(2*indent + 1), "", SvPVX_const(sv));
10736
10737         if (OP(node) != OPTIMIZED) {
10738             if (next == NULL)           /* Next ptr. */
10739                 PerlIO_printf(Perl_debug_log, " (0)");
10740             else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
10741                 PerlIO_printf(Perl_debug_log, " (FAIL)");
10742             else
10743                 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
10744             (void)PerlIO_putc(Perl_debug_log, '\n');
10745         }
10746
10747       after_print:
10748         if (PL_regkind[(U8)op] == BRANCHJ) {
10749             assert(next);
10750             {
10751                 register const regnode *nnode = (OP(next) == LONGJMP
10752                                              ? regnext((regnode *)next)
10753                                              : next);
10754                 if (last && nnode > last)
10755                     nnode = last;
10756                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
10757             }
10758         }
10759         else if (PL_regkind[(U8)op] == BRANCH) {
10760             assert(next);
10761             DUMPUNTIL(NEXTOPER(node), next);
10762         }
10763         else if ( PL_regkind[(U8)op]  == TRIE ) {
10764             const regnode *this_trie = node;
10765             const char op = OP(node);
10766             const U32 n = ARG(node);
10767             const reg_ac_data * const ac = op>=AHOCORASICK ?
10768                (reg_ac_data *)ri->data->data[n] :
10769                NULL;
10770             const reg_trie_data * const trie =
10771                 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
10772 #ifdef DEBUGGING
10773             AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
10774 #endif
10775             const regnode *nextbranch= NULL;
10776             I32 word_idx;
10777             sv_setpvs(sv, "");
10778             for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
10779                 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
10780
10781                 PerlIO_printf(Perl_debug_log, "%*s%s ",
10782                    (int)(2*(indent+3)), "",
10783                     elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
10784                             PL_colors[0], PL_colors[1],
10785                             (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
10786                             PERL_PV_PRETTY_ELLIPSES    |
10787                             PERL_PV_PRETTY_LTGT
10788                             )
10789                             : "???"
10790                 );
10791                 if (trie->jump) {
10792                     U16 dist= trie->jump[word_idx+1];
10793                     PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
10794                                   (UV)((dist ? this_trie + dist : next) - start));
10795                     if (dist) {
10796                         if (!nextbranch)
10797                             nextbranch= this_trie + trie->jump[0];
10798                         DUMPUNTIL(this_trie + dist, nextbranch);
10799                     }
10800                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
10801                         nextbranch= regnext((regnode *)nextbranch);
10802                 } else {
10803                     PerlIO_printf(Perl_debug_log, "\n");
10804                 }
10805             }
10806             if (last && next > last)
10807                 node= last;
10808             else
10809                 node= next;
10810         }
10811         else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
10812             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
10813                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
10814         }
10815         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
10816             assert(next);
10817             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
10818         }
10819         else if ( op == PLUS || op == STAR) {
10820             DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
10821         }
10822         else if (PL_regkind[(U8)op] == ANYOF) {
10823             /* arglen 1 + class block */
10824             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
10825                     ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
10826             node = NEXTOPER(node);
10827         }
10828         else if (PL_regkind[(U8)op] == EXACT) {
10829             /* Literal string, where present. */
10830             node += NODE_SZ_STR(node) - 1;
10831             node = NEXTOPER(node);
10832         }
10833         else {
10834             node = NEXTOPER(node);
10835             node += regarglen[(U8)op];
10836         }
10837         if (op == CURLYX || op == OPEN)
10838             indent++;
10839     }
10840     CLEAR_OPTSTART;
10841 #ifdef DEBUG_DUMPUNTIL
10842     PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
10843 #endif
10844     return node;
10845 }
10846
10847 #endif  /* DEBUGGING */
10848
10849 /*
10850  * Local variables:
10851  * c-indentation-style: bsd
10852  * c-basic-offset: 4
10853  * indent-tabs-mode: t
10854  * End:
10855  *
10856  * ex: set ts=8 sts=4 sw=4 noet:
10857  */