src/5013010/regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  *      Copyright (c) 1986 by University of Toronto.
  42  *      Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  *      Permission is granted to anyone to use this software for any
  45  *      purpose on any computer system, and to redistribute it freely,
  46  *      subject to the following restrictions:
  47  *
  48  *      1. The author is not responsible for the consequences of use of
  49  *              this software, no matter how awful, even if they arise
  50  *              from defects in it.
  51  *
  52  *      2. The origin of this software must not be misrepresented, either
  53  *              by explicit claim or by omission.
  54  *
  55  *      3. Altered versions must be plainly marked as such, and must not
  56  *              be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #include "perl.h"
  76
  77 #ifndef PERL_IN_XSUB_RE
  78 #include "re_defs.h"
  79 #endif
  80
  81 #define REG_COMP_C
  82 #ifdef PERL_IN_XSUB_RE
  83 #  include "re_comp.h"
  84 #else
  85 #  include "regcomp.h"
  86 #endif
  87
  88 #include "dquote_static.c"
  89
  90 #ifdef op
  91 #undef op
  92 #endif /* op */
  93
  94 #ifdef MSDOS
  95 #  if defined(BUGGY_MSC6)
  96  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
  97 #    pragma optimize("a",off)
  98  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
  99 #    pragma optimize("w",on )
 100 #  endif /* BUGGY_MSC6 */
 101 #endif /* MSDOS */
 102
 103 #ifndef STATIC
 104 #define STATIC  static
 105 #endif
 106
 107 typedef struct RExC_state_t {
 108     U32         flags;                  /* are we folding, multilining? */
 109     char        *precomp;               /* uncompiled string. */
 110     REGEXP      *rx_sv;                 /* The SV that is the regexp. */
 111     regexp      *rx;                    /* perl core regexp structure */
 112     regexp_internal     *rxi;           /* internal data for regexp object pprivate field */
 113     char        *start;                 /* Start of input for compile */
 114     char        *end;                   /* End of input for compile */
 115     char        *parse;                 /* Input-scan pointer. */
 116     I32         whilem_seen;            /* number of WHILEM in this expr */
 117     regnode     *emit_start;            /* Start of emitted-code area */
 118     regnode     *emit_bound;            /* First regnode outside of the allocated space */
 119     regnode     *emit;                  /* Code-emit pointer; &regdummy = don't = compiling */
 120     I32         naughty;                /* How bad is this pattern? */
 121     I32         sawback;                /* Did we see \1, ...? */
 122     U32         seen;
 123     I32         size;                   /* Code size. */
 124     I32         npar;                   /* Capture buffer count, (OPEN). */
 125     I32         cpar;                   /* Capture buffer count, (CLOSE). */
 126     I32         nestroot;               /* root parens we are in - used by accept */
 127     I32         extralen;
 128     I32         seen_zerolen;
 129     I32         seen_evals;
 130     regnode     **open_parens;          /* pointers to open parens */
 131     regnode     **close_parens;         /* pointers to close parens */
 132     regnode     *opend;                 /* END node in program */
 133     I32         utf8;           /* whether the pattern is utf8 or not */
 134     I32         orig_utf8;      /* whether the pattern was originally in utf8 */
 135                                 /* XXX use this for future optimisation of case
 136                                  * where pattern must be upgraded to utf8. */
 137     I32         uni_semantics;  /* If a d charset modifier should use unicode
 138                                    rules, even if the pattern is not in
 139                                    utf8 */
 140     HV          *paren_names;           /* Paren names */
 141
 142     regnode     **recurse;              /* Recurse regops */
 143     I32         recurse_count;          /* Number of recurse regops */
 144     I32         in_lookbehind;
 145 #if ADD_TO_REGEXEC
 146     char        *starttry;              /* -Dr: where regtry was called. */
 147 #define RExC_starttry   (pRExC_state->starttry)
 148 #endif
 149 #ifdef DEBUGGING
 150     const char  *lastparse;
 151     I32         lastnum;
 152     AV          *paren_name_list;       /* idx -> name */
 153 #define RExC_lastparse  (pRExC_state->lastparse)
 154 #define RExC_lastnum    (pRExC_state->lastnum)
 155 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 156 #endif
 157 } RExC_state_t;
 158
 159 #define RExC_flags      (pRExC_state->flags)
 160 #define RExC_precomp    (pRExC_state->precomp)
 161 #define RExC_rx_sv      (pRExC_state->rx_sv)
 162 #define RExC_rx         (pRExC_state->rx)
 163 #define RExC_rxi        (pRExC_state->rxi)
 164 #define RExC_start      (pRExC_state->start)
 165 #define RExC_end        (pRExC_state->end)
 166 #define RExC_parse      (pRExC_state->parse)
 167 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
 168 #ifdef RE_TRACK_PATTERN_OFFSETS
 169 #define RExC_offsets    (pRExC_state->rxi->u.offsets) /* I am not like the others */
 170 #endif
 171 #define RExC_emit       (pRExC_state->emit)
 172 #define RExC_emit_start (pRExC_state->emit_start)
 173 #define RExC_emit_bound (pRExC_state->emit_bound)
 174 #define RExC_naughty    (pRExC_state->naughty)
 175 #define RExC_sawback    (pRExC_state->sawback)
 176 #define RExC_seen       (pRExC_state->seen)
 177 #define RExC_size       (pRExC_state->size)
 178 #define RExC_npar       (pRExC_state->npar)
 179 #define RExC_nestroot   (pRExC_state->nestroot)
 180 #define RExC_extralen   (pRExC_state->extralen)
 181 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
 182 #define RExC_seen_evals (pRExC_state->seen_evals)
 183 #define RExC_utf8       (pRExC_state->utf8)
 184 #define RExC_uni_semantics      (pRExC_state->uni_semantics)
 185 #define RExC_orig_utf8  (pRExC_state->orig_utf8)
 186 #define RExC_open_parens        (pRExC_state->open_parens)
 187 #define RExC_close_parens       (pRExC_state->close_parens)
 188 #define RExC_opend      (pRExC_state->opend)
 189 #define RExC_paren_names        (pRExC_state->paren_names)
 190 #define RExC_recurse    (pRExC_state->recurse)
 191 #define RExC_recurse_count      (pRExC_state->recurse_count)
 192 #define RExC_in_lookbehind      (pRExC_state->in_lookbehind)
 193
 194
 195 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
 196 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 197         ((*s) == '{' && regcurly(s)))
 198
 199 #ifdef SPSTART
 200 #undef SPSTART          /* dratted cpp namespace... */
 201 #endif
 202 /*
 203  * Flags to be passed up and down.
 204  */
 205 #define WORST           0       /* Worst case. */
 206 #define HASWIDTH        0x01    /* Known to match non-null strings. */
 207
 208 /* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
 209  * character, and if utf8, must be invariant.  Note that this is not the same thing as REGNODE_SIMPLE */
 210 #define SIMPLE          0x02
 211 #define SPSTART         0x04    /* Starts with * or +. */
 212 #define TRYAGAIN        0x08    /* Weeded out a declaration. */
 213 #define POSTPONED       0x10    /* (?1),(?&name), (??{...}) or similar */
 214
 215 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 216
 217 /* whether trie related optimizations are enabled */
 218 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 219 #define TRIE_STUDY_OPT
 220 #define FULL_TRIE_STUDY
 221 #define TRIE_STCLASS
 222 #endif
 223
 224
 225
 226 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 227 #define PBITVAL(paren) (1 << ((paren) & 7))
 228 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 229 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 230 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 231
 232 /* If not already in utf8, do a longjmp back to the beginning */
 233 #define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
 234 #define REQUIRE_UTF8    STMT_START {                                       \
 235                                      if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
 236                         } STMT_END
 237
 238 /* About scan_data_t.
 239
 240   During optimisation we recurse through the regexp program performing
 241   various inplace (keyhole style) optimisations. In addition study_chunk
 242   and scan_commit populate this data structure with information about
 243   what strings MUST appear in the pattern. We look for the longest
 244   string that must appear at a fixed location, and we look for the
 245   longest string that may appear at a floating location. So for instance
 246   in the pattern:
 247
 248     /FOO[xX]A.*B[xX]BAR/
 249
 250   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 251   strings (because they follow a .* construct). study_chunk will identify
 252   both FOO and BAR as being the longest fixed and floating strings respectively.
 253
 254   The strings can be composites, for instance
 255
 256      /(f)(o)(o)/
 257
 258   will result in a composite fixed substring 'foo'.
 259
 260   For each string some basic information is maintained:
 261
 262   - offset or min_offset
 263     This is the position the string must appear at, or not before.
 264     It also implicitly (when combined with minlenp) tells us how many
 265     characters must match before the string we are searching for.
 266     Likewise when combined with minlenp and the length of the string it
 267     tells us how many characters must appear after the string we have
 268     found.
 269
 270   - max_offset
 271     Only used for floating strings. This is the rightmost point that
 272     the string can appear at. If set to I32 max it indicates that the
 273     string can occur infinitely far to the right.
 274
 275   - minlenp
 276     A pointer to the minimum length of the pattern that the string
 277     was found inside. This is important as in the case of positive
 278     lookahead or positive lookbehind we can have multiple patterns
 279     involved. Consider
 280
 281     /(?=FOO).*F/
 282
 283     The minimum length of the pattern overall is 3, the minimum length
 284     of the lookahead part is 3, but the minimum length of the part that
 285     will actually match is 1. So 'FOO's minimum length is 3, but the
 286     minimum length for the F is 1. This is important as the minimum length
 287     is used to determine offsets in front of and behind the string being
 288     looked for.  Since strings can be composites this is the length of the
 289     pattern at the time it was committed with a scan_commit. Note that
 290     the length is calculated by study_chunk, so that the minimum lengths
 291     are not known until the full pattern has been compiled, thus the
 292     pointer to the value.
 293
 294   - lookbehind
 295
 296     In the case of lookbehind the string being searched for can be
 297     offset past the start point of the final matching string.
 298     If this value was just blithely removed from the min_offset it would
 299     invalidate some of the calculations for how many chars must match
 300     before or after (as they are derived from min_offset and minlen and
 301     the length of the string being searched for).
 302     When the final pattern is compiled and the data is moved from the
 303     scan_data_t structure into the regexp structure the information
 304     about lookbehind is factored in, with the information that would
 305     have been lost precalculated in the end_shift field for the
 306     associated string.
 307
 308   The fields pos_min and pos_delta are used to store the minimum offset
 309   and the delta to the maximum offset at the current point in the pattern.
 310
 311 */
 312
 313 typedef struct scan_data_t {
 314     /*I32 len_min;      unused */
 315     /*I32 len_delta;    unused */
 316     I32 pos_min;
 317     I32 pos_delta;
 318     SV *last_found;
 319     I32 last_end;           /* min value, <0 unless valid. */
 320     I32 last_start_min;
 321     I32 last_start_max;
 322     SV **longest;           /* Either &l_fixed, or &l_float. */
 323     SV *longest_fixed;      /* longest fixed string found in pattern */
 324     I32 offset_fixed;       /* offset where it starts */
 325     I32 *minlen_fixed;      /* pointer to the minlen relevant to the string */
 326     I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 327     SV *longest_float;      /* longest floating string found in pattern */
 328     I32 offset_float_min;   /* earliest point in string it can appear */
 329     I32 offset_float_max;   /* latest point in string it can appear */
 330     I32 *minlen_float;      /* pointer to the minlen relevant to the string */
 331     I32 lookbehind_float;   /* is the position of the string modified by LB */
 332     I32 flags;
 333     I32 whilem_c;
 334     I32 *last_closep;
 335     struct regnode_charclass_class *start_class;
 336 } scan_data_t;
 337
 338 /*
 339  * Forward declarations for pregcomp()'s friends.
 340  */
 341
 342 static const scan_data_t zero_scan_data =
 343   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 344
 345 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 346 #define SF_BEFORE_SEOL          0x0001
 347 #define SF_BEFORE_MEOL          0x0002
 348 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 349 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 350
 351 #ifdef NO_UNARY_PLUS
 352 #  define SF_FIX_SHIFT_EOL      (0+2)
 353 #  define SF_FL_SHIFT_EOL               (0+4)
 354 #else
 355 #  define SF_FIX_SHIFT_EOL      (+2)
 356 #  define SF_FL_SHIFT_EOL               (+4)
 357 #endif
 358
 359 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 360 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 361
 362 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 363 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 364 #define SF_IS_INF               0x0040
 365 #define SF_HAS_PAR              0x0080
 366 #define SF_IN_PAR               0x0100
 367 #define SF_HAS_EVAL             0x0200
 368 #define SCF_DO_SUBSTR           0x0400
 369 #define SCF_DO_STCLASS_AND      0x0800
 370 #define SCF_DO_STCLASS_OR       0x1000
 371 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 372 #define SCF_WHILEM_VISITED_POS  0x2000
 373
 374 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 375 #define SCF_SEEN_ACCEPT         0x8000
 376
 377 #define UTF cBOOL(RExC_utf8)
 378 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 379 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
 380 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
 381 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
 382 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
 383 #define MORE_ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
 384 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
 385
 386 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 387
 388 #define OOB_UNICODE             12345678
 389 #define OOB_NAMEDCLASS          -1
 390
 391 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 392 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 393
 394
 395 /* length of regex to show in messages that don't mark a position within */
 396 #define RegexLengthToShowInErrorMessages 127
 397
 398 /*
 399  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 400  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 401  * op/pragma/warn/regcomp.
 402  */
 403 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 404 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 405
 406 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
 407
 408 /*
 409  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 410  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 411  * "...".
 412  */
 413 #define _FAIL(code) STMT_START {                                        \
 414     const char *ellipses = "";                                          \
 415     IV len = RExC_end - RExC_precomp;                                   \
 416                                                                         \
 417     if (!SIZE_ONLY)                                                     \
 418         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);                   \
 419     if (len > RegexLengthToShowInErrorMessages) {                       \
 420         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
 421         len = RegexLengthToShowInErrorMessages - 10;                    \
 422         ellipses = "...";                                               \
 423     }                                                                   \
 424     code;                                                               \
 425 } STMT_END
 426
 427 #define FAIL(msg) _FAIL(                            \
 428     Perl_croak(aTHX_ "%s in regex m/%.*s%s/",       \
 429             msg, (int)len, RExC_precomp, ellipses))
 430
 431 #define FAIL2(msg,arg) _FAIL(                       \
 432     Perl_croak(aTHX_ msg " in regex m/%.*s%s/",     \
 433             arg, (int)len, RExC_precomp, ellipses))
 434
 435 /*
 436  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 437  */
 438 #define Simple_vFAIL(m) STMT_START {                                    \
 439     const IV offset = RExC_parse - RExC_precomp;                        \
 440     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
 441             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
 442 } STMT_END
 443
 444 /*
 445  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 446  */
 447 #define vFAIL(m) STMT_START {                           \
 448     if (!SIZE_ONLY)                                     \
 449         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 450     Simple_vFAIL(m);                                    \
 451 } STMT_END
 452
 453 /*
 454  * Like Simple_vFAIL(), but accepts two arguments.
 455  */
 456 #define Simple_vFAIL2(m,a1) STMT_START {                        \
 457     const IV offset = RExC_parse - RExC_precomp;                        \
 458     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,                   \
 459             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 460 } STMT_END
 461
 462 /*
 463  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 464  */
 465 #define vFAIL2(m,a1) STMT_START {                       \
 466     if (!SIZE_ONLY)                                     \
 467         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 468     Simple_vFAIL2(m, a1);                               \
 469 } STMT_END
 470
 471
 472 /*
 473  * Like Simple_vFAIL(), but accepts three arguments.
 474  */
 475 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
 476     const IV offset = RExC_parse - RExC_precomp;                \
 477     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,               \
 478             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 479 } STMT_END
 480
 481 /*
 482  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 483  */
 484 #define vFAIL3(m,a1,a2) STMT_START {                    \
 485     if (!SIZE_ONLY)                                     \
 486         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 487     Simple_vFAIL3(m, a1, a2);                           \
 488 } STMT_END
 489
 490 /*
 491  * Like Simple_vFAIL(), but accepts four arguments.
 492  */
 493 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
 494     const IV offset = RExC_parse - RExC_precomp;                \
 495     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,           \
 496             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 497 } STMT_END
 498
 499 #define ckWARNreg(loc,m) STMT_START {                                   \
 500     const IV offset = loc - RExC_precomp;                               \
 501     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 502             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 503 } STMT_END
 504
 505 #define ckWARNregdep(loc,m) STMT_START {                                \
 506     const IV offset = loc - RExC_precomp;                               \
 507     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 508             m REPORT_LOCATION,                                          \
 509             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 510 } STMT_END
 511
 512 #define ckWARN2reg(loc, m, a1) STMT_START {                             \
 513     const IV offset = loc - RExC_precomp;                               \
 514     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 515             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
 516 } STMT_END
 517
 518 #define vWARN3(loc, m, a1, a2) STMT_START {                             \
 519     const IV offset = loc - RExC_precomp;                               \
 520     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 521             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 522 } STMT_END
 523
 524 #define ckWARN3reg(loc, m, a1, a2) STMT_START {                         \
 525     const IV offset = loc - RExC_precomp;                               \
 526     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 527             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 528 } STMT_END
 529
 530 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
 531     const IV offset = loc - RExC_precomp;                               \
 532     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 533             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 534 } STMT_END
 535
 536 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START {                     \
 537     const IV offset = loc - RExC_precomp;                               \
 538     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 539             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 540 } STMT_END
 541
 542 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
 543     const IV offset = loc - RExC_precomp;                               \
 544     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 545             a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
 546 } STMT_END
 547
 548
 549 /* Allow for side effects in s */
 550 #define REGC(c,s) STMT_START {                  \
 551     if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
 552 } STMT_END
 553
 554 /* Macros for recording node offsets.   20001227 mjd@plover.com
 555  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 556  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 557  * Element 0 holds the number n.
 558  * Position is 1 indexed.
 559  */
 560 #ifndef RE_TRACK_PATTERN_OFFSETS
 561 #define Set_Node_Offset_To_R(node,byte)
 562 #define Set_Node_Offset(node,byte)
 563 #define Set_Cur_Node_Offset
 564 #define Set_Node_Length_To_R(node,len)
 565 #define Set_Node_Length(node,len)
 566 #define Set_Node_Cur_Length(node)
 567 #define Node_Offset(n)
 568 #define Node_Length(n)
 569 #define Set_Node_Offset_Length(node,offset,len)
 570 #define ProgLen(ri) ri->u.proglen
 571 #define SetProgLen(ri,x) ri->u.proglen = x
 572 #else
 573 #define ProgLen(ri) ri->u.offsets[0]
 574 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 575 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
 576     if (! SIZE_ONLY) {                                                  \
 577         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
 578                     __LINE__, (int)(node), (int)(byte)));               \
 579         if((node) < 0) {                                                \
 580             Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
 581         } else {                                                        \
 582             RExC_offsets[2*(node)-1] = (byte);                          \
 583         }                                                               \
 584     }                                                                   \
 585 } STMT_END
 586
 587 #define Set_Node_Offset(node,byte) \
 588     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 589 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 590
 591 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
 592     if (! SIZE_ONLY) {                                                  \
 593         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
 594                 __LINE__, (int)(node), (int)(len)));                    \
 595         if((node) < 0) {                                                \
 596             Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
 597         } else {                                                        \
 598             RExC_offsets[2*(node)] = (len);                             \
 599         }                                                               \
 600     }                                                                   \
 601 } STMT_END
 602
 603 #define Set_Node_Length(node,len) \
 604     Set_Node_Length_To_R((node)-RExC_emit_start, len)
 605 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
 606 #define Set_Node_Cur_Length(node) \
 607     Set_Node_Length(node, RExC_parse - parse_start)
 608
 609 /* Get offsets and lengths */
 610 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 611 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 612
 613 #define Set_Node_Offset_Length(node,offset,len) STMT_START {    \
 614     Set_Node_Offset_To_R((node)-RExC_emit_start, (offset));     \
 615     Set_Node_Length_To_R((node)-RExC_emit_start, (len));        \
 616 } STMT_END
 617 #endif
 618
 619 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 620 #define EXPERIMENTAL_INPLACESCAN
 621 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
 622
 623 #define DEBUG_STUDYDATA(str,data,depth)                              \
 624 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
 625     PerlIO_printf(Perl_debug_log,                                    \
 626         "%*s" str "Pos:%"IVdf"/%"IVdf                                \
 627         " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s",       \
 628         (int)(depth)*2, "",                                          \
 629         (IV)((data)->pos_min),                                       \
 630         (IV)((data)->pos_delta),                                     \
 631         (UV)((data)->flags),                                         \
 632         (IV)((data)->whilem_c),                                      \
 633         (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
 634         is_inf ? "INF " : ""                                         \
 635     );                                                               \
 636     if ((data)->last_found)                                          \
 637         PerlIO_printf(Perl_debug_log,                                \
 638             "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 639             " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 640             SvPVX_const((data)->last_found),                         \
 641             (IV)((data)->last_end),                                  \
 642             (IV)((data)->last_start_min),                            \
 643             (IV)((data)->last_start_max),                            \
 644             ((data)->longest &&                                      \
 645              (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 646             SvPVX_const((data)->longest_fixed),                      \
 647             (IV)((data)->offset_fixed),                              \
 648             ((data)->longest &&                                      \
 649              (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 650             SvPVX_const((data)->longest_float),                      \
 651             (IV)((data)->offset_float_min),                          \
 652             (IV)((data)->offset_float_max)                           \
 653         );                                                           \
 654     PerlIO_printf(Perl_debug_log,"\n");                              \
 655 });
 656
 657 static void clear_re(pTHX_ void *r);
 658
 659 /* Mark that we cannot extend a found fixed substring at this point.
 660    Update the longest found anchored substring and the longest found
 661    floating substrings if needed. */
 662
 663 STATIC void
 664 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
 665 {
 666     const STRLEN l = CHR_SVLEN(data->last_found);
 667     const STRLEN old_l = CHR_SVLEN(*data->longest);
 668     GET_RE_DEBUG_FLAGS_DECL;
 669
 670     PERL_ARGS_ASSERT_SCAN_COMMIT;
 671
 672     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 673         SvSetMagicSV(*data->longest, data->last_found);
 674         if (*data->longest == data->longest_fixed) {
 675             data->offset_fixed = l ? data->last_start_min : data->pos_min;
 676             if (data->flags & SF_BEFORE_EOL)
 677                 data->flags
 678                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
 679             else
 680                 data->flags &= ~SF_FIX_BEFORE_EOL;
 681             data->minlen_fixed=minlenp;
 682             data->lookbehind_fixed=0;
 683         }
 684         else { /* *data->longest == data->longest_float */
 685             data->offset_float_min = l ? data->last_start_min : data->pos_min;
 686             data->offset_float_max = (l
 687                                       ? data->last_start_max
 688                                       : data->pos_min + data->pos_delta);
 689             if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
 690                 data->offset_float_max = I32_MAX;
 691             if (data->flags & SF_BEFORE_EOL)
 692                 data->flags
 693                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
 694             else
 695                 data->flags &= ~SF_FL_BEFORE_EOL;
 696             data->minlen_float=minlenp;
 697             data->lookbehind_float=0;
 698         }
 699     }
 700     SvCUR_set(data->last_found, 0);
 701     {
 702         SV * const sv = data->last_found;
 703         if (SvUTF8(sv) && SvMAGICAL(sv)) {
 704             MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 705             if (mg)
 706                 mg->mg_len = 0;
 707         }
 708     }
 709     data->last_end = -1;
 710     data->flags &= ~SF_BEFORE_EOL;
 711     DEBUG_STUDYDATA("commit: ",data,0);
 712 }
 713
 714 /* Can match anything (initialization) */
 715 STATIC void
 716 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 717 {
 718     PERL_ARGS_ASSERT_CL_ANYTHING;
 719
 720     ANYOF_CLASS_ZERO(cl);
 721     ANYOF_BITMAP_SETALL(cl);
 722     cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL|ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL;
 723     if (LOC)
 724         cl->flags |= ANYOF_LOCALE;
 725 }
 726
 727 /* Can match anything (initialization) */
 728 STATIC int
 729 S_cl_is_anything(const struct regnode_charclass_class *cl)
 730 {
 731     int value;
 732
 733     PERL_ARGS_ASSERT_CL_IS_ANYTHING;
 734
 735     for (value = 0; value <= ANYOF_MAX; value += 2)
 736         if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
 737             return 1;
 738     if (!(cl->flags & ANYOF_UNICODE_ALL))
 739         return 0;
 740     if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
 741         return 0;
 742     return 1;
 743 }
 744
 745 /* Can match anything (initialization) */
 746 STATIC void
 747 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 748 {
 749     PERL_ARGS_ASSERT_CL_INIT;
 750
 751     Zero(cl, 1, struct regnode_charclass_class);
 752     cl->type = ANYOF;
 753     cl_anything(pRExC_state, cl);
 754 }
 755
 756 STATIC void
 757 S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 758 {
 759     PERL_ARGS_ASSERT_CL_INIT_ZERO;
 760
 761     Zero(cl, 1, struct regnode_charclass_class);
 762     cl->type = ANYOF;
 763     cl_anything(pRExC_state, cl);
 764     if (LOC)
 765         cl->flags |= ANYOF_LOCALE;
 766 }
 767
 768 /* 'And' a given class with another one.  Can create false positives */
 769 /* We assume that cl is not inverted */
 770 STATIC void
 771 S_cl_and(struct regnode_charclass_class *cl,
 772         const struct regnode_charclass_class *and_with)
 773 {
 774     PERL_ARGS_ASSERT_CL_AND;
 775
 776     assert(and_with->type == ANYOF);
 777
 778     if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
 779         && !(ANYOF_CLASS_TEST_ANY_SET(cl))
 780         && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 781         && !(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
 782         && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) {
 783         int i;
 784
 785         if (and_with->flags & ANYOF_INVERT)
 786             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 787                 cl->bitmap[i] &= ~and_with->bitmap[i];
 788         else
 789             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 790                 cl->bitmap[i] &= and_with->bitmap[i];
 791     } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
 792     if (!(and_with->flags & ANYOF_EOS))
 793         cl->flags &= ~ANYOF_EOS;
 794
 795     if (!(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD))
 796         cl->flags &= ~ANYOF_LOC_NONBITMAP_FOLD;
 797     if (!(and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL))
 798         cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
 799
 800     if (cl->flags & ANYOF_UNICODE_ALL
 801         && and_with->flags & ANYOF_NONBITMAP
 802         && !(and_with->flags & ANYOF_INVERT))
 803     {
 804         if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
 805             cl->flags &= ~ANYOF_UNICODE_ALL;
 806         }
 807         cl->flags |= and_with->flags & ANYOF_NONBITMAP; /* field is 2 bits; use
 808                                                            only the one(s)
 809                                                            actually set */
 810         ARG_SET(cl, ARG(and_with));
 811     }
 812     if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
 813         !(and_with->flags & ANYOF_INVERT))
 814         cl->flags &= ~ANYOF_UNICODE_ALL;
 815     if (!(and_with->flags & (ANYOF_NONBITMAP|ANYOF_UNICODE_ALL)) &&
 816         !(and_with->flags & ANYOF_INVERT))
 817         cl->flags &= ~ANYOF_NONBITMAP;
 818 }
 819
 820 /* 'OR' a given class with another one.  Can create false positives */
 821 /* We assume that cl is not inverted */
 822 STATIC void
 823 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
 824 {
 825     PERL_ARGS_ASSERT_CL_OR;
 826
 827     if (or_with->flags & ANYOF_INVERT) {
 828         /* We do not use
 829          * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
 830          *   <= (B1 | !B2) | (CL1 | !CL2)
 831          * which is wasteful if CL2 is small, but we ignore CL2:
 832          *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
 833          * XXXX Can we handle case-fold?  Unclear:
 834          *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
 835          *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
 836          */
 837         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 838              && !(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
 839              && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD) ) {
 840             int i;
 841
 842             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 843                 cl->bitmap[i] |= ~or_with->bitmap[i];
 844         } /* XXXX: logic is complicated otherwise */
 845         else {
 846             cl_anything(pRExC_state, cl);
 847         }
 848     } else {
 849         /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
 850         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 851              && (!(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
 852                  || (cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) ) {
 853             int i;
 854
 855             /* OR char bitmap and class bitmap separately */
 856             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 857                 cl->bitmap[i] |= or_with->bitmap[i];
 858             if (ANYOF_CLASS_TEST_ANY_SET(or_with)) {
 859                 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
 860                     cl->classflags[i] |= or_with->classflags[i];
 861                 cl->flags |= ANYOF_CLASS;
 862             }
 863         }
 864         else { /* XXXX: logic is complicated, leave it along for a moment. */
 865             cl_anything(pRExC_state, cl);
 866         }
 867     }
 868     if (or_with->flags & ANYOF_EOS)
 869         cl->flags |= ANYOF_EOS;
 870     if (!(or_with->flags & ANYOF_NON_UTF8_LATIN1_ALL))
 871         cl->flags |= ANYOF_NON_UTF8_LATIN1_ALL;
 872
 873     if (or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
 874         cl->flags |= ANYOF_LOC_NONBITMAP_FOLD;
 875
 876     /* If both nodes match something outside the bitmap, but what they match
 877      * outside is not the same pointer, and hence not easily compared, give up
 878      * and allow the start class to match everything outside the bitmap */
 879     if (cl->flags & ANYOF_NONBITMAP && or_with->flags & ANYOF_NONBITMAP &&
 880         ARG(cl) != ARG(or_with)) {
 881         cl->flags |= ANYOF_UNICODE_ALL;
 882     }
 883
 884     if (or_with->flags & ANYOF_UNICODE_ALL) {
 885         cl->flags |= ANYOF_UNICODE_ALL;
 886     }
 887 }
 888
 889 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
 890 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
 891 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
 892 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
 893
 894
 895 #ifdef DEBUGGING
 896 /*
 897    dump_trie(trie,widecharmap,revcharmap)
 898    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
 899    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
 900
 901    These routines dump out a trie in a somewhat readable format.
 902    The _interim_ variants are used for debugging the interim
 903    tables that are used to generate the final compressed
 904    representation which is what dump_trie expects.
 905
 906    Part of the reason for their existence is to provide a form
 907    of documentation as to how the different representations function.
 908
 909 */
 910
 911 /*
 912   Dumps the final compressed table form of the trie to Perl_debug_log.
 913   Used for debugging make_trie().
 914 */
 915
 916 STATIC void
 917 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
 918             AV *revcharmap, U32 depth)
 919 {
 920     U32 state;
 921     SV *sv=sv_newmortal();
 922     int colwidth= widecharmap ? 6 : 4;
 923     U16 word;
 924     GET_RE_DEBUG_FLAGS_DECL;
 925
 926     PERL_ARGS_ASSERT_DUMP_TRIE;
 927
 928     PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
 929         (int)depth * 2 + 2,"",
 930         "Match","Base","Ofs" );
 931
 932     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
 933         SV ** const tmp = av_fetch( revcharmap, state, 0);
 934         if ( tmp ) {
 935             PerlIO_printf( Perl_debug_log, "%*s",
 936                 colwidth,
 937                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
 938                             PL_colors[0], PL_colors[1],
 939                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
 940                             PERL_PV_ESCAPE_FIRSTCHAR
 941                 )
 942             );
 943         }
 944     }
 945     PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
 946         (int)depth * 2 + 2,"");
 947
 948     for( state = 0 ; state < trie->uniquecharcount ; state++ )
 949         PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
 950     PerlIO_printf( Perl_debug_log, "\n");
 951
 952     for( state = 1 ; state < trie->statecount ; state++ ) {
 953         const U32 base = trie->states[ state ].trans.base;
 954
 955         PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
 956
 957         if ( trie->states[ state ].wordnum ) {
 958             PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
 959         } else {
 960             PerlIO_printf( Perl_debug_log, "%6s", "" );
 961         }
 962
 963         PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
 964
 965         if ( base ) {
 966             U32 ofs = 0;
 967
 968             while( ( base + ofs  < trie->uniquecharcount ) ||
 969                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
 970                      && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
 971                     ofs++;
 972
 973             PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
 974
 975             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
 976                 if ( ( base + ofs >= trie->uniquecharcount ) &&
 977                      ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
 978                      trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
 979                 {
 980                    PerlIO_printf( Perl_debug_log, "%*"UVXf,
 981                     colwidth,
 982                     (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
 983                 } else {
 984                     PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
 985                 }
 986             }
 987
 988             PerlIO_printf( Perl_debug_log, "]");
 989
 990         }
 991         PerlIO_printf( Perl_debug_log, "\n" );
 992     }
 993     PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
 994     for (word=1; word <= trie->wordcount; word++) {
 995         PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
 996             (int)word, (int)(trie->wordinfo[word].prev),
 997             (int)(trie->wordinfo[word].len));
 998     }
 999     PerlIO_printf(Perl_debug_log, "\n" );
1000 }
1001 /*
1002   Dumps a fully constructed but uncompressed trie in list form.
1003   List tries normally only are used for construction when the number of
1004   possible chars (trie->uniquecharcount) is very high.
1005   Used for debugging make_trie().
1006 */
1007 STATIC void
1008 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1009                          HV *widecharmap, AV *revcharmap, U32 next_alloc,
1010                          U32 depth)
1011 {
1012     U32 state;
1013     SV *sv=sv_newmortal();
1014     int colwidth= widecharmap ? 6 : 4;
1015     GET_RE_DEBUG_FLAGS_DECL;
1016
1017     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1018
1019     /* print out the table precompression.  */
1020     PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1021         (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1022         "------:-----+-----------------\n" );
1023
1024     for( state=1 ; state < next_alloc ; state ++ ) {
1025         U16 charid;
1026
1027         PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1028             (int)depth * 2 + 2,"", (UV)state  );
1029         if ( ! trie->states[ state ].wordnum ) {
1030             PerlIO_printf( Perl_debug_log, "%5s| ","");
1031         } else {
1032             PerlIO_printf( Perl_debug_log, "W%4x| ",
1033                 trie->states[ state ].wordnum
1034             );
1035         }
1036         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1037             SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1038             if ( tmp ) {
1039                 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1040                     colwidth,
1041                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1042                             PL_colors[0], PL_colors[1],
1043                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1044                             PERL_PV_ESCAPE_FIRSTCHAR
1045                     ) ,
1046                     TRIE_LIST_ITEM(state,charid).forid,
1047                     (UV)TRIE_LIST_ITEM(state,charid).newstate
1048                 );
1049                 if (!(charid % 10))
1050                     PerlIO_printf(Perl_debug_log, "\n%*s| ",
1051                         (int)((depth * 2) + 14), "");
1052             }
1053         }
1054         PerlIO_printf( Perl_debug_log, "\n");
1055     }
1056 }
1057
1058 /*
1059   Dumps a fully constructed but uncompressed trie in table form.
1060   This is the normal DFA style state transition table, with a few
1061   twists to facilitate compression later.
1062   Used for debugging make_trie().
1063 */
1064 STATIC void
1065 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1066                           HV *widecharmap, AV *revcharmap, U32 next_alloc,
1067                           U32 depth)
1068 {
1069     U32 state;
1070     U16 charid;
1071     SV *sv=sv_newmortal();
1072     int colwidth= widecharmap ? 6 : 4;
1073     GET_RE_DEBUG_FLAGS_DECL;
1074
1075     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1076
1077     /*
1078        print out the table precompression so that we can do a visual check
1079        that they are identical.
1080      */
1081
1082     PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1083
1084     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1085         SV ** const tmp = av_fetch( revcharmap, charid, 0);
1086         if ( tmp ) {
1087             PerlIO_printf( Perl_debug_log, "%*s",
1088                 colwidth,
1089                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1090                             PL_colors[0], PL_colors[1],
1091                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1092                             PERL_PV_ESCAPE_FIRSTCHAR
1093                 )
1094             );
1095         }
1096     }
1097
1098     PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1099
1100     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1101         PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1102     }
1103
1104     PerlIO_printf( Perl_debug_log, "\n" );
1105
1106     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1107
1108         PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1109             (int)depth * 2 + 2,"",
1110             (UV)TRIE_NODENUM( state ) );
1111
1112         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1113             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1114             if (v)
1115                 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1116             else
1117                 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1118         }
1119         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1120             PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1121         } else {
1122             PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1123             trie->states[ TRIE_NODENUM( state ) ].wordnum );
1124         }
1125     }
1126 }
1127
1128 #endif
1129
1130
1131 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1132   startbranch: the first branch in the whole branch sequence
1133   first      : start branch of sequence of branch-exact nodes.
1134                May be the same as startbranch
1135   last       : Thing following the last branch.
1136                May be the same as tail.
1137   tail       : item following the branch sequence
1138   count      : words in the sequence
1139   flags      : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1140   depth      : indent depth
1141
1142 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1143
1144 A trie is an N'ary tree where the branches are determined by digital
1145 decomposition of the key. IE, at the root node you look up the 1st character and
1146 follow that branch repeat until you find the end of the branches. Nodes can be
1147 marked as "accepting" meaning they represent a complete word. Eg:
1148
1149   /he|she|his|hers/
1150
1151 would convert into the following structure. Numbers represent states, letters
1152 following numbers represent valid transitions on the letter from that state, if
1153 the number is in square brackets it represents an accepting state, otherwise it
1154 will be in parenthesis.
1155
1156       +-h->+-e->[3]-+-r->(8)-+-s->[9]
1157       |    |
1158       |   (2)
1159       |    |
1160      (1)   +-i->(6)-+-s->[7]
1161       |
1162       +-s->(3)-+-h->(4)-+-e->[5]
1163
1164       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1165
1166 This shows that when matching against the string 'hers' we will begin at state 1
1167 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1168 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1169 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1170 single traverse. We store a mapping from accepting to state to which word was
1171 matched, and then when we have multiple possibilities we try to complete the
1172 rest of the regex in the order in which they occured in the alternation.
1173
1174 The only prior NFA like behaviour that would be changed by the TRIE support is
1175 the silent ignoring of duplicate alternations which are of the form:
1176
1177  / (DUPE|DUPE) X? (?{ ... }) Y /x
1178
1179 Thus EVAL blocks following a trie may be called a different number of times with
1180 and without the optimisation. With the optimisations dupes will be silently
1181 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1182 the following demonstrates:
1183
1184  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1185
1186 which prints out 'word' three times, but
1187
1188  'words'=~/(word|word|word)(?{ print $1 })S/
1189
1190 which doesnt print it out at all. This is due to other optimisations kicking in.
1191
1192 Example of what happens on a structural level:
1193
1194 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1195
1196    1: CURLYM[1] {1,32767}(18)
1197    5:   BRANCH(8)
1198    6:     EXACT <ac>(16)
1199    8:   BRANCH(11)
1200    9:     EXACT <ad>(16)
1201   11:   BRANCH(14)
1202   12:     EXACT <ab>(16)
1203   16:   SUCCEED(0)
1204   17:   NOTHING(18)
1205   18: END(0)
1206
1207 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1208 and should turn into:
1209
1210    1: CURLYM[1] {1,32767}(18)
1211    5:   TRIE(16)
1212         [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1213           <ac>
1214           <ad>
1215           <ab>
1216   16:   SUCCEED(0)
1217   17:   NOTHING(18)
1218   18: END(0)
1219
1220 Cases where tail != last would be like /(?foo|bar)baz/:
1221
1222    1: BRANCH(4)
1223    2:   EXACT <foo>(8)
1224    4: BRANCH(7)
1225    5:   EXACT <bar>(8)
1226    7: TAIL(8)
1227    8: EXACT <baz>(10)
1228   10: END(0)
1229
1230 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1231 and would end up looking like:
1232
1233     1: TRIE(8)
1234       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1235         <foo>
1236         <bar>
1237    7: TAIL(8)
1238    8: EXACT <baz>(10)
1239   10: END(0)
1240
1241     d = uvuni_to_utf8_flags(d, uv, 0);
1242
1243 is the recommended Unicode-aware way of saying
1244
1245     *(d++) = uv;
1246 */
1247
1248 #define TRIE_STORE_REVCHAR                                                 \
1249     STMT_START {                                                           \
1250         if (UTF) {                                                         \
1251             SV *zlopp = newSV(2);                                          \
1252             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
1253             unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
1254             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
1255             SvPOK_on(zlopp);                                               \
1256             SvUTF8_on(zlopp);                                              \
1257             av_push(revcharmap, zlopp);                                    \
1258         } else {                                                           \
1259             char ooooff = (char)uvc;                                               \
1260             av_push(revcharmap, newSVpvn(&ooooff, 1));                     \
1261         }                                                                  \
1262         } STMT_END
1263
1264 #define TRIE_READ_CHAR STMT_START {                                           \
1265     wordlen++;                                                                \
1266     if ( UTF ) {                                                              \
1267         if ( folder ) {                                                       \
1268             if ( foldlen > 0 ) {                                              \
1269                uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags );     \
1270                foldlen -= len;                                                \
1271                scan += len;                                                   \
1272                len = 0;                                                       \
1273             } else {                                                          \
1274                 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1275                 uvc = to_uni_fold( uvc, foldbuf, &foldlen );                  \
1276                 foldlen -= UNISKIP( uvc );                                    \
1277                 scan = foldbuf + UNISKIP( uvc );                              \
1278             }                                                                 \
1279         } else {                                                              \
1280             uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1281         }                                                                     \
1282     } else {                                                                  \
1283         uvc = (U32)*uc;                                                       \
1284         len = 1;                                                              \
1285     }                                                                         \
1286 } STMT_END
1287
1288
1289
1290 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
1291     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
1292         U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
1293         Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1294     }                                                           \
1295     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
1296     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
1297     TRIE_LIST_CUR( state )++;                                   \
1298 } STMT_END
1299
1300 #define TRIE_LIST_NEW(state) STMT_START {                       \
1301     Newxz( trie->states[ state ].trans.list,               \
1302         4, reg_trie_trans_le );                                 \
1303      TRIE_LIST_CUR( state ) = 1;                                \
1304      TRIE_LIST_LEN( state ) = 4;                                \
1305 } STMT_END
1306
1307 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
1308     U16 dupe= trie->states[ state ].wordnum;                    \
1309     regnode * const noper_next = regnext( noper );              \
1310                                                                 \
1311     DEBUG_r({                                                   \
1312         /* store the word for dumping */                        \
1313         SV* tmp;                                                \
1314         if (OP(noper) != NOTHING)                               \
1315             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);    \
1316         else                                                    \
1317             tmp = newSVpvn_utf8( "", 0, UTF );                  \
1318         av_push( trie_words, tmp );                             \
1319     });                                                         \
1320                                                                 \
1321     curword++;                                                  \
1322     trie->wordinfo[curword].prev   = 0;                         \
1323     trie->wordinfo[curword].len    = wordlen;                   \
1324     trie->wordinfo[curword].accept = state;                     \
1325                                                                 \
1326     if ( noper_next < tail ) {                                  \
1327         if (!trie->jump)                                        \
1328             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1329         trie->jump[curword] = (U16)(noper_next - convert);      \
1330         if (!jumper)                                            \
1331             jumper = noper_next;                                \
1332         if (!nextbranch)                                        \
1333             nextbranch= regnext(cur);                           \
1334     }                                                           \
1335                                                                 \
1336     if ( dupe ) {                                               \
1337         /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
1338         /* chain, so that when the bits of chain are later    */\
1339         /* linked together, the dups appear in the chain      */\
1340         trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1341         trie->wordinfo[dupe].prev = curword;                    \
1342     } else {                                                    \
1343         /* we haven't inserted this word yet.                */ \
1344         trie->states[ state ].wordnum = curword;                \
1345     }                                                           \
1346 } STMT_END
1347
1348
1349 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)          \
1350      ( ( base + charid >=  ucharcount                                   \
1351          && base + charid < ubound                                      \
1352          && state == trie->trans[ base - ucharcount + charid ].check    \
1353          && trie->trans[ base - ucharcount + charid ].next )            \
1354            ? trie->trans[ base - ucharcount + charid ].next             \
1355            : ( state==1 ? special : 0 )                                 \
1356       )
1357
1358 #define MADE_TRIE       1
1359 #define MADE_JUMP_TRIE  2
1360 #define MADE_EXACT_TRIE 4
1361
1362 STATIC I32
1363 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1364 {
1365     dVAR;
1366     /* first pass, loop through and scan words */
1367     reg_trie_data *trie;
1368     HV *widecharmap = NULL;
1369     AV *revcharmap = newAV();
1370     regnode *cur;
1371     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1372     STRLEN len = 0;
1373     UV uvc = 0;
1374     U16 curword = 0;
1375     U32 next_alloc = 0;
1376     regnode *jumper = NULL;
1377     regnode *nextbranch = NULL;
1378     regnode *convert = NULL;
1379     U32 *prev_states; /* temp array mapping each state to previous one */
1380     /* we just use folder as a flag in utf8 */
1381     const U8 * folder = NULL;
1382
1383 #ifdef DEBUGGING
1384     const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1385     AV *trie_words = NULL;
1386     /* along with revcharmap, this only used during construction but both are
1387      * useful during debugging so we store them in the struct when debugging.
1388      */
1389 #else
1390     const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1391     STRLEN trie_charcount=0;
1392 #endif
1393     SV *re_trie_maxbuff;
1394     GET_RE_DEBUG_FLAGS_DECL;
1395
1396     PERL_ARGS_ASSERT_MAKE_TRIE;
1397 #ifndef DEBUGGING
1398     PERL_UNUSED_ARG(depth);
1399 #endif
1400
1401     switch (flags) {
1402         case EXACTFA:
1403         case EXACTFU: folder = PL_fold_latin1; break;
1404         case EXACTF:  folder = PL_fold; break;
1405         case EXACTFL: folder = PL_fold_locale; break;
1406     }
1407
1408     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1409     trie->refcount = 1;
1410     trie->startstate = 1;
1411     trie->wordcount = word_count;
1412     RExC_rxi->data->data[ data_slot ] = (void*)trie;
1413     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1414     if (!(UTF && folder))
1415         trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1416     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1417                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
1418
1419     DEBUG_r({
1420         trie_words = newAV();
1421     });
1422
1423     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1424     if (!SvIOK(re_trie_maxbuff)) {
1425         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1426     }
1427     DEBUG_OPTIMISE_r({
1428                 PerlIO_printf( Perl_debug_log,
1429                   "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1430                   (int)depth * 2 + 2, "",
1431                   REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1432                   REG_NODE_NUM(last), REG_NODE_NUM(tail),
1433                   (int)depth);
1434     });
1435
1436    /* Find the node we are going to overwrite */
1437     if ( first == startbranch && OP( last ) != BRANCH ) {
1438         /* whole branch chain */
1439         convert = first;
1440     } else {
1441         /* branch sub-chain */
1442         convert = NEXTOPER( first );
1443     }
1444
1445     /*  -- First loop and Setup --
1446
1447        We first traverse the branches and scan each word to determine if it
1448        contains widechars, and how many unique chars there are, this is
1449        important as we have to build a table with at least as many columns as we
1450        have unique chars.
1451
1452        We use an array of integers to represent the character codes 0..255
1453        (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1454        native representation of the character value as the key and IV's for the
1455        coded index.
1456
1457        *TODO* If we keep track of how many times each character is used we can
1458        remap the columns so that the table compression later on is more
1459        efficient in terms of memory by ensuring the most common value is in the
1460        middle and the least common are on the outside.  IMO this would be better
1461        than a most to least common mapping as theres a decent chance the most
1462        common letter will share a node with the least common, meaning the node
1463        will not be compressible. With a middle is most common approach the worst
1464        case is when we have the least common nodes twice.
1465
1466      */
1467
1468     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1469         regnode * const noper = NEXTOPER( cur );
1470         const U8 *uc = (U8*)STRING( noper );
1471         const U8 * const e  = uc + STR_LEN( noper );
1472         STRLEN foldlen = 0;
1473         U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1474         const U8 *scan = (U8*)NULL;
1475         U32 wordlen      = 0;         /* required init */
1476         STRLEN chars = 0;
1477         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1478
1479         if (OP(noper) == NOTHING) {
1480             trie->minlen= 0;
1481             continue;
1482         }
1483         if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1484             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1485                                           regardless of encoding */
1486
1487         for ( ; uc < e ; uc += len ) {
1488             TRIE_CHARCOUNT(trie)++;
1489             TRIE_READ_CHAR;
1490             chars++;
1491             if ( uvc < 256 ) {
1492                 if ( !trie->charmap[ uvc ] ) {
1493                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1494                     if ( folder )
1495                         trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1496                     TRIE_STORE_REVCHAR;
1497                 }
1498                 if ( set_bit ) {
1499                     /* store the codepoint in the bitmap, and its folded
1500                      * equivalent. */
1501                     TRIE_BITMAP_SET(trie,uvc);
1502
1503                     /* store the folded codepoint */
1504                     if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1505
1506                     if ( !UTF ) {
1507                         /* store first byte of utf8 representation of
1508                            variant codepoints */
1509                         if (! UNI_IS_INVARIANT(uvc)) {
1510                             TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
1511                         }
1512                     }
1513                     set_bit = 0; /* We've done our bit :-) */
1514                 }
1515             } else {
1516                 SV** svpp;
1517                 if ( !widecharmap )
1518                     widecharmap = newHV();
1519
1520                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1521
1522                 if ( !svpp )
1523                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1524
1525                 if ( !SvTRUE( *svpp ) ) {
1526                     sv_setiv( *svpp, ++trie->uniquecharcount );
1527                     TRIE_STORE_REVCHAR;
1528                 }
1529             }
1530         }
1531         if( cur == first ) {
1532             trie->minlen=chars;
1533             trie->maxlen=chars;
1534         } else if (chars < trie->minlen) {
1535             trie->minlen=chars;
1536         } else if (chars > trie->maxlen) {
1537             trie->maxlen=chars;
1538         }
1539
1540     } /* end first pass */
1541     DEBUG_TRIE_COMPILE_r(
1542         PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1543                 (int)depth * 2 + 2,"",
1544                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1545                 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1546                 (int)trie->minlen, (int)trie->maxlen )
1547     );
1548
1549     /*
1550         We now know what we are dealing with in terms of unique chars and
1551         string sizes so we can calculate how much memory a naive
1552         representation using a flat table  will take. If it's over a reasonable
1553         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1554         conservative but potentially much slower representation using an array
1555         of lists.
1556
1557         At the end we convert both representations into the same compressed
1558         form that will be used in regexec.c for matching with. The latter
1559         is a form that cannot be used to construct with but has memory
1560         properties similar to the list form and access properties similar
1561         to the table form making it both suitable for fast searches and
1562         small enough that its feasable to store for the duration of a program.
1563
1564         See the comment in the code where the compressed table is produced
1565         inplace from the flat tabe representation for an explanation of how
1566         the compression works.
1567
1568     */
1569
1570
1571     Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1572     prev_states[1] = 0;
1573
1574     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1575         /*
1576             Second Pass -- Array Of Lists Representation
1577
1578             Each state will be represented by a list of charid:state records
1579             (reg_trie_trans_le) the first such element holds the CUR and LEN
1580             points of the allocated array. (See defines above).
1581
1582             We build the initial structure using the lists, and then convert
1583             it into the compressed table form which allows faster lookups
1584             (but cant be modified once converted).
1585         */
1586
1587         STRLEN transcount = 1;
1588
1589         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1590             "%*sCompiling trie using list compiler\n",
1591             (int)depth * 2 + 2, ""));
1592
1593         trie->states = (reg_trie_state *)
1594             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1595                                   sizeof(reg_trie_state) );
1596         TRIE_LIST_NEW(1);
1597         next_alloc = 2;
1598
1599         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1600
1601             regnode * const noper = NEXTOPER( cur );
1602             U8 *uc           = (U8*)STRING( noper );
1603             const U8 * const e = uc + STR_LEN( noper );
1604             U32 state        = 1;         /* required init */
1605             U16 charid       = 0;         /* sanity init */
1606             U8 *scan         = (U8*)NULL; /* sanity init */
1607             STRLEN foldlen   = 0;         /* required init */
1608             U32 wordlen      = 0;         /* required init */
1609             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1610
1611             if (OP(noper) != NOTHING) {
1612                 for ( ; uc < e ; uc += len ) {
1613
1614                     TRIE_READ_CHAR;
1615
1616                     if ( uvc < 256 ) {
1617                         charid = trie->charmap[ uvc ];
1618                     } else {
1619                         SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1620                         if ( !svpp ) {
1621                             charid = 0;
1622                         } else {
1623                             charid=(U16)SvIV( *svpp );
1624                         }
1625                     }
1626                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
1627                     if ( charid ) {
1628
1629                         U16 check;
1630                         U32 newstate = 0;
1631
1632                         charid--;
1633                         if ( !trie->states[ state ].trans.list ) {
1634                             TRIE_LIST_NEW( state );
1635                         }
1636                         for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1637                             if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1638                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1639                                 break;
1640                             }
1641                         }
1642                         if ( ! newstate ) {
1643                             newstate = next_alloc++;
1644                             prev_states[newstate] = state;
1645                             TRIE_LIST_PUSH( state, charid, newstate );
1646                             transcount++;
1647                         }
1648                         state = newstate;
1649                     } else {
1650                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1651                     }
1652                 }
1653             }
1654             TRIE_HANDLE_WORD(state);
1655
1656         } /* end second pass */
1657
1658         /* next alloc is the NEXT state to be allocated */
1659         trie->statecount = next_alloc;
1660         trie->states = (reg_trie_state *)
1661             PerlMemShared_realloc( trie->states,
1662                                    next_alloc
1663                                    * sizeof(reg_trie_state) );
1664
1665         /* and now dump it out before we compress it */
1666         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1667                                                          revcharmap, next_alloc,
1668                                                          depth+1)
1669         );
1670
1671         trie->trans = (reg_trie_trans *)
1672             PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1673         {
1674             U32 state;
1675             U32 tp = 0;
1676             U32 zp = 0;
1677
1678
1679             for( state=1 ; state < next_alloc ; state ++ ) {
1680                 U32 base=0;
1681
1682                 /*
1683                 DEBUG_TRIE_COMPILE_MORE_r(
1684                     PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1685                 );
1686                 */
1687
1688                 if (trie->states[state].trans.list) {
1689                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1690                     U16 maxid=minid;
1691                     U16 idx;
1692
1693                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1694                         const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1695                         if ( forid < minid ) {
1696                             minid=forid;
1697                         } else if ( forid > maxid ) {
1698                             maxid=forid;
1699                         }
1700                     }
1701                     if ( transcount < tp + maxid - minid + 1) {
1702                         transcount *= 2;
1703                         trie->trans = (reg_trie_trans *)
1704                             PerlMemShared_realloc( trie->trans,
1705                                                      transcount
1706                                                      * sizeof(reg_trie_trans) );
1707                         Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1708                     }
1709                     base = trie->uniquecharcount + tp - minid;
1710                     if ( maxid == minid ) {
1711                         U32 set = 0;
1712                         for ( ; zp < tp ; zp++ ) {
1713                             if ( ! trie->trans[ zp ].next ) {
1714                                 base = trie->uniquecharcount + zp - minid;
1715                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1716                                 trie->trans[ zp ].check = state;
1717                                 set = 1;
1718                                 break;
1719                             }
1720                         }
1721                         if ( !set ) {
1722                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1723                             trie->trans[ tp ].check = state;
1724                             tp++;
1725                             zp = tp;
1726                         }
1727                     } else {
1728                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1729                             const U32 tid = base -  trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1730                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1731                             trie->trans[ tid ].check = state;
1732                         }
1733                         tp += ( maxid - minid + 1 );
1734                     }
1735                     Safefree(trie->states[ state ].trans.list);
1736                 }
1737                 /*
1738                 DEBUG_TRIE_COMPILE_MORE_r(
1739                     PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1740                 );
1741                 */
1742                 trie->states[ state ].trans.base=base;
1743             }
1744             trie->lasttrans = tp + 1;
1745         }
1746     } else {
1747         /*
1748            Second Pass -- Flat Table Representation.
1749
1750            we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1751            We know that we will need Charcount+1 trans at most to store the data
1752            (one row per char at worst case) So we preallocate both structures
1753            assuming worst case.
1754
1755            We then construct the trie using only the .next slots of the entry
1756            structs.
1757
1758            We use the .check field of the first entry of the node temporarily to
1759            make compression both faster and easier by keeping track of how many non
1760            zero fields are in the node.
1761
1762            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1763            transition.
1764
1765            There are two terms at use here: state as a TRIE_NODEIDX() which is a
1766            number representing the first entry of the node, and state as a
1767            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1768            TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1769            are 2 entrys per node. eg:
1770
1771              A B       A B
1772           1. 2 4    1. 3 7
1773           2. 0 3    3. 0 5
1774           3. 0 0    5. 0 0
1775           4. 0 0    7. 0 0
1776
1777            The table is internally in the right hand, idx form. However as we also
1778            have to deal with the states array which is indexed by nodenum we have to
1779            use TRIE_NODENUM() to convert.
1780
1781         */
1782         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1783             "%*sCompiling trie using table compiler\n",
1784             (int)depth * 2 + 2, ""));
1785
1786         trie->trans = (reg_trie_trans *)
1787             PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1788                                   * trie->uniquecharcount + 1,
1789                                   sizeof(reg_trie_trans) );
1790         trie->states = (reg_trie_state *)
1791             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1792                                   sizeof(reg_trie_state) );
1793         next_alloc = trie->uniquecharcount + 1;
1794
1795
1796         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1797
1798             regnode * const noper   = NEXTOPER( cur );
1799             const U8 *uc     = (U8*)STRING( noper );
1800             const U8 * const e = uc + STR_LEN( noper );
1801
1802             U32 state        = 1;         /* required init */
1803
1804             U16 charid       = 0;         /* sanity init */
1805             U32 accept_state = 0;         /* sanity init */
1806             U8 *scan         = (U8*)NULL; /* sanity init */
1807
1808             STRLEN foldlen   = 0;         /* required init */
1809             U32 wordlen      = 0;         /* required init */
1810             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1811
1812             if ( OP(noper) != NOTHING ) {
1813                 for ( ; uc < e ; uc += len ) {
1814
1815                     TRIE_READ_CHAR;
1816
1817                     if ( uvc < 256 ) {
1818                         charid = trie->charmap[ uvc ];
1819                     } else {
1820                         SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1821                         charid = svpp ? (U16)SvIV(*svpp) : 0;
1822                     }
1823                     if ( charid ) {
1824                         charid--;
1825                         if ( !trie->trans[ state + charid ].next ) {
1826                             trie->trans[ state + charid ].next = next_alloc;
1827                             trie->trans[ state ].check++;
1828                             prev_states[TRIE_NODENUM(next_alloc)]
1829                                     = TRIE_NODENUM(state);
1830                             next_alloc += trie->uniquecharcount;
1831                         }
1832                         state = trie->trans[ state + charid ].next;
1833                     } else {
1834                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1835                     }
1836                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
1837                 }
1838             }
1839             accept_state = TRIE_NODENUM( state );
1840             TRIE_HANDLE_WORD(accept_state);
1841
1842         } /* end second pass */
1843
1844         /* and now dump it out before we compress it */
1845         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1846                                                           revcharmap,
1847                                                           next_alloc, depth+1));
1848
1849         {
1850         /*
1851            * Inplace compress the table.*
1852
1853            For sparse data sets the table constructed by the trie algorithm will
1854            be mostly 0/FAIL transitions or to put it another way mostly empty.
1855            (Note that leaf nodes will not contain any transitions.)
1856
1857            This algorithm compresses the tables by eliminating most such
1858            transitions, at the cost of a modest bit of extra work during lookup:
1859
1860            - Each states[] entry contains a .base field which indicates the
1861            index in the state[] array wheres its transition data is stored.
1862
1863            - If .base is 0 there are no valid transitions from that node.
1864
1865            - If .base is nonzero then charid is added to it to find an entry in
1866            the trans array.
1867
1868            -If trans[states[state].base+charid].check!=state then the
1869            transition is taken to be a 0/Fail transition. Thus if there are fail
1870            transitions at the front of the node then the .base offset will point
1871            somewhere inside the previous nodes data (or maybe even into a node
1872            even earlier), but the .check field determines if the transition is
1873            valid.
1874
1875            XXX - wrong maybe?
1876            The following process inplace converts the table to the compressed
1877            table: We first do not compress the root node 1,and mark all its
1878            .check pointers as 1 and set its .base pointer as 1 as well. This
1879            allows us to do a DFA construction from the compressed table later,
1880            and ensures that any .base pointers we calculate later are greater
1881            than 0.
1882
1883            - We set 'pos' to indicate the first entry of the second node.
1884
1885            - We then iterate over the columns of the node, finding the first and
1886            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1887            and set the .check pointers accordingly, and advance pos
1888            appropriately and repreat for the next node. Note that when we copy
1889            the next pointers we have to convert them from the original
1890            NODEIDX form to NODENUM form as the former is not valid post
1891            compression.
1892
1893            - If a node has no transitions used we mark its base as 0 and do not
1894            advance the pos pointer.
1895
1896            - If a node only has one transition we use a second pointer into the
1897            structure to fill in allocated fail transitions from other states.
1898            This pointer is independent of the main pointer and scans forward
1899            looking for null transitions that are allocated to a state. When it
1900            finds one it writes the single transition into the "hole".  If the
1901            pointer doesnt find one the single transition is appended as normal.
1902
1903            - Once compressed we can Renew/realloc the structures to release the
1904            excess space.
1905
1906            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1907            specifically Fig 3.47 and the associated pseudocode.
1908
1909            demq
1910         */
1911         const U32 laststate = TRIE_NODENUM( next_alloc );
1912         U32 state, charid;
1913         U32 pos = 0, zp=0;
1914         trie->statecount = laststate;
1915
1916         for ( state = 1 ; state < laststate ; state++ ) {
1917             U8 flag = 0;
1918             const U32 stateidx = TRIE_NODEIDX( state );
1919             const U32 o_used = trie->trans[ stateidx ].check;
1920             U32 used = trie->trans[ stateidx ].check;
1921             trie->trans[ stateidx ].check = 0;
1922
1923             for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1924                 if ( flag || trie->trans[ stateidx + charid ].next ) {
1925                     if ( trie->trans[ stateidx + charid ].next ) {
1926                         if (o_used == 1) {
1927                             for ( ; zp < pos ; zp++ ) {
1928                                 if ( ! trie->trans[ zp ].next ) {
1929                                     break;
1930                                 }
1931                             }
1932                             trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1933                             trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1934                             trie->trans[ zp ].check = state;
1935                             if ( ++zp > pos ) pos = zp;
1936                             break;
1937                         }
1938                         used--;
1939                     }
1940                     if ( !flag ) {
1941                         flag = 1;
1942                         trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1943                     }
1944                     trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1945                     trie->trans[ pos ].check = state;
1946                     pos++;
1947                 }
1948             }
1949         }
1950         trie->lasttrans = pos + 1;
1951         trie->states = (reg_trie_state *)
1952             PerlMemShared_realloc( trie->states, laststate
1953                                    * sizeof(reg_trie_state) );
1954         DEBUG_TRIE_COMPILE_MORE_r(
1955                 PerlIO_printf( Perl_debug_log,
1956                     "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1957                     (int)depth * 2 + 2,"",
1958                     (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
1959                     (IV)next_alloc,
1960                     (IV)pos,
1961                     ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1962             );
1963
1964         } /* end table compress */
1965     }
1966     DEBUG_TRIE_COMPILE_MORE_r(
1967             PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1968                 (int)depth * 2 + 2, "",
1969                 (UV)trie->statecount,
1970                 (UV)trie->lasttrans)
1971     );
1972     /* resize the trans array to remove unused space */
1973     trie->trans = (reg_trie_trans *)
1974         PerlMemShared_realloc( trie->trans, trie->lasttrans
1975                                * sizeof(reg_trie_trans) );
1976
1977     {   /* Modify the program and insert the new TRIE node */
1978         U8 nodetype =(U8)(flags & 0xFF);
1979         char *str=NULL;
1980
1981 #ifdef DEBUGGING
1982         regnode *optimize = NULL;
1983 #ifdef RE_TRACK_PATTERN_OFFSETS
1984
1985         U32 mjd_offset = 0;
1986         U32 mjd_nodelen = 0;
1987 #endif /* RE_TRACK_PATTERN_OFFSETS */
1988 #endif /* DEBUGGING */
1989         /*
1990            This means we convert either the first branch or the first Exact,
1991            depending on whether the thing following (in 'last') is a branch
1992            or not and whther first is the startbranch (ie is it a sub part of
1993            the alternation or is it the whole thing.)
1994            Assuming its a sub part we convert the EXACT otherwise we convert
1995            the whole branch sequence, including the first.
1996          */
1997         /* Find the node we are going to overwrite */
1998         if ( first != startbranch || OP( last ) == BRANCH ) {
1999             /* branch sub-chain */
2000             NEXT_OFF( first ) = (U16)(last - first);
2001 #ifdef RE_TRACK_PATTERN_OFFSETS
2002             DEBUG_r({
2003                 mjd_offset= Node_Offset((convert));
2004                 mjd_nodelen= Node_Length((convert));
2005             });
2006 #endif
2007             /* whole branch chain */
2008         }
2009 #ifdef RE_TRACK_PATTERN_OFFSETS
2010         else {
2011             DEBUG_r({
2012                 const  regnode *nop = NEXTOPER( convert );
2013                 mjd_offset= Node_Offset((nop));
2014                 mjd_nodelen= Node_Length((nop));
2015             });
2016         }
2017         DEBUG_OPTIMISE_r(
2018             PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2019                 (int)depth * 2 + 2, "",
2020                 (UV)mjd_offset, (UV)mjd_nodelen)
2021         );
2022 #endif
2023         /* But first we check to see if there is a common prefix we can
2024            split out as an EXACT and put in front of the TRIE node.  */
2025         trie->startstate= 1;
2026         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
2027             U32 state;
2028             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2029                 U32 ofs = 0;
2030                 I32 idx = -1;
2031                 U32 count = 0;
2032                 const U32 base = trie->states[ state ].trans.base;
2033
2034                 if ( trie->states[state].wordnum )
2035                         count = 1;
2036
2037                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2038                     if ( ( base + ofs >= trie->uniquecharcount ) &&
2039                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2040                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2041                     {
2042                         if ( ++count > 1 ) {
2043                             SV **tmp = av_fetch( revcharmap, ofs, 0);
2044                             const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2045                             if ( state == 1 ) break;
2046                             if ( count == 2 ) {
2047                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2048                                 DEBUG_OPTIMISE_r(
2049                                     PerlIO_printf(Perl_debug_log,
2050                                         "%*sNew Start State=%"UVuf" Class: [",
2051                                         (int)depth * 2 + 2, "",
2052                                         (UV)state));
2053                                 if (idx >= 0) {
2054                                     SV ** const tmp = av_fetch( revcharmap, idx, 0);
2055                                     const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2056
2057                                     TRIE_BITMAP_SET(trie,*ch);
2058                                     if ( folder )
2059                                         TRIE_BITMAP_SET(trie, folder[ *ch ]);
2060                                     DEBUG_OPTIMISE_r(
2061                                         PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2062                                     );
2063                                 }
2064                             }
2065                             TRIE_BITMAP_SET(trie,*ch);
2066                             if ( folder )
2067                                 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2068                             DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2069                         }
2070                         idx = ofs;
2071                     }
2072                 }
2073                 if ( count == 1 ) {
2074                     SV **tmp = av_fetch( revcharmap, idx, 0);
2075                     STRLEN len;
2076                     char *ch = SvPV( *tmp, len );
2077                     DEBUG_OPTIMISE_r({
2078                         SV *sv=sv_newmortal();
2079                         PerlIO_printf( Perl_debug_log,
2080                             "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2081                             (int)depth * 2 + 2, "",
2082                             (UV)state, (UV)idx,
2083                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2084                                 PL_colors[0], PL_colors[1],
2085                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2086                                 PERL_PV_ESCAPE_FIRSTCHAR
2087                             )
2088                         );
2089                     });
2090                     if ( state==1 ) {
2091                         OP( convert ) = nodetype;
2092                         str=STRING(convert);
2093                         STR_LEN(convert)=0;
2094                     }
2095                     STR_LEN(convert) += len;
2096                     while (len--)
2097                         *str++ = *ch++;
2098                 } else {
2099 #ifdef DEBUGGING
2100                     if (state>1)
2101                         DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2102 #endif
2103                     break;
2104                 }
2105             }
2106             trie->prefixlen = (state-1);
2107             if (str) {
2108                 regnode *n = convert+NODE_SZ_STR(convert);
2109                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2110                 trie->startstate = state;
2111                 trie->minlen -= (state - 1);
2112                 trie->maxlen -= (state - 1);
2113 #ifdef DEBUGGING
2114                /* At least the UNICOS C compiler choked on this
2115                 * being argument to DEBUG_r(), so let's just have
2116                 * it right here. */
2117                if (
2118 #ifdef PERL_EXT_RE_BUILD
2119                    1
2120 #else
2121                    DEBUG_r_TEST
2122 #endif
2123                    ) {
2124                    regnode *fix = convert;
2125                    U32 word = trie->wordcount;
2126                    mjd_nodelen++;
2127                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2128                    while( ++fix < n ) {
2129                        Set_Node_Offset_Length(fix, 0, 0);
2130                    }
2131                    while (word--) {
2132                        SV ** const tmp = av_fetch( trie_words, word, 0 );
2133                        if (tmp) {
2134                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
2135                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2136                            else
2137                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2138                        }
2139                    }
2140                }
2141 #endif
2142                 if (trie->maxlen) {
2143                     convert = n;
2144                 } else {
2145                     NEXT_OFF(convert) = (U16)(tail - convert);
2146                     DEBUG_r(optimize= n);
2147                 }
2148             }
2149         }
2150         if (!jumper)
2151             jumper = last;
2152         if ( trie->maxlen ) {
2153             NEXT_OFF( convert ) = (U16)(tail - convert);
2154             ARG_SET( convert, data_slot );
2155             /* Store the offset to the first unabsorbed branch in
2156                jump[0], which is otherwise unused by the jump logic.
2157                We use this when dumping a trie and during optimisation. */
2158             if (trie->jump)
2159                 trie->jump[0] = (U16)(nextbranch - convert);
2160
2161             /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2162              *   and there is a bitmap
2163              *   and the first "jump target" node we found leaves enough room
2164              * then convert the TRIE node into a TRIEC node, with the bitmap
2165              * embedded inline in the opcode - this is hypothetically faster.
2166              */
2167             if ( !trie->states[trie->startstate].wordnum
2168                  && trie->bitmap
2169                  && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2170             {
2171                 OP( convert ) = TRIEC;
2172                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2173                 PerlMemShared_free(trie->bitmap);
2174                 trie->bitmap= NULL;
2175             } else
2176                 OP( convert ) = TRIE;
2177
2178             /* store the type in the flags */
2179             convert->flags = nodetype;
2180             DEBUG_r({
2181             optimize = convert
2182                       + NODE_STEP_REGNODE
2183                       + regarglen[ OP( convert ) ];
2184             });
2185             /* XXX We really should free up the resource in trie now,
2186                    as we won't use them - (which resources?) dmq */
2187         }
2188         /* needed for dumping*/
2189         DEBUG_r(if (optimize) {
2190             regnode *opt = convert;
2191
2192             while ( ++opt < optimize) {
2193                 Set_Node_Offset_Length(opt,0,0);
2194             }
2195             /*
2196                 Try to clean up some of the debris left after the
2197                 optimisation.
2198              */
2199             while( optimize < jumper ) {
2200                 mjd_nodelen += Node_Length((optimize));
2201                 OP( optimize ) = OPTIMIZED;
2202                 Set_Node_Offset_Length(optimize,0,0);
2203                 optimize++;
2204             }
2205             Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2206         });
2207     } /* end node insert */
2208
2209     /*  Finish populating the prev field of the wordinfo array.  Walk back
2210      *  from each accept state until we find another accept state, and if
2211      *  so, point the first word's .prev field at the second word. If the
2212      *  second already has a .prev field set, stop now. This will be the
2213      *  case either if we've already processed that word's accept state,
2214      *  or that state had multiple words, and the overspill words were
2215      *  already linked up earlier.
2216      */
2217     {
2218         U16 word;
2219         U32 state;
2220         U16 prev;
2221
2222         for (word=1; word <= trie->wordcount; word++) {
2223             prev = 0;
2224             if (trie->wordinfo[word].prev)
2225                 continue;
2226             state = trie->wordinfo[word].accept;
2227             while (state) {
2228                 state = prev_states[state];
2229                 if (!state)
2230                     break;
2231                 prev = trie->states[state].wordnum;
2232                 if (prev)
2233                     break;
2234             }
2235             trie->wordinfo[word].prev = prev;
2236         }
2237         Safefree(prev_states);
2238     }
2239
2240
2241     /* and now dump out the compressed format */
2242     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2243
2244     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2245 #ifdef DEBUGGING
2246     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2247     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2248 #else
2249     SvREFCNT_dec(revcharmap);
2250 #endif
2251     return trie->jump
2252            ? MADE_JUMP_TRIE
2253            : trie->startstate>1
2254              ? MADE_EXACT_TRIE
2255              : MADE_TRIE;
2256 }
2257
2258 STATIC void
2259 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode *stclass, U32 depth)
2260 {
2261 /* The Trie is constructed and compressed now so we can build a fail array if it's needed
2262
2263    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2264    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2265    ISBN 0-201-10088-6
2266
2267    We find the fail state for each state in the trie, this state is the longest proper
2268    suffix of the current state's 'word' that is also a proper prefix of another word in our
2269    trie. State 1 represents the word '' and is thus the default fail state. This allows
2270    the DFA not to have to restart after its tried and failed a word at a given point, it
2271    simply continues as though it had been matching the other word in the first place.
2272    Consider
2273       'abcdgu'=~/abcdefg|cdgu/
2274    When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2275    fail, which would bring us to the state representing 'd' in the second word where we would
2276    try 'g' and succeed, proceeding to match 'cdgu'.
2277  */
2278  /* add a fail transition */
2279     const U32 trie_offset = ARG(source);
2280     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2281     U32 *q;
2282     const U32 ucharcount = trie->uniquecharcount;
2283     const U32 numstates = trie->statecount;
2284     const U32 ubound = trie->lasttrans + ucharcount;
2285     U32 q_read = 0;
2286     U32 q_write = 0;
2287     U32 charid;
2288     U32 base = trie->states[ 1 ].trans.base;
2289     U32 *fail;
2290     reg_ac_data *aho;
2291     const U32 data_slot = add_data( pRExC_state, 1, "T" );
2292     GET_RE_DEBUG_FLAGS_DECL;
2293
2294     PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2295 #ifndef DEBUGGING
2296     PERL_UNUSED_ARG(depth);
2297 #endif
2298
2299
2300     ARG_SET( stclass, data_slot );
2301     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2302     RExC_rxi->data->data[ data_slot ] = (void*)aho;
2303     aho->trie=trie_offset;
2304     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2305     Copy( trie->states, aho->states, numstates, reg_trie_state );
2306     Newxz( q, numstates, U32);
2307     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2308     aho->refcount = 1;
2309     fail = aho->fail;
2310     /* initialize fail[0..1] to be 1 so that we always have
2311        a valid final fail state */
2312     fail[ 0 ] = fail[ 1 ] = 1;
2313
2314     for ( charid = 0; charid < ucharcount ; charid++ ) {
2315         const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2316         if ( newstate ) {
2317             q[ q_write ] = newstate;
2318             /* set to point at the root */
2319             fail[ q[ q_write++ ] ]=1;
2320         }
2321     }
2322     while ( q_read < q_write) {
2323         const U32 cur = q[ q_read++ % numstates ];
2324         base = trie->states[ cur ].trans.base;
2325
2326         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2327             const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2328             if (ch_state) {
2329                 U32 fail_state = cur;
2330                 U32 fail_base;
2331                 do {
2332                     fail_state = fail[ fail_state ];
2333                     fail_base = aho->states[ fail_state ].trans.base;
2334                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2335
2336                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2337                 fail[ ch_state ] = fail_state;
2338                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2339                 {
2340                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
2341                 }
2342                 q[ q_write++ % numstates] = ch_state;
2343             }
2344         }
2345     }
2346     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2347        when we fail in state 1, this allows us to use the
2348        charclass scan to find a valid start char. This is based on the principle
2349        that theres a good chance the string being searched contains lots of stuff
2350        that cant be a start char.
2351      */
2352     fail[ 0 ] = fail[ 1 ] = 0;
2353     DEBUG_TRIE_COMPILE_r({
2354         PerlIO_printf(Perl_debug_log,
2355                       "%*sStclass Failtable (%"UVuf" states): 0",
2356                       (int)(depth * 2), "", (UV)numstates
2357         );
2358         for( q_read=1; q_read<numstates; q_read++ ) {
2359             PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2360         }
2361         PerlIO_printf(Perl_debug_log, "\n");
2362     });
2363     Safefree(q);
2364     /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2365 }
2366
2367
2368 /*
2369  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2370  * These need to be revisited when a newer toolchain becomes available.
2371  */
2372 #if defined(__sparc64__) && defined(__GNUC__)
2373 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2374 #       undef  SPARC64_GCC_WORKAROUND
2375 #       define SPARC64_GCC_WORKAROUND 1
2376 #   endif
2377 #endif
2378
2379 #define DEBUG_PEEP(str,scan,depth) \
2380     DEBUG_OPTIMISE_r({if (scan){ \
2381        SV * const mysv=sv_newmortal(); \
2382        regnode *Next = regnext(scan); \
2383        regprop(RExC_rx, mysv, scan); \
2384        PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2385        (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2386        Next ? (REG_NODE_NUM(Next)) : 0 ); \
2387    }});
2388
2389
2390
2391
2392
2393 #define JOIN_EXACT(scan,min,flags) \
2394     if (PL_regkind[OP(scan)] == EXACT) \
2395         join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2396
2397 STATIC U32
2398 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2399     /* Merge several consecutive EXACTish nodes into one. */
2400     regnode *n = regnext(scan);
2401     U32 stringok = 1;
2402     regnode *next = scan + NODE_SZ_STR(scan);
2403     U32 merged = 0;
2404     U32 stopnow = 0;
2405 #ifdef DEBUGGING
2406     regnode *stop = scan;
2407     GET_RE_DEBUG_FLAGS_DECL;
2408 #else
2409     PERL_UNUSED_ARG(depth);
2410 #endif
2411
2412     PERL_ARGS_ASSERT_JOIN_EXACT;
2413 #ifndef EXPERIMENTAL_INPLACESCAN
2414     PERL_UNUSED_ARG(flags);
2415     PERL_UNUSED_ARG(val);
2416 #endif
2417     DEBUG_PEEP("join",scan,depth);
2418
2419     /* Skip NOTHING, merge EXACT*. */
2420     while (n &&
2421            ( PL_regkind[OP(n)] == NOTHING ||
2422              (stringok && (OP(n) == OP(scan))))
2423            && NEXT_OFF(n)
2424            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2425
2426         if (OP(n) == TAIL || n > next)
2427             stringok = 0;
2428         if (PL_regkind[OP(n)] == NOTHING) {
2429             DEBUG_PEEP("skip:",n,depth);
2430             NEXT_OFF(scan) += NEXT_OFF(n);
2431             next = n + NODE_STEP_REGNODE;
2432 #ifdef DEBUGGING
2433             if (stringok)
2434                 stop = n;
2435 #endif
2436             n = regnext(n);
2437         }
2438         else if (stringok) {
2439             const unsigned int oldl = STR_LEN(scan);
2440             regnode * const nnext = regnext(n);
2441
2442             DEBUG_PEEP("merg",n,depth);
2443
2444             merged++;
2445             if (oldl + STR_LEN(n) > U8_MAX)
2446                 break;
2447             NEXT_OFF(scan) += NEXT_OFF(n);
2448             STR_LEN(scan) += STR_LEN(n);
2449             next = n + NODE_SZ_STR(n);
2450             /* Now we can overwrite *n : */
2451             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2452 #ifdef DEBUGGING
2453             stop = next - 1;
2454 #endif
2455             n = nnext;
2456             if (stopnow) break;
2457         }
2458
2459 #ifdef EXPERIMENTAL_INPLACESCAN
2460         if (flags && !NEXT_OFF(n)) {
2461             DEBUG_PEEP("atch", val, depth);
2462             if (reg_off_by_arg[OP(n)]) {
2463                 ARG_SET(n, val - n);
2464             }
2465             else {
2466                 NEXT_OFF(n) = val - n;
2467             }
2468             stopnow = 1;
2469         }
2470 #endif
2471     }
2472 #define GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS   0x0390
2473 #define IOTA_D_T        GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS
2474 #define GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS     0x03B0
2475 #define UPSILON_D_T     GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS
2476
2477     if (UTF
2478         && ( OP(scan) == EXACTF || OP(scan) == EXACTFU || OP(scan) == EXACTFA)
2479         && ( STR_LEN(scan) >= 6 ) )
2480     {
2481     /*
2482     Two problematic code points in Unicode casefolding of EXACT nodes:
2483
2484     U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2485     U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2486
2487     which casefold to
2488
2489     Unicode                      UTF-8
2490
2491     U+03B9 U+0308 U+0301         0xCE 0xB9 0xCC 0x88 0xCC 0x81
2492     U+03C5 U+0308 U+0301         0xCF 0x85 0xCC 0x88 0xCC 0x81
2493
2494     This means that in case-insensitive matching (or "loose matching",
2495     as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2496     length of the above casefolded versions) can match a target string
2497     of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2498     This would rather mess up the minimum length computation.
2499
2500     What we'll do is to look for the tail four bytes, and then peek
2501     at the preceding two bytes to see whether we need to decrease
2502     the minimum length by four (six minus two).
2503
2504     Thanks to the design of UTF-8, there cannot be false matches:
2505     A sequence of valid UTF-8 bytes cannot be a subsequence of
2506     another valid sequence of UTF-8 bytes.
2507
2508     */
2509          char * const s0 = STRING(scan), *s, *t;
2510          char * const s1 = s0 + STR_LEN(scan) - 1;
2511          char * const s2 = s1 - 4;
2512 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2513          const char t0[] = "\xaf\x49\xaf\x42";
2514 #else
2515          const char t0[] = "\xcc\x88\xcc\x81";
2516 #endif
2517          const char * const t1 = t0 + 3;
2518
2519          for (s = s0 + 2;
2520               s < s2 && (t = ninstr(s, s1, t0, t1));
2521               s = t + 4) {
2522 #ifdef EBCDIC
2523               if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2524                   ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2525 #else
2526               if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2527                   ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
2528 #endif
2529                    *min -= 4;
2530          }
2531     }
2532
2533 #ifdef DEBUGGING
2534     /* Allow dumping */
2535     n = scan + NODE_SZ_STR(scan);
2536     while (n <= stop) {
2537         if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2538             OP(n) = OPTIMIZED;
2539             NEXT_OFF(n) = 0;
2540         }
2541         n++;
2542     }
2543 #endif
2544     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2545     return stopnow;
2546 }
2547
2548 /* REx optimizer.  Converts nodes into quicker variants "in place".
2549    Finds fixed substrings.  */
2550
2551 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2552    to the position after last scanned or to NULL. */
2553
2554 #define INIT_AND_WITHP \
2555     assert(!and_withp); \
2556     Newx(and_withp,1,struct regnode_charclass_class); \
2557     SAVEFREEPV(and_withp)
2558
2559 /* this is a chain of data about sub patterns we are processing that
2560    need to be handled separately/specially in study_chunk. Its so
2561    we can simulate recursion without losing state.  */
2562 struct scan_frame;
2563 typedef struct scan_frame {
2564     regnode *last;  /* last node to process in this frame */
2565     regnode *next;  /* next node to process when last is reached */
2566     struct scan_frame *prev; /*previous frame*/
2567     I32 stop; /* what stopparen do we use */
2568 } scan_frame;
2569
2570
2571 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2572
2573 #define CASE_SYNST_FNC(nAmE)                                       \
2574 case nAmE:                                                         \
2575     if (flags & SCF_DO_STCLASS_AND) {                              \
2576             for (value = 0; value < 256; value++)                  \
2577                 if (!is_ ## nAmE ## _cp(value))                       \
2578                     ANYOF_BITMAP_CLEAR(data->start_class, value);  \
2579     }                                                              \
2580     else {                                                         \
2581             for (value = 0; value < 256; value++)                  \
2582                 if (is_ ## nAmE ## _cp(value))                        \
2583                     ANYOF_BITMAP_SET(data->start_class, value);    \
2584     }                                                              \
2585     break;                                                         \
2586 case N ## nAmE:                                                    \
2587     if (flags & SCF_DO_STCLASS_AND) {                              \
2588             for (value = 0; value < 256; value++)                   \
2589                 if (is_ ## nAmE ## _cp(value))                         \
2590                     ANYOF_BITMAP_CLEAR(data->start_class, value);   \
2591     }                                                               \
2592     else {                                                          \
2593             for (value = 0; value < 256; value++)                   \
2594                 if (!is_ ## nAmE ## _cp(value))                        \
2595                     ANYOF_BITMAP_SET(data->start_class, value);     \
2596     }                                                               \
2597     break
2598
2599
2600
2601 STATIC I32
2602 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2603                         I32 *minlenp, I32 *deltap,
2604                         regnode *last,
2605                         scan_data_t *data,
2606                         I32 stopparen,
2607                         U8* recursed,
2608                         struct regnode_charclass_class *and_withp,
2609                         U32 flags, U32 depth)
2610                         /* scanp: Start here (read-write). */
2611                         /* deltap: Write maxlen-minlen here. */
2612                         /* last: Stop before this one. */
2613                         /* data: string data about the pattern */
2614                         /* stopparen: treat close N as END */
2615                         /* recursed: which subroutines have we recursed into */
2616                         /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2617 {
2618     dVAR;
2619     I32 min = 0, pars = 0, code;
2620     regnode *scan = *scanp, *next;
2621     I32 delta = 0;
2622     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2623     int is_inf_internal = 0;            /* The studied chunk is infinite */
2624     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2625     scan_data_t data_fake;
2626     SV *re_trie_maxbuff = NULL;
2627     regnode *first_non_open = scan;
2628     I32 stopmin = I32_MAX;
2629     scan_frame *frame = NULL;
2630     GET_RE_DEBUG_FLAGS_DECL;
2631
2632     PERL_ARGS_ASSERT_STUDY_CHUNK;
2633
2634 #ifdef DEBUGGING
2635     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2636 #endif
2637
2638     if ( depth == 0 ) {
2639         while (first_non_open && OP(first_non_open) == OPEN)
2640             first_non_open=regnext(first_non_open);
2641     }
2642
2643
2644   fake_study_recurse:
2645     while ( scan && OP(scan) != END && scan < last ){
2646         /* Peephole optimizer: */
2647         DEBUG_STUDYDATA("Peep:", data,depth);
2648         DEBUG_PEEP("Peep",scan,depth);
2649         JOIN_EXACT(scan,&min,0);
2650
2651         /* Follow the next-chain of the current node and optimize
2652            away all the NOTHINGs from it.  */
2653         if (OP(scan) != CURLYX) {
2654             const int max = (reg_off_by_arg[OP(scan)]
2655                        ? I32_MAX
2656                        /* I32 may be smaller than U16 on CRAYs! */
2657                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2658             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2659             int noff;
2660             regnode *n = scan;
2661
2662             /* Skip NOTHING and LONGJMP. */
2663             while ((n = regnext(n))
2664                    && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2665                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2666                    && off + noff < max)
2667                 off += noff;
2668             if (reg_off_by_arg[OP(scan)])
2669                 ARG(scan) = off;
2670             else
2671                 NEXT_OFF(scan) = off;
2672         }
2673
2674
2675
2676         /* The principal pseudo-switch.  Cannot be a switch, since we
2677            look into several different things.  */
2678         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2679                    || OP(scan) == IFTHEN) {
2680             next = regnext(scan);
2681             code = OP(scan);
2682             /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2683
2684             if (OP(next) == code || code == IFTHEN) {
2685                 /* NOTE - There is similar code to this block below for handling
2686                    TRIE nodes on a re-study.  If you change stuff here check there
2687                    too. */
2688                 I32 max1 = 0, min1 = I32_MAX, num = 0;
2689                 struct regnode_charclass_class accum;
2690                 regnode * const startbranch=scan;
2691
2692                 if (flags & SCF_DO_SUBSTR)
2693                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2694                 if (flags & SCF_DO_STCLASS)
2695                     cl_init_zero(pRExC_state, &accum);
2696
2697                 while (OP(scan) == code) {
2698                     I32 deltanext, minnext, f = 0, fake;
2699                     struct regnode_charclass_class this_class;
2700
2701                     num++;
2702                     data_fake.flags = 0;
2703                     if (data) {
2704                         data_fake.whilem_c = data->whilem_c;
2705                         data_fake.last_closep = data->last_closep;
2706                     }
2707                     else
2708                         data_fake.last_closep = &fake;
2709
2710                     data_fake.pos_delta = delta;
2711                     next = regnext(scan);
2712                     scan = NEXTOPER(scan);
2713                     if (code != BRANCH)
2714                         scan = NEXTOPER(scan);
2715                     if (flags & SCF_DO_STCLASS) {
2716                         cl_init(pRExC_state, &this_class);
2717                         data_fake.start_class = &this_class;
2718                         f = SCF_DO_STCLASS_AND;
2719                     }
2720                     if (flags & SCF_WHILEM_VISITED_POS)
2721                         f |= SCF_WHILEM_VISITED_POS;
2722
2723                     /* we suppose the run is continuous, last=next...*/
2724                     minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2725                                           next, &data_fake,
2726                                           stopparen, recursed, NULL, f,depth+1);
2727                     if (min1 > minnext)
2728                         min1 = minnext;
2729                     if (max1 < minnext + deltanext)
2730                         max1 = minnext + deltanext;
2731                     if (deltanext == I32_MAX)
2732                         is_inf = is_inf_internal = 1;
2733                     scan = next;
2734                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2735                         pars++;
2736                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
2737                         if ( stopmin > minnext)
2738                             stopmin = min + min1;
2739                         flags &= ~SCF_DO_SUBSTR;
2740                         if (data)
2741                             data->flags |= SCF_SEEN_ACCEPT;
2742                     }
2743                     if (data) {
2744                         if (data_fake.flags & SF_HAS_EVAL)
2745                             data->flags |= SF_HAS_EVAL;
2746                         data->whilem_c = data_fake.whilem_c;
2747                     }
2748                     if (flags & SCF_DO_STCLASS)
2749                         cl_or(pRExC_state, &accum, &this_class);
2750                 }
2751                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2752                     min1 = 0;
2753                 if (flags & SCF_DO_SUBSTR) {
2754                     data->pos_min += min1;
2755                     data->pos_delta += max1 - min1;
2756                     if (max1 != min1 || is_inf)
2757                         data->longest = &(data->longest_float);
2758                 }
2759                 min += min1;
2760                 delta += max1 - min1;
2761                 if (flags & SCF_DO_STCLASS_OR) {
2762                     cl_or(pRExC_state, data->start_class, &accum);
2763                     if (min1) {
2764                         cl_and(data->start_class, and_withp);
2765                         flags &= ~SCF_DO_STCLASS;
2766                     }
2767                 }
2768                 else if (flags & SCF_DO_STCLASS_AND) {
2769                     if (min1) {
2770                         cl_and(data->start_class, &accum);
2771                         flags &= ~SCF_DO_STCLASS;
2772                     }
2773                     else {
2774                         /* Switch to OR mode: cache the old value of
2775                          * data->start_class */
2776                         INIT_AND_WITHP;
2777                         StructCopy(data->start_class, and_withp,
2778                                    struct regnode_charclass_class);
2779                         flags &= ~SCF_DO_STCLASS_AND;
2780                         StructCopy(&accum, data->start_class,
2781                                    struct regnode_charclass_class);
2782                         flags |= SCF_DO_STCLASS_OR;
2783                         data->start_class->flags |= ANYOF_EOS;
2784                     }
2785                 }
2786
2787                 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2788                 /* demq.
2789
2790                    Assuming this was/is a branch we are dealing with: 'scan' now
2791                    points at the item that follows the branch sequence, whatever
2792                    it is. We now start at the beginning of the sequence and look
2793                    for subsequences of
2794
2795                    BRANCH->EXACT=>x1
2796                    BRANCH->EXACT=>x2
2797                    tail
2798
2799                    which would be constructed from a pattern like /A|LIST|OF|WORDS/
2800
2801                    If we can find such a subsequence we need to turn the first
2802                    element into a trie and then add the subsequent branch exact
2803                    strings to the trie.
2804
2805                    We have two cases
2806
2807                      1. patterns where the whole set of branches can be converted.
2808
2809                      2. patterns where only a subset can be converted.
2810
2811                    In case 1 we can replace the whole set with a single regop
2812                    for the trie. In case 2 we need to keep the start and end
2813                    branches so
2814
2815                      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2816                      becomes BRANCH TRIE; BRANCH X;
2817
2818                   There is an additional case, that being where there is a
2819                   common prefix, which gets split out into an EXACT like node
2820                   preceding the TRIE node.
2821
2822                   If x(1..n)==tail then we can do a simple trie, if not we make
2823                   a "jump" trie, such that when we match the appropriate word
2824                   we "jump" to the appropriate tail node. Essentially we turn
2825                   a nested if into a case structure of sorts.
2826
2827                 */
2828
2829                     int made=0;
2830                     if (!re_trie_maxbuff) {
2831                         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2832                         if (!SvIOK(re_trie_maxbuff))
2833                             sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2834                     }
2835                     if ( SvIV(re_trie_maxbuff)>=0  ) {
2836                         regnode *cur;
2837                         regnode *first = (regnode *)NULL;
2838                         regnode *last = (regnode *)NULL;
2839                         regnode *tail = scan;
2840                         U8 optype = 0;
2841                         U32 count=0;
2842
2843 #ifdef DEBUGGING
2844                         SV * const mysv = sv_newmortal();       /* for dumping */
2845 #endif
2846                         /* var tail is used because there may be a TAIL
2847                            regop in the way. Ie, the exacts will point to the
2848                            thing following the TAIL, but the last branch will
2849                            point at the TAIL. So we advance tail. If we
2850                            have nested (?:) we may have to move through several
2851                            tails.
2852                          */
2853
2854                         while ( OP( tail ) == TAIL ) {
2855                             /* this is the TAIL generated by (?:) */
2856                             tail = regnext( tail );
2857                         }
2858
2859
2860                         DEBUG_OPTIMISE_r({
2861                             regprop(RExC_rx, mysv, tail );
2862                             PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2863                                 (int)depth * 2 + 2, "",
2864                                 "Looking for TRIE'able sequences. Tail node is: ",
2865                                 SvPV_nolen_const( mysv )
2866                             );
2867                         });
2868
2869                         /*
2870
2871                            step through the branches, cur represents each
2872                            branch, noper is the first thing to be matched
2873                            as part of that branch and noper_next is the
2874                            regnext() of that node. if noper is an EXACT
2875                            and noper_next is the same as scan (our current
2876                            position in the regex) then the EXACT branch is
2877                            a possible optimization target. Once we have
2878                            two or more consecutive such branches we can
2879                            create a trie of the EXACT's contents and stich
2880                            it in place. If the sequence represents all of
2881                            the branches we eliminate the whole thing and
2882                            replace it with a single TRIE. If it is a
2883                            subsequence then we need to stitch it in. This
2884                            means the first branch has to remain, and needs
2885                            to be repointed at the item on the branch chain
2886                            following the last branch optimized. This could
2887                            be either a BRANCH, in which case the
2888                            subsequence is internal, or it could be the
2889                            item following the branch sequence in which
2890                            case the subsequence is at the end.
2891
2892                         */
2893
2894                         /* dont use tail as the end marker for this traverse */
2895                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2896                             regnode * const noper = NEXTOPER( cur );
2897 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
2898                             regnode * const noper_next = regnext( noper );
2899 #endif
2900
2901                             DEBUG_OPTIMISE_r({
2902                                 regprop(RExC_rx, mysv, cur);
2903                                 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2904                                    (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2905
2906                                 regprop(RExC_rx, mysv, noper);
2907                                 PerlIO_printf( Perl_debug_log, " -> %s",
2908                                     SvPV_nolen_const(mysv));
2909
2910                                 if ( noper_next ) {
2911                                   regprop(RExC_rx, mysv, noper_next );
2912                                   PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2913                                     SvPV_nolen_const(mysv));
2914                                 }
2915                                 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2916                                    REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2917                             });
2918                             if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2919                                          : PL_regkind[ OP( noper ) ] == EXACT )
2920                                   || OP(noper) == NOTHING )
2921 #ifdef NOJUMPTRIE
2922                                   && noper_next == tail
2923 #endif
2924                                   && count < U16_MAX)
2925                             {
2926                                 count++;
2927                                 if ( !first || optype == NOTHING ) {
2928                                     if (!first) first = cur;
2929                                     optype = OP( noper );
2930                                 } else {
2931                                     last = cur;
2932                                 }
2933                             } else {
2934 /*
2935     Currently we do not believe that the trie logic can
2936     handle case insensitive matching properly when the
2937     pattern is not unicode (thus forcing unicode semantics).
2938
2939     If/when this is fixed the following define can be swapped
2940     in below to fully enable trie logic.
2941
2942 #define TRIE_TYPE_IS_SAFE 1
2943
2944 */
2945 #define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
2946
2947                                 if ( last && TRIE_TYPE_IS_SAFE ) {
2948                                     make_trie( pRExC_state,
2949                                             startbranch, first, cur, tail, count,
2950                                             optype, depth+1 );
2951                                 }
2952                                 if ( PL_regkind[ OP( noper ) ] == EXACT
2953 #ifdef NOJUMPTRIE
2954                                      && noper_next == tail
2955 #endif
2956                                 ){
2957                                     count = 1;
2958                                     first = cur;
2959                                     optype = OP( noper );
2960                                 } else {
2961                                     count = 0;
2962                                     first = NULL;
2963                                     optype = 0;
2964                                 }
2965                                 last = NULL;
2966                             }
2967                         }
2968                         DEBUG_OPTIMISE_r({
2969                             regprop(RExC_rx, mysv, cur);
2970                             PerlIO_printf( Perl_debug_log,
2971                               "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2972                               "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2973
2974                         });
2975
2976                         if ( last && TRIE_TYPE_IS_SAFE ) {
2977                             made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
2978 #ifdef TRIE_STUDY_OPT
2979                             if ( ((made == MADE_EXACT_TRIE &&
2980                                  startbranch == first)
2981                                  || ( first_non_open == first )) &&
2982                                  depth==0 ) {
2983                                 flags |= SCF_TRIE_RESTUDY;
2984                                 if ( startbranch == first
2985                                      && scan == tail )
2986                                 {
2987                                     RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2988                                 }
2989                             }
2990 #endif
2991                         }
2992                     }
2993
2994                 } /* do trie */
2995
2996             }
2997             else if ( code == BRANCHJ ) {  /* single branch is optimized. */
2998                 scan = NEXTOPER(NEXTOPER(scan));
2999             } else                      /* single branch is optimized. */
3000                 scan = NEXTOPER(scan);
3001             continue;
3002         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3003             scan_frame *newframe = NULL;
3004             I32 paren;
3005             regnode *start;
3006             regnode *end;
3007
3008             if (OP(scan) != SUSPEND) {
3009             /* set the pointer */
3010                 if (OP(scan) == GOSUB) {
3011                     paren = ARG(scan);
3012                     RExC_recurse[ARG2L(scan)] = scan;
3013                     start = RExC_open_parens[paren-1];
3014                     end   = RExC_close_parens[paren-1];
3015                 } else {
3016                     paren = 0;
3017                     start = RExC_rxi->program + 1;
3018                     end   = RExC_opend;
3019                 }
3020                 if (!recursed) {
3021                     Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3022                     SAVEFREEPV(recursed);
3023                 }
3024                 if (!PAREN_TEST(recursed,paren+1)) {
3025                     PAREN_SET(recursed,paren+1);
3026                     Newx(newframe,1,scan_frame);
3027                 } else {
3028                     if (flags & SCF_DO_SUBSTR) {
3029                         SCAN_COMMIT(pRExC_state,data,minlenp);
3030                         data->longest = &(data->longest_float);
3031                     }
3032                     is_inf = is_inf_internal = 1;
3033                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3034                         cl_anything(pRExC_state, data->start_class);
3035                     flags &= ~SCF_DO_STCLASS;
3036                 }
3037             } else {
3038                 Newx(newframe,1,scan_frame);
3039                 paren = stopparen;
3040                 start = scan+2;
3041                 end = regnext(scan);
3042             }
3043             if (newframe) {
3044                 assert(start);
3045                 assert(end);
3046                 SAVEFREEPV(newframe);
3047                 newframe->next = regnext(scan);
3048                 newframe->last = last;
3049                 newframe->stop = stopparen;
3050                 newframe->prev = frame;
3051
3052                 frame = newframe;
3053                 scan =  start;
3054                 stopparen = paren;
3055                 last = end;
3056
3057                 continue;
3058             }
3059         }
3060         else if (OP(scan) == EXACT) {
3061             I32 l = STR_LEN(scan);
3062             UV uc;
3063             if (UTF) {
3064                 const U8 * const s = (U8*)STRING(scan);
3065                 l = utf8_length(s, s + l);
3066                 uc = utf8_to_uvchr(s, NULL);
3067             } else {
3068                 uc = *((U8*)STRING(scan));
3069             }
3070             min += l;
3071             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3072                 /* The code below prefers earlier match for fixed
3073                    offset, later match for variable offset.  */
3074                 if (data->last_end == -1) { /* Update the start info. */
3075                     data->last_start_min = data->pos_min;
3076                     data->last_start_max = is_inf
3077                         ? I32_MAX : data->pos_min + data->pos_delta;
3078                 }
3079                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3080                 if (UTF)
3081                     SvUTF8_on(data->last_found);
3082                 {
3083                     SV * const sv = data->last_found;
3084                     MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3085                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
3086                     if (mg && mg->mg_len >= 0)
3087                         mg->mg_len += utf8_length((U8*)STRING(scan),
3088                                                   (U8*)STRING(scan)+STR_LEN(scan));
3089                 }
3090                 data->last_end = data->pos_min + l;
3091                 data->pos_min += l; /* As in the first entry. */
3092                 data->flags &= ~SF_BEFORE_EOL;
3093             }
3094             if (flags & SCF_DO_STCLASS_AND) {
3095                 /* Check whether it is compatible with what we know already! */
3096                 int compat = 1;
3097
3098
3099                 /* If compatible, we or it in below.  It is compatible if is
3100                  * in the bitmp and either 1) its bit or its fold is set, or 2)
3101                  * it's for a locale.  Even if there isn't unicode semantics
3102                  * here, at runtime there may be because of matching against a
3103                  * utf8 string, so accept a possible false positive for
3104                  * latin1-range folds */
3105                 if (uc >= 0x100 ||
3106                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3107                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
3108                     && (!(data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD)
3109                         || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3110                     )
3111                 {
3112                     compat = 0;
3113                 }
3114                 ANYOF_CLASS_ZERO(data->start_class);
3115                 ANYOF_BITMAP_ZERO(data->start_class);
3116                 if (compat)
3117                     ANYOF_BITMAP_SET(data->start_class, uc);
3118                 else if (uc >= 0x100) {
3119                     int i;
3120
3121                     /* Some Unicode code points fold to the Latin1 range; as
3122                      * XXX temporary code, instead of figuring out if this is
3123                      * one, just assume it is and set all the start class bits
3124                      * that could be some such above 255 code point's fold
3125                      * which will generate fals positives.  As the code
3126                      * elsewhere that does compute the fold settles down, it
3127                      * can be extracted out and re-used here */
3128                     for (i = 0; i < 256; i++){
3129                         if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3130                             ANYOF_BITMAP_SET(data->start_class, i);
3131                         }
3132                     }
3133                 }
3134                 data->start_class->flags &= ~ANYOF_EOS;
3135                 if (uc < 0x100)
3136                   data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3137             }
3138             else if (flags & SCF_DO_STCLASS_OR) {
3139                 /* false positive possible if the class is case-folded */
3140                 if (uc < 0x100)
3141                     ANYOF_BITMAP_SET(data->start_class, uc);
3142                 else
3143                     data->start_class->flags |= ANYOF_UNICODE_ALL;
3144                 data->start_class->flags &= ~ANYOF_EOS;
3145                 cl_and(data->start_class, and_withp);
3146             }
3147             flags &= ~SCF_DO_STCLASS;
3148         }
3149         else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3150             I32 l = STR_LEN(scan);
3151             UV uc = *((U8*)STRING(scan));
3152
3153             /* Search for fixed substrings supports EXACT only. */
3154             if (flags & SCF_DO_SUBSTR) {
3155                 assert(data);
3156                 SCAN_COMMIT(pRExC_state, data, minlenp);
3157             }
3158             if (UTF) {
3159                 const U8 * const s = (U8 *)STRING(scan);
3160                 l = utf8_length(s, s + l);
3161                 uc = utf8_to_uvchr(s, NULL);
3162             }
3163             min += l;
3164             if (flags & SCF_DO_SUBSTR)
3165                 data->pos_min += l;
3166             if (flags & SCF_DO_STCLASS_AND) {
3167                 /* Check whether it is compatible with what we know already! */
3168                 int compat = 1;
3169                 if (uc >= 0x100 ||
3170                  (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3171                   && !ANYOF_BITMAP_TEST(data->start_class, uc)
3172                   && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3173                 {
3174                     compat = 0;
3175                 }
3176                 ANYOF_CLASS_ZERO(data->start_class);
3177                 ANYOF_BITMAP_ZERO(data->start_class);
3178                 if (compat) {
3179                     ANYOF_BITMAP_SET(data->start_class, uc);
3180                     data->start_class->flags &= ~ANYOF_EOS;
3181                     data->start_class->flags |= ANYOF_LOC_NONBITMAP_FOLD;
3182                     if (OP(scan) == EXACTFL) {
3183                         data->start_class->flags |= ANYOF_LOCALE;
3184                     }
3185                     else {
3186
3187                         /* Also set the other member of the fold pair.  In case
3188                          * that unicode semantics is called for at runtime, use
3189                          * the full latin1 fold.  (Can't do this for locale,
3190                          * because not known until runtime */
3191                         ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3192                     }
3193                 }
3194                 else if (uc >= 0x100) {
3195                     int i;
3196                     for (i = 0; i < 256; i++){
3197                         if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3198                             ANYOF_BITMAP_SET(data->start_class, i);
3199                         }
3200                     }
3201                 }
3202             }
3203             else if (flags & SCF_DO_STCLASS_OR) {
3204                 if (data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) {
3205                     /* false positive possible if the class is case-folded.
3206                        Assume that the locale settings are the same... */
3207                     if (uc < 0x100) {
3208                         ANYOF_BITMAP_SET(data->start_class, uc);
3209                         if (OP(scan) != EXACTFL) {
3210
3211                             /* And set the other member of the fold pair, but
3212                              * can't do that in locale because not known until
3213                              * run-time */
3214                             ANYOF_BITMAP_SET(data->start_class,
3215                                              PL_fold_latin1[uc]);
3216                         }
3217                     }
3218                     data->start_class->flags &= ~ANYOF_EOS;
3219                 }
3220                 cl_and(data->start_class, and_withp);
3221             }
3222             flags &= ~SCF_DO_STCLASS;
3223         }
3224         else if (REGNODE_VARIES(OP(scan))) {
3225             I32 mincount, maxcount, minnext, deltanext, fl = 0;
3226             I32 f = flags, pos_before = 0;
3227             regnode * const oscan = scan;
3228             struct regnode_charclass_class this_class;
3229             struct regnode_charclass_class *oclass = NULL;
3230             I32 next_is_eval = 0;
3231
3232             switch (PL_regkind[OP(scan)]) {
3233             case WHILEM:                /* End of (?:...)* . */
3234                 scan = NEXTOPER(scan);
3235                 goto finish;
3236             case PLUS:
3237                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3238                     next = NEXTOPER(scan);
3239                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3240                         mincount = 1;
3241                         maxcount = REG_INFTY;
3242                         next = regnext(scan);
3243                         scan = NEXTOPER(scan);
3244                         goto do_curly;
3245                     }
3246                 }
3247                 if (flags & SCF_DO_SUBSTR)
3248                     data->pos_min++;
3249                 min++;
3250                 /* Fall through. */
3251             case STAR:
3252                 if (flags & SCF_DO_STCLASS) {
3253                     mincount = 0;
3254                     maxcount = REG_INFTY;
3255                     next = regnext(scan);
3256                     scan = NEXTOPER(scan);
3257                     goto do_curly;
3258                 }
3259                 is_inf = is_inf_internal = 1;
3260                 scan = regnext(scan);
3261                 if (flags & SCF_DO_SUBSTR) {
3262                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3263                     data->longest = &(data->longest_float);
3264                 }
3265                 goto optimize_curly_tail;
3266             case CURLY:
3267                 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3268                     && (scan->flags == stopparen))
3269                 {
3270                     mincount = 1;
3271                     maxcount = 1;
3272                 } else {
3273                     mincount = ARG1(scan);
3274                     maxcount = ARG2(scan);
3275                 }
3276                 next = regnext(scan);
3277                 if (OP(scan) == CURLYX) {
3278                     I32 lp = (data ? *(data->last_closep) : 0);
3279                     scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3280                 }
3281                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3282                 next_is_eval = (OP(scan) == EVAL);
3283               do_curly:
3284                 if (flags & SCF_DO_SUBSTR) {
3285                     if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3286                     pos_before = data->pos_min;
3287                 }
3288                 if (data) {
3289                     fl = data->flags;
3290                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3291                     if (is_inf)
3292                         data->flags |= SF_IS_INF;
3293                 }
3294                 if (flags & SCF_DO_STCLASS) {
3295                     cl_init(pRExC_state, &this_class);
3296                     oclass = data->start_class;
3297                     data->start_class = &this_class;
3298                     f |= SCF_DO_STCLASS_AND;
3299                     f &= ~SCF_DO_STCLASS_OR;
3300                 }
3301                 /* Exclude from super-linear cache processing any {n,m}
3302                    regops for which the combination of input pos and regex
3303                    pos is not enough information to determine if a match
3304                    will be possible.
3305
3306                    For example, in the regex /foo(bar\s*){4,8}baz/ with the
3307                    regex pos at the \s*, the prospects for a match depend not
3308                    only on the input position but also on how many (bar\s*)
3309                    repeats into the {4,8} we are. */
3310                if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3311                     f &= ~SCF_WHILEM_VISITED_POS;
3312
3313                 /* This will finish on WHILEM, setting scan, or on NULL: */
3314                 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3315                                       last, data, stopparen, recursed, NULL,
3316                                       (mincount == 0
3317                                         ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3318
3319                 if (flags & SCF_DO_STCLASS)
3320                     data->start_class = oclass;
3321                 if (mincount == 0 || minnext == 0) {
3322                     if (flags & SCF_DO_STCLASS_OR) {
3323                         cl_or(pRExC_state, data->start_class, &this_class);
3324                     }
3325                     else if (flags & SCF_DO_STCLASS_AND) {
3326                         /* Switch to OR mode: cache the old value of
3327                          * data->start_class */
3328                         INIT_AND_WITHP;
3329                         StructCopy(data->start_class, and_withp,
3330                                    struct regnode_charclass_class);
3331                         flags &= ~SCF_DO_STCLASS_AND;
3332                         StructCopy(&this_class, data->start_class,
3333                                    struct regnode_charclass_class);
3334                         flags |= SCF_DO_STCLASS_OR;
3335                         data->start_class->flags |= ANYOF_EOS;
3336                     }
3337                 } else {                /* Non-zero len */
3338                     if (flags & SCF_DO_STCLASS_OR) {
3339                         cl_or(pRExC_state, data->start_class, &this_class);
3340                         cl_and(data->start_class, and_withp);
3341                     }
3342                     else if (flags & SCF_DO_STCLASS_AND)
3343                         cl_and(data->start_class, &this_class);
3344                     flags &= ~SCF_DO_STCLASS;
3345                 }
3346                 if (!scan)              /* It was not CURLYX, but CURLY. */
3347                     scan = next;
3348                 if ( /* ? quantifier ok, except for (?{ ... }) */
3349                     (next_is_eval || !(mincount == 0 && maxcount == 1))
3350                     && (minnext == 0) && (deltanext == 0)
3351                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3352                     && maxcount <= REG_INFTY/3) /* Complement check for big count */
3353                 {
3354                     ckWARNreg(RExC_parse,
3355                               "Quantifier unexpected on zero-length expression");
3356                 }
3357
3358                 min += minnext * mincount;
3359                 is_inf_internal |= ((maxcount == REG_INFTY
3360                                      && (minnext + deltanext) > 0)
3361                                     || deltanext == I32_MAX);
3362                 is_inf |= is_inf_internal;
3363                 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3364
3365                 /* Try powerful optimization CURLYX => CURLYN. */
3366                 if (  OP(oscan) == CURLYX && data
3367                       && data->flags & SF_IN_PAR
3368                       && !(data->flags & SF_HAS_EVAL)
3369                       && !deltanext && minnext == 1 ) {
3370                     /* Try to optimize to CURLYN.  */
3371                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3372                     regnode * const nxt1 = nxt;
3373 #ifdef DEBUGGING
3374                     regnode *nxt2;
3375 #endif
3376
3377                     /* Skip open. */
3378                     nxt = regnext(nxt);
3379                     if (!REGNODE_SIMPLE(OP(nxt))
3380                         && !(PL_regkind[OP(nxt)] == EXACT
3381                              && STR_LEN(nxt) == 1))
3382                         goto nogo;
3383 #ifdef DEBUGGING
3384                     nxt2 = nxt;
3385 #endif
3386                     nxt = regnext(nxt);
3387                     if (OP(nxt) != CLOSE)
3388                         goto nogo;
3389                     if (RExC_open_parens) {
3390                         RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3391                         RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3392                     }
3393                     /* Now we know that nxt2 is the only contents: */
3394                     oscan->flags = (U8)ARG(nxt);
3395                     OP(oscan) = CURLYN;
3396                     OP(nxt1) = NOTHING; /* was OPEN. */
3397
3398 #ifdef DEBUGGING
3399                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3400                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3401                     NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3402                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
3403                     OP(nxt + 1) = OPTIMIZED; /* was count. */
3404                     NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3405 #endif
3406                 }
3407               nogo:
3408
3409                 /* Try optimization CURLYX => CURLYM. */
3410                 if (  OP(oscan) == CURLYX && data
3411                       && !(data->flags & SF_HAS_PAR)
3412                       && !(data->flags & SF_HAS_EVAL)
3413                       && !deltanext     /* atom is fixed width */
3414                       && minnext != 0   /* CURLYM can't handle zero width */
3415                 ) {
3416                     /* XXXX How to optimize if data == 0? */
3417                     /* Optimize to a simpler form.  */
3418                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3419                     regnode *nxt2;
3420
3421                     OP(oscan) = CURLYM;
3422                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3423                             && (OP(nxt2) != WHILEM))
3424                         nxt = nxt2;
3425                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
3426                     /* Need to optimize away parenths. */
3427                     if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3428                         /* Set the parenth number.  */
3429                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3430
3431                         oscan->flags = (U8)ARG(nxt);
3432                         if (RExC_open_parens) {
3433                             RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3434                             RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3435                         }
3436                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
3437                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
3438
3439 #ifdef DEBUGGING
3440                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3441                         OP(nxt + 1) = OPTIMIZED; /* was count. */
3442                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3443                         NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
3444 #endif
3445 #if 0
3446                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
3447                             regnode *nnxt = regnext(nxt1);
3448                             if (nnxt == nxt) {
3449                                 if (reg_off_by_arg[OP(nxt1)])
3450                                     ARG_SET(nxt1, nxt2 - nxt1);
3451                                 else if (nxt2 - nxt1 < U16_MAX)
3452                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
3453                                 else
3454                                     OP(nxt) = NOTHING;  /* Cannot beautify */
3455                             }
3456                             nxt1 = nnxt;
3457                         }
3458 #endif
3459                         /* Optimize again: */
3460                         study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3461                                     NULL, stopparen, recursed, NULL, 0,depth+1);
3462                     }
3463                     else
3464                         oscan->flags = 0;
3465                 }
3466                 else if ((OP(oscan) == CURLYX)
3467                          && (flags & SCF_WHILEM_VISITED_POS)
3468                          /* See the comment on a similar expression above.
3469                             However, this time it's not a subexpression
3470                             we care about, but the expression itself. */
3471                          && (maxcount == REG_INFTY)
3472                          && data && ++data->whilem_c < 16) {
3473                     /* This stays as CURLYX, we can put the count/of pair. */
3474                     /* Find WHILEM (as in regexec.c) */
3475                     regnode *nxt = oscan + NEXT_OFF(oscan);
3476
3477                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3478                         nxt += ARG(nxt);
3479                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
3480                         | (RExC_whilem_seen << 4)); /* On WHILEM */
3481                 }
3482                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3483                     pars++;
3484                 if (flags & SCF_DO_SUBSTR) {
3485                     SV *last_str = NULL;
3486                     int counted = mincount != 0;
3487
3488                     if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3489 #if defined(SPARC64_GCC_WORKAROUND)
3490                         I32 b = 0;
3491                         STRLEN l = 0;
3492                         const char *s = NULL;
3493                         I32 old = 0;
3494
3495                         if (pos_before >= data->last_start_min)
3496                             b = pos_before;
3497                         else
3498                             b = data->last_start_min;
3499
3500                         l = 0;
3501                         s = SvPV_const(data->last_found, l);
3502                         old = b - data->last_start_min;
3503
3504 #else
3505                         I32 b = pos_before >= data->last_start_min
3506                             ? pos_before : data->last_start_min;
3507                         STRLEN l;
3508                         const char * const s = SvPV_const(data->last_found, l);
3509                         I32 old = b - data->last_start_min;
3510 #endif
3511
3512                         if (UTF)
3513                             old = utf8_hop((U8*)s, old) - (U8*)s;
3514                         l -= old;
3515                         /* Get the added string: */
3516                         last_str = newSVpvn_utf8(s  + old, l, UTF);
3517                         if (deltanext == 0 && pos_before == b) {
3518                             /* What was added is a constant string */
3519                             if (mincount > 1) {
3520                                 SvGROW(last_str, (mincount * l) + 1);
3521                                 repeatcpy(SvPVX(last_str) + l,
3522                                           SvPVX_const(last_str), l, mincount - 1);
3523                                 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3524                                 /* Add additional parts. */
3525                                 SvCUR_set(data->last_found,
3526                                           SvCUR(data->last_found) - l);
3527                                 sv_catsv(data->last_found, last_str);
3528                                 {
3529                                     SV * sv = data->last_found;
3530                                     MAGIC *mg =
3531                                         SvUTF8(sv) && SvMAGICAL(sv) ?
3532                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
3533                                     if (mg && mg->mg_len >= 0)
3534                                         mg->mg_len += CHR_SVLEN(last_str) - l;
3535                                 }
3536                                 data->last_end += l * (mincount - 1);
3537                             }
3538                         } else {
3539                             /* start offset must point into the last copy */
3540                             data->last_start_min += minnext * (mincount - 1);
3541                             data->last_start_max += is_inf ? I32_MAX
3542                                 : (maxcount - 1) * (minnext + data->pos_delta);
3543                         }
3544                     }
3545                     /* It is counted once already... */
3546                     data->pos_min += minnext * (mincount - counted);
3547                     data->pos_delta += - counted * deltanext +
3548                         (minnext + deltanext) * maxcount - minnext * mincount;
3549                     if (mincount != maxcount) {
3550                          /* Cannot extend fixed substrings found inside
3551                             the group.  */
3552                         SCAN_COMMIT(pRExC_state,data,minlenp);
3553                         if (mincount && last_str) {
3554                             SV * const sv = data->last_found;
3555                             MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3556                                 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3557
3558                             if (mg)
3559                                 mg->mg_len = -1;
3560                             sv_setsv(sv, last_str);
3561                             data->last_end = data->pos_min;
3562                             data->last_start_min =
3563                                 data->pos_min - CHR_SVLEN(last_str);
3564                             data->last_start_max = is_inf
3565                                 ? I32_MAX
3566                                 : data->pos_min + data->pos_delta
3567                                 - CHR_SVLEN(last_str);
3568                         }
3569                         data->longest = &(data->longest_float);
3570                     }
3571                     SvREFCNT_dec(last_str);
3572                 }
3573                 if (data && (fl & SF_HAS_EVAL))
3574                     data->flags |= SF_HAS_EVAL;
3575               optimize_curly_tail:
3576                 if (OP(oscan) != CURLYX) {
3577                     while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3578                            && NEXT_OFF(next))
3579                         NEXT_OFF(oscan) += NEXT_OFF(next);
3580                 }
3581                 continue;
3582             default:                    /* REF, ANYOFV, and CLUMP only? */
3583                 if (flags & SCF_DO_SUBSTR) {
3584                     SCAN_COMMIT(pRExC_state,data,minlenp);      /* Cannot expect anything... */
3585                     data->longest = &(data->longest_float);
3586                 }
3587                 is_inf = is_inf_internal = 1;
3588                 if (flags & SCF_DO_STCLASS_OR)
3589                     cl_anything(pRExC_state, data->start_class);
3590                 flags &= ~SCF_DO_STCLASS;
3591                 break;
3592             }
3593         }
3594         else if (OP(scan) == LNBREAK) {
3595             if (flags & SCF_DO_STCLASS) {
3596                 int value = 0;
3597                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3598                 if (flags & SCF_DO_STCLASS_AND) {
3599                     for (value = 0; value < 256; value++)
3600                         if (!is_VERTWS_cp(value))
3601                             ANYOF_BITMAP_CLEAR(data->start_class, value);
3602                 }
3603                 else {
3604                     for (value = 0; value < 256; value++)
3605                         if (is_VERTWS_cp(value))
3606                             ANYOF_BITMAP_SET(data->start_class, value);
3607                 }
3608                 if (flags & SCF_DO_STCLASS_OR)
3609                     cl_and(data->start_class, and_withp);
3610                 flags &= ~SCF_DO_STCLASS;
3611             }
3612             min += 1;
3613             delta += 1;
3614             if (flags & SCF_DO_SUBSTR) {
3615                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
3616                 data->pos_min += 1;
3617                 data->pos_delta += 1;
3618                 data->longest = &(data->longest_float);
3619             }
3620         }
3621         else if (OP(scan) == FOLDCHAR) {
3622             int d = ARG(scan) == LATIN_SMALL_LETTER_SHARP_S ? 1 : 2;
3623             flags &= ~SCF_DO_STCLASS;
3624             min += 1;
3625             delta += d;
3626             if (flags & SCF_DO_SUBSTR) {
3627                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
3628                 data->pos_min += 1;
3629                 data->pos_delta += d;
3630                 data->longest = &(data->longest_float);
3631             }
3632         }
3633         else if (REGNODE_SIMPLE(OP(scan))) {
3634             int value = 0;
3635
3636             if (flags & SCF_DO_SUBSTR) {
3637                 SCAN_COMMIT(pRExC_state,data,minlenp);
3638                 data->pos_min++;
3639             }
3640             min++;
3641             if (flags & SCF_DO_STCLASS) {
3642                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3643
3644                 /* Some of the logic below assumes that switching
3645                    locale on will only add false positives. */
3646                 switch (PL_regkind[OP(scan)]) {
3647                 case SANY:
3648                 default:
3649                   do_default:
3650                     /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3651                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3652                         cl_anything(pRExC_state, data->start_class);
3653                     break;
3654                 case REG_ANY:
3655                     if (OP(scan) == SANY)
3656                         goto do_default;
3657                     if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3658                         value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3659                                  || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
3660                         cl_anything(pRExC_state, data->start_class);
3661                     }
3662                     if (flags & SCF_DO_STCLASS_AND || !value)
3663                         ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3664                     break;
3665                 case ANYOF:
3666                     if (flags & SCF_DO_STCLASS_AND)
3667                         cl_and(data->start_class,
3668                                (struct regnode_charclass_class*)scan);
3669                     else
3670                         cl_or(pRExC_state, data->start_class,
3671                               (struct regnode_charclass_class*)scan);
3672                     break;
3673                 case ALNUM:
3674                     if (flags & SCF_DO_STCLASS_AND) {
3675                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3676                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3677                             if (OP(scan) == ALNUMU) {
3678                                 for (value = 0; value < 256; value++) {
3679                                     if (!isWORDCHAR_L1(value)) {
3680                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3681                                     }
3682                                 }
3683                             } else {
3684                                 for (value = 0; value < 256; value++) {
3685                                     if (!isALNUM(value)) {
3686                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3687                                     }
3688                                 }
3689                             }
3690                         }
3691                     }
3692                     else {
3693                         if (data->start_class->flags & ANYOF_LOCALE)
3694                             ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3695                         else if (OP(scan) == ALNUMU) {
3696                             for (value = 0; value < 256; value++) {
3697                                 if (isWORDCHAR_L1(value)) {
3698                                     ANYOF_BITMAP_SET(data->start_class, value);
3699                                 }
3700                             }
3701                         } else {
3702                             for (value = 0; value < 256; value++) {
3703                                 if (isALNUM(value)) {
3704                                     ANYOF_BITMAP_SET(data->start_class, value);
3705                                 }
3706                             }
3707                         }
3708                     }
3709                     break;
3710                 case NALNUM:
3711                     if (flags & SCF_DO_STCLASS_AND) {
3712                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3713                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3714                             if (OP(scan) == NALNUMU) {
3715                                 for (value = 0; value < 256; value++) {
3716                                     if (isWORDCHAR_L1(value)) {
3717                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3718                                     }
3719                                 }
3720                             } else {
3721                                 for (value = 0; value < 256; value++) {
3722                                     if (isALNUM(value)) {
3723                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3724                                     }
3725                                 }
3726                             }
3727                         }
3728                     }
3729                     else {
3730                         if (data->start_class->flags & ANYOF_LOCALE)
3731                             ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3732                         else {
3733                             if (OP(scan) == NALNUMU) {
3734                                 for (value = 0; value < 256; value++) {
3735                                     if (! isWORDCHAR_L1(value)) {
3736                                         ANYOF_BITMAP_SET(data->start_class, value);
3737                                     }
3738                                 }
3739                             } else {
3740                                 for (value = 0; value < 256; value++) {
3741                                     if (! isALNUM(value)) {
3742                                         ANYOF_BITMAP_SET(data->start_class, value);
3743                                     }
3744                                 }
3745                             }
3746                         }
3747                     }
3748                     break;
3749                 case SPACE:
3750                     if (flags & SCF_DO_STCLASS_AND) {
3751                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3752                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3753                             if (OP(scan) == SPACEU) {
3754                                 for (value = 0; value < 256; value++) {
3755                                     if (!isSPACE_L1(value)) {
3756                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3757                                     }
3758                                 }
3759                             } else {
3760                                 for (value = 0; value < 256; value++) {
3761                                     if (!isSPACE(value)) {
3762                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3763                                     }
3764                                 }
3765                             }
3766                         }
3767                     }
3768                     else {
3769                         if (data->start_class->flags & ANYOF_LOCALE) {
3770                             ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3771                         }
3772                         else if (OP(scan) == SPACEU) {
3773                             for (value = 0; value < 256; value++) {
3774                                 if (isSPACE_L1(value)) {
3775                                     ANYOF_BITMAP_SET(data->start_class, value);
3776                                 }
3777                             }
3778                         } else {
3779                             for (value = 0; value < 256; value++) {
3780                                 if (isSPACE(value)) {
3781                                     ANYOF_BITMAP_SET(data->start_class, value);
3782                                 }
3783                             }
3784                         }
3785                     }
3786                     break;
3787                 case NSPACE:
3788                     if (flags & SCF_DO_STCLASS_AND) {
3789                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3790                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3791                             if (OP(scan) == NSPACEU) {
3792                                 for (value = 0; value < 256; value++) {
3793                                     if (isSPACE_L1(value)) {
3794                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3795                                     }
3796                                 }
3797                             } else {
3798                                 for (value = 0; value < 256; value++) {
3799                                     if (isSPACE(value)) {
3800                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3801                                     }
3802                                 }
3803                             }
3804                         }
3805                     }
3806                     else {
3807                         if (data->start_class->flags & ANYOF_LOCALE)
3808                             ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3809                         else if (OP(scan) == NSPACEU) {
3810                             for (value = 0; value < 256; value++) {
3811                                 if (!isSPACE_L1(value)) {
3812                                     ANYOF_BITMAP_SET(data->start_class, value);
3813                                 }
3814                             }
3815                         }
3816                         else {
3817                             for (value = 0; value < 256; value++) {
3818                                 if (!isSPACE(value)) {
3819                                     ANYOF_BITMAP_SET(data->start_class, value);
3820                                 }
3821                             }
3822                         }
3823                     }
3824                     break;
3825                 case DIGIT:
3826                     if (flags & SCF_DO_STCLASS_AND) {
3827                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3828                         for (value = 0; value < 256; value++)
3829                             if (!isDIGIT(value))
3830                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
3831                     }
3832                     else {
3833                         if (data->start_class->flags & ANYOF_LOCALE)
3834                             ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3835                         else {
3836                             for (value = 0; value < 256; value++)
3837                                 if (isDIGIT(value))
3838                                     ANYOF_BITMAP_SET(data->start_class, value);
3839                         }
3840                     }
3841                     break;
3842                 case NDIGIT:
3843                     if (flags & SCF_DO_STCLASS_AND) {
3844                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3845                         for (value = 0; value < 256; value++)
3846                             if (isDIGIT(value))
3847                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
3848                     }
3849                     else {
3850                         if (data->start_class->flags & ANYOF_LOCALE)
3851                             ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3852                         else {
3853                             for (value = 0; value < 256; value++)
3854                                 if (!isDIGIT(value))
3855                                     ANYOF_BITMAP_SET(data->start_class, value);
3856                         }
3857                     }
3858                     break;
3859                 CASE_SYNST_FNC(VERTWS);
3860                 CASE_SYNST_FNC(HORIZWS);
3861
3862                 }
3863                 if (flags & SCF_DO_STCLASS_OR)
3864                     cl_and(data->start_class, and_withp);
3865                 flags &= ~SCF_DO_STCLASS;
3866             }
3867         }
3868         else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3869             data->flags |= (OP(scan) == MEOL
3870                             ? SF_BEFORE_MEOL
3871                             : SF_BEFORE_SEOL);
3872         }
3873         else if (  PL_regkind[OP(scan)] == BRANCHJ
3874                  /* Lookbehind, or need to calculate parens/evals/stclass: */
3875                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
3876                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3877             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3878                 || OP(scan) == UNLESSM )
3879             {
3880                 /* Negative Lookahead/lookbehind
3881                    In this case we can't do fixed string optimisation.
3882                 */
3883
3884                 I32 deltanext, minnext, fake = 0;
3885                 regnode *nscan;
3886                 struct regnode_charclass_class intrnl;
3887                 int f = 0;
3888
3889                 data_fake.flags = 0;
3890                 if (data) {
3891                     data_fake.whilem_c = data->whilem_c;
3892                     data_fake.last_closep = data->last_closep;
3893                 }
3894                 else
3895                     data_fake.last_closep = &fake;
3896                 data_fake.pos_delta = delta;
3897                 if ( flags & SCF_DO_STCLASS && !scan->flags
3898                      && OP(scan) == IFMATCH ) { /* Lookahead */
3899                     cl_init(pRExC_state, &intrnl);
3900                     data_fake.start_class = &intrnl;
3901                     f |= SCF_DO_STCLASS_AND;
3902                 }
3903                 if (flags & SCF_WHILEM_VISITED_POS)
3904                     f |= SCF_WHILEM_VISITED_POS;
3905                 next = regnext(scan);
3906                 nscan = NEXTOPER(NEXTOPER(scan));
3907                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3908                     last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3909                 if (scan->flags) {
3910                     if (deltanext) {
3911                         FAIL("Variable length lookbehind not implemented");
3912                     }
3913                     else if (minnext > (I32)U8_MAX) {
3914                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3915                     }
3916                     scan->flags = (U8)minnext;
3917                 }
3918                 if (data) {
3919                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3920                         pars++;
3921                     if (data_fake.flags & SF_HAS_EVAL)
3922                         data->flags |= SF_HAS_EVAL;
3923                     data->whilem_c = data_fake.whilem_c;
3924                 }
3925                 if (f & SCF_DO_STCLASS_AND) {
3926                     if (flags & SCF_DO_STCLASS_OR) {
3927                         /* OR before, AND after: ideally we would recurse with
3928                          * data_fake to get the AND applied by study of the
3929                          * remainder of the pattern, and then derecurse;
3930                          * *** HACK *** for now just treat as "no information".
3931                          * See [perl #56690].
3932                          */
3933                         cl_init(pRExC_state, data->start_class);
3934                     }  else {
3935                         /* AND before and after: combine and continue */
3936                         const int was = (data->start_class->flags & ANYOF_EOS);
3937
3938                         cl_and(data->start_class, &intrnl);
3939                         if (was)
3940                             data->start_class->flags |= ANYOF_EOS;
3941                     }
3942                 }
3943             }
3944 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3945             else {
3946                 /* Positive Lookahead/lookbehind
3947                    In this case we can do fixed string optimisation,
3948                    but we must be careful about it. Note in the case of
3949                    lookbehind the positions will be offset by the minimum
3950                    length of the pattern, something we won't know about
3951                    until after the recurse.
3952                 */
3953                 I32 deltanext, fake = 0;
3954                 regnode *nscan;
3955                 struct regnode_charclass_class intrnl;
3956                 int f = 0;
3957                 /* We use SAVEFREEPV so that when the full compile
3958                     is finished perl will clean up the allocated
3959                     minlens when it's all done. This way we don't
3960                     have to worry about freeing them when we know
3961                     they wont be used, which would be a pain.
3962                  */
3963                 I32 *minnextp;
3964                 Newx( minnextp, 1, I32 );
3965                 SAVEFREEPV(minnextp);
3966
3967                 if (data) {
3968                     StructCopy(data, &data_fake, scan_data_t);
3969                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3970                         f |= SCF_DO_SUBSTR;
3971                         if (scan->flags)
3972                             SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
3973                         data_fake.last_found=newSVsv(data->last_found);
3974                     }
3975                 }
3976                 else
3977                     data_fake.last_closep = &fake;
3978                 data_fake.flags = 0;
3979                 data_fake.pos_delta = delta;
3980                 if (is_inf)
3981                     data_fake.flags |= SF_IS_INF;
3982                 if ( flags & SCF_DO_STCLASS && !scan->flags
3983                      && OP(scan) == IFMATCH ) { /* Lookahead */
3984                     cl_init(pRExC_state, &intrnl);
3985                     data_fake.start_class = &intrnl;
3986                     f |= SCF_DO_STCLASS_AND;
3987                 }
3988                 if (flags & SCF_WHILEM_VISITED_POS)
3989                     f |= SCF_WHILEM_VISITED_POS;
3990                 next = regnext(scan);
3991                 nscan = NEXTOPER(NEXTOPER(scan));
3992
3993                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3994                     last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3995                 if (scan->flags) {
3996                     if (deltanext) {
3997                         FAIL("Variable length lookbehind not implemented");
3998                     }
3999                     else if (*minnextp > (I32)U8_MAX) {
4000                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4001                     }
4002                     scan->flags = (U8)*minnextp;
4003                 }
4004
4005                 *minnextp += min;
4006
4007                 if (f & SCF_DO_STCLASS_AND) {
4008                     const int was = (data->start_class->flags & ANYOF_EOS);
4009
4010                     cl_and(data->start_class, &intrnl);
4011                     if (was)
4012                         data->start_class->flags |= ANYOF_EOS;
4013                 }
4014                 if (data) {
4015                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4016                         pars++;
4017                     if (data_fake.flags & SF_HAS_EVAL)
4018                         data->flags |= SF_HAS_EVAL;
4019                     data->whilem_c = data_fake.whilem_c;
4020                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4021                         if (RExC_rx->minlen<*minnextp)
4022                             RExC_rx->minlen=*minnextp;
4023                         SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4024                         SvREFCNT_dec(data_fake.last_found);
4025
4026                         if ( data_fake.minlen_fixed != minlenp )
4027                         {
4028                             data->offset_fixed= data_fake.offset_fixed;
4029                             data->minlen_fixed= data_fake.minlen_fixed;
4030                             data->lookbehind_fixed+= scan->flags;
4031                         }
4032                         if ( data_fake.minlen_float != minlenp )
4033                         {
4034                             data->minlen_float= data_fake.minlen_float;
4035                             data->offset_float_min=data_fake.offset_float_min;
4036                             data->offset_float_max=data_fake.offset_float_max;
4037                             data->lookbehind_float+= scan->flags;
4038                         }
4039                     }
4040                 }
4041
4042
4043             }
4044 #endif
4045         }
4046         else if (OP(scan) == OPEN) {
4047             if (stopparen != (I32)ARG(scan))
4048                 pars++;
4049         }
4050         else if (OP(scan) == CLOSE) {
4051             if (stopparen == (I32)ARG(scan)) {
4052                 break;
4053             }
4054             if ((I32)ARG(scan) == is_par) {
4055                 next = regnext(scan);
4056
4057                 if ( next && (OP(next) != WHILEM) && next < last)
4058                     is_par = 0;         /* Disable optimization */
4059             }
4060             if (data)
4061                 *(data->last_closep) = ARG(scan);
4062         }
4063         else if (OP(scan) == EVAL) {
4064                 if (data)
4065                     data->flags |= SF_HAS_EVAL;
4066         }
4067         else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4068             if (flags & SCF_DO_SUBSTR) {
4069                 SCAN_COMMIT(pRExC_state,data,minlenp);
4070                 flags &= ~SCF_DO_SUBSTR;
4071             }
4072             if (data && OP(scan)==ACCEPT) {
4073                 data->flags |= SCF_SEEN_ACCEPT;
4074                 if (stopmin > min)
4075                     stopmin = min;
4076             }
4077         }
4078         else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4079         {
4080                 if (flags & SCF_DO_SUBSTR) {
4081                     SCAN_COMMIT(pRExC_state,data,minlenp);
4082                     data->longest = &(data->longest_float);
4083                 }
4084                 is_inf = is_inf_internal = 1;
4085                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4086                     cl_anything(pRExC_state, data->start_class);
4087                 flags &= ~SCF_DO_STCLASS;
4088         }
4089         else if (OP(scan) == GPOS) {
4090             if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4091                 !(delta || is_inf || (data && data->pos_delta)))
4092             {
4093                 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4094                     RExC_rx->extflags |= RXf_ANCH_GPOS;
4095                 if (RExC_rx->gofs < (U32)min)
4096                     RExC_rx->gofs = min;
4097             } else {
4098                 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4099                 RExC_rx->gofs = 0;
4100             }
4101         }
4102 #ifdef TRIE_STUDY_OPT
4103 #ifdef FULL_TRIE_STUDY
4104         else if (PL_regkind[OP(scan)] == TRIE) {
4105             /* NOTE - There is similar code to this block above for handling
4106                BRANCH nodes on the initial study.  If you change stuff here
4107                check there too. */
4108             regnode *trie_node= scan;
4109             regnode *tail= regnext(scan);
4110             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4111             I32 max1 = 0, min1 = I32_MAX;
4112             struct regnode_charclass_class accum;
4113
4114             if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4115                 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4116             if (flags & SCF_DO_STCLASS)
4117                 cl_init_zero(pRExC_state, &accum);
4118
4119             if (!trie->jump) {
4120                 min1= trie->minlen;
4121                 max1= trie->maxlen;
4122             } else {
4123                 const regnode *nextbranch= NULL;
4124                 U32 word;
4125
4126                 for ( word=1 ; word <= trie->wordcount ; word++)
4127                 {
4128                     I32 deltanext=0, minnext=0, f = 0, fake;
4129                     struct regnode_charclass_class this_class;
4130
4131                     data_fake.flags = 0;
4132                     if (data) {
4133                         data_fake.whilem_c = data->whilem_c;
4134                         data_fake.last_closep = data->last_closep;
4135                     }
4136                     else
4137                         data_fake.last_closep = &fake;
4138                     data_fake.pos_delta = delta;
4139                     if (flags & SCF_DO_STCLASS) {
4140                         cl_init(pRExC_state, &this_class);
4141                         data_fake.start_class = &this_class;
4142                         f = SCF_DO_STCLASS_AND;
4143                     }
4144                     if (flags & SCF_WHILEM_VISITED_POS)
4145                         f |= SCF_WHILEM_VISITED_POS;
4146
4147                     if (trie->jump[word]) {
4148                         if (!nextbranch)
4149                             nextbranch = trie_node + trie->jump[0];
4150                         scan= trie_node + trie->jump[word];
4151                         /* We go from the jump point to the branch that follows
4152                            it. Note this means we need the vestigal unused branches
4153                            even though they arent otherwise used.
4154                          */
4155                         minnext = study_chunk(pRExC_state, &scan, minlenp,
4156                             &deltanext, (regnode *)nextbranch, &data_fake,
4157                             stopparen, recursed, NULL, f,depth+1);
4158                     }
4159                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4160                         nextbranch= regnext((regnode*)nextbranch);
4161
4162                     if (min1 > (I32)(minnext + trie->minlen))
4163                         min1 = minnext + trie->minlen;
4164                     if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4165                         max1 = minnext + deltanext + trie->maxlen;
4166                     if (deltanext == I32_MAX)
4167                         is_inf = is_inf_internal = 1;
4168
4169                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4170                         pars++;
4171                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
4172                         if ( stopmin > min + min1)
4173                             stopmin = min + min1;
4174                         flags &= ~SCF_DO_SUBSTR;
4175                         if (data)
4176                             data->flags |= SCF_SEEN_ACCEPT;
4177                     }
4178                     if (data) {
4179                         if (data_fake.flags & SF_HAS_EVAL)
4180                             data->flags |= SF_HAS_EVAL;
4181                         data->whilem_c = data_fake.whilem_c;
4182                     }
4183                     if (flags & SCF_DO_STCLASS)
4184                         cl_or(pRExC_state, &accum, &this_class);
4185                 }
4186             }
4187             if (flags & SCF_DO_SUBSTR) {
4188                 data->pos_min += min1;
4189                 data->pos_delta += max1 - min1;
4190                 if (max1 != min1 || is_inf)
4191                     data->longest = &(data->longest_float);
4192             }
4193             min += min1;
4194             delta += max1 - min1;
4195             if (flags & SCF_DO_STCLASS_OR) {
4196                 cl_or(pRExC_state, data->start_class, &accum);
4197                 if (min1) {
4198                     cl_and(data->start_class, and_withp);
4199                     flags &= ~SCF_DO_STCLASS;
4200                 }
4201             }
4202             else if (flags & SCF_DO_STCLASS_AND) {
4203                 if (min1) {
4204                     cl_and(data->start_class, &accum);
4205                     flags &= ~SCF_DO_STCLASS;
4206                 }
4207                 else {
4208                     /* Switch to OR mode: cache the old value of
4209                      * data->start_class */
4210                     INIT_AND_WITHP;
4211                     StructCopy(data->start_class, and_withp,
4212                                struct regnode_charclass_class);
4213                     flags &= ~SCF_DO_STCLASS_AND;
4214                     StructCopy(&accum, data->start_class,
4215                                struct regnode_charclass_class);
4216                     flags |= SCF_DO_STCLASS_OR;
4217                     data->start_class->flags |= ANYOF_EOS;
4218                 }
4219             }
4220             scan= tail;
4221             continue;
4222         }
4223 #else
4224         else if (PL_regkind[OP(scan)] == TRIE) {
4225             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4226             U8*bang=NULL;
4227
4228             min += trie->minlen;
4229             delta += (trie->maxlen - trie->minlen);
4230             flags &= ~SCF_DO_STCLASS; /* xxx */
4231             if (flags & SCF_DO_SUBSTR) {
4232                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
4233                 data->pos_min += trie->minlen;
4234                 data->pos_delta += (trie->maxlen - trie->minlen);
4235                 if (trie->maxlen != trie->minlen)
4236                     data->longest = &(data->longest_float);
4237             }
4238             if (trie->jump) /* no more substrings -- for now /grr*/
4239                 flags &= ~SCF_DO_SUBSTR;
4240         }
4241 #endif /* old or new */
4242 #endif /* TRIE_STUDY_OPT */
4243
4244         /* Else: zero-length, ignore. */
4245         scan = regnext(scan);
4246     }
4247     if (frame) {
4248         last = frame->last;
4249         scan = frame->next;
4250         stopparen = frame->stop;
4251         frame = frame->prev;
4252         goto fake_study_recurse;
4253     }
4254
4255   finish:
4256     assert(!frame);
4257     DEBUG_STUDYDATA("pre-fin:",data,depth);
4258
4259     *scanp = scan;
4260     *deltap = is_inf_internal ? I32_MAX : delta;
4261     if (flags & SCF_DO_SUBSTR && is_inf)
4262         data->pos_delta = I32_MAX - data->pos_min;
4263     if (is_par > (I32)U8_MAX)
4264         is_par = 0;
4265     if (is_par && pars==1 && data) {
4266         data->flags |= SF_IN_PAR;
4267         data->flags &= ~SF_HAS_PAR;
4268     }
4269     else if (pars && data) {
4270         data->flags |= SF_HAS_PAR;
4271         data->flags &= ~SF_IN_PAR;
4272     }
4273     if (flags & SCF_DO_STCLASS_OR)
4274         cl_and(data->start_class, and_withp);
4275     if (flags & SCF_TRIE_RESTUDY)
4276         data->flags |=  SCF_TRIE_RESTUDY;
4277
4278     DEBUG_STUDYDATA("post-fin:",data,depth);
4279
4280     return min < stopmin ? min : stopmin;
4281 }
4282
4283 STATIC U32
4284 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4285 {
4286     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4287
4288     PERL_ARGS_ASSERT_ADD_DATA;
4289
4290     Renewc(RExC_rxi->data,
4291            sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4292            char, struct reg_data);
4293     if(count)
4294         Renew(RExC_rxi->data->what, count + n, U8);
4295     else
4296         Newx(RExC_rxi->data->what, n, U8);
4297     RExC_rxi->data->count = count + n;
4298     Copy(s, RExC_rxi->data->what + count, n, U8);
4299     return count;
4300 }
4301
4302 /*XXX: todo make this not included in a non debugging perl */
4303 #ifndef PERL_IN_XSUB_RE
4304 void
4305 Perl_reginitcolors(pTHX)
4306 {
4307     dVAR;
4308     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4309     if (s) {
4310         char *t = savepv(s);
4311         int i = 0;
4312         PL_colors[0] = t;
4313         while (++i < 6) {
4314             t = strchr(t, '\t');
4315             if (t) {
4316                 *t = '\0';
4317                 PL_colors[i] = ++t;
4318             }
4319             else
4320                 PL_colors[i] = t = (char *)"";
4321         }
4322     } else {
4323         int i = 0;
4324         while (i < 6)
4325             PL_colors[i++] = (char *)"";
4326     }
4327     PL_colorset = 1;
4328 }
4329 #endif
4330
4331
4332 #ifdef TRIE_STUDY_OPT
4333 #define CHECK_RESTUDY_GOTO                                  \
4334         if (                                                \
4335               (data.flags & SCF_TRIE_RESTUDY)               \
4336               && ! restudied++                              \
4337         )     goto reStudy
4338 #else
4339 #define CHECK_RESTUDY_GOTO
4340 #endif
4341
4342 /*
4343  - pregcomp - compile a regular expression into internal code
4344  *
4345  * We can't allocate space until we know how big the compiled form will be,
4346  * but we can't compile it (and thus know how big it is) until we've got a
4347  * place to put the code.  So we cheat:  we compile it twice, once with code
4348  * generation turned off and size counting turned on, and once "for real".
4349  * This also means that we don't allocate space until we are sure that the
4350  * thing really will compile successfully, and we never have to move the
4351  * code and thus invalidate pointers into it.  (Note that it has to be in
4352  * one piece because free() must be able to free it all.) [NB: not true in perl]
4353  *
4354  * Beware that the optimization-preparation code in here knows about some
4355  * of the structure of the compiled regexp.  [I'll say.]
4356  */
4357
4358
4359
4360 #ifndef PERL_IN_XSUB_RE
4361 #define RE_ENGINE_PTR &PL_core_reg_engine
4362 #else
4363 extern const struct regexp_engine my_reg_engine;
4364 #define RE_ENGINE_PTR &my_reg_engine
4365 #endif
4366
4367 #ifndef PERL_IN_XSUB_RE
4368 REGEXP *
4369 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4370 {
4371     dVAR;
4372     HV * const table = GvHV(PL_hintgv);
4373
4374     PERL_ARGS_ASSERT_PREGCOMP;
4375
4376     /* Dispatch a request to compile a regexp to correct
4377        regexp engine. */
4378     if (table) {
4379         SV **ptr= hv_fetchs(table, "regcomp", FALSE);
4380         GET_RE_DEBUG_FLAGS_DECL;
4381         if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
4382             const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4383             DEBUG_COMPILE_r({
4384                 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4385                     SvIV(*ptr));
4386             });
4387             return CALLREGCOMP_ENG(eng, pattern, flags);
4388         }
4389     }
4390     return Perl_re_compile(aTHX_ pattern, flags);
4391 }
4392 #endif
4393
4394 REGEXP *
4395 Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
4396 {
4397     dVAR;
4398     REGEXP *rx;
4399     struct regexp *r;
4400     register regexp_internal *ri;
4401     STRLEN plen;
4402     char  *exp;
4403     char* xend;
4404     regnode *scan;
4405     I32 flags;
4406     I32 minlen = 0;
4407     U32 pm_flags;
4408
4409     /* these are all flags - maybe they should be turned
4410      * into a single int with different bit masks */
4411     I32 sawlookahead = 0;
4412     I32 sawplus = 0;
4413     I32 sawopen = 0;
4414     bool used_setjump = FALSE;
4415
4416     U8 jump_ret = 0;
4417     dJMPENV;
4418     scan_data_t data;
4419     RExC_state_t RExC_state;
4420     RExC_state_t * const pRExC_state = &RExC_state;
4421 #ifdef TRIE_STUDY_OPT
4422     int restudied;
4423     RExC_state_t copyRExC_state;
4424 #endif
4425     GET_RE_DEBUG_FLAGS_DECL;
4426
4427     PERL_ARGS_ASSERT_RE_COMPILE;
4428
4429     DEBUG_r(if (!PL_colorset) reginitcolors());
4430
4431     RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
4432     RExC_uni_semantics = 0;
4433
4434     /****************** LONG JUMP TARGET HERE***********************/
4435     /* Longjmp back to here if have to switch in midstream to utf8 */
4436     if (! RExC_orig_utf8) {
4437         JMPENV_PUSH(jump_ret);
4438         used_setjump = TRUE;
4439     }
4440
4441     if (jump_ret == 0) {    /* First time through */
4442         exp = SvPV(pattern, plen);
4443         xend = exp + plen;
4444         /* ignore the utf8ness if the pattern is 0 length */
4445         if (plen == 0) {
4446             RExC_utf8 = RExC_orig_utf8 = 0;
4447         }
4448
4449         DEBUG_COMPILE_r({
4450             SV *dsv= sv_newmortal();
4451             RE_PV_QUOTED_DECL(s, RExC_utf8,
4452                 dsv, exp, plen, 60);
4453             PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4454                            PL_colors[4],PL_colors[5],s);
4455         });
4456     }
4457     else {  /* longjumped back */
4458         STRLEN len = plen;
4459
4460         /* If the cause for the longjmp was other than changing to utf8, pop
4461          * our own setjmp, and longjmp to the correct handler */
4462         if (jump_ret != UTF8_LONGJMP) {
4463             JMPENV_POP;
4464             JMPENV_JUMP(jump_ret);
4465         }
4466
4467         GET_RE_DEBUG_FLAGS;
4468
4469         /* It's possible to write a regexp in ascii that represents Unicode
4470         codepoints outside of the byte range, such as via \x{100}. If we
4471         detect such a sequence we have to convert the entire pattern to utf8
4472         and then recompile, as our sizing calculation will have been based
4473         on 1 byte == 1 character, but we will need to use utf8 to encode
4474         at least some part of the pattern, and therefore must convert the whole
4475         thing.
4476         -- dmq */
4477         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4478             "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4479         exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)SvPV(pattern, plen), &len);
4480         xend = exp + len;
4481         RExC_orig_utf8 = RExC_utf8 = 1;
4482         SAVEFREEPV(exp);
4483     }
4484
4485 #ifdef TRIE_STUDY_OPT
4486     restudied = 0;
4487 #endif
4488
4489     /* Set to use unicode semantics if the pattern is in utf8 and has the
4490      * 'depends' charset specified, as it means unicode when utf8  */
4491     pm_flags = orig_pm_flags;
4492
4493     if (RExC_utf8 && get_regex_charset(pm_flags) == REGEX_DEPENDS_CHARSET) {
4494         set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
4495     }
4496
4497     RExC_precomp = exp;
4498     RExC_flags = pm_flags;
4499     RExC_sawback = 0;
4500
4501     RExC_seen = 0;
4502     RExC_in_lookbehind = 0;
4503     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4504     RExC_seen_evals = 0;
4505     RExC_extralen = 0;
4506
4507     /* First pass: determine size, legality. */
4508     RExC_parse = exp;
4509     RExC_start = exp;
4510     RExC_end = xend;
4511     RExC_naughty = 0;
4512     RExC_npar = 1;
4513     RExC_nestroot = 0;
4514     RExC_size = 0L;
4515     RExC_emit = &PL_regdummy;
4516     RExC_whilem_seen = 0;
4517     RExC_open_parens = NULL;
4518     RExC_close_parens = NULL;
4519     RExC_opend = NULL;
4520     RExC_paren_names = NULL;
4521 #ifdef DEBUGGING
4522     RExC_paren_name_list = NULL;
4523 #endif
4524     RExC_recurse = NULL;
4525     RExC_recurse_count = 0;
4526
4527 #if 0 /* REGC() is (currently) a NOP at the first pass.
4528        * Clever compilers notice this and complain. --jhi */
4529     REGC((U8)REG_MAGIC, (char*)RExC_emit);
4530 #endif
4531     DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4532     if (reg(pRExC_state, 0, &flags,1) == NULL) {
4533         RExC_precomp = NULL;
4534         return(NULL);
4535     }
4536
4537     /* Here, finished first pass.  Get rid of any added setjmp */
4538     if (used_setjump) {
4539         JMPENV_POP;
4540     }
4541
4542     DEBUG_PARSE_r({
4543         PerlIO_printf(Perl_debug_log,
4544             "Required size %"IVdf" nodes\n"
4545             "Starting second pass (creation)\n",
4546             (IV)RExC_size);
4547         RExC_lastnum=0;
4548         RExC_lastparse=NULL;
4549     });
4550
4551     /* The first pass could have found things that force Unicode semantics */
4552     if ((RExC_utf8 || RExC_uni_semantics)
4553          && get_regex_charset(pm_flags) == REGEX_DEPENDS_CHARSET)
4554     {
4555         set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
4556     }
4557
4558     /* Small enough for pointer-storage convention?
4559        If extralen==0, this means that we will not need long jumps. */
4560     if (RExC_size >= 0x10000L && RExC_extralen)
4561         RExC_size += RExC_extralen;
4562     else
4563         RExC_extralen = 0;
4564     if (RExC_whilem_seen > 15)
4565         RExC_whilem_seen = 15;
4566
4567     /* Allocate space and zero-initialize. Note, the two step process
4568        of zeroing when in debug mode, thus anything assigned has to
4569        happen after that */
4570     rx = (REGEXP*) newSV_type(SVt_REGEXP);
4571     r = (struct regexp*)SvANY(rx);
4572     Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4573          char, regexp_internal);
4574     if ( r == NULL || ri == NULL )
4575         FAIL("Regexp out of space");
4576 #ifdef DEBUGGING
4577     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
4578     Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
4579 #else
4580     /* bulk initialize base fields with 0. */
4581     Zero(ri, sizeof(regexp_internal), char);
4582 #endif
4583
4584     /* non-zero initialization begins here */
4585     RXi_SET( r, ri );
4586     r->engine= RE_ENGINE_PTR;
4587     r->extflags = pm_flags;
4588     {
4589         bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
4590         bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
4591
4592         /* The caret is output if there are any defaults: if not all the STD
4593          * flags are set, or if no character set specifier is needed */
4594         bool has_default =
4595                     (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
4596                     || ! has_charset);
4597         bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
4598         U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
4599                             >> RXf_PMf_STD_PMMOD_SHIFT);
4600         const char *fptr = STD_PAT_MODS;        /*"msix"*/
4601         char *p;
4602         /* Allocate for the worst case, which is all the std flags are turned
4603          * on.  If more precision is desired, we could do a population count of
4604          * the flags set.  This could be done with a small lookup table, or by
4605          * shifting, masking and adding, or even, when available, assembly
4606          * language for a machine-language population count.
4607          * We never output a minus, as all those are defaults, so are
4608          * covered by the caret */
4609         const STRLEN wraplen = plen + has_p + has_runon
4610             + has_default       /* If needs a caret */
4611
4612                 /* If needs a character set specifier */
4613             + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
4614             + (sizeof(STD_PAT_MODS) - 1)
4615             + (sizeof("(?:)") - 1);
4616
4617         p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
4618         SvPOK_on(rx);
4619         SvFLAGS(rx) |= SvUTF8(pattern);
4620         *p++='('; *p++='?';
4621
4622         /* If a default, cover it using the caret */
4623         if (has_default) {
4624             *p++= DEFAULT_PAT_MOD;
4625         }
4626         if (has_charset) {
4627             STRLEN len;
4628             const char* const name = get_regex_charset_name(r->extflags, &len);
4629             Copy(name, p, len, char);
4630             p += len;
4631         }
4632         if (has_p)
4633             *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
4634         {
4635             char ch;
4636             while((ch = *fptr++)) {
4637                 if(reganch & 1)
4638                     *p++ = ch;
4639                 reganch >>= 1;
4640             }
4641         }
4642
4643         *p++ = ':';
4644         Copy(RExC_precomp, p, plen, char);
4645         assert ((RX_WRAPPED(rx) - p) < 16);
4646         r->pre_prefix = p - RX_WRAPPED(rx);
4647         p += plen;
4648         if (has_runon)
4649             *p++ = '\n';
4650         *p++ = ')';
4651         *p = 0;
4652         SvCUR_set(rx, p - SvPVX_const(rx));
4653     }
4654
4655     r->intflags = 0;
4656     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4657
4658     if (RExC_seen & REG_SEEN_RECURSE) {
4659         Newxz(RExC_open_parens, RExC_npar,regnode *);
4660         SAVEFREEPV(RExC_open_parens);
4661         Newxz(RExC_close_parens,RExC_npar,regnode *);
4662         SAVEFREEPV(RExC_close_parens);
4663     }
4664
4665     /* Useful during FAIL. */
4666 #ifdef RE_TRACK_PATTERN_OFFSETS
4667     Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
4668     DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
4669                           "%s %"UVuf" bytes for offset annotations.\n",
4670                           ri->u.offsets ? "Got" : "Couldn't get",
4671                           (UV)((2*RExC_size+1) * sizeof(U32))));
4672 #endif
4673     SetProgLen(ri,RExC_size);
4674     RExC_rx_sv = rx;
4675     RExC_rx = r;
4676     RExC_rxi = ri;
4677     REH_CALL_COMP_BEGIN_HOOK(pRExC_state->rx);
4678
4679     /* Second pass: emit code. */
4680     RExC_flags = pm_flags;      /* don't let top level (?i) bleed */
4681     RExC_parse = exp;
4682     RExC_end = xend;
4683     RExC_naughty = 0;
4684     RExC_npar = 1;
4685     RExC_emit_start = ri->program;
4686     RExC_emit = ri->program;
4687     RExC_emit_bound = ri->program + RExC_size + 1;
4688
4689     /* Store the count of eval-groups for security checks: */
4690     RExC_rx->seen_evals = RExC_seen_evals;
4691     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
4692     if (reg(pRExC_state, 0, &flags,1) == NULL) {
4693         ReREFCNT_dec(rx);
4694         return(NULL);
4695     }
4696     /* XXXX To minimize changes to RE engine we always allocate
4697        3-units-long substrs field. */
4698     Newx(r->substrs, 1, struct reg_substr_data);
4699     if (RExC_recurse_count) {
4700         Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4701         SAVEFREEPV(RExC_recurse);
4702     }
4703
4704 reStudy:
4705     r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
4706     Zero(r->substrs, 1, struct reg_substr_data);
4707
4708 #ifdef TRIE_STUDY_OPT
4709     if (!restudied) {
4710         StructCopy(&zero_scan_data, &data, scan_data_t);
4711         copyRExC_state = RExC_state;
4712     } else {
4713         U32 seen=RExC_seen;
4714         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
4715
4716         RExC_state = copyRExC_state;
4717         if (seen & REG_TOP_LEVEL_BRANCHES)
4718             RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4719         else
4720             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
4721         if (data.last_found) {
4722             SvREFCNT_dec(data.longest_fixed);
4723             SvREFCNT_dec(data.longest_float);
4724             SvREFCNT_dec(data.last_found);
4725         }
4726         StructCopy(&zero_scan_data, &data, scan_data_t);
4727     }
4728 #else
4729     StructCopy(&zero_scan_data, &data, scan_data_t);
4730 #endif
4731
4732     /* Dig out information for optimizations. */
4733     r->extflags = RExC_flags; /* was pm_op */
4734     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
4735
4736     if (UTF)
4737         SvUTF8_on(rx);  /* Unicode in it? */
4738     ri->regstclass = NULL;
4739     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
4740         r->intflags |= PREGf_NAUGHTY;
4741     scan = ri->program + 1;             /* First BRANCH. */
4742
4743     /* testing for BRANCH here tells us whether there is "must appear"
4744        data in the pattern. If there is then we can use it for optimisations */
4745     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /*  Only one top-level choice. */
4746         I32 fake;
4747         STRLEN longest_float_length, longest_fixed_length;
4748         struct regnode_charclass_class ch_class; /* pointed to by data */
4749         int stclass_flag;
4750         I32 last_close = 0; /* pointed to by data */
4751         regnode *first= scan;
4752         regnode *first_next= regnext(first);
4753         /*
4754          * Skip introductions and multiplicators >= 1
4755          * so that we can extract the 'meat' of the pattern that must
4756          * match in the large if() sequence following.
4757          * NOTE that EXACT is NOT covered here, as it is normally
4758          * picked up by the optimiser separately.
4759          *
4760          * This is unfortunate as the optimiser isnt handling lookahead
4761          * properly currently.
4762          *
4763          */
4764         while ((OP(first) == OPEN && (sawopen = 1)) ||
4765                /* An OR of *one* alternative - should not happen now. */
4766             (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
4767             /* for now we can't handle lookbehind IFMATCH*/
4768             (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
4769             (OP(first) == PLUS) ||
4770             (OP(first) == MINMOD) ||
4771                /* An {n,m} with n>0 */
4772             (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
4773             (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
4774         {
4775                 /*
4776                  * the only op that could be a regnode is PLUS, all the rest
4777                  * will be regnode_1 or regnode_2.
4778                  *
4779                  */
4780                 if (OP(first) == PLUS)
4781                     sawplus = 1;
4782                 else
4783                     first += regarglen[OP(first)];
4784
4785                 first = NEXTOPER(first);
4786                 first_next= regnext(first);
4787         }
4788
4789         /* Starting-point info. */
4790       again:
4791         DEBUG_PEEP("first:",first,0);
4792         /* Ignore EXACT as we deal with it later. */
4793         if (PL_regkind[OP(first)] == EXACT) {
4794             if (OP(first) == EXACT)
4795                 NOOP;   /* Empty, get anchored substr later. */
4796             else
4797                 ri->regstclass = first;
4798         }
4799 #ifdef TRIE_STCLASS
4800         else if (PL_regkind[OP(first)] == TRIE &&
4801                 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
4802         {
4803             regnode *trie_op;
4804             /* this can happen only on restudy */
4805             if ( OP(first) == TRIE ) {
4806                 struct regnode_1 *trieop = (struct regnode_1 *)
4807                     PerlMemShared_calloc(1, sizeof(struct regnode_1));
4808                 StructCopy(first,trieop,struct regnode_1);
4809                 trie_op=(regnode *)trieop;
4810             } else {
4811                 struct regnode_charclass *trieop = (struct regnode_charclass *)
4812                     PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
4813                 StructCopy(first,trieop,struct regnode_charclass);
4814                 trie_op=(regnode *)trieop;
4815             }
4816             OP(trie_op)+=2;
4817             make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
4818             ri->regstclass = trie_op;
4819         }
4820 #endif
4821         else if (REGNODE_SIMPLE(OP(first)))
4822             ri->regstclass = first;
4823         else if (PL_regkind[OP(first)] == BOUND ||
4824                  PL_regkind[OP(first)] == NBOUND)
4825             ri->regstclass = first;
4826         else if (PL_regkind[OP(first)] == BOL) {
4827             r->extflags |= (OP(first) == MBOL
4828                            ? RXf_ANCH_MBOL
4829                            : (OP(first) == SBOL
4830                               ? RXf_ANCH_SBOL
4831                               : RXf_ANCH_BOL));
4832             first = NEXTOPER(first);
4833             goto again;
4834         }
4835         else if (OP(first) == GPOS) {
4836             r->extflags |= RXf_ANCH_GPOS;
4837             first = NEXTOPER(first);
4838             goto again;
4839         }
4840         else if ((!sawopen || !RExC_sawback) &&
4841             (OP(first) == STAR &&
4842             PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
4843             !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
4844         {
4845             /* turn .* into ^.* with an implied $*=1 */
4846             const int type =
4847                 (OP(NEXTOPER(first)) == REG_ANY)
4848                     ? RXf_ANCH_MBOL
4849                     : RXf_ANCH_SBOL;
4850             r->extflags |= type;
4851             r->intflags |= PREGf_IMPLICIT;
4852             first = NEXTOPER(first);
4853             goto again;
4854         }
4855         if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
4856             && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
4857             /* x+ must match at the 1st pos of run of x's */
4858             r->intflags |= PREGf_SKIP;
4859
4860         /* Scan is after the zeroth branch, first is atomic matcher. */
4861 #ifdef TRIE_STUDY_OPT
4862         DEBUG_PARSE_r(
4863             if (!restudied)
4864                 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4865                               (IV)(first - scan + 1))
4866         );
4867 #else
4868         DEBUG_PARSE_r(
4869             PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4870                 (IV)(first - scan + 1))
4871         );
4872 #endif
4873
4874
4875         /*
4876         * If there's something expensive in the r.e., find the
4877         * longest literal string that must appear and make it the
4878         * regmust.  Resolve ties in favor of later strings, since
4879         * the regstart check works with the beginning of the r.e.
4880         * and avoiding duplication strengthens checking.  Not a
4881         * strong reason, but sufficient in the absence of others.
4882         * [Now we resolve ties in favor of the earlier string if
4883         * it happens that c_offset_min has been invalidated, since the
4884         * earlier string may buy us something the later one won't.]
4885         */
4886
4887         data.longest_fixed = newSVpvs("");
4888         data.longest_float = newSVpvs("");
4889         data.last_found = newSVpvs("");
4890         data.longest = &(data.longest_fixed);
4891         first = scan;
4892         if (!ri->regstclass) {
4893             cl_init(pRExC_state, &ch_class);
4894             data.start_class = &ch_class;
4895             stclass_flag = SCF_DO_STCLASS_AND;
4896         } else                          /* XXXX Check for BOUND? */
4897             stclass_flag = 0;
4898         data.last_closep = &last_close;
4899
4900         minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
4901             &data, -1, NULL, NULL,
4902             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
4903
4904
4905         CHECK_RESTUDY_GOTO;
4906
4907
4908         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
4909              && data.last_start_min == 0 && data.last_end > 0
4910              && !RExC_seen_zerolen
4911              && !(RExC_seen & REG_SEEN_VERBARG)
4912              && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
4913             r->extflags |= RXf_CHECK_ALL;
4914         scan_commit(pRExC_state, &data,&minlen,0);
4915         SvREFCNT_dec(data.last_found);
4916
4917         /* Note that code very similar to this but for anchored string
4918            follows immediately below, changes may need to be made to both.
4919            Be careful.
4920          */
4921         longest_float_length = CHR_SVLEN(data.longest_float);
4922         if (longest_float_length
4923             || (data.flags & SF_FL_BEFORE_EOL
4924                 && (!(data.flags & SF_FL_BEFORE_MEOL)
4925                     || (RExC_flags & RXf_PMf_MULTILINE))))
4926         {
4927             I32 t,ml;
4928
4929             if (SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
4930                 && data.offset_fixed == data.offset_float_min
4931                 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4932                     goto remove_float;          /* As in (a)+. */
4933
4934             /* copy the information about the longest float from the reg_scan_data
4935                over to the program. */
4936             if (SvUTF8(data.longest_float)) {
4937                 r->float_utf8 = data.longest_float;
4938                 r->float_substr = NULL;
4939             } else {
4940                 r->float_substr = data.longest_float;
4941                 r->float_utf8 = NULL;
4942             }
4943             /* float_end_shift is how many chars that must be matched that
4944                follow this item. We calculate it ahead of time as once the
4945                lookbehind offset is added in we lose the ability to correctly
4946                calculate it.*/
4947             ml = data.minlen_float ? *(data.minlen_float)
4948                                    : (I32)longest_float_length;
4949             r->float_end_shift = ml - data.offset_float_min
4950                 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4951                 + data.lookbehind_float;
4952             r->float_min_offset = data.offset_float_min - data.lookbehind_float;
4953             r->float_max_offset = data.offset_float_max;
4954             if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
4955                 r->float_max_offset -= data.lookbehind_float;
4956
4957             t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4958                        && (!(data.flags & SF_FL_BEFORE_MEOL)
4959                            || (RExC_flags & RXf_PMf_MULTILINE)));
4960             fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
4961         }
4962         else {
4963           remove_float:
4964             r->float_substr = r->float_utf8 = NULL;
4965             SvREFCNT_dec(data.longest_float);
4966             longest_float_length = 0;
4967         }
4968
4969         /* Note that code very similar to this but for floating string
4970            is immediately above, changes may need to be made to both.
4971            Be careful.
4972          */
4973         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
4974         if (longest_fixed_length
4975             || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4976                 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4977                     || (RExC_flags & RXf_PMf_MULTILINE))))
4978         {
4979             I32 t,ml;
4980
4981             /* copy the information about the longest fixed
4982                from the reg_scan_data over to the program. */
4983             if (SvUTF8(data.longest_fixed)) {
4984                 r->anchored_utf8 = data.longest_fixed;
4985                 r->anchored_substr = NULL;
4986             } else {
4987                 r->anchored_substr = data.longest_fixed;
4988                 r->anchored_utf8 = NULL;
4989             }
4990             /* fixed_end_shift is how many chars that must be matched that
4991                follow this item. We calculate it ahead of time as once the
4992                lookbehind offset is added in we lose the ability to correctly
4993                calculate it.*/
4994             ml = data.minlen_fixed ? *(data.minlen_fixed)
4995                                    : (I32)longest_fixed_length;
4996             r->anchored_end_shift = ml - data.offset_fixed
4997                 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4998                 + data.lookbehind_fixed;
4999             r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
5000
5001             t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
5002                  && (!(data.flags & SF_FIX_BEFORE_MEOL)
5003                      || (RExC_flags & RXf_PMf_MULTILINE)));
5004             fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
5005         }
5006         else {
5007             r->anchored_substr = r->anchored_utf8 = NULL;
5008             SvREFCNT_dec(data.longest_fixed);
5009             longest_fixed_length = 0;
5010         }
5011         if (ri->regstclass
5012             && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
5013             ri->regstclass = NULL;
5014
5015         /* If the synthetic start class were to ever be used when EOS is set,
5016          * that bit would have to be cleared, as it is shared with another */
5017         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
5018             && stclass_flag
5019             && !(data.start_class->flags & ANYOF_EOS)
5020             && !cl_is_anything(data.start_class))
5021         {
5022             const U32 n = add_data(pRExC_state, 1, "f");
5023
5024             Newx(RExC_rxi->data->data[n], 1,
5025                 struct regnode_charclass_class);
5026             StructCopy(data.start_class,
5027                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
5028                        struct regnode_charclass_class);
5029             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
5030             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
5031             DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
5032                       regprop(r, sv, (regnode*)data.start_class);
5033                       PerlIO_printf(Perl_debug_log,
5034                                     "synthetic stclass \"%s\".\n",
5035                                     SvPVX_const(sv));});
5036         }
5037
5038         /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
5039         if (longest_fixed_length > longest_float_length) {
5040             r->check_end_shift = r->anchored_end_shift;
5041             r->check_substr = r->anchored_substr;
5042             r->check_utf8 = r->anchored_utf8;
5043             r->check_offset_min = r->check_offset_max = r->anchored_offset;
5044             if (r->extflags & RXf_ANCH_SINGLE)
5045                 r->extflags |= RXf_NOSCAN;
5046         }
5047         else {
5048             r->check_end_shift = r->float_end_shift;
5049             r->check_substr = r->float_substr;
5050             r->check_utf8 = r->float_utf8;
5051             r->check_offset_min = r->float_min_offset;
5052             r->check_offset_max = r->float_max_offset;
5053         }
5054         /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
5055            This should be changed ASAP!  */
5056         if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
5057             r->extflags |= RXf_USE_INTUIT;
5058             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
5059                 r->extflags |= RXf_INTUIT_TAIL;
5060         }
5061         /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
5062         if ( (STRLEN)minlen < longest_float_length )
5063             minlen= longest_float_length;
5064         if ( (STRLEN)minlen < longest_fixed_length )
5065             minlen= longest_fixed_length;
5066         */
5067     }
5068     else {
5069         /* Several toplevels. Best we can is to set minlen. */
5070         I32 fake;
5071         struct regnode_charclass_class ch_class;
5072         I32 last_close = 0;
5073
5074         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
5075
5076         scan = ri->program + 1;
5077         cl_init(pRExC_state, &ch_class);
5078         data.start_class = &ch_class;
5079         data.last_closep = &last_close;
5080
5081
5082         minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
5083             &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
5084
5085         CHECK_RESTUDY_GOTO;
5086
5087         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
5088                 = r->float_substr = r->float_utf8 = NULL;
5089
5090         /* If the synthetic start class were to ever be used when EOS is set,
5091          * that bit would have to be cleared, as it is shared with another */
5092         if (!(data.start_class->flags & ANYOF_EOS)
5093             && !cl_is_anything(data.start_class))
5094         {
5095             const U32 n = add_data(pRExC_state, 1, "f");
5096
5097             Newx(RExC_rxi->data->data[n], 1,
5098                 struct regnode_charclass_class);
5099             StructCopy(data.start_class,
5100                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
5101                        struct regnode_charclass_class);
5102             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
5103             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
5104             DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
5105                       regprop(r, sv, (regnode*)data.start_class);
5106                       PerlIO_printf(Perl_debug_log,
5107                                     "synthetic stclass \"%s\".\n",
5108                                     SvPVX_const(sv));});
5109         }
5110     }
5111
5112     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
5113        the "real" pattern. */
5114     DEBUG_OPTIMISE_r({
5115         PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
5116                       (IV)minlen, (IV)r->minlen);
5117     });
5118     r->minlenret = minlen;
5119     if (r->minlen < minlen)
5120         r->minlen = minlen;
5121
5122     if (RExC_seen & REG_SEEN_GPOS)
5123         r->extflags |= RXf_GPOS_SEEN;
5124     if (RExC_seen & REG_SEEN_LOOKBEHIND)
5125         r->extflags |= RXf_LOOKBEHIND_SEEN;
5126     if (RExC_seen & REG_SEEN_EVAL)
5127         r->extflags |= RXf_EVAL_SEEN;
5128     if (RExC_seen & REG_SEEN_CANY)
5129         r->extflags |= RXf_CANY_SEEN;
5130     if (RExC_seen & REG_SEEN_VERBARG)
5131         r->intflags |= PREGf_VERBARG_SEEN;
5132     if (RExC_seen & REG_SEEN_CUTGROUP)
5133         r->intflags |= PREGf_CUTGROUP_SEEN;
5134     if (RExC_paren_names)
5135         RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
5136     else
5137         RXp_PAREN_NAMES(r) = NULL;
5138
5139 #ifdef STUPID_PATTERN_CHECKS
5140     if (RX_PRELEN(rx) == 0)
5141         r->extflags |= RXf_NULL;
5142     if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
5143         /* XXX: this should happen BEFORE we compile */
5144         r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5145     else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
5146         r->extflags |= RXf_WHITE;
5147     else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
5148         r->extflags |= RXf_START_ONLY;
5149 #else
5150     if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
5151             /* XXX: this should happen BEFORE we compile */
5152             r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5153     else {
5154         regnode *first = ri->program + 1;
5155         U8 fop = OP(first);
5156
5157         if (PL_regkind[fop] == NOTHING && OP(NEXTOPER(first)) == END)
5158             r->extflags |= RXf_NULL;
5159         else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END)
5160             r->extflags |= RXf_START_ONLY;
5161         else if (fop == PLUS && OP(NEXTOPER(first)) == SPACE
5162                              && OP(regnext(first)) == END)
5163             r->extflags |= RXf_WHITE;
5164     }
5165 #endif
5166 #ifdef DEBUGGING
5167     if (RExC_paren_names) {
5168         ri->name_list_idx = add_data( pRExC_state, 1, "a" );
5169         ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
5170     } else
5171 #endif
5172         ri->name_list_idx = 0;
5173
5174     if (RExC_recurse_count) {
5175         for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
5176             const regnode *scan = RExC_recurse[RExC_recurse_count-1];
5177             ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
5178         }
5179     }
5180     Newxz(r->offs, RExC_npar, regexp_paren_pair);
5181     /* assume we don't need to swap parens around before we match */
5182
5183     DEBUG_DUMP_r({
5184         PerlIO_printf(Perl_debug_log,"Final program:\n");
5185         regdump(r);
5186     });
5187 #ifdef RE_TRACK_PATTERN_OFFSETS
5188     DEBUG_OFFSETS_r(if (ri->u.offsets) {
5189         const U32 len = ri->u.offsets[0];
5190         U32 i;
5191         GET_RE_DEBUG_FLAGS_DECL;
5192         PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
5193         for (i = 1; i <= len; i++) {
5194             if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
5195                 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
5196                 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
5197             }
5198         PerlIO_printf(Perl_debug_log, "\n");
5199     });
5200 #endif
5201     return rx;
5202 }
5203
5204 #undef RE_ENGINE_PTR
5205
5206
5207 SV*
5208 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
5209                     const U32 flags)
5210 {
5211     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
5212
5213     PERL_UNUSED_ARG(value);
5214
5215     if (flags & RXapif_FETCH) {
5216         return reg_named_buff_fetch(rx, key, flags);
5217     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
5218         Perl_croak_no_modify(aTHX);
5219         return NULL;
5220     } else if (flags & RXapif_EXISTS) {
5221         return reg_named_buff_exists(rx, key, flags)
5222             ? &PL_sv_yes
5223             : &PL_sv_no;
5224     } else if (flags & RXapif_REGNAMES) {
5225         return reg_named_buff_all(rx, flags);
5226     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
5227         return reg_named_buff_scalar(rx, flags);
5228     } else {
5229         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
5230         return NULL;
5231     }
5232 }
5233
5234 SV*
5235 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
5236                          const U32 flags)
5237 {
5238     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
5239     PERL_UNUSED_ARG(lastkey);
5240
5241     if (flags & RXapif_FIRSTKEY)
5242         return reg_named_buff_firstkey(rx, flags);
5243     else if (flags & RXapif_NEXTKEY)
5244         return reg_named_buff_nextkey(rx, flags);
5245     else {
5246         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
5247         return NULL;
5248     }
5249 }
5250
5251 SV*
5252 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
5253                           const U32 flags)
5254 {
5255     AV *retarray = NULL;
5256     SV *ret;
5257     struct regexp *const rx = (struct regexp *)SvANY(r);
5258
5259     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
5260
5261     if (flags & RXapif_ALL)
5262         retarray=newAV();
5263
5264     if (rx && RXp_PAREN_NAMES(rx)) {
5265         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
5266         if (he_str) {
5267             IV i;
5268             SV* sv_dat=HeVAL(he_str);
5269             I32 *nums=(I32*)SvPVX(sv_dat);
5270             for ( i=0; i<SvIVX(sv_dat); i++ ) {
5271                 if ((I32)(rx->nparens) >= nums[i]
5272                     && rx->offs[nums[i]].start != -1
5273                     && rx->offs[nums[i]].end != -1)
5274                 {
5275                     ret = newSVpvs("");
5276                     CALLREG_NUMBUF_FETCH(r,nums[i],ret);
5277                     if (!retarray)
5278                         return ret;
5279                 } else {
5280                     ret = newSVsv(&PL_sv_undef);
5281                 }
5282                 if (retarray)
5283                     av_push(retarray, ret);
5284             }
5285             if (retarray)
5286                 return newRV_noinc(MUTABLE_SV(retarray));
5287         }
5288     }
5289     return NULL;
5290 }
5291
5292 bool
5293 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
5294                            const U32 flags)
5295 {
5296     struct regexp *const rx = (struct regexp *)SvANY(r);
5297
5298     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
5299
5300     if (rx && RXp_PAREN_NAMES(rx)) {
5301         if (flags & RXapif_ALL) {
5302             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
5303         } else {
5304             SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
5305             if (sv) {
5306                 SvREFCNT_dec(sv);
5307                 return TRUE;
5308             } else {
5309                 return FALSE;
5310             }
5311         }
5312     } else {
5313         return FALSE;
5314     }
5315 }
5316
5317 SV*
5318 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
5319 {
5320     struct regexp *const rx = (struct regexp *)SvANY(r);
5321
5322     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
5323
5324     if ( rx && RXp_PAREN_NAMES(rx) ) {
5325         (void)hv_iterinit(RXp_PAREN_NAMES(rx));
5326
5327         return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
5328     } else {
5329         return FALSE;
5330     }
5331 }
5332
5333 SV*
5334 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
5335 {
5336     struct regexp *const rx = (struct regexp *)SvANY(r);
5337     GET_RE_DEBUG_FLAGS_DECL;
5338
5339     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
5340
5341     if (rx && RXp_PAREN_NAMES(rx)) {
5342         HV *hv = RXp_PAREN_NAMES(rx);
5343         HE *temphe;
5344         while ( (temphe = hv_iternext_flags(hv,0)) ) {
5345             IV i;
5346             IV parno = 0;
5347             SV* sv_dat = HeVAL(temphe);
5348             I32 *nums = (I32*)SvPVX(sv_dat);
5349             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5350                 if ((I32)(rx->lastparen) >= nums[i] &&
5351                     rx->offs[nums[i]].start != -1 &&
5352                     rx->offs[nums[i]].end != -1)
5353                 {
5354                     parno = nums[i];
5355                     break;
5356                 }
5357             }
5358             if (parno || flags & RXapif_ALL) {
5359                 return newSVhek(HeKEY_hek(temphe));
5360             }
5361         }
5362     }
5363     return NULL;
5364 }
5365
5366 SV*
5367 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
5368 {
5369     SV *ret;
5370     AV *av;
5371     I32 length;
5372     struct regexp *const rx = (struct regexp *)SvANY(r);
5373
5374     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
5375
5376     if (rx && RXp_PAREN_NAMES(rx)) {
5377         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5378             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
5379         } else if (flags & RXapif_ONE) {
5380             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
5381             av = MUTABLE_AV(SvRV(ret));
5382             length = av_len(av);
5383             SvREFCNT_dec(ret);
5384             return newSViv(length + 1);
5385         } else {
5386             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5387             return NULL;
5388         }
5389     }
5390     return &PL_sv_undef;
5391 }
5392
5393 SV*
5394 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
5395 {
5396     struct regexp *const rx = (struct regexp *)SvANY(r);
5397     AV *av = newAV();
5398
5399     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
5400
5401     if (rx && RXp_PAREN_NAMES(rx)) {
5402         HV *hv= RXp_PAREN_NAMES(rx);
5403         HE *temphe;
5404         (void)hv_iterinit(hv);
5405         while ( (temphe = hv_iternext_flags(hv,0)) ) {
5406             IV i;
5407             IV parno = 0;
5408             SV* sv_dat = HeVAL(temphe);
5409             I32 *nums = (I32*)SvPVX(sv_dat);
5410             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5411                 if ((I32)(rx->lastparen) >= nums[i] &&
5412                     rx->offs[nums[i]].start != -1 &&
5413                     rx->offs[nums[i]].end != -1)
5414                 {
5415                     parno = nums[i];
5416                     break;
5417                 }
5418             }
5419             if (parno || flags & RXapif_ALL) {
5420                 av_push(av, newSVhek(HeKEY_hek(temphe)));
5421             }
5422         }
5423     }
5424
5425     return newRV_noinc(MUTABLE_SV(av));
5426 }
5427
5428 void
5429 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
5430                              SV * const sv)
5431 {
5432     struct regexp *const rx = (struct regexp *)SvANY(r);
5433     char *s = NULL;
5434     I32 i = 0;
5435     I32 s1, t1;
5436
5437     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
5438
5439     if (!rx->subbeg) {
5440         sv_setsv(sv,&PL_sv_undef);
5441         return;
5442     }
5443     else
5444     if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
5445         /* $` */
5446         i = rx->offs[0].start;
5447         s = rx->subbeg;
5448     }
5449     else
5450     if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
5451         /* $' */
5452         s = rx->subbeg + rx->offs[0].end;
5453         i = rx->sublen - rx->offs[0].end;
5454     }
5455     else
5456     if ( 0 <= paren && paren <= (I32)rx->nparens &&
5457         (s1 = rx->offs[paren].start) != -1 &&
5458         (t1 = rx->offs[paren].end) != -1)
5459     {
5460         /* $& $1 ... */
5461         i = t1 - s1;
5462         s = rx->subbeg + s1;
5463     } else {
5464         sv_setsv(sv,&PL_sv_undef);
5465         return;
5466     }
5467     assert(rx->sublen >= (s - rx->subbeg) + i );
5468     if (i >= 0) {
5469         const int oldtainted = PL_tainted;
5470         TAINT_NOT;
5471         sv_setpvn(sv, s, i);
5472         PL_tainted = oldtainted;
5473         if ( (rx->extflags & RXf_CANY_SEEN)
5474             ? (RXp_MATCH_UTF8(rx)
5475                         && (!i || is_utf8_string((U8*)s, i)))
5476             : (RXp_MATCH_UTF8(rx)) )
5477         {
5478             SvUTF8_on(sv);
5479         }
5480         else
5481             SvUTF8_off(sv);
5482         if (PL_tainting) {
5483             if (RXp_MATCH_TAINTED(rx)) {
5484                 if (SvTYPE(sv) >= SVt_PVMG) {
5485                     MAGIC* const mg = SvMAGIC(sv);
5486                     MAGIC* mgt;
5487                     PL_tainted = 1;
5488                     SvMAGIC_set(sv, mg->mg_moremagic);
5489                     SvTAINT(sv);
5490                     if ((mgt = SvMAGIC(sv))) {
5491                         mg->mg_moremagic = mgt;
5492                         SvMAGIC_set(sv, mg);
5493                     }
5494                 } else {
5495                     PL_tainted = 1;
5496                     SvTAINT(sv);
5497                 }
5498             } else
5499                 SvTAINTED_off(sv);
5500         }
5501     } else {
5502         sv_setsv(sv,&PL_sv_undef);
5503         return;
5504     }
5505 }
5506
5507 void
5508 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
5509                                                          SV const * const value)
5510 {
5511     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
5512
5513     PERL_UNUSED_ARG(rx);
5514     PERL_UNUSED_ARG(paren);
5515     PERL_UNUSED_ARG(value);
5516
5517     if (!PL_localizing)
5518         Perl_croak_no_modify(aTHX);
5519 }
5520
5521 I32
5522 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
5523                               const I32 paren)
5524 {
5525     struct regexp *const rx = (struct regexp *)SvANY(r);
5526     I32 i;
5527     I32 s1, t1;
5528
5529     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
5530
5531     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
5532         switch (paren) {
5533       /* $` / ${^PREMATCH} */
5534       case RX_BUFF_IDX_PREMATCH:
5535         if (rx->offs[0].start != -1) {
5536                         i = rx->offs[0].start;
5537                         if (i > 0) {
5538                                 s1 = 0;
5539                                 t1 = i;
5540                                 goto getlen;
5541                         }
5542             }
5543         return 0;
5544       /* $' / ${^POSTMATCH} */
5545       case RX_BUFF_IDX_POSTMATCH:
5546             if (rx->offs[0].end != -1) {
5547                         i = rx->sublen - rx->offs[0].end;
5548                         if (i > 0) {
5549                                 s1 = rx->offs[0].end;
5550                                 t1 = rx->sublen;
5551                                 goto getlen;
5552                         }
5553             }
5554         return 0;
5555       /* $& / ${^MATCH}, $1, $2, ... */
5556       default:
5557             if (paren <= (I32)rx->nparens &&
5558             (s1 = rx->offs[paren].start) != -1 &&
5559             (t1 = rx->offs[paren].end) != -1)
5560             {
5561             i = t1 - s1;
5562             goto getlen;
5563         } else {
5564             if (ckWARN(WARN_UNINITIALIZED))
5565                 report_uninit((const SV *)sv);
5566             return 0;
5567         }
5568     }
5569   getlen:
5570     if (i > 0 && RXp_MATCH_UTF8(rx)) {
5571         const char * const s = rx->subbeg + s1;
5572         const U8 *ep;
5573         STRLEN el;
5574
5575         i = t1 - s1;
5576         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
5577                         i = el;
5578     }
5579     return i;
5580 }
5581
5582 SV*
5583 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
5584 {
5585     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
5586         PERL_UNUSED_ARG(rx);
5587         if (0)
5588             return NULL;
5589         else
5590             return newSVpvs("Regexp");
5591 }
5592
5593 /* Scans the name of a named buffer from the pattern.
5594  * If flags is REG_RSN_RETURN_NULL returns null.
5595  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
5596  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
5597  * to the parsed name as looked up in the RExC_paren_names hash.
5598  * If there is an error throws a vFAIL().. type exception.
5599  */
5600
5601 #define REG_RSN_RETURN_NULL    0
5602 #define REG_RSN_RETURN_NAME    1
5603 #define REG_RSN_RETURN_DATA    2
5604
5605 STATIC SV*
5606 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
5607 {
5608     char *name_start = RExC_parse;
5609
5610     PERL_ARGS_ASSERT_REG_SCAN_NAME;
5611
5612     if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
5613          /* skip IDFIRST by using do...while */
5614         if (UTF)
5615             do {
5616                 RExC_parse += UTF8SKIP(RExC_parse);
5617             } while (isALNUM_utf8((U8*)RExC_parse));
5618         else
5619             do {
5620                 RExC_parse++;
5621             } while (isALNUM(*RExC_parse));
5622     }
5623
5624     if ( flags ) {
5625         SV* sv_name
5626             = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
5627                              SVs_TEMP | (UTF ? SVf_UTF8 : 0));
5628         if ( flags == REG_RSN_RETURN_NAME)
5629             return sv_name;
5630         else if (flags==REG_RSN_RETURN_DATA) {
5631             HE *he_str = NULL;
5632             SV *sv_dat = NULL;
5633             if ( ! sv_name )      /* should not happen*/
5634                 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
5635             if (RExC_paren_names)
5636                 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
5637             if ( he_str )
5638                 sv_dat = HeVAL(he_str);
5639             if ( ! sv_dat )
5640                 vFAIL("Reference to nonexistent named group");
5641             return sv_dat;
5642         }
5643         else {
5644             Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
5645         }
5646         /* NOT REACHED */
5647     }
5648     return NULL;
5649 }
5650
5651 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
5652     int rem=(int)(RExC_end - RExC_parse);                       \
5653     int cut;                                                    \
5654     int num;                                                    \
5655     int iscut=0;                                                \
5656     if (rem>10) {                                               \
5657         rem=10;                                                 \
5658         iscut=1;                                                \
5659     }                                                           \
5660     cut=10-rem;                                                 \
5661     if (RExC_lastparse!=RExC_parse)                             \
5662         PerlIO_printf(Perl_debug_log," >%.*s%-*s",              \
5663             rem, RExC_parse,                                    \
5664             cut + 4,                                            \
5665             iscut ? "..." : "<"                                 \
5666         );                                                      \
5667     else                                                        \
5668         PerlIO_printf(Perl_debug_log,"%16s","");                \
5669                                                                 \
5670     if (SIZE_ONLY)                                              \
5671        num = RExC_size + 1;                                     \
5672     else                                                        \
5673        num=REG_NODE_NUM(RExC_emit);                             \
5674     if (RExC_lastnum!=num)                                      \
5675        PerlIO_printf(Perl_debug_log,"|%4d",num);                \
5676     else                                                        \
5677        PerlIO_printf(Perl_debug_log,"|%4s","");                 \
5678     PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
5679         (int)((depth*2)), "",                                   \
5680         (funcname)                                              \
5681     );                                                          \
5682     RExC_lastnum=num;                                           \
5683     RExC_lastparse=RExC_parse;                                  \
5684 })
5685
5686
5687
5688 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
5689     DEBUG_PARSE_MSG((funcname));                            \
5690     PerlIO_printf(Perl_debug_log,"%4s","\n");               \
5691 })
5692 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
5693     DEBUG_PARSE_MSG((funcname));                            \
5694     PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
5695 })
5696
5697 /* This section of code defines the inversion list object and its methods.  The
5698  * interfaces are highly subject to change, so as much as possible is static to
5699  * this file.  An inversion list is here implemented as a malloc'd C array with
5700  * some added info.  More will be coming when functionality is added later.
5701  *
5702  * Some of the methods should always be private to the implementation, and some
5703  * should eventually be made public */
5704
5705 #define INVLIST_INITIAL_LEN 10
5706 #define INVLIST_ARRAY_KEY "array"
5707 #define INVLIST_MAX_KEY "max"
5708 #define INVLIST_LEN_KEY "len"
5709
5710 PERL_STATIC_INLINE UV*
5711 S_invlist_array(pTHX_ HV* const invlist)
5712 {
5713     /* Returns the pointer to the inversion list's array.  Every time the
5714      * length changes, this needs to be called in case malloc or realloc moved
5715      * it */
5716
5717     SV** list_ptr = hv_fetchs(invlist, INVLIST_ARRAY_KEY, FALSE);
5718
5719     PERL_ARGS_ASSERT_INVLIST_ARRAY;
5720
5721     if (list_ptr == NULL) {
5722         Perl_croak(aTHX_ "panic: inversion list without a '%s' element",
5723                                                             INVLIST_ARRAY_KEY);
5724     }
5725
5726     return INT2PTR(UV *, SvUV(*list_ptr));
5727 }
5728
5729 PERL_STATIC_INLINE void
5730 S_invlist_set_array(pTHX_ HV* const invlist, const UV* const array)
5731 {
5732     PERL_ARGS_ASSERT_INVLIST_SET_ARRAY;
5733
5734     /* Sets the array stored in the inversion list to the memory beginning with
5735      * the parameter */
5736
5737     if (hv_stores(invlist, INVLIST_ARRAY_KEY, newSVuv(PTR2UV(array))) == NULL) {
5738         Perl_croak(aTHX_ "panic: can't store '%s' entry in inversion list",
5739                                                             INVLIST_ARRAY_KEY);
5740     }
5741 }
5742
5743 PERL_STATIC_INLINE UV
5744 S_invlist_len(pTHX_ HV* const invlist)
5745 {
5746     /* Returns the current number of elements in the inversion list's array */
5747
5748     SV** len_ptr = hv_fetchs(invlist, INVLIST_LEN_KEY, FALSE);
5749
5750     PERL_ARGS_ASSERT_INVLIST_LEN;
5751
5752     if (len_ptr == NULL) {
5753         Perl_croak(aTHX_ "panic: inversion list without a '%s' element",
5754                                                             INVLIST_LEN_KEY);
5755     }
5756
5757     return SvUV(*len_ptr);
5758 }
5759
5760 PERL_STATIC_INLINE UV
5761 S_invlist_max(pTHX_ HV* const invlist)
5762 {
5763     /* Returns the maximum number of elements storable in the inversion list's
5764      * array, without having to realloc() */
5765
5766     SV** max_ptr = hv_fetchs(invlist, INVLIST_MAX_KEY, FALSE);
5767
5768     PERL_ARGS_ASSERT_INVLIST_MAX;
5769
5770     if (max_ptr == NULL) {
5771         Perl_croak(aTHX_ "panic: inversion list without a '%s' element",
5772                                                             INVLIST_MAX_KEY);
5773     }
5774
5775     return SvUV(*max_ptr);
5776 }
5777
5778 PERL_STATIC_INLINE void
5779 S_invlist_set_len(pTHX_ HV* const invlist, const UV len)
5780 {
5781     /* Sets the current number of elements stored in the inversion list */
5782
5783     PERL_ARGS_ASSERT_INVLIST_SET_LEN;
5784
5785     if (len != 0 && len > invlist_max(invlist)) {
5786         Perl_croak(aTHX_ "panic: Can't make '%s=%"UVuf"' more than %s=%"UVuf" in inversion list", INVLIST_LEN_KEY, len, INVLIST_MAX_KEY, invlist_max(invlist));
5787     }
5788
5789     if (hv_stores(invlist, INVLIST_LEN_KEY, newSVuv(len)) == NULL) {
5790         Perl_croak(aTHX_ "panic: can't store '%s' entry in inversion list",
5791                                                             INVLIST_LEN_KEY);
5792     }
5793 }
5794
5795 PERL_STATIC_INLINE void
5796 S_invlist_set_max(pTHX_ HV* const invlist, const UV max)
5797 {
5798
5799     /* Sets the maximum number of elements storable in the inversion list
5800      * without having to realloc() */
5801
5802     PERL_ARGS_ASSERT_INVLIST_SET_MAX;
5803
5804     if (max < invlist_len(invlist)) {
5805         Perl_croak(aTHX_ "panic: Can't make '%s=%"UVuf"' less than %s=%"UVuf" in inversion list", INVLIST_MAX_KEY, invlist_len(invlist), INVLIST_LEN_KEY, invlist_max(invlist));
5806     }
5807
5808     if (hv_stores(invlist, INVLIST_MAX_KEY, newSVuv(max)) == NULL) {
5809         Perl_croak(aTHX_ "panic: can't store '%s' entry in inversion list",
5810                                                             INVLIST_LEN_KEY);
5811     }
5812 }
5813
5814 #ifndef PERL_IN_XSUB_RE
5815 HV*
5816 Perl__new_invlist(pTHX_ IV initial_size)
5817 {
5818
5819     /* Return a pointer to a newly constructed inversion list, with enough
5820      * space to store 'initial_size' elements.  If that number is negative, a
5821      * system default is used instead */
5822
5823     HV* invlist = newHV();
5824     UV* list;
5825
5826     if (initial_size < 0) {
5827         initial_size = INVLIST_INITIAL_LEN;
5828     }
5829
5830     /* Allocate the initial space */
5831     Newx(list, initial_size, UV);
5832     invlist_set_array(invlist, list);
5833
5834     /* set_len has to come before set_max, as the latter inspects the len */
5835     invlist_set_len(invlist, 0);
5836     invlist_set_max(invlist, initial_size);
5837
5838     return invlist;
5839 }
5840 #endif
5841
5842 PERL_STATIC_INLINE void
5843 S_invlist_destroy(pTHX_ HV* const invlist)
5844 {
5845    /* Inversion list destructor */
5846
5847     SV** list_ptr = hv_fetchs(invlist, INVLIST_ARRAY_KEY, FALSE);
5848
5849     PERL_ARGS_ASSERT_INVLIST_DESTROY;
5850
5851     if (list_ptr != NULL) {
5852         UV *list = INT2PTR(UV *, SvUV(*list_ptr)); /* PERL_POISON needs lvalue */
5853         Safefree(list);
5854     }
5855 }
5856
5857 STATIC void
5858 S_invlist_extend(pTHX_ HV* const invlist, const UV new_max)
5859 {
5860     /* Change the maximum size of an inversion list (up or down) */
5861
5862     UV* orig_array;
5863     UV* array;
5864     const UV old_max = invlist_max(invlist);
5865
5866     PERL_ARGS_ASSERT_INVLIST_EXTEND;
5867
5868     if (old_max == new_max) {   /* If a no-op */
5869         return;
5870     }
5871
5872     array = orig_array = invlist_array(invlist);
5873     Renew(array, new_max, UV);
5874
5875     /* If the size change moved the list in memory, set the new one */
5876     if (array != orig_array) {
5877         invlist_set_array(invlist, array);
5878     }
5879
5880     invlist_set_max(invlist, new_max);
5881
5882 }
5883
5884 PERL_STATIC_INLINE void
5885 S_invlist_trim(pTHX_ HV* const invlist)
5886 {
5887     PERL_ARGS_ASSERT_INVLIST_TRIM;
5888
5889     /* Change the length of the inversion list to how many entries it currently
5890      * has */
5891
5892     invlist_extend(invlist, invlist_len(invlist));
5893 }
5894
5895 /* An element is in an inversion list iff its index is even numbered: 0, 2, 4,
5896  * etc */
5897
5898 #define ELEMENT_IN_INVLIST_SET(i) (! ((i) & 1))
5899
5900 #ifndef PERL_IN_XSUB_RE
5901 void
5902 Perl__append_range_to_invlist(pTHX_ HV* const invlist, const UV start, const UV end)
5903 {
5904    /* Subject to change or removal.  Append the range from 'start' to 'end' at
5905     * the end of the inversion list.  The range must be above any existing
5906     * ones. */
5907
5908     UV* array = invlist_array(invlist);
5909     UV max = invlist_max(invlist);
5910     UV len = invlist_len(invlist);
5911
5912     PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
5913
5914     if (len > 0) {
5915
5916         /* Here, the existing list is non-empty. The current max entry in the
5917          * list is generally the first value not in the set, except when the
5918          * set extends to the end of permissible values, in which case it is
5919          * the first entry in that final set, and so this call is an attempt to
5920          * append out-of-order */
5921
5922         UV final_element = len - 1;
5923         if (array[final_element] > start
5924             || ELEMENT_IN_INVLIST_SET(final_element))
5925         {
5926             Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list");
5927         }
5928
5929         /* Here, it is a legal append.  If the new range begins with the first
5930          * value not in the set, it is extending the set, so the new first
5931          * value not in the set is one greater than the newly extended range.
5932          * */
5933         if (array[final_element] == start) {
5934             if (end != UV_MAX) {
5935                 array[final_element] = end + 1;
5936             }
5937             else {
5938                 /* But if the end is the maximum representable on the machine,
5939                  * just let the range that this would extend have no end */
5940                 invlist_set_len(invlist, len - 1);
5941             }
5942             return;
5943         }
5944     }
5945
5946     /* Here the new range doesn't extend any existing set.  Add it */
5947
5948     len += 2;   /* Includes an element each for the start and end of range */
5949
5950     /* If overflows the existing space, extend, which may cause the array to be
5951      * moved */
5952     if (max < len) {
5953         invlist_extend(invlist, len);
5954         array = invlist_array(invlist);
5955     }
5956
5957     invlist_set_len(invlist, len);
5958
5959     /* The next item on the list starts the range, the one after that is
5960      * one past the new range.  */
5961     array[len - 2] = start;
5962     if (end != UV_MAX) {
5963         array[len - 1] = end + 1;
5964     }
5965     else {
5966         /* But if the end is the maximum representable on the machine, just let
5967          * the range have no end */
5968         invlist_set_len(invlist, len - 1);
5969     }
5970 }
5971 #endif
5972
5973 PERL_STATIC_INLINE HV*
5974 S_invlist_union(pTHX_ HV* const a, HV* const b)
5975 {
5976     /* Return a new inversion list which is the union of two inversion lists.
5977      * The basis for this comes from "Unicode Demystified" Chapter 13 by
5978      * Richard Gillam, published by Addison-Wesley, and explained at some
5979      * length there.  The preface says to incorporate its examples into your
5980      * code at your own risk.
5981      *
5982      * The algorithm is like a merge sort.
5983      *
5984      * XXX A potential performance improvement is to keep track as we go along
5985      * if only one of the inputs contributes to the result, meaning the other
5986      * is a subset of that one.  In that case, we can skip the final copy and
5987      * return the larger of the input lists */
5988
5989     UV* array_a = invlist_array(a);   /* a's array */
5990     UV* array_b = invlist_array(b);
5991     UV len_a = invlist_len(a);  /* length of a's array */
5992     UV len_b = invlist_len(b);
5993
5994     HV* u;                      /* the resulting union */
5995     UV* array_u;
5996     UV len_u;
5997
5998     UV i_a = 0;             /* current index into a's array */
5999     UV i_b = 0;
6000     UV i_u = 0;
6001
6002     /* running count, as explained in the algorithm source book; items are
6003      * stopped accumulating and are output when the count changes to/from 0.
6004      * The count is incremented when we start a range that's in the set, and
6005      * decremented when we start a range that's not in the set.  So its range
6006      * is 0 to 2.  Only when the count is zero is something not in the set.
6007      */
6008     UV count = 0;
6009
6010     PERL_ARGS_ASSERT_INVLIST_UNION;
6011
6012     /* Size the union for the worst case: that the sets are completely
6013      * disjoint */
6014     u = _new_invlist(len_a + len_b);
6015     array_u = invlist_array(u);
6016
6017     /* Go through each list item by item, stopping when exhausted one of
6018      * them */
6019     while (i_a < len_a && i_b < len_b) {
6020         UV cp;      /* The element to potentially add to the union's array */
6021         bool cp_in_set;   /* is it in the the input list's set or not */
6022
6023         /* We need to take one or the other of the two inputs for the union.
6024          * Since we are merging two sorted lists, we take the smaller of the
6025          * next items.  In case of a tie, we take the one that is in its set
6026          * first.  If we took one not in the set first, it would decrement the
6027          * count, possibly to 0 which would cause it to be output as ending the
6028          * range, and the next time through we would take the same number, and
6029          * output it again as beginning the next range.  By doing it the
6030          * opposite way, there is no possibility that the count will be
6031          * momentarily decremented to 0, and thus the two adjoining ranges will
6032          * be seamlessly merged.  (In a tie and both are in the set or both not
6033          * in the set, it doesn't matter which we take first.) */
6034         if (array_a[i_a] < array_b[i_b]
6035             || (array_a[i_a] == array_b[i_b] && ELEMENT_IN_INVLIST_SET(i_a)))
6036         {
6037             cp_in_set = ELEMENT_IN_INVLIST_SET(i_a);
6038             cp= array_a[i_a++];
6039         }
6040         else {
6041             cp_in_set = ELEMENT_IN_INVLIST_SET(i_b);
6042             cp= array_b[i_b++];
6043         }
6044
6045         /* Here, have chosen which of the two inputs to look at.  Only output
6046          * if the running count changes to/from 0, which marks the
6047          * beginning/end of a range in that's in the set */
6048         if (cp_in_set) {
6049             if (count == 0) {
6050                 array_u[i_u++] = cp;
6051             }
6052             count++;
6053         }
6054         else {
6055             count--;
6056             if (count == 0) {
6057                 array_u[i_u++] = cp;
6058             }
6059         }
6060     }
6061
6062     /* Here, we are finished going through at least one of the lists, which
6063      * means there is something remaining in at most one.  We check if the list
6064      * that hasn't been exhausted is positioned such that we are in the middle
6065      * of a range in its set or not.  (We are in the set if the next item in
6066      * the array marks the beginning of something not in the set)   If in the
6067      * set, we decrement 'count'; if 0, there is potentially more to output.
6068      * There are four cases:
6069      *  1) Both weren't in their sets, count is 0, and remains 0.  What's left
6070      *     in the union is entirely from the non-exhausted set.
6071      *  2) Both were in their sets, count is 2.  Nothing further should
6072      *     be output, as everything that remains will be in the exhausted
6073      *     list's set, hence in the union; decrementing to 1 but not 0 insures
6074      *     that
6075      *  3) the exhausted was in its set, non-exhausted isn't, count is 1.
6076      *     Nothing further should be output because the union includes
6077      *     everything from the exhausted set.  Not decrementing insures that.
6078      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1;
6079      *     decrementing to 0 insures that we look at the remainder of the
6080      *     non-exhausted set */
6081     if ((i_a != len_a && ! ELEMENT_IN_INVLIST_SET(i_a))
6082         || (i_b != len_b && ! ELEMENT_IN_INVLIST_SET(i_b)))
6083     {
6084         count--;
6085     }
6086
6087     /* The final length is what we've output so far, plus what else is about to
6088      * be output.  (If 'count' is non-zero, then the input list we exhausted
6089      * has everything remaining up to the machine's limit in its set, and hence
6090      * in the union, so there will be no further output. */
6091     len_u = i_u;
6092     if (count == 0) {
6093         /* At most one of the subexpressions will be non-zero */
6094         len_u += (len_a - i_a) + (len_b - i_b);
6095     }
6096
6097     /* Set result to final length, which can change the pointer to array_u, so
6098      * re-find it */
6099     if (len_u != invlist_len(u)) {
6100         invlist_set_len(u, len_u);
6101         invlist_trim(u);
6102         array_u = invlist_array(u);
6103     }
6104
6105     /* When 'count' is 0, the list that was exhausted (if one was shorter than
6106      * the other) ended with everything above it not in its set.  That means
6107      * that the remaining part of the union is precisely the same as the
6108      * non-exhausted list, so can just copy it unchanged.  (If both list were
6109      * exhausted at the same time, then the operations below will be both 0.)
6110      */
6111     if (count == 0) {
6112         IV copy_count; /* At most one will have a non-zero copy count */
6113         if ((copy_count = len_a - i_a) > 0) {
6114             Copy(array_a + i_a, array_u + i_u, copy_count, UV);
6115         }
6116         else if ((copy_count = len_b - i_b) > 0) {
6117             Copy(array_b + i_b, array_u + i_u, copy_count, UV);
6118         }
6119     }
6120
6121     return u;
6122 }
6123
6124 PERL_STATIC_INLINE HV*
6125 S_invlist_intersection(pTHX_ HV* const a, HV* const b)
6126 {
6127     /* Return the intersection of two inversion lists.  The basis for this
6128      * comes from "Unicode Demystified" Chapter 13 by Richard Gillam, published
6129      * by Addison-Wesley, and explained at some length there.  The preface says
6130      * to incorporate its examples into your code at your own risk.
6131      *
6132      * The algorithm is like a merge sort, and is essentially the same as the
6133      * union above
6134      */
6135
6136     UV* array_a = invlist_array(a);   /* a's array */
6137     UV* array_b = invlist_array(b);
6138     UV len_a = invlist_len(a);  /* length of a's array */
6139     UV len_b = invlist_len(b);
6140
6141     HV* r;                   /* the resulting intersection */
6142     UV* array_r;
6143     UV len_r;
6144
6145     UV i_a = 0;             /* current index into a's array */
6146     UV i_b = 0;
6147     UV i_r = 0;
6148
6149     /* running count, as explained in the algorithm source book; items are
6150      * stopped accumulating and are output when the count changes to/from 2.
6151      * The count is incremented when we start a range that's in the set, and
6152      * decremented when we start a range that's not in the set.  So its range
6153      * is 0 to 2.  Only when the count is 2 is something in the intersection.
6154      */
6155     UV count = 0;
6156
6157     PERL_ARGS_ASSERT_INVLIST_INTERSECTION;
6158
6159     /* Size the intersection for the worst case: that the intersection ends up
6160      * fragmenting everything to be completely disjoint */
6161     r= _new_invlist(len_a + len_b);
6162     array_r = invlist_array(r);
6163
6164     /* Go through each list item by item, stopping when exhausted one of
6165      * them */
6166     while (i_a < len_a && i_b < len_b) {
6167         UV cp;      /* The element to potentially add to the intersection's
6168                        array */
6169         bool cp_in_set; /* Is it in the input list's set or not */
6170
6171         /* We need to take one or the other of the two inputs for the union.
6172          * Since we are merging two sorted lists, we take the smaller of the
6173          * next items.  In case of a tie, we take the one that is not in its
6174          * set first (a difference from the union algorithm).  If we took one
6175          * in the set first, it would increment the count, possibly to 2 which
6176          * would cause it to be output as starting a range in the intersection,
6177          * and the next time through we would take that same number, and output
6178          * it again as ending the set.  By doing it the opposite of this, we
6179          * there is no possibility that the count will be momentarily
6180          * incremented to 2.  (In a tie and both are in the set or both not in
6181          * the set, it doesn't matter which we take first.) */
6182         if (array_a[i_a] < array_b[i_b]
6183             || (array_a[i_a] == array_b[i_b] && ! ELEMENT_IN_INVLIST_SET(i_a)))
6184         {
6185             cp_in_set = ELEMENT_IN_INVLIST_SET(i_a);
6186             cp= array_a[i_a++];
6187         }
6188         else {
6189             cp_in_set = ELEMENT_IN_INVLIST_SET(i_b);
6190             cp= array_b[i_b++];
6191         }
6192
6193         /* Here, have chosen which of the two inputs to look at.  Only output
6194          * if the running count changes to/from 2, which marks the
6195          * beginning/end of a range that's in the intersection */
6196         if (cp_in_set) {
6197             count++;
6198             if (count == 2) {
6199                 array_r[i_r++] = cp;
6200             }
6201         }
6202         else {
6203             if (count == 2) {
6204                 array_r[i_r++] = cp;
6205             }
6206             count--;
6207         }
6208     }
6209
6210     /* Here, we are finished going through at least one of the sets, which
6211      * means there is something remaining in at most one.  See the comments in
6212      * the union code */
6213     if ((i_a != len_a && ! ELEMENT_IN_INVLIST_SET(i_a))
6214         || (i_b != len_b && ! ELEMENT_IN_INVLIST_SET(i_b)))
6215     {
6216         count--;
6217     }
6218
6219     /* The final length is what we've output so far plus what else is in the
6220      * intersection.  Only one of the subexpressions below will be non-zero */
6221     len_r = i_r;
6222     if (count == 2) {
6223         len_r += (len_a - i_a) + (len_b - i_b);
6224     }
6225
6226     /* Set result to final length, which can change the pointer to array_r, so
6227      * re-find it */
6228     if (len_r != invlist_len(r)) {
6229         invlist_set_len(r, len_r);
6230         invlist_trim(r);
6231         array_r = invlist_array(r);
6232     }
6233
6234     /* Finish outputting any remaining */
6235     if (count == 2) { /* Only one of will have a non-zero copy count */
6236         IV copy_count;
6237         if ((copy_count = len_a - i_a) > 0) {
6238             Copy(array_a + i_a, array_r + i_r, copy_count, UV);
6239         }
6240         else if ((copy_count = len_b - i_b) > 0) {
6241             Copy(array_b + i_b, array_r + i_r, copy_count, UV);
6242         }
6243     }
6244
6245     return r;
6246 }
6247
6248 STATIC HV*
6249 S_add_range_to_invlist(pTHX_ HV* const invlist, const UV start, const UV end)
6250 {
6251     /* Add the range from 'start' to 'end' inclusive to the inversion list's
6252      * set.  A pointer to the inversion list is returned.  This may actually be
6253      * a new list, in which case the passed in one has been destroyed */
6254
6255     HV* range_invlist;
6256     HV* added_invlist;
6257
6258     UV len = invlist_len(invlist);
6259
6260     PERL_ARGS_ASSERT_ADD_RANGE_TO_INVLIST;
6261
6262     /* If comes after the final entry, can just append it to the end */
6263     if (len == 0
6264         || start >= invlist_array(invlist)
6265                                     [invlist_len(invlist) - 1])
6266     {
6267         _append_range_to_invlist(invlist, start, end);
6268         return invlist;
6269     }
6270
6271     /* Here, can't just append things, create and return a new inversion list
6272      * which is the union of this range and the existing inversion list */
6273     range_invlist = _new_invlist(2);
6274     _append_range_to_invlist(range_invlist, start, end);
6275
6276     added_invlist = invlist_union(invlist, range_invlist);
6277
6278     /* The passed in list can be freed, as well as our temporary */
6279     invlist_destroy(range_invlist);
6280     if (invlist != added_invlist) {
6281         invlist_destroy(invlist);
6282     }
6283
6284     return added_invlist;
6285 }
6286
6287 /* End of inversion list object */
6288
6289 /*
6290  - reg - regular expression, i.e. main body or parenthesized thing
6291  *
6292  * Caller must absorb opening parenthesis.
6293  *
6294  * Combining parenthesis handling with the base level of regular expression
6295  * is a trifle forced, but the need to tie the tails of the branches to what
6296  * follows makes it hard to avoid.
6297  */
6298 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
6299 #ifdef DEBUGGING
6300 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
6301 #else
6302 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
6303 #endif
6304
6305 STATIC regnode *
6306 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
6307     /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
6308 {
6309     dVAR;
6310     register regnode *ret;              /* Will be the head of the group. */
6311     register regnode *br;
6312     register regnode *lastbr;
6313     register regnode *ender = NULL;
6314     register I32 parno = 0;
6315     I32 flags;
6316     U32 oregflags = RExC_flags;
6317     bool have_branch = 0;
6318     bool is_open = 0;
6319     I32 freeze_paren = 0;
6320     I32 after_freeze = 0;
6321
6322     /* for (?g), (?gc), and (?o) warnings; warning
6323        about (?c) will warn about (?g) -- japhy    */
6324
6325 #define WASTED_O  0x01
6326 #define WASTED_G  0x02
6327 #define WASTED_C  0x04
6328 #define WASTED_GC (0x02|0x04)
6329     I32 wastedflags = 0x00;
6330
6331     char * parse_start = RExC_parse; /* MJD */
6332     char * const oregcomp_parse = RExC_parse;
6333
6334     GET_RE_DEBUG_FLAGS_DECL;
6335
6336     PERL_ARGS_ASSERT_REG;
6337     DEBUG_PARSE("reg ");
6338
6339     *flagp = 0;                         /* Tentatively. */
6340
6341
6342     /* Make an OPEN node, if parenthesized. */
6343     if (paren) {
6344         if ( *RExC_parse == '*') { /* (*VERB:ARG) */
6345             char *start_verb = RExC_parse;
6346             STRLEN verb_len = 0;
6347             char *start_arg = NULL;
6348             unsigned char op = 0;
6349             int argok = 1;
6350             int internal_argval = 0; /* internal_argval is only useful if !argok */
6351             while ( *RExC_parse && *RExC_parse != ')' ) {
6352                 if ( *RExC_parse == ':' ) {
6353                     start_arg = RExC_parse + 1;
6354                     break;
6355                 }
6356                 RExC_parse++;
6357             }
6358             ++start_verb;
6359             verb_len = RExC_parse - start_verb;
6360             if ( start_arg ) {
6361                 RExC_parse++;
6362                 while ( *RExC_parse && *RExC_parse != ')' )
6363                     RExC_parse++;
6364                 if ( *RExC_parse != ')' )
6365                     vFAIL("Unterminated verb pattern argument");
6366                 if ( RExC_parse == start_arg )
6367                     start_arg = NULL;
6368             } else {
6369                 if ( *RExC_parse != ')' )
6370                     vFAIL("Unterminated verb pattern");
6371             }
6372
6373             switch ( *start_verb ) {
6374             case 'A':  /* (*ACCEPT) */
6375                 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
6376                     op = ACCEPT;
6377                     internal_argval = RExC_nestroot;
6378                 }
6379                 break;
6380             case 'C':  /* (*COMMIT) */
6381                 if ( memEQs(start_verb,verb_len,"COMMIT") )
6382                     op = COMMIT;
6383                 break;
6384             case 'F':  /* (*FAIL) */
6385                 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
6386                     op = OPFAIL;
6387                     argok = 0;
6388                 }
6389                 break;
6390             case ':':  /* (*:NAME) */
6391             case 'M':  /* (*MARK:NAME) */
6392                 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
6393                     op = MARKPOINT;
6394                     argok = -1;
6395                 }
6396                 break;
6397             case 'P':  /* (*PRUNE) */
6398                 if ( memEQs(start_verb,verb_len,"PRUNE") )
6399                     op = PRUNE;
6400                 break;
6401             case 'S':   /* (*SKIP) */
6402                 if ( memEQs(start_verb,verb_len,"SKIP") )
6403                     op = SKIP;
6404                 break;
6405             case 'T':  /* (*THEN) */
6406                 /* [19:06] <TimToady> :: is then */
6407                 if ( memEQs(start_verb,verb_len,"THEN") ) {
6408                     op = CUTGROUP;
6409                     RExC_seen |= REG_SEEN_CUTGROUP;
6410                 }
6411                 break;
6412             }
6413             if ( ! op ) {
6414                 RExC_parse++;
6415                 vFAIL3("Unknown verb pattern '%.*s'",
6416                     verb_len, start_verb);
6417             }
6418             if ( argok ) {
6419                 if ( start_arg && internal_argval ) {
6420                     vFAIL3("Verb pattern '%.*s' may not have an argument",
6421                         verb_len, start_verb);
6422                 } else if ( argok < 0 && !start_arg ) {
6423                     vFAIL3("Verb pattern '%.*s' has a mandatory argument",
6424                         verb_len, start_verb);
6425                 } else {
6426                     ret = reganode(pRExC_state, op, internal_argval);
6427                     if ( ! internal_argval && ! SIZE_ONLY ) {
6428                         if (start_arg) {
6429                             SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
6430                             ARG(ret) = add_data( pRExC_state, 1, "S" );
6431                             RExC_rxi->data->data[ARG(ret)]=(void*)sv;
6432                             ret->flags = 0;
6433                         } else {
6434                             ret->flags = 1;
6435                         }
6436                     }
6437                 }
6438                 if (!internal_argval)
6439                     RExC_seen |= REG_SEEN_VERBARG;
6440             } else if ( start_arg ) {
6441                 vFAIL3("Verb pattern '%.*s' may not have an argument",
6442                         verb_len, start_verb);
6443             } else {
6444                 ret = reg_node(pRExC_state, op);
6445             }
6446             nextchar(pRExC_state);
6447             return ret;
6448         } else
6449         if (*RExC_parse == '?') { /* (?...) */
6450             bool is_logical = 0;
6451             const char * const seqstart = RExC_parse;
6452             bool has_use_defaults = FALSE;
6453
6454             RExC_parse++;
6455             paren = *RExC_parse++;
6456             ret = NULL;                 /* For look-ahead/behind. */
6457             switch (paren) {
6458
6459             case 'P':   /* (?P...) variants for those used to PCRE/Python */
6460                 paren = *RExC_parse++;
6461                 if ( paren == '<')         /* (?P<...>) named capture */
6462                     goto named_capture;
6463                 else if (paren == '>') {   /* (?P>name) named recursion */
6464                     goto named_recursion;
6465                 }
6466                 else if (paren == '=') {   /* (?P=...)  named backref */
6467                     /* this pretty much dupes the code for \k<NAME> in regatom(), if
6468                        you change this make sure you change that */
6469                     char* name_start = RExC_parse;
6470                     U32 num = 0;
6471                     SV *sv_dat = reg_scan_name(pRExC_state,
6472                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6473                     if (RExC_parse == name_start || *RExC_parse != ')')
6474                         vFAIL2("Sequence %.3s... not terminated",parse_start);
6475
6476                     if (!SIZE_ONLY) {
6477                         num = add_data( pRExC_state, 1, "S" );
6478                         RExC_rxi->data->data[num]=(void*)sv_dat;
6479                         SvREFCNT_inc_simple_void(sv_dat);
6480                     }
6481                     RExC_sawback = 1;
6482                     ret = reganode(pRExC_state,
6483                                    ((! FOLD)
6484                                      ? NREF
6485                                      : (MORE_ASCII_RESTRICTED)
6486                                        ? NREFFA
6487                                        : (AT_LEAST_UNI_SEMANTICS)
6488                                          ? NREFFU
6489                                          : (LOC)
6490                                            ? NREFFL
6491                                            : NREFF),
6492                                     num);
6493                     *flagp |= HASWIDTH;
6494
6495                     Set_Node_Offset(ret, parse_start+1);
6496                     Set_Node_Cur_Length(ret); /* MJD */
6497
6498                     nextchar(pRExC_state);
6499                     return ret;
6500                 }
6501                 RExC_parse++;
6502                 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6503                 /*NOTREACHED*/
6504             case '<':           /* (?<...) */
6505                 if (*RExC_parse == '!')
6506                     paren = ',';
6507                 else if (*RExC_parse != '=')
6508               named_capture:
6509                 {               /* (?<...>) */
6510                     char *name_start;
6511                     SV *svname;
6512                     paren= '>';
6513             case '\'':          /* (?'...') */
6514                     name_start= RExC_parse;
6515                     svname = reg_scan_name(pRExC_state,
6516                         SIZE_ONLY ?  /* reverse test from the others */
6517                         REG_RSN_RETURN_NAME :
6518                         REG_RSN_RETURN_NULL);
6519                     if (RExC_parse == name_start) {
6520                         RExC_parse++;
6521                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6522                         /*NOTREACHED*/
6523                     }
6524                     if (*RExC_parse != paren)
6525                         vFAIL2("Sequence (?%c... not terminated",
6526                             paren=='>' ? '<' : paren);
6527                     if (SIZE_ONLY) {
6528                         HE *he_str;
6529                         SV *sv_dat = NULL;
6530                         if (!svname) /* shouldn't happen */
6531                             Perl_croak(aTHX_
6532                                 "panic: reg_scan_name returned NULL");
6533                         if (!RExC_paren_names) {
6534                             RExC_paren_names= newHV();
6535                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
6536 #ifdef DEBUGGING
6537                             RExC_paren_name_list= newAV();
6538                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
6539 #endif
6540                         }
6541                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
6542                         if ( he_str )
6543                             sv_dat = HeVAL(he_str);
6544                         if ( ! sv_dat ) {
6545                             /* croak baby croak */
6546                             Perl_croak(aTHX_
6547                                 "panic: paren_name hash element allocation failed");
6548                         } else if ( SvPOK(sv_dat) ) {
6549                             /* (?|...) can mean we have dupes so scan to check
6550                                its already been stored. Maybe a flag indicating
6551                                we are inside such a construct would be useful,
6552                                but the arrays are likely to be quite small, so
6553                                for now we punt -- dmq */
6554                             IV count = SvIV(sv_dat);
6555                             I32 *pv = (I32*)SvPVX(sv_dat);
6556                             IV i;
6557                             for ( i = 0 ; i < count ; i++ ) {
6558                                 if ( pv[i] == RExC_npar ) {
6559                                     count = 0;
6560                                     break;
6561                                 }
6562                             }
6563                             if ( count ) {
6564                                 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
6565                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
6566                                 pv[count] = RExC_npar;
6567                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
6568                             }
6569                         } else {
6570                             (void)SvUPGRADE(sv_dat,SVt_PVNV);
6571                             sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
6572                             SvIOK_on(sv_dat);
6573                             SvIV_set(sv_dat, 1);
6574                         }
6575 #ifdef DEBUGGING
6576                         if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
6577                             SvREFCNT_dec(svname);
6578 #endif
6579
6580                         /*sv_dump(sv_dat);*/
6581                     }
6582                     nextchar(pRExC_state);
6583                     paren = 1;
6584                     goto capturing_parens;
6585                 }
6586                 RExC_seen |= REG_SEEN_LOOKBEHIND;
6587                 RExC_in_lookbehind++;
6588                 RExC_parse++;
6589             case '=':           /* (?=...) */
6590                 RExC_seen_zerolen++;
6591                 break;
6592             case '!':           /* (?!...) */
6593                 RExC_seen_zerolen++;
6594                 if (*RExC_parse == ')') {
6595                     ret=reg_node(pRExC_state, OPFAIL);
6596                     nextchar(pRExC_state);
6597                     return ret;
6598                 }
6599                 break;
6600             case '|':           /* (?|...) */
6601                 /* branch reset, behave like a (?:...) except that
6602                    buffers in alternations share the same numbers */
6603                 paren = ':';
6604                 after_freeze = freeze_paren = RExC_npar;
6605                 break;
6606             case ':':           /* (?:...) */
6607             case '>':           /* (?>...) */
6608                 break;
6609             case '$':           /* (?$...) */
6610             case '@':           /* (?@...) */
6611                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
6612                 break;
6613             case '#':           /* (?#...) */
6614                 while (*RExC_parse && *RExC_parse != ')')
6615                     RExC_parse++;
6616                 if (*RExC_parse != ')')
6617                     FAIL("Sequence (?#... not terminated");
6618                 nextchar(pRExC_state);
6619                 *flagp = TRYAGAIN;
6620                 return NULL;
6621             case '0' :           /* (?0) */
6622             case 'R' :           /* (?R) */
6623                 if (*RExC_parse != ')')
6624                     FAIL("Sequence (?R) not terminated");
6625                 ret = reg_node(pRExC_state, GOSTART);
6626                 *flagp |= POSTPONED;
6627                 nextchar(pRExC_state);
6628                 return ret;
6629                 /*notreached*/
6630             { /* named and numeric backreferences */
6631                 I32 num;
6632             case '&':            /* (?&NAME) */
6633                 parse_start = RExC_parse - 1;
6634               named_recursion:
6635                 {
6636                     SV *sv_dat = reg_scan_name(pRExC_state,
6637                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6638                      num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
6639                 }
6640                 goto gen_recurse_regop;
6641                 /* NOT REACHED */
6642             case '+':
6643                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
6644                     RExC_parse++;
6645                     vFAIL("Illegal pattern");
6646                 }
6647                 goto parse_recursion;
6648                 /* NOT REACHED*/
6649             case '-': /* (?-1) */
6650                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
6651                     RExC_parse--; /* rewind to let it be handled later */
6652                     goto parse_flags;
6653                 }
6654                 /*FALLTHROUGH */
6655             case '1': case '2': case '3': case '4': /* (?1) */
6656             case '5': case '6': case '7': case '8': case '9':
6657                 RExC_parse--;
6658               parse_recursion:
6659                 num = atoi(RExC_parse);
6660                 parse_start = RExC_parse - 1; /* MJD */
6661                 if (*RExC_parse == '-')
6662                     RExC_parse++;
6663                 while (isDIGIT(*RExC_parse))
6664                         RExC_parse++;
6665                 if (*RExC_parse!=')')
6666                     vFAIL("Expecting close bracket");
6667
6668               gen_recurse_regop:
6669                 if ( paren == '-' ) {
6670                     /*
6671                     Diagram of capture buffer numbering.
6672                     Top line is the normal capture buffer numbers
6673                     Bottom line is the negative indexing as from
6674                     the X (the (?-2))
6675
6676                     +   1 2    3 4 5 X          6 7
6677                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
6678                     -   5 4    3 2 1 X          x x
6679
6680                     */
6681                     num = RExC_npar + num;
6682                     if (num < 1)  {
6683                         RExC_parse++;
6684                         vFAIL("Reference to nonexistent group");
6685                     }
6686                 } else if ( paren == '+' ) {
6687                     num = RExC_npar + num - 1;
6688                 }
6689
6690                 ret = reganode(pRExC_state, GOSUB, num);
6691                 if (!SIZE_ONLY) {
6692                     if (num > (I32)RExC_rx->nparens) {
6693                         RExC_parse++;
6694                         vFAIL("Reference to nonexistent group");
6695                     }
6696                     ARG2L_SET( ret, RExC_recurse_count++);
6697                     RExC_emit++;
6698                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6699                         "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
6700                 } else {
6701                     RExC_size++;
6702                 }
6703                 RExC_seen |= REG_SEEN_RECURSE;
6704                 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
6705                 Set_Node_Offset(ret, parse_start); /* MJD */
6706
6707                 *flagp |= POSTPONED;
6708                 nextchar(pRExC_state);
6709                 return ret;
6710             } /* named and numeric backreferences */
6711             /* NOT REACHED */
6712
6713             case '?':           /* (??...) */
6714                 is_logical = 1;
6715                 if (*RExC_parse != '{') {
6716                     RExC_parse++;
6717                     vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6718                     /*NOTREACHED*/
6719                 }
6720                 *flagp |= POSTPONED;
6721                 paren = *RExC_parse++;
6722                 /* FALL THROUGH */
6723             case '{':           /* (?{...}) */
6724             {
6725                 I32 count = 1;
6726                 U32 n = 0;
6727                 char c;
6728                 char *s = RExC_parse;
6729
6730                 RExC_seen_zerolen++;
6731                 RExC_seen |= REG_SEEN_EVAL;
6732                 while (count && (c = *RExC_parse)) {
6733                     if (c == '\\') {
6734                         if (RExC_parse[1])
6735                             RExC_parse++;
6736                     }
6737                     else if (c == '{')
6738                         count++;
6739                     else if (c == '}')
6740                         count--;
6741                     RExC_parse++;
6742                 }
6743                 if (*RExC_parse != ')') {
6744                     RExC_parse = s;
6745                     vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
6746                 }
6747                 if (!SIZE_ONLY) {
6748                     PAD *pad;
6749                     OP_4tree *sop, *rop;
6750                     SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
6751
6752                     ENTER;
6753                     Perl_save_re_context(aTHX);
6754                     rop = Perl_sv_compile_2op_is_broken(aTHX_ sv, &sop, "re", &pad);
6755                     sop->op_private |= OPpREFCOUNTED;
6756                     /* re_dup will OpREFCNT_inc */
6757                     OpREFCNT_set(sop, 1);
6758                     LEAVE;
6759
6760                     n = add_data(pRExC_state, 3, "nop");
6761                     RExC_rxi->data->data[n] = (void*)rop;
6762                     RExC_rxi->data->data[n+1] = (void*)sop;
6763                     RExC_rxi->data->data[n+2] = (void*)pad;
6764                     SvREFCNT_dec(sv);
6765                 }
6766                 else {                                          /* First pass */
6767                     if (PL_reginterp_cnt < ++RExC_seen_evals
6768                         && IN_PERL_RUNTIME)
6769                         /* No compiled RE interpolated, has runtime
6770                            components ===> unsafe.  */
6771                         FAIL("Eval-group not allowed at runtime, use re 'eval'");
6772                     if (PL_tainting && PL_tainted)
6773                         FAIL("Eval-group in insecure regular expression");
6774 #if PERL_VERSION > 8
6775                     if (IN_PERL_COMPILETIME)
6776                         PL_cv_has_eval = 1;
6777 #endif
6778                 }
6779
6780                 nextchar(pRExC_state);
6781                 if (is_logical) {
6782                     ret = reg_node(pRExC_state, LOGICAL);
6783                     if (!SIZE_ONLY)
6784                         ret->flags = 2;
6785                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
6786                     /* deal with the length of this later - MJD */
6787                     return ret;
6788                 }
6789                 ret = reganode(pRExC_state, EVAL, n);
6790                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
6791                 Set_Node_Offset(ret, parse_start);
6792                 return ret;
6793             }
6794             case '(':           /* (?(?{...})...) and (?(?=...)...) */
6795             {
6796                 int is_define= 0;
6797                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
6798                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
6799                         || RExC_parse[1] == '<'
6800                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
6801                         I32 flag;
6802
6803                         ret = reg_node(pRExC_state, LOGICAL);
6804                         if (!SIZE_ONLY)
6805                             ret->flags = 1;
6806                         REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
6807                         goto insert_if;
6808                     }
6809                 }
6810                 else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
6811                          || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
6812                 {
6813                     char ch = RExC_parse[0] == '<' ? '>' : '\'';
6814                     char *name_start= RExC_parse++;
6815                     U32 num = 0;
6816                     SV *sv_dat=reg_scan_name(pRExC_state,
6817                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6818                     if (RExC_parse == name_start || *RExC_parse != ch)
6819                         vFAIL2("Sequence (?(%c... not terminated",
6820                             (ch == '>' ? '<' : ch));
6821                     RExC_parse++;
6822                     if (!SIZE_ONLY) {
6823                         num = add_data( pRExC_state, 1, "S" );
6824                         RExC_rxi->data->data[num]=(void*)sv_dat;
6825                         SvREFCNT_inc_simple_void(sv_dat);
6826                     }
6827                     ret = reganode(pRExC_state,NGROUPP,num);
6828                     goto insert_if_check_paren;
6829                 }
6830                 else if (RExC_parse[0] == 'D' &&
6831                          RExC_parse[1] == 'E' &&
6832                          RExC_parse[2] == 'F' &&
6833                          RExC_parse[3] == 'I' &&
6834                          RExC_parse[4] == 'N' &&
6835                          RExC_parse[5] == 'E')
6836                 {
6837                     ret = reganode(pRExC_state,DEFINEP,0);
6838                     RExC_parse +=6 ;
6839                     is_define = 1;
6840                     goto insert_if_check_paren;
6841                 }
6842                 else if (RExC_parse[0] == 'R') {
6843                     RExC_parse++;
6844                     parno = 0;
6845                     if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
6846                         parno = atoi(RExC_parse++);
6847                         while (isDIGIT(*RExC_parse))
6848                             RExC_parse++;
6849                     } else if (RExC_parse[0] == '&') {
6850                         SV *sv_dat;
6851                         RExC_parse++;
6852                         sv_dat = reg_scan_name(pRExC_state,
6853                             SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6854                         parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
6855                     }
6856                     ret = reganode(pRExC_state,INSUBP,parno);
6857                     goto insert_if_check_paren;
6858                 }
6859                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
6860                     /* (?(1)...) */
6861                     char c;
6862                     parno = atoi(RExC_parse++);
6863
6864                     while (isDIGIT(*RExC_parse))
6865                         RExC_parse++;
6866                     ret = reganode(pRExC_state, GROUPP, parno);
6867
6868                  insert_if_check_paren:
6869                     if ((c = *nextchar(pRExC_state)) != ')')
6870                         vFAIL("Switch condition not recognized");
6871                   insert_if:
6872                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
6873                     br = regbranch(pRExC_state, &flags, 1,depth+1);
6874                     if (br == NULL)
6875                         br = reganode(pRExC_state, LONGJMP, 0);
6876                     else
6877                         REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
6878                     c = *nextchar(pRExC_state);
6879                     if (flags&HASWIDTH)
6880                         *flagp |= HASWIDTH;
6881                     if (c == '|') {
6882                         if (is_define)
6883                             vFAIL("(?(DEFINE)....) does not allow branches");
6884                         lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
6885                         regbranch(pRExC_state, &flags, 1,depth+1);
6886                         REGTAIL(pRExC_state, ret, lastbr);
6887                         if (flags&HASWIDTH)
6888                             *flagp |= HASWIDTH;
6889                         c = *nextchar(pRExC_state);
6890                     }
6891                     else
6892                         lastbr = NULL;
6893                     if (c != ')')
6894                         vFAIL("Switch (?(condition)... contains too many branches");
6895                     ender = reg_node(pRExC_state, TAIL);
6896                     REGTAIL(pRExC_state, br, ender);
6897                     if (lastbr) {
6898                         REGTAIL(pRExC_state, lastbr, ender);
6899                         REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
6900                     }
6901                     else
6902                         REGTAIL(pRExC_state, ret, ender);
6903                     RExC_size++; /* XXX WHY do we need this?!!
6904                                     For large programs it seems to be required
6905                                     but I can't figure out why. -- dmq*/
6906                     return ret;
6907                 }
6908                 else {
6909                     vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
6910                 }
6911             }
6912             case 0:
6913                 RExC_parse--; /* for vFAIL to print correctly */
6914                 vFAIL("Sequence (? incomplete");
6915                 break;
6916             case DEFAULT_PAT_MOD:   /* Use default flags with the exceptions
6917                                        that follow */
6918                 has_use_defaults = TRUE;
6919                 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
6920                 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
6921                                                 ? REGEX_UNICODE_CHARSET
6922                                                 : REGEX_DEPENDS_CHARSET);
6923                 goto parse_flags;
6924             default:
6925                 --RExC_parse;
6926                 parse_flags:      /* (?i) */
6927             {
6928                 U32 posflags = 0, negflags = 0;
6929                 U32 *flagsp = &posflags;
6930                 bool has_charset_modifier = 0;
6931                 regex_charset cs = REGEX_DEPENDS_CHARSET;
6932
6933                 while (*RExC_parse) {
6934                     /* && strchr("iogcmsx", *RExC_parse) */
6935                     /* (?g), (?gc) and (?o) are useless here
6936                        and must be globally applied -- japhy */
6937                     switch (*RExC_parse) {
6938                     CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
6939                     case LOCALE_PAT_MOD:
6940                         if (has_charset_modifier || flagsp == &negflags) {
6941                             goto fail_modifiers;
6942                         }
6943                         cs = REGEX_LOCALE_CHARSET;
6944                         has_charset_modifier = 1;
6945                         break;
6946                     case UNICODE_PAT_MOD:
6947                         if (has_charset_modifier || flagsp == &negflags) {
6948                             goto fail_modifiers;
6949                         }
6950                         cs = REGEX_UNICODE_CHARSET;
6951                         has_charset_modifier = 1;
6952                         break;
6953                     case ASCII_RESTRICT_PAT_MOD:
6954                         if (has_charset_modifier || flagsp == &negflags) {
6955                             goto fail_modifiers;
6956                         }
6957                         if (*(RExC_parse + 1) == ASCII_RESTRICT_PAT_MOD) {
6958                             /* Doubled modifier implies more restricted */
6959                             cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
6960                             RExC_parse++;
6961                         }
6962                         else {
6963                             cs = REGEX_ASCII_RESTRICTED_CHARSET;
6964                         }
6965                         has_charset_modifier = 1;
6966                         break;
6967                     case DEPENDS_PAT_MOD:
6968                         if (has_use_defaults
6969                             || has_charset_modifier
6970                             || flagsp == &negflags)
6971                         {
6972                             goto fail_modifiers;
6973                         }
6974
6975                         /* The dual charset means unicode semantics if the
6976                          * pattern (or target, not known until runtime) are
6977                          * utf8, or something in the pattern indicates unicode
6978                          * semantics */
6979                         cs = (RExC_utf8 || RExC_uni_semantics)
6980                              ? REGEX_UNICODE_CHARSET
6981                              : REGEX_DEPENDS_CHARSET;
6982                         has_charset_modifier = 1;
6983                         break;
6984                     case ONCE_PAT_MOD: /* 'o' */
6985                     case GLOBAL_PAT_MOD: /* 'g' */
6986                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6987                             const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
6988                             if (! (wastedflags & wflagbit) ) {
6989                                 wastedflags |= wflagbit;
6990                                 vWARN5(
6991                                     RExC_parse + 1,
6992                                     "Useless (%s%c) - %suse /%c modifier",
6993                                     flagsp == &negflags ? "?-" : "?",
6994                                     *RExC_parse,
6995                                     flagsp == &negflags ? "don't " : "",
6996                                     *RExC_parse
6997                                 );
6998                             }
6999                         }
7000                         break;
7001
7002                     case CONTINUE_PAT_MOD: /* 'c' */
7003                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
7004                             if (! (wastedflags & WASTED_C) ) {
7005                                 wastedflags |= WASTED_GC;
7006                                 vWARN3(
7007                                     RExC_parse + 1,
7008                                     "Useless (%sc) - %suse /gc modifier",
7009                                     flagsp == &negflags ? "?-" : "?",
7010                                     flagsp == &negflags ? "don't " : ""
7011                                 );
7012                             }
7013                         }
7014                         break;
7015                     case KEEPCOPY_PAT_MOD: /* 'p' */
7016                         if (flagsp == &negflags) {
7017                             if (SIZE_ONLY)
7018                                 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
7019                         } else {
7020                             *flagsp |= RXf_PMf_KEEPCOPY;
7021                         }
7022                         break;
7023                     case '-':
7024                         /* A flag is a default iff it is following a minus, so
7025                          * if there is a minus, it means will be trying to
7026                          * re-specify a default which is an error */
7027                         if (has_use_defaults || flagsp == &negflags) {
7028             fail_modifiers:
7029                             RExC_parse++;
7030                             vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
7031                             /*NOTREACHED*/
7032                         }
7033                         flagsp = &negflags;
7034                         wastedflags = 0;  /* reset so (?g-c) warns twice */
7035                         break;
7036                     case ':':
7037                         paren = ':';
7038                         /*FALLTHROUGH*/
7039                     case ')':
7040                         RExC_flags |= posflags;
7041                         RExC_flags &= ~negflags;
7042                         set_regex_charset(&RExC_flags, cs);
7043                         if (paren != ':') {
7044                             oregflags |= posflags;
7045                             oregflags &= ~negflags;
7046                             set_regex_charset(&oregflags, cs);
7047                         }
7048                         nextchar(pRExC_state);
7049                         if (paren != ':') {
7050                             *flagp = TRYAGAIN;
7051                             return NULL;
7052                         } else {
7053                             ret = NULL;
7054                             goto parse_rest;
7055                         }
7056                         /*NOTREACHED*/
7057                     default:
7058                         RExC_parse++;
7059                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
7060                         /*NOTREACHED*/
7061                     }
7062                     ++RExC_parse;
7063                 }
7064             }} /* one for the default block, one for the switch */
7065         }
7066         else {                  /* (...) */
7067           capturing_parens:
7068             parno = RExC_npar;
7069             RExC_npar++;
7070
7071             ret = reganode(pRExC_state, OPEN, parno);
7072             if (!SIZE_ONLY ){
7073                 if (!RExC_nestroot)
7074                     RExC_nestroot = parno;
7075                 if (RExC_seen & REG_SEEN_RECURSE
7076                     && !RExC_open_parens[parno-1])
7077                 {
7078                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
7079                         "Setting open paren #%"IVdf" to %d\n",
7080                         (IV)parno, REG_NODE_NUM(ret)));
7081                     RExC_open_parens[parno-1]= ret;
7082                 }
7083             }
7084             Set_Node_Length(ret, 1); /* MJD */
7085             Set_Node_Offset(ret, RExC_parse); /* MJD */
7086             is_open = 1;
7087         }
7088     }
7089     else                        /* ! paren */
7090         ret = NULL;
7091
7092    parse_rest:
7093     /* Pick up the branches, linking them together. */
7094     parse_start = RExC_parse;   /* MJD */
7095     br = regbranch(pRExC_state, &flags, 1,depth+1);
7096
7097     if (freeze_paren) {
7098         if (RExC_npar > after_freeze)
7099             after_freeze = RExC_npar;
7100         RExC_npar = freeze_paren;
7101     }
7102
7103     /*     branch_len = (paren != 0); */
7104
7105     if (br == NULL)
7106         return(NULL);
7107     if (*RExC_parse == '|') {
7108         if (!SIZE_ONLY && RExC_extralen) {
7109             reginsert(pRExC_state, BRANCHJ, br, depth+1);
7110         }
7111         else {                  /* MJD */
7112             reginsert(pRExC_state, BRANCH, br, depth+1);
7113             Set_Node_Length(br, paren != 0);
7114             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
7115         }
7116         have_branch = 1;
7117         if (SIZE_ONLY)
7118             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
7119     }
7120     else if (paren == ':') {
7121         *flagp |= flags&SIMPLE;
7122     }
7123     if (is_open) {                              /* Starts with OPEN. */
7124         REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
7125     }
7126     else if (paren != '?')              /* Not Conditional */
7127         ret = br;
7128     *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
7129     lastbr = br;
7130     while (*RExC_parse == '|') {
7131         if (!SIZE_ONLY && RExC_extralen) {
7132             ender = reganode(pRExC_state, LONGJMP,0);
7133             REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
7134         }
7135         if (SIZE_ONLY)
7136             RExC_extralen += 2;         /* Account for LONGJMP. */
7137         nextchar(pRExC_state);
7138         if (freeze_paren) {
7139             if (RExC_npar > after_freeze)
7140                 after_freeze = RExC_npar;
7141             RExC_npar = freeze_paren;
7142         }
7143         br = regbranch(pRExC_state, &flags, 0, depth+1);
7144
7145         if (br == NULL)
7146             return(NULL);
7147         REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
7148         lastbr = br;
7149         *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
7150     }
7151
7152     if (have_branch || paren != ':') {
7153         /* Make a closing node, and hook it on the end. */
7154         switch (paren) {
7155         case ':':
7156             ender = reg_node(pRExC_state, TAIL);
7157             break;
7158         case 1:
7159             ender = reganode(pRExC_state, CLOSE, parno);
7160             if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
7161                 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
7162                         "Setting close paren #%"IVdf" to %d\n",
7163                         (IV)parno, REG_NODE_NUM(ender)));
7164                 RExC_close_parens[parno-1]= ender;
7165                 if (RExC_nestroot == parno)
7166                     RExC_nestroot = 0;
7167             }
7168             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
7169             Set_Node_Length(ender,1); /* MJD */
7170             break;
7171         case '<':
7172         case ',':
7173         case '=':
7174         case '!':
7175             *flagp &= ~HASWIDTH;
7176             /* FALL THROUGH */
7177         case '>':
7178             ender = reg_node(pRExC_state, SUCCEED);
7179             break;
7180         case 0:
7181             ender = reg_node(pRExC_state, END);
7182             if (!SIZE_ONLY) {
7183                 assert(!RExC_opend); /* there can only be one! */
7184                 RExC_opend = ender;
7185             }
7186             break;
7187         }
7188         REGTAIL(pRExC_state, lastbr, ender);
7189
7190         if (have_branch && !SIZE_ONLY) {
7191             if (depth==1)
7192                 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
7193
7194             /* Hook the tails of the branches to the closing node. */
7195             for (br = ret; br; br = regnext(br)) {
7196                 const U8 op = PL_regkind[OP(br)];
7197                 if (op == BRANCH) {
7198                     REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
7199                 }
7200                 else if (op == BRANCHJ) {
7201                     REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
7202                 }
7203             }
7204         }
7205     }
7206
7207     {
7208         const char *p;
7209         static const char parens[] = "=!<,>";
7210
7211         if (paren && (p = strchr(parens, paren))) {
7212             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
7213             int flag = (p - parens) > 1;
7214
7215             if (paren == '>')
7216                 node = SUSPEND, flag = 0;
7217             reginsert(pRExC_state, node,ret, depth+1);
7218             Set_Node_Cur_Length(ret);
7219             Set_Node_Offset(ret, parse_start + 1);
7220             ret->flags = flag;
7221             REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
7222         }
7223     }
7224
7225     /* Check for proper termination. */
7226     if (paren) {
7227         RExC_flags = oregflags;
7228         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
7229             RExC_parse = oregcomp_parse;
7230             vFAIL("Unmatched (");
7231         }
7232     }
7233     else if (!paren && RExC_parse < RExC_end) {
7234         if (*RExC_parse == ')') {
7235             RExC_parse++;
7236             vFAIL("Unmatched )");
7237         }
7238         else
7239             FAIL("Junk on end of regexp");      /* "Can't happen". */
7240         /* NOTREACHED */
7241     }
7242
7243     if (RExC_in_lookbehind) {
7244         RExC_in_lookbehind--;
7245     }
7246     if (after_freeze)
7247         RExC_npar = after_freeze;
7248     return(ret);
7249 }
7250
7251 /*
7252  - regbranch - one alternative of an | operator
7253  *
7254  * Implements the concatenation operator.
7255  */
7256 STATIC regnode *
7257 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
7258 {
7259     dVAR;
7260     register regnode *ret;
7261     register regnode *chain = NULL;
7262     register regnode *latest;
7263     I32 flags = 0, c = 0;
7264     GET_RE_DEBUG_FLAGS_DECL;
7265
7266     PERL_ARGS_ASSERT_REGBRANCH;
7267
7268     DEBUG_PARSE("brnc");
7269
7270     if (first)
7271         ret = NULL;
7272     else {
7273         if (!SIZE_ONLY && RExC_extralen)
7274             ret = reganode(pRExC_state, BRANCHJ,0);
7275         else {
7276             ret = reg_node(pRExC_state, BRANCH);
7277             Set_Node_Length(ret, 1);
7278         }
7279     }
7280
7281     if (!first && SIZE_ONLY)
7282         RExC_extralen += 1;                     /* BRANCHJ */
7283
7284     *flagp = WORST;                     /* Tentatively. */
7285
7286     RExC_parse--;
7287     nextchar(pRExC_state);
7288     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
7289         flags &= ~TRYAGAIN;
7290         latest = regpiece(pRExC_state, &flags,depth+1);
7291         if (latest == NULL) {
7292             if (flags & TRYAGAIN)
7293                 continue;
7294             return(NULL);
7295         }
7296         else if (ret == NULL)
7297             ret = latest;
7298         *flagp |= flags&(HASWIDTH|POSTPONED);
7299         if (chain == NULL)      /* First piece. */
7300             *flagp |= flags&SPSTART;
7301         else {
7302             RExC_naughty++;
7303             REGTAIL(pRExC_state, chain, latest);
7304         }
7305         chain = latest;
7306         c++;
7307     }
7308     if (chain == NULL) {        /* Loop ran zero times. */
7309         chain = reg_node(pRExC_state, NOTHING);
7310         if (ret == NULL)
7311             ret = chain;
7312     }
7313     if (c == 1) {
7314         *flagp |= flags&SIMPLE;
7315     }
7316
7317     return ret;
7318 }
7319
7320 /*
7321  - regpiece - something followed by possible [*+?]
7322  *
7323  * Note that the branching code sequences used for ? and the general cases
7324  * of * and + are somewhat optimized:  they use the same NOTHING node as
7325  * both the endmarker for their branch list and the body of the last branch.
7326  * It might seem that this node could be dispensed with entirely, but the
7327  * endmarker role is not redundant.
7328  */
7329 STATIC regnode *
7330 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
7331 {
7332     dVAR;
7333     register regnode *ret;
7334     register char op;
7335     register char *next;
7336     I32 flags;
7337     const char * const origparse = RExC_parse;
7338     I32 min;
7339     I32 max = REG_INFTY;
7340     char *parse_start;
7341     const char *maxpos = NULL;
7342     GET_RE_DEBUG_FLAGS_DECL;
7343
7344     PERL_ARGS_ASSERT_REGPIECE;
7345
7346     DEBUG_PARSE("piec");
7347
7348     ret = regatom(pRExC_state, &flags,depth+1);
7349     if (ret == NULL) {
7350         if (flags & TRYAGAIN)
7351             *flagp |= TRYAGAIN;
7352         return(NULL);
7353     }
7354
7355     op = *RExC_parse;
7356
7357     if (op == '{' && regcurly(RExC_parse)) {
7358         maxpos = NULL;
7359         parse_start = RExC_parse; /* MJD */
7360         next = RExC_parse + 1;
7361         while (isDIGIT(*next) || *next == ',') {
7362             if (*next == ',') {
7363                 if (maxpos)
7364                     break;
7365                 else
7366                     maxpos = next;
7367             }
7368             next++;
7369         }
7370         if (*next == '}') {             /* got one */
7371             if (!maxpos)
7372                 maxpos = next;
7373             RExC_parse++;
7374             min = atoi(RExC_parse);
7375             if (*maxpos == ',')
7376                 maxpos++;
7377             else
7378                 maxpos = RExC_parse;
7379             max = atoi(maxpos);
7380             if (!max && *maxpos != '0')
7381                 max = REG_INFTY;                /* meaning "infinity" */
7382             else if (max >= REG_INFTY)
7383                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
7384             RExC_parse = next;
7385             nextchar(pRExC_state);
7386
7387         do_curly:
7388             if ((flags&SIMPLE)) {
7389                 RExC_naughty += 2 + RExC_naughty / 2;
7390                 reginsert(pRExC_state, CURLY, ret, depth+1);
7391                 Set_Node_Offset(ret, parse_start+1); /* MJD */
7392                 Set_Node_Cur_Length(ret);
7393             }
7394             else {
7395                 regnode * const w = reg_node(pRExC_state, WHILEM);
7396
7397                 w->flags = 0;
7398                 REGTAIL(pRExC_state, ret, w);
7399                 if (!SIZE_ONLY && RExC_extralen) {
7400                     reginsert(pRExC_state, LONGJMP,ret, depth+1);
7401                     reginsert(pRExC_state, NOTHING,ret, depth+1);
7402                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
7403                 }
7404                 reginsert(pRExC_state, CURLYX,ret, depth+1);
7405                                 /* MJD hk */
7406                 Set_Node_Offset(ret, parse_start+1);
7407                 Set_Node_Length(ret,
7408                                 op == '{' ? (RExC_parse - parse_start) : 1);
7409
7410                 if (!SIZE_ONLY && RExC_extralen)
7411                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
7412                 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
7413                 if (SIZE_ONLY)
7414                     RExC_whilem_seen++, RExC_extralen += 3;
7415                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
7416             }
7417             ret->flags = 0;
7418
7419             if (min > 0)
7420                 *flagp = WORST;
7421             if (max > 0)
7422                 *flagp |= HASWIDTH;
7423             if (max < min)
7424                 vFAIL("Can't do {n,m} with n > m");
7425             if (!SIZE_ONLY) {
7426                 ARG1_SET(ret, (U16)min);
7427                 ARG2_SET(ret, (U16)max);
7428             }
7429
7430             goto nest_check;
7431         }
7432     }
7433
7434     if (!ISMULT1(op)) {
7435         *flagp = flags;
7436         return(ret);
7437     }
7438
7439 #if 0                           /* Now runtime fix should be reliable. */
7440
7441     /* if this is reinstated, don't forget to put this back into perldiag:
7442
7443             =item Regexp *+ operand could be empty at {#} in regex m/%s/
7444
7445            (F) The part of the regexp subject to either the * or + quantifier
7446            could match an empty string. The {#} shows in the regular
7447            expression about where the problem was discovered.
7448
7449     */
7450
7451     if (!(flags&HASWIDTH) && op != '?')
7452       vFAIL("Regexp *+ operand could be empty");
7453 #endif
7454
7455     parse_start = RExC_parse;
7456     nextchar(pRExC_state);
7457
7458     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
7459
7460     if (op == '*' && (flags&SIMPLE)) {
7461         reginsert(pRExC_state, STAR, ret, depth+1);
7462         ret->flags = 0;
7463         RExC_naughty += 4;
7464     }
7465     else if (op == '*') {
7466         min = 0;
7467         goto do_curly;
7468     }
7469     else if (op == '+' && (flags&SIMPLE)) {
7470         reginsert(pRExC_state, PLUS, ret, depth+1);
7471         ret->flags = 0;
7472         RExC_naughty += 3;
7473     }
7474     else if (op == '+') {
7475         min = 1;
7476         goto do_curly;
7477     }
7478     else if (op == '?') {
7479         min = 0; max = 1;
7480         goto do_curly;
7481     }
7482   nest_check:
7483     if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
7484         ckWARN3reg(RExC_parse,
7485                    "%.*s matches null string many times",
7486                    (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
7487                    origparse);
7488     }
7489
7490     if (RExC_parse < RExC_end && *RExC_parse == '?') {
7491         nextchar(pRExC_state);
7492         reginsert(pRExC_state, MINMOD, ret, depth+1);
7493         REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
7494     }
7495 #ifndef REG_ALLOW_MINMOD_SUSPEND
7496     else
7497 #endif
7498     if (RExC_parse < RExC_end && *RExC_parse == '+') {
7499         regnode *ender;
7500         nextchar(pRExC_state);
7501         ender = reg_node(pRExC_state, SUCCEED);
7502         REGTAIL(pRExC_state, ret, ender);
7503         reginsert(pRExC_state, SUSPEND, ret, depth+1);
7504         ret->flags = 0;
7505         ender = reg_node(pRExC_state, TAIL);
7506         REGTAIL(pRExC_state, ret, ender);
7507         /*ret= ender;*/
7508     }
7509
7510     if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
7511         RExC_parse++;
7512         vFAIL("Nested quantifiers");
7513     }
7514
7515     return(ret);
7516 }
7517
7518
7519 /* reg_namedseq(pRExC_state,UVp)
7520
7521    This is expected to be called by a parser routine that has
7522    recognized '\N' and needs to handle the rest. RExC_parse is
7523    expected to point at the first char following the N at the time
7524    of the call.
7525
7526    The \N may be inside (indicated by valuep not being NULL) or outside a
7527    character class.
7528
7529    \N may begin either a named sequence, or if outside a character class, mean
7530    to match a non-newline.  For non single-quoted regexes, the tokenizer has
7531    attempted to decide which, and in the case of a named sequence converted it
7532    into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
7533    where c1... are the characters in the sequence.  For single-quoted regexes,
7534    the tokenizer passes the \N sequence through unchanged; this code will not
7535    attempt to determine this nor expand those.  The net effect is that if the
7536    beginning of the passed-in pattern isn't '{U+' or there is no '}', it
7537    signals that this \N occurrence means to match a non-newline.
7538
7539    Only the \N{U+...} form should occur in a character class, for the same
7540    reason that '.' inside a character class means to just match a period: it
7541    just doesn't make sense.
7542
7543    If valuep is non-null then it is assumed that we are parsing inside
7544    of a charclass definition and the first codepoint in the resolved
7545    string is returned via *valuep and the routine will return NULL.
7546    In this mode if a multichar string is returned from the charnames
7547    handler, a warning will be issued, and only the first char in the
7548    sequence will be examined. If the string returned is zero length
7549    then the value of *valuep is undefined and NON-NULL will
7550    be returned to indicate failure. (This will NOT be a valid pointer
7551    to a regnode.)
7552
7553    If valuep is null then it is assumed that we are parsing normal text and a
7554    new EXACT node is inserted into the program containing the resolved string,
7555    and a pointer to the new node is returned.  But if the string is zero length
7556    a NOTHING node is emitted instead.
7557
7558    On success RExC_parse is set to the char following the endbrace.
7559    Parsing failures will generate a fatal error via vFAIL(...)
7560  */
7561 STATIC regnode *
7562 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
7563 {
7564     char * endbrace;    /* '}' following the name */
7565     regnode *ret = NULL;
7566 #ifdef DEBUGGING
7567     char* parse_start = RExC_parse - 2;     /* points to the '\N' */
7568 #endif
7569     char* p;
7570
7571     GET_RE_DEBUG_FLAGS_DECL;
7572
7573     PERL_ARGS_ASSERT_REG_NAMEDSEQ;
7574
7575     GET_RE_DEBUG_FLAGS;
7576
7577     /* The [^\n] meaning of \N ignores spaces and comments under the /x
7578      * modifier.  The other meaning does not */
7579     p = (RExC_flags & RXf_PMf_EXTENDED)
7580         ? regwhite( pRExC_state, RExC_parse )
7581         : RExC_parse;
7582
7583     /* Disambiguate between \N meaning a named character versus \N meaning
7584      * [^\n].  The former is assumed when it can't be the latter. */
7585     if (*p != '{' || regcurly(p)) {
7586         RExC_parse = p;
7587         if (valuep) {
7588             /* no bare \N in a charclass */
7589             vFAIL("\\N in a character class must be a named character: \\N{...}");
7590         }
7591         nextchar(pRExC_state);
7592         ret = reg_node(pRExC_state, REG_ANY);
7593         *flagp |= HASWIDTH|SIMPLE;
7594         RExC_naughty++;
7595         RExC_parse--;
7596         Set_Node_Length(ret, 1); /* MJD */
7597         return ret;
7598     }
7599
7600     /* Here, we have decided it should be a named sequence */
7601
7602     /* The test above made sure that the next real character is a '{', but
7603      * under the /x modifier, it could be separated by space (or a comment and
7604      * \n) and this is not allowed (for consistency with \x{...} and the
7605      * tokenizer handling of \N{NAME}). */
7606     if (*RExC_parse != '{') {
7607         vFAIL("Missing braces on \\N{}");
7608     }
7609
7610     RExC_parse++;       /* Skip past the '{' */
7611
7612     if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
7613         || ! (endbrace == RExC_parse            /* nothing between the {} */
7614               || (endbrace - RExC_parse >= 2    /* U+ (bad hex is checked below */
7615                   && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
7616     {
7617         if (endbrace) RExC_parse = endbrace;    /* position msg's '<--HERE' */
7618         vFAIL("\\N{NAME} must be resolved by the lexer");
7619     }
7620
7621     if (endbrace == RExC_parse) {   /* empty: \N{} */
7622         if (! valuep) {
7623             RExC_parse = endbrace + 1;
7624             return reg_node(pRExC_state,NOTHING);
7625         }
7626
7627         if (SIZE_ONLY) {
7628             ckWARNreg(RExC_parse,
7629                     "Ignoring zero length \\N{} in character class"
7630             );
7631             RExC_parse = endbrace + 1;
7632         }
7633         *valuep = 0;
7634         return (regnode *) &RExC_parse; /* Invalid regnode pointer */
7635     }
7636
7637     REQUIRE_UTF8;       /* named sequences imply Unicode semantics */
7638     RExC_parse += 2;    /* Skip past the 'U+' */
7639
7640     if (valuep) {   /* In a bracketed char class */
7641         /* We only pay attention to the first char of
7642         multichar strings being returned. I kinda wonder
7643         if this makes sense as it does change the behaviour
7644         from earlier versions, OTOH that behaviour was broken
7645         as well. XXX Solution is to recharacterize as
7646         [rest-of-class]|multi1|multi2... */
7647
7648         STRLEN length_of_hex;
7649         I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7650             | PERL_SCAN_DISALLOW_PREFIX
7651             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
7652
7653         char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
7654         if (endchar < endbrace) {
7655             ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
7656         }
7657
7658         length_of_hex = (STRLEN)(endchar - RExC_parse);
7659         *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
7660
7661         /* The tokenizer should have guaranteed validity, but it's possible to
7662          * bypass it by using single quoting, so check */
7663         if (length_of_hex == 0
7664             || length_of_hex != (STRLEN)(endchar - RExC_parse) )
7665         {
7666             RExC_parse += length_of_hex;        /* Includes all the valid */
7667             RExC_parse += (RExC_orig_utf8)      /* point to after 1st invalid */
7668                             ? UTF8SKIP(RExC_parse)
7669                             : 1;
7670             /* Guard against malformed utf8 */
7671             if (RExC_parse >= endchar) RExC_parse = endchar;
7672             vFAIL("Invalid hexadecimal number in \\N{U+...}");
7673         }
7674
7675         RExC_parse = endbrace + 1;
7676         if (endchar == endbrace) return NULL;
7677
7678         ret = (regnode *) &RExC_parse;  /* Invalid regnode pointer */
7679     }
7680     else {      /* Not a char class */
7681         char *s;            /* String to put in generated EXACT node */
7682         STRLEN len = 0;     /* Its current byte length */
7683         char *endchar;      /* Points to '.' or '}' ending cur char in the input
7684                                stream */
7685         ret = reg_node(pRExC_state,
7686                            (U8) ((! FOLD) ? EXACT
7687                                           : (LOC)
7688                                              ? EXACTFL
7689                                              : (MORE_ASCII_RESTRICTED)
7690                                                ? EXACTFA
7691                                                : (AT_LEAST_UNI_SEMANTICS)
7692                                                  ? EXACTFU
7693                                                  : EXACTF));
7694         s= STRING(ret);
7695
7696         /* Exact nodes can hold only a U8 length's of text = 255.  Loop through
7697          * the input which is of the form now 'c1.c2.c3...}' until find the
7698          * ending brace or exceed length 255.  The characters that exceed this
7699          * limit are dropped.  The limit could be relaxed should it become
7700          * desirable by reparsing this as (?:\N{NAME}), so could generate
7701          * multiple EXACT nodes, as is done for just regular input.  But this
7702          * is primarily a named character, and not intended to be a huge long
7703          * string, so 255 bytes should be good enough */
7704         while (1) {
7705             STRLEN length_of_hex;
7706             I32 grok_flags = PERL_SCAN_ALLOW_UNDERSCORES
7707                             | PERL_SCAN_DISALLOW_PREFIX
7708                             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
7709             UV cp;  /* Ord of current character */
7710             bool use_this_char_fold = FOLD;
7711
7712             /* Code points are separated by dots.  If none, there is only one
7713              * code point, and is terminated by the brace */
7714             endchar = RExC_parse + strcspn(RExC_parse, ".}");
7715
7716             /* The values are Unicode even on EBCDIC machines */
7717             length_of_hex = (STRLEN)(endchar - RExC_parse);
7718             cp = grok_hex(RExC_parse, &length_of_hex, &grok_flags, NULL);
7719             if ( length_of_hex == 0
7720                 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
7721             {
7722                 RExC_parse += length_of_hex;        /* Includes all the valid */
7723                 RExC_parse += (RExC_orig_utf8)  /* point to after 1st invalid */
7724                                 ? UTF8SKIP(RExC_parse)
7725                                 : 1;
7726                 /* Guard against malformed utf8 */
7727                 if (RExC_parse >= endchar) RExC_parse = endchar;
7728                 vFAIL("Invalid hexadecimal number in \\N{U+...}");
7729             }
7730
7731             /* XXX ? Change to ANYOF node
7732             if (FOLD
7733                 && (cp > 255 || (! MORE_ASCII_RESTRICTED && ! LOC))
7734                 && is_TRICKYFOLD_cp(cp))
7735             {
7736             }
7737             */
7738
7739             /* Under /aa, we can't mix ASCII with non- in a fold.  If we are
7740              * folding, and the source isn't ASCII, look through all the
7741              * characters it folds to.  If any one of them is ASCII, forbid
7742              * this fold.  (cp is uni, so the 127 below is correct even for
7743              * EBCDIC).  Similarly under locale rules, we don't mix under 256
7744              * with above 255.  XXX It really doesn't make sense to have \N{}
7745              * which means a Unicode rules under locale.  I (khw) think this
7746              * should be warned about, but the counter argument is that people
7747              * who have programmed around Perl's earlier lack of specifying the
7748              * rules and used \N{} to force Unicode things in a local
7749              * environment shouldn't get suddenly a warning */
7750             if (use_this_char_fold) {
7751                 if (LOC && cp < 256) {  /* Fold not known until run-time */
7752                     use_this_char_fold = FALSE;
7753                 }
7754                 else if ((cp > 127 && MORE_ASCII_RESTRICTED)
7755                          || (cp > 255 && LOC))
7756                 {
7757                 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
7758                 U8* s = tmpbuf;
7759                 U8* e;
7760                 STRLEN foldlen;
7761
7762                 (void) toFOLD_uni(cp, tmpbuf, &foldlen);
7763                 e = s + foldlen;
7764
7765                 while (s < e) {
7766                     if (isASCII(*s)
7767                         || (LOC && (UTF8_IS_INVARIANT(*s)
7768                                     || UTF8_IS_DOWNGRADEABLE_START(*s))))
7769                     {
7770                         use_this_char_fold = FALSE;
7771                         break;
7772                     }
7773                     s += UTF8SKIP(s);
7774                 }
7775                 }
7776             }
7777
7778             if (! use_this_char_fold) { /* Not folding, just append to the
7779                                            string */
7780                 STRLEN unilen;
7781
7782                 /* Quit before adding this character if would exceed limit */
7783                 if (len + UNISKIP(cp) > U8_MAX) break;
7784
7785                 unilen = reguni(pRExC_state, cp, s);
7786                 if (unilen > 0) {
7787                     s   += unilen;
7788                     len += unilen;
7789                 }
7790             } else {    /* Folding, output the folded equivalent */
7791                 STRLEN foldlen,numlen;
7792                 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
7793                 cp = toFOLD_uni(cp, tmpbuf, &foldlen);
7794
7795                 /* Quit before exceeding size limit */
7796                 if (len + foldlen > U8_MAX) break;
7797
7798                 for (foldbuf = tmpbuf;
7799                     foldlen;
7800                     foldlen -= numlen)
7801                 {
7802                     cp = utf8_to_uvchr(foldbuf, &numlen);
7803                     if (numlen > 0) {
7804                         const STRLEN unilen = reguni(pRExC_state, cp, s);
7805                         s       += unilen;
7806                         len     += unilen;
7807                         /* In EBCDIC the numlen and unilen can differ. */
7808                         foldbuf += numlen;
7809                         if (numlen >= foldlen)
7810                             break;
7811                     }
7812                     else
7813                         break; /* "Can't happen." */
7814                 }
7815             }
7816
7817             /* Point to the beginning of the next character in the sequence. */
7818             RExC_parse = endchar + 1;
7819
7820             /* Quit if no more characters */
7821             if (RExC_parse >= endbrace) break;
7822         }
7823
7824
7825         if (SIZE_ONLY) {
7826             if (RExC_parse < endbrace) {
7827                 ckWARNreg(RExC_parse - 1,
7828                           "Using just the first characters returned by \\N{}");
7829             }
7830
7831             RExC_size += STR_SZ(len);
7832         } else {
7833             STR_LEN(ret) = len;
7834             RExC_emit += STR_SZ(len);
7835         }
7836
7837         RExC_parse = endbrace + 1;
7838
7839         *flagp |= HASWIDTH; /* Not SIMPLE, as that causes the engine to fail
7840                                with malformed in t/re/pat_advanced.t */
7841         RExC_parse --;
7842         Set_Node_Cur_Length(ret); /* MJD */
7843         nextchar(pRExC_state);
7844     }
7845
7846     return ret;
7847 }
7848
7849
7850 /*
7851  * reg_recode
7852  *
7853  * It returns the code point in utf8 for the value in *encp.
7854  *    value: a code value in the source encoding
7855  *    encp:  a pointer to an Encode object
7856  *
7857  * If the result from Encode is not a single character,
7858  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
7859  */
7860 STATIC UV
7861 S_reg_recode(pTHX_ const char value, SV **encp)
7862 {
7863     STRLEN numlen = 1;
7864     SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
7865     const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
7866     const STRLEN newlen = SvCUR(sv);
7867     UV uv = UNICODE_REPLACEMENT;
7868
7869     PERL_ARGS_ASSERT_REG_RECODE;
7870
7871     if (newlen)
7872         uv = SvUTF8(sv)
7873              ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
7874              : *(U8*)s;
7875
7876     if (!newlen || numlen != newlen) {
7877         uv = UNICODE_REPLACEMENT;
7878         *encp = NULL;
7879     }
7880     return uv;
7881 }
7882
7883
7884 /*
7885  - regatom - the lowest level
7886
7887    Try to identify anything special at the start of the pattern. If there
7888    is, then handle it as required. This may involve generating a single regop,
7889    such as for an assertion; or it may involve recursing, such as to
7890    handle a () structure.
7891
7892    If the string doesn't start with something special then we gobble up
7893    as much literal text as we can.
7894
7895    Once we have been able to handle whatever type of thing started the
7896    sequence, we return.
7897
7898    Note: we have to be careful with escapes, as they can be both literal
7899    and special, and in the case of \10 and friends can either, depending
7900    on context. Specifically there are two separate switches for handling
7901    escape sequences, with the one for handling literal escapes requiring
7902    a dummy entry for all of the special escapes that are actually handled
7903    by the other.
7904 */
7905
7906 STATIC regnode *
7907 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
7908 {
7909     dVAR;
7910     register regnode *ret = NULL;
7911     I32 flags;
7912     char *parse_start = RExC_parse;
7913     U8 op;
7914     GET_RE_DEBUG_FLAGS_DECL;
7915     DEBUG_PARSE("atom");
7916     *flagp = WORST;             /* Tentatively. */
7917
7918     PERL_ARGS_ASSERT_REGATOM;
7919
7920 tryagain:
7921     switch ((U8)*RExC_parse) {
7922     case '^':
7923         RExC_seen_zerolen++;
7924         nextchar(pRExC_state);
7925         if (RExC_flags & RXf_PMf_MULTILINE)
7926             ret = reg_node(pRExC_state, MBOL);
7927         else if (RExC_flags & RXf_PMf_SINGLELINE)
7928             ret = reg_node(pRExC_state, SBOL);
7929         else
7930             ret = reg_node(pRExC_state, BOL);
7931         Set_Node_Length(ret, 1); /* MJD */
7932         break;
7933     case '$':
7934         nextchar(pRExC_state);
7935         if (*RExC_parse)
7936             RExC_seen_zerolen++;
7937         if (RExC_flags & RXf_PMf_MULTILINE)
7938             ret = reg_node(pRExC_state, MEOL);
7939         else if (RExC_flags & RXf_PMf_SINGLELINE)
7940             ret = reg_node(pRExC_state, SEOL);
7941         else
7942             ret = reg_node(pRExC_state, EOL);
7943         Set_Node_Length(ret, 1); /* MJD */
7944         break;
7945     case '.':
7946         nextchar(pRExC_state);
7947         if (RExC_flags & RXf_PMf_SINGLELINE)
7948             ret = reg_node(pRExC_state, SANY);
7949         else
7950             ret = reg_node(pRExC_state, REG_ANY);
7951         *flagp |= HASWIDTH|SIMPLE;
7952         RExC_naughty++;
7953         Set_Node_Length(ret, 1); /* MJD */
7954         break;
7955     case '[':
7956     {
7957         char * const oregcomp_parse = ++RExC_parse;
7958         ret = regclass(pRExC_state,depth+1);
7959         if (*RExC_parse != ']') {
7960             RExC_parse = oregcomp_parse;
7961             vFAIL("Unmatched [");
7962         }
7963         nextchar(pRExC_state);
7964         *flagp |= HASWIDTH|SIMPLE;
7965         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
7966         break;
7967     }
7968     case '(':
7969         nextchar(pRExC_state);
7970         ret = reg(pRExC_state, 1, &flags,depth+1);
7971         if (ret == NULL) {
7972                 if (flags & TRYAGAIN) {
7973                     if (RExC_parse == RExC_end) {
7974                          /* Make parent create an empty node if needed. */
7975                         *flagp |= TRYAGAIN;
7976                         return(NULL);
7977                     }
7978                     goto tryagain;
7979                 }
7980                 return(NULL);
7981         }
7982         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
7983         break;
7984     case '|':
7985     case ')':
7986         if (flags & TRYAGAIN) {
7987             *flagp |= TRYAGAIN;
7988             return NULL;
7989         }
7990         vFAIL("Internal urp");
7991                                 /* Supposed to be caught earlier. */
7992         break;
7993     case '{':
7994         if (!regcurly(RExC_parse)) {
7995             RExC_parse++;
7996             goto defchar;
7997         }
7998         /* FALL THROUGH */
7999     case '?':
8000     case '+':
8001     case '*':
8002         RExC_parse++;
8003         vFAIL("Quantifier follows nothing");
8004         break;
8005     case LATIN_SMALL_LETTER_SHARP_S:
8006     case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
8007     case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
8008 #if UTF8_TWO_BYTE_HI_nocast(UPSILON_D_T) != UTF8_TWO_BYTE_HI_nocast(IOTA_D_T)
8009 #error The beginning utf8 byte of IOTA_D_T and UPSILON_D_T unexpectedly differ.  Other instances in this code should have the case statement below.
8010     case UTF8_TWO_BYTE_HI_nocast(UPSILON_D_T):
8011 #endif
8012         do_foldchar:
8013         if (!LOC && FOLD) {
8014             U32 len,cp;
8015             len=0; /* silence a spurious compiler warning */
8016             if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) {
8017                 *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */
8018                 RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */
8019                 ret = reganode(pRExC_state, FOLDCHAR, cp);
8020                 Set_Node_Length(ret, 1); /* MJD */
8021                 nextchar(pRExC_state); /* kill whitespace under /x */
8022                 return ret;
8023             }
8024         }
8025         goto outer_default;
8026     case '\\':
8027         /* Special Escapes
8028
8029            This switch handles escape sequences that resolve to some kind
8030            of special regop and not to literal text. Escape sequnces that
8031            resolve to literal text are handled below in the switch marked
8032            "Literal Escapes".
8033
8034            Every entry in this switch *must* have a corresponding entry
8035            in the literal escape switch. However, the opposite is not
8036            required, as the default for this switch is to jump to the
8037            literal text handling code.
8038         */
8039         switch ((U8)*++RExC_parse) {
8040         case LATIN_SMALL_LETTER_SHARP_S:
8041         case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
8042         case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
8043                    goto do_foldchar;
8044         /* Special Escapes */
8045         case 'A':
8046             RExC_seen_zerolen++;
8047             ret = reg_node(pRExC_state, SBOL);
8048             *flagp |= SIMPLE;
8049             goto finish_meta_pat;
8050         case 'G':
8051             ret = reg_node(pRExC_state, GPOS);
8052             RExC_seen |= REG_SEEN_GPOS;
8053             *flagp |= SIMPLE;
8054             goto finish_meta_pat;
8055         case 'K':
8056             RExC_seen_zerolen++;
8057             ret = reg_node(pRExC_state, KEEPS);
8058             *flagp |= SIMPLE;
8059             /* XXX:dmq : disabling in-place substitution seems to
8060              * be necessary here to avoid cases of memory corruption, as
8061              * with: C<$_="x" x 80; s/x\K/y/> -- rgs
8062              */
8063             RExC_seen |= REG_SEEN_LOOKBEHIND;
8064             goto finish_meta_pat;
8065         case 'Z':
8066             ret = reg_node(pRExC_state, SEOL);
8067             *flagp |= SIMPLE;
8068             RExC_seen_zerolen++;                /* Do not optimize RE away */
8069             goto finish_meta_pat;
8070         case 'z':
8071             ret = reg_node(pRExC_state, EOS);
8072             *flagp |= SIMPLE;
8073             RExC_seen_zerolen++;                /* Do not optimize RE away */
8074             goto finish_meta_pat;
8075         case 'C':
8076             ret = reg_node(pRExC_state, CANY);
8077             RExC_seen |= REG_SEEN_CANY;
8078             *flagp |= HASWIDTH|SIMPLE;
8079             goto finish_meta_pat;
8080         case 'X':
8081             ret = reg_node(pRExC_state, CLUMP);
8082             *flagp |= HASWIDTH;
8083             goto finish_meta_pat;
8084         case 'w':
8085             switch (get_regex_charset(RExC_flags)) {
8086                 case REGEX_LOCALE_CHARSET:
8087                     op = ALNUML;
8088                     break;
8089                 case REGEX_UNICODE_CHARSET:
8090                     op = ALNUMU;
8091                     break;
8092                 case REGEX_ASCII_RESTRICTED_CHARSET:
8093                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8094                     op = ALNUMA;
8095                     break;
8096                 case REGEX_DEPENDS_CHARSET:
8097                     op = ALNUM;
8098                     break;
8099                 default:
8100                     goto bad_charset;
8101             }
8102             ret = reg_node(pRExC_state, op);
8103             *flagp |= HASWIDTH|SIMPLE;
8104             goto finish_meta_pat;
8105         case 'W':
8106             switch (get_regex_charset(RExC_flags)) {
8107                 case REGEX_LOCALE_CHARSET:
8108                     op = NALNUML;
8109                     break;
8110                 case REGEX_UNICODE_CHARSET:
8111                     op = NALNUMU;
8112                     break;
8113                 case REGEX_ASCII_RESTRICTED_CHARSET:
8114                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8115                     op = NALNUMA;
8116                     break;
8117                 case REGEX_DEPENDS_CHARSET:
8118                     op = NALNUM;
8119                     break;
8120                 default:
8121                     goto bad_charset;
8122             }
8123             ret = reg_node(pRExC_state, op);
8124             *flagp |= HASWIDTH|SIMPLE;
8125             goto finish_meta_pat;
8126         case 'b':
8127             RExC_seen_zerolen++;
8128             RExC_seen |= REG_SEEN_LOOKBEHIND;
8129             switch (get_regex_charset(RExC_flags)) {
8130                 case REGEX_LOCALE_CHARSET:
8131                     op = BOUNDL;
8132                     break;
8133                 case REGEX_UNICODE_CHARSET:
8134                     op = BOUNDU;
8135                     break;
8136                 case REGEX_ASCII_RESTRICTED_CHARSET:
8137                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8138                     op = BOUNDA;
8139                     break;
8140                 case REGEX_DEPENDS_CHARSET:
8141                     op = BOUND;
8142                     break;
8143                 default:
8144                     goto bad_charset;
8145             }
8146             ret = reg_node(pRExC_state, op);
8147             FLAGS(ret) = get_regex_charset(RExC_flags);
8148             *flagp |= SIMPLE;
8149             if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
8150                 ckWARNregdep(RExC_parse, "\"\\b{\" is deprecated; use \"\\b\\{\" instead");
8151             }
8152             goto finish_meta_pat;
8153         case 'B':
8154             RExC_seen_zerolen++;
8155             RExC_seen |= REG_SEEN_LOOKBEHIND;
8156             switch (get_regex_charset(RExC_flags)) {
8157                 case REGEX_LOCALE_CHARSET:
8158                     op = NBOUNDL;
8159                     break;
8160                 case REGEX_UNICODE_CHARSET:
8161                     op = NBOUNDU;
8162                     break;
8163                 case REGEX_ASCII_RESTRICTED_CHARSET:
8164                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8165                     op = NBOUNDA;
8166                     break;
8167                 case REGEX_DEPENDS_CHARSET:
8168                     op = NBOUND;
8169                     break;
8170                 default:
8171                     goto bad_charset;
8172             }
8173             ret = reg_node(pRExC_state, op);
8174             FLAGS(ret) = get_regex_charset(RExC_flags);
8175             *flagp |= SIMPLE;
8176             if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
8177                 ckWARNregdep(RExC_parse, "\"\\B{\" is deprecated; use \"\\B\\{\" instead");
8178             }
8179             goto finish_meta_pat;
8180         case 's':
8181             switch (get_regex_charset(RExC_flags)) {
8182                 case REGEX_LOCALE_CHARSET:
8183                     op = SPACEL;
8184                     break;
8185                 case REGEX_UNICODE_CHARSET:
8186                     op = SPACEU;
8187                     break;
8188                 case REGEX_ASCII_RESTRICTED_CHARSET:
8189                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8190                     op = SPACEA;
8191                     break;
8192                 case REGEX_DEPENDS_CHARSET:
8193                     op = SPACE;
8194                     break;
8195                 default:
8196                     goto bad_charset;
8197             }
8198             ret = reg_node(pRExC_state, op);
8199             *flagp |= HASWIDTH|SIMPLE;
8200             goto finish_meta_pat;
8201         case 'S':
8202             switch (get_regex_charset(RExC_flags)) {
8203                 case REGEX_LOCALE_CHARSET:
8204                     op = NSPACEL;
8205                     break;
8206                 case REGEX_UNICODE_CHARSET:
8207                     op = NSPACEU;
8208                     break;
8209                 case REGEX_ASCII_RESTRICTED_CHARSET:
8210                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8211                     op = NSPACEA;
8212                     break;
8213                 case REGEX_DEPENDS_CHARSET:
8214                     op = NSPACE;
8215                     break;
8216                 default:
8217                     goto bad_charset;
8218             }
8219             ret = reg_node(pRExC_state, op);
8220             *flagp |= HASWIDTH|SIMPLE;
8221             goto finish_meta_pat;
8222         case 'd':
8223             switch (get_regex_charset(RExC_flags)) {
8224                 case REGEX_LOCALE_CHARSET:
8225                     op = DIGITL;
8226                     break;
8227                 case REGEX_ASCII_RESTRICTED_CHARSET:
8228                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8229                     op = DIGITA;
8230                     break;
8231                 case REGEX_DEPENDS_CHARSET: /* No difference between these */
8232                 case REGEX_UNICODE_CHARSET:
8233                     op = DIGIT;
8234                     break;
8235                 default:
8236                     goto bad_charset;
8237             }
8238             ret = reg_node(pRExC_state, op);
8239             *flagp |= HASWIDTH|SIMPLE;
8240             goto finish_meta_pat;
8241         case 'D':
8242             switch (get_regex_charset(RExC_flags)) {
8243                 case REGEX_LOCALE_CHARSET:
8244                     op = NDIGITL;
8245                     break;
8246                 case REGEX_ASCII_RESTRICTED_CHARSET:
8247                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8248                     op = NDIGITA;
8249                     break;
8250                 case REGEX_DEPENDS_CHARSET: /* No difference between these */
8251                 case REGEX_UNICODE_CHARSET:
8252                     op = NDIGIT;
8253                     break;
8254                 default:
8255                     goto bad_charset;
8256             }
8257             ret = reg_node(pRExC_state, op);
8258             *flagp |= HASWIDTH|SIMPLE;
8259             goto finish_meta_pat;
8260         case 'R':
8261             ret = reg_node(pRExC_state, LNBREAK);
8262             *flagp |= HASWIDTH|SIMPLE;
8263             goto finish_meta_pat;
8264         case 'h':
8265             ret = reg_node(pRExC_state, HORIZWS);
8266             *flagp |= HASWIDTH|SIMPLE;
8267             goto finish_meta_pat;
8268         case 'H':
8269             ret = reg_node(pRExC_state, NHORIZWS);
8270             *flagp |= HASWIDTH|SIMPLE;
8271             goto finish_meta_pat;
8272         case 'v':
8273             ret = reg_node(pRExC_state, VERTWS);
8274             *flagp |= HASWIDTH|SIMPLE;
8275             goto finish_meta_pat;
8276         case 'V':
8277             ret = reg_node(pRExC_state, NVERTWS);
8278             *flagp |= HASWIDTH|SIMPLE;
8279          finish_meta_pat:
8280             nextchar(pRExC_state);
8281             Set_Node_Length(ret, 2); /* MJD */
8282             break;
8283         case 'p':
8284         case 'P':
8285             {
8286                 char* const oldregxend = RExC_end;
8287 #ifdef DEBUGGING
8288                 char* parse_start = RExC_parse - 2;
8289 #endif
8290
8291                 if (RExC_parse[1] == '{') {
8292                   /* a lovely hack--pretend we saw [\pX] instead */
8293                     RExC_end = strchr(RExC_parse, '}');
8294                     if (!RExC_end) {
8295                         const U8 c = (U8)*RExC_parse;
8296                         RExC_parse += 2;
8297                         RExC_end = oldregxend;
8298                         vFAIL2("Missing right brace on \\%c{}", c);
8299                     }
8300                     RExC_end++;
8301                 }
8302                 else {
8303                     RExC_end = RExC_parse + 2;
8304                     if (RExC_end > oldregxend)
8305                         RExC_end = oldregxend;
8306                 }
8307                 RExC_parse--;
8308
8309                 ret = regclass(pRExC_state,depth+1);
8310
8311                 RExC_end = oldregxend;
8312                 RExC_parse--;
8313
8314                 Set_Node_Offset(ret, parse_start + 2);
8315                 Set_Node_Cur_Length(ret);
8316                 nextchar(pRExC_state);
8317                 *flagp |= HASWIDTH|SIMPLE;
8318             }
8319             break;
8320         case 'N':
8321             /* Handle \N and \N{NAME} here and not below because it can be
8322             multicharacter. join_exact() will join them up later on.
8323             Also this makes sure that things like /\N{BLAH}+/ and
8324             \N{BLAH} being multi char Just Happen. dmq*/
8325             ++RExC_parse;
8326             ret= reg_namedseq(pRExC_state, NULL, flagp);
8327             break;
8328         case 'k':    /* Handle \k<NAME> and \k'NAME' */
8329         parse_named_seq:
8330         {
8331             char ch= RExC_parse[1];
8332             if (ch != '<' && ch != '\'' && ch != '{') {
8333                 RExC_parse++;
8334                 vFAIL2("Sequence %.2s... not terminated",parse_start);
8335             } else {
8336                 /* this pretty much dupes the code for (?P=...) in reg(), if
8337                    you change this make sure you change that */
8338                 char* name_start = (RExC_parse += 2);
8339                 U32 num = 0;
8340                 SV *sv_dat = reg_scan_name(pRExC_state,
8341                     SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8342                 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
8343                 if (RExC_parse == name_start || *RExC_parse != ch)
8344                     vFAIL2("Sequence %.3s... not terminated",parse_start);
8345
8346                 if (!SIZE_ONLY) {
8347                     num = add_data( pRExC_state, 1, "S" );
8348                     RExC_rxi->data->data[num]=(void*)sv_dat;
8349                     SvREFCNT_inc_simple_void(sv_dat);
8350                 }
8351
8352                 RExC_sawback = 1;
8353                 ret = reganode(pRExC_state,
8354                                ((! FOLD)
8355                                  ? NREF
8356                                  : (MORE_ASCII_RESTRICTED)
8357                                    ? NREFFA
8358                                    : (AT_LEAST_UNI_SEMANTICS)
8359                                      ? NREFFU
8360                                      : (LOC)
8361                                        ? NREFFL
8362                                        : NREFF),
8363                                 num);
8364                 *flagp |= HASWIDTH;
8365
8366                 /* override incorrect value set in reganode MJD */
8367                 Set_Node_Offset(ret, parse_start+1);
8368                 Set_Node_Cur_Length(ret); /* MJD */
8369                 nextchar(pRExC_state);
8370
8371             }
8372             break;
8373         }
8374         case 'g':
8375         case '1': case '2': case '3': case '4':
8376         case '5': case '6': case '7': case '8': case '9':
8377             {
8378                 I32 num;
8379                 bool isg = *RExC_parse == 'g';
8380                 bool isrel = 0;
8381                 bool hasbrace = 0;
8382                 if (isg) {
8383                     RExC_parse++;
8384                     if (*RExC_parse == '{') {
8385                         RExC_parse++;
8386                         hasbrace = 1;
8387                     }
8388                     if (*RExC_parse == '-') {
8389                         RExC_parse++;
8390                         isrel = 1;
8391                     }
8392                     if (hasbrace && !isDIGIT(*RExC_parse)) {
8393                         if (isrel) RExC_parse--;
8394                         RExC_parse -= 2;
8395                         goto parse_named_seq;
8396                 }   }
8397                 num = atoi(RExC_parse);
8398                 if (isg && num == 0)
8399                     vFAIL("Reference to invalid group 0");
8400                 if (isrel) {
8401                     num = RExC_npar - num;
8402                     if (num < 1)
8403                         vFAIL("Reference to nonexistent or unclosed group");
8404                 }
8405                 if (!isg && num > 9 && num >= RExC_npar)
8406                     goto defchar;
8407                 else {
8408                     char * const parse_start = RExC_parse - 1; /* MJD */
8409                     while (isDIGIT(*RExC_parse))
8410                         RExC_parse++;
8411                     if (parse_start == RExC_parse - 1)
8412                         vFAIL("Unterminated \\g... pattern");
8413                     if (hasbrace) {
8414                         if (*RExC_parse != '}')
8415                             vFAIL("Unterminated \\g{...} pattern");
8416                         RExC_parse++;
8417                     }
8418                     if (!SIZE_ONLY) {
8419                         if (num > (I32)RExC_rx->nparens)
8420                             vFAIL("Reference to nonexistent group");
8421                     }
8422                     RExC_sawback = 1;
8423                     ret = reganode(pRExC_state,
8424                                    ((! FOLD)
8425                                      ? REF
8426                                      : (MORE_ASCII_RESTRICTED)
8427                                        ? REFFA
8428                                        : (AT_LEAST_UNI_SEMANTICS)
8429                                          ? REFFU
8430                                          : (LOC)
8431                                            ? REFFL
8432                                            : REFF),
8433                                     num);
8434                     *flagp |= HASWIDTH;
8435
8436                     /* override incorrect value set in reganode MJD */
8437                     Set_Node_Offset(ret, parse_start+1);
8438                     Set_Node_Cur_Length(ret); /* MJD */
8439                     RExC_parse--;
8440                     nextchar(pRExC_state);
8441                 }
8442             }
8443             break;
8444         case '\0':
8445             if (RExC_parse >= RExC_end)
8446                 FAIL("Trailing \\");
8447             /* FALL THROUGH */
8448         default:
8449             /* Do not generate "unrecognized" warnings here, we fall
8450                back into the quick-grab loop below */
8451             parse_start--;
8452             goto defchar;
8453         }
8454         break;
8455
8456     case '#':
8457         if (RExC_flags & RXf_PMf_EXTENDED) {
8458             if ( reg_skipcomment( pRExC_state ) )
8459                 goto tryagain;
8460         }
8461         /* FALL THROUGH */
8462
8463     default:
8464         outer_default:{
8465             register STRLEN len;
8466             register UV ender;
8467             register char *p;
8468             char *s;
8469             STRLEN foldlen;
8470             U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
8471             regnode * orig_emit;
8472
8473             parse_start = RExC_parse - 1;
8474
8475             RExC_parse++;
8476
8477         defchar:
8478             ender = 0;
8479             orig_emit = RExC_emit; /* Save the original output node position in
8480                                       case we need to output a different node
8481                                       type */
8482             ret = reg_node(pRExC_state,
8483                            (U8) ((! FOLD) ? EXACT
8484                                           : (LOC)
8485                                              ? EXACTFL
8486                                              : (MORE_ASCII_RESTRICTED)
8487                                                ? EXACTFA
8488                                                : (AT_LEAST_UNI_SEMANTICS)
8489                                                  ? EXACTFU
8490                                                  : EXACTF)
8491                     );
8492             s = STRING(ret);
8493             for (len = 0, p = RExC_parse - 1;
8494               len < 127 && p < RExC_end;
8495               len++)
8496             {
8497                 char * const oldp = p;
8498
8499                 if (RExC_flags & RXf_PMf_EXTENDED)
8500                     p = regwhite( pRExC_state, p );
8501                 switch ((U8)*p) {
8502                 case LATIN_SMALL_LETTER_SHARP_S:
8503                 case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
8504                 case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
8505                            if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
8506                                 goto normal_default;
8507                 case '^':
8508                 case '$':
8509                 case '.':
8510                 case '[':
8511                 case '(':
8512                 case ')':
8513                 case '|':
8514                     goto loopdone;
8515                 case '\\':
8516                     /* Literal Escapes Switch
8517
8518                        This switch is meant to handle escape sequences that
8519                        resolve to a literal character.
8520
8521                        Every escape sequence that represents something
8522                        else, like an assertion or a char class, is handled
8523                        in the switch marked 'Special Escapes' above in this
8524                        routine, but also has an entry here as anything that
8525                        isn't explicitly mentioned here will be treated as
8526                        an unescaped equivalent literal.
8527                     */
8528
8529                     switch ((U8)*++p) {
8530                     /* These are all the special escapes. */
8531                     case LATIN_SMALL_LETTER_SHARP_S:
8532                     case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
8533                     case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
8534                            if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
8535                                 goto normal_default;
8536                     case 'A':             /* Start assertion */
8537                     case 'b': case 'B':   /* Word-boundary assertion*/
8538                     case 'C':             /* Single char !DANGEROUS! */
8539                     case 'd': case 'D':   /* digit class */
8540                     case 'g': case 'G':   /* generic-backref, pos assertion */
8541                     case 'h': case 'H':   /* HORIZWS */
8542                     case 'k': case 'K':   /* named backref, keep marker */
8543                     case 'N':             /* named char sequence */
8544                     case 'p': case 'P':   /* Unicode property */
8545                               case 'R':   /* LNBREAK */
8546                     case 's': case 'S':   /* space class */
8547                     case 'v': case 'V':   /* VERTWS */
8548                     case 'w': case 'W':   /* word class */
8549                     case 'X':             /* eXtended Unicode "combining character sequence" */
8550                     case 'z': case 'Z':   /* End of line/string assertion */
8551                         --p;
8552                         goto loopdone;
8553
8554                     /* Anything after here is an escape that resolves to a
8555                        literal. (Except digits, which may or may not)
8556                      */
8557                     case 'n':
8558                         ender = '\n';
8559                         p++;
8560                         break;
8561                     case 'r':
8562                         ender = '\r';
8563                         p++;
8564                         break;
8565                     case 't':
8566                         ender = '\t';
8567                         p++;
8568                         break;
8569                     case 'f':
8570                         ender = '\f';
8571                         p++;
8572                         break;
8573                     case 'e':
8574                           ender = ASCII_TO_NATIVE('\033');
8575                         p++;
8576                         break;
8577                     case 'a':
8578                           ender = ASCII_TO_NATIVE('\007');
8579                         p++;
8580                         break;
8581                     case 'o':
8582                         {
8583                             STRLEN brace_len = len;
8584                             UV result;
8585                             const char* error_msg;
8586
8587                             bool valid = grok_bslash_o(p,
8588                                                        &result,
8589                                                        &brace_len,
8590                                                        &error_msg,
8591                                                        1);
8592                             p += brace_len;
8593                             if (! valid) {
8594                                 RExC_parse = p; /* going to die anyway; point
8595                                                    to exact spot of failure */
8596                                 vFAIL(error_msg);
8597                             }
8598                             else
8599                             {
8600                                 ender = result;
8601                             }
8602                             if (PL_encoding && ender < 0x100) {
8603                                 goto recode_encoding;
8604                             }
8605                             if (ender > 0xff) {
8606                                 REQUIRE_UTF8;
8607                             }
8608                             break;
8609                         }
8610                     case 'x':
8611                         if (*++p == '{') {
8612                             char* const e = strchr(p, '}');
8613
8614                             if (!e) {
8615                                 RExC_parse = p + 1;
8616                                 vFAIL("Missing right brace on \\x{}");
8617                             }
8618                             else {
8619                                 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
8620                                     | PERL_SCAN_DISALLOW_PREFIX;
8621                                 STRLEN numlen = e - p - 1;
8622                                 ender = grok_hex(p + 1, &numlen, &flags, NULL);
8623                                 if (ender > 0xff)
8624                                     REQUIRE_UTF8;
8625                                 p = e + 1;
8626                             }
8627                         }
8628                         else {
8629                             I32 flags = PERL_SCAN_DISALLOW_PREFIX;
8630                             STRLEN numlen = 2;
8631                             ender = grok_hex(p, &numlen, &flags, NULL);
8632                             p += numlen;
8633                         }
8634                         if (PL_encoding && ender < 0x100)
8635                             goto recode_encoding;
8636                         break;
8637                     case 'c':
8638                         p++;
8639                         ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
8640                         break;
8641                     case '0': case '1': case '2': case '3':case '4':
8642                     case '5': case '6': case '7': case '8':case '9':
8643                         if (*p == '0' ||
8644                             (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
8645                         {
8646                             I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
8647                             STRLEN numlen = 3;
8648                             ender = grok_oct(p, &numlen, &flags, NULL);
8649                             if (ender > 0xff) {
8650                                 REQUIRE_UTF8;
8651                             }
8652                             p += numlen;
8653                         }
8654                         else {
8655                             --p;
8656                             goto loopdone;
8657                         }
8658                         if (PL_encoding && ender < 0x100)
8659                             goto recode_encoding;
8660                         break;
8661                     recode_encoding:
8662                         {
8663                             SV* enc = PL_encoding;
8664                             ender = reg_recode((const char)(U8)ender, &enc);
8665                             if (!enc && SIZE_ONLY)
8666                                 ckWARNreg(p, "Invalid escape in the specified encoding");
8667                             REQUIRE_UTF8;
8668                         }
8669                         break;
8670                     case '\0':
8671                         if (p >= RExC_end)
8672                             FAIL("Trailing \\");
8673                         /* FALL THROUGH */
8674                     default:
8675                         if (!SIZE_ONLY&& isALPHA(*p)) {
8676                             /* Include any { following the alpha to emphasize
8677                              * that it could be part of an escape at some point
8678                              * in the future */
8679                             int len = (*(p + 1) == '{') ? 2 : 1;
8680                             ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
8681                         }
8682                         goto normal_default;
8683                     }
8684                     break;
8685                 default:
8686                   normal_default:
8687                     if (UTF8_IS_START(*p) && UTF) {
8688                         STRLEN numlen;
8689                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
8690                                                &numlen, UTF8_ALLOW_DEFAULT);
8691                         p += numlen;
8692                     }
8693                     else
8694                         ender = (U8) *p++;
8695                     break;
8696                 } /* End of switch on the literal */
8697
8698                 /* Certain characters are problematic because their folded
8699                  * length is so different from their original length that it
8700                  * isn't handleable by the optimizer.  They are therefore not
8701                  * placed in an EXACTish node; and are here handled specially.
8702                  * (Even if the optimizer handled LATIN_SMALL_LETTER_SHARP_S,
8703                  * putting it in a special node keeps regexec from having to
8704                  * deal with a non-utf8 multi-char fold */
8705                 if (FOLD
8706                     && (ender > 255 || (! MORE_ASCII_RESTRICTED && ! LOC))
8707                     && is_TRICKYFOLD_cp(ender))
8708                 {
8709                     /* If is in middle of outputting characters into an
8710                      * EXACTish node, go output what we have so far, and
8711                      * position the parse so that this will be called again
8712                      * immediately */
8713                     if (len) {
8714                         p  = RExC_parse + len - 1;
8715                         goto loopdone;
8716                     }
8717                     else {
8718
8719                         /* Here we are ready to output our tricky fold
8720                          * character.  What's done is to pretend it's in a
8721                          * [bracketed] class, and let the code that deals with
8722                          * those handle it, as that code has all the
8723                          * intelligence necessary.  First save the current
8724                          * parse state, get rid of the already allocated EXACT
8725                          * node that the ANYOFV node will replace, and point
8726                          * the parse to a buffer which we fill with the
8727                          * character we want the regclass code to think is
8728                          * being parsed */
8729                         char* const oldregxend = RExC_end;
8730                         char tmpbuf[2];
8731                         RExC_emit = orig_emit;
8732                         RExC_parse = tmpbuf;
8733                         if (UTF) {
8734                             tmpbuf[0] = UTF8_TWO_BYTE_HI(ender);
8735                             tmpbuf[1] = UTF8_TWO_BYTE_LO(ender);
8736                             RExC_end = RExC_parse + 2;
8737                         }
8738                         else {
8739                             tmpbuf[0] = (char) ender;
8740                             RExC_end = RExC_parse + 1;
8741                         }
8742
8743                         ret = regclass(pRExC_state,depth+1);
8744
8745                         /* Here, have parsed the buffer.  Reset the parse to
8746                          * the actual input, and return */
8747                         RExC_end = oldregxend;
8748                         RExC_parse = p - 1;
8749
8750                         Set_Node_Offset(ret, RExC_parse);
8751                         Set_Node_Cur_Length(ret);
8752                         nextchar(pRExC_state);
8753                         *flagp |= HASWIDTH|SIMPLE;
8754                         return ret;
8755                     }
8756                 }
8757
8758                 if ( RExC_flags & RXf_PMf_EXTENDED)
8759                     p = regwhite( pRExC_state, p );
8760                 if (UTF && FOLD) {
8761                     /* Prime the casefolded buffer.  Locale rules, which apply
8762                      * only to code points < 256, aren't known until execution,
8763                      * so for them, just output the original character using
8764                      * utf8 */
8765                     if (LOC && ender < 256) {
8766                         if (UNI_IS_INVARIANT(ender)) {
8767                             *tmpbuf = (U8) ender;
8768                             foldlen = 1;
8769                         } else {
8770                             *tmpbuf = UTF8_TWO_BYTE_HI(ender);
8771                             *(tmpbuf + 1) = UTF8_TWO_BYTE_LO(ender);
8772                             foldlen = 2;
8773                         }
8774                     }
8775                     else if (isASCII(ender)) {  /* Note: Here can't also be LOC
8776                                                  */
8777                         ender = toLOWER(ender);
8778                         *tmpbuf = (U8) ender;
8779                         foldlen = 1;
8780                     }
8781                     else if (! MORE_ASCII_RESTRICTED && ! LOC) {
8782
8783                         /* Locale and /aa require more selectivity about the
8784                          * fold, so are handled below.  Otherwise, here, just
8785                          * use the fold */
8786                         ender = toFOLD_uni(ender, tmpbuf, &foldlen);
8787                     }
8788                     else {
8789                         /* Under locale rules or /aa we are not to mix,
8790                          * respectively, ords < 256 or ASCII with non-.  So
8791                          * reject folds that mix them, using only the
8792                          * non-folded code point.  So do the fold to a
8793                          * temporary, and inspect each character in it. */
8794                         U8 trialbuf[UTF8_MAXBYTES_CASE+1];
8795                         U8* s = trialbuf;
8796                         UV tmpender = toFOLD_uni(ender, trialbuf, &foldlen);
8797                         U8* e = s + foldlen;
8798                         bool fold_ok = TRUE;
8799
8800                         while (s < e) {
8801                             if (isASCII(*s)
8802                                 || (LOC && (UTF8_IS_INVARIANT(*s)
8803                                            || UTF8_IS_DOWNGRADEABLE_START(*s))))
8804                             {
8805                                 fold_ok = FALSE;
8806                                 break;
8807                             }
8808                             s += UTF8SKIP(s);
8809                         }
8810                         if (fold_ok) {
8811                             Copy(trialbuf, tmpbuf, foldlen, U8);
8812                             ender = tmpender;
8813                         }
8814                         else {
8815                             uvuni_to_utf8(tmpbuf, ender);
8816                             foldlen = UNISKIP(ender);
8817                         }
8818                     }
8819                 }
8820                 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
8821                     if (len)
8822                         p = oldp;
8823                     else if (UTF) {
8824                          if (FOLD) {
8825                               /* Emit all the Unicode characters. */
8826                               STRLEN numlen;
8827                               for (foldbuf = tmpbuf;
8828                                    foldlen;
8829                                    foldlen -= numlen) {
8830                                    ender = utf8_to_uvchr(foldbuf, &numlen);
8831                                    if (numlen > 0) {
8832                                         const STRLEN unilen = reguni(pRExC_state, ender, s);
8833                                         s       += unilen;
8834                                         len     += unilen;
8835                                         /* In EBCDIC the numlen
8836                                          * and unilen can differ. */
8837                                         foldbuf += numlen;
8838                                         if (numlen >= foldlen)
8839                                              break;
8840                                    }
8841                                    else
8842                                         break; /* "Can't happen." */
8843                               }
8844                          }
8845                          else {
8846                               const STRLEN unilen = reguni(pRExC_state, ender, s);
8847                               if (unilen > 0) {
8848                                    s   += unilen;
8849                                    len += unilen;
8850                               }
8851                          }
8852                     }
8853                     else {
8854                         len++;
8855                         REGC((char)ender, s++);
8856                     }
8857                     break;
8858                 }
8859                 if (UTF) {
8860                      if (FOLD) {
8861                           /* Emit all the Unicode characters. */
8862                           STRLEN numlen;
8863                           for (foldbuf = tmpbuf;
8864                                foldlen;
8865                                foldlen -= numlen) {
8866                                ender = utf8_to_uvchr(foldbuf, &numlen);
8867                                if (numlen > 0) {
8868                                     const STRLEN unilen = reguni(pRExC_state, ender, s);
8869                                     len     += unilen;
8870                                     s       += unilen;
8871                                     /* In EBCDIC the numlen
8872                                      * and unilen can differ. */
8873                                     foldbuf += numlen;
8874                                     if (numlen >= foldlen)
8875                                          break;
8876                                }
8877                                else
8878                                     break;
8879                           }
8880                      }
8881                      else {
8882                           const STRLEN unilen = reguni(pRExC_state, ender, s);
8883                           if (unilen > 0) {
8884                                s   += unilen;
8885                                len += unilen;
8886                           }
8887                      }
8888                      len--;
8889                 }
8890                 else
8891                     REGC((char)ender, s++);
8892             }
8893         loopdone:   /* Jumped to when encounters something that shouldn't be in
8894                        the node */
8895             RExC_parse = p - 1;
8896             Set_Node_Cur_Length(ret); /* MJD */
8897             nextchar(pRExC_state);
8898             {
8899                 /* len is STRLEN which is unsigned, need to copy to signed */
8900                 IV iv = len;
8901                 if (iv < 0)
8902                     vFAIL("Internal disaster");
8903             }
8904             if (len > 0)
8905                 *flagp |= HASWIDTH;
8906             if (len == 1 && UNI_IS_INVARIANT(ender))
8907                 *flagp |= SIMPLE;
8908
8909             if (SIZE_ONLY)
8910                 RExC_size += STR_SZ(len);
8911             else {
8912                 STR_LEN(ret) = len;
8913                 RExC_emit += STR_SZ(len);
8914             }
8915         }
8916         break;
8917     }
8918
8919     return(ret);
8920
8921 /* Jumped to when an unrecognized character set is encountered */
8922 bad_charset:
8923     Perl_croak(aTHX_ "panic: Unknown regex character set encoding: %u", get_regex_charset(RExC_flags));
8924     return(NULL);
8925 }
8926
8927 STATIC char *
8928 S_regwhite( RExC_state_t *pRExC_state, char *p )
8929 {
8930     const char *e = RExC_end;
8931
8932     PERL_ARGS_ASSERT_REGWHITE;
8933
8934     while (p < e) {
8935         if (isSPACE(*p))
8936             ++p;
8937         else if (*p == '#') {
8938             bool ended = 0;
8939             do {
8940                 if (*p++ == '\n') {
8941                     ended = 1;
8942                     break;
8943                 }
8944             } while (p < e);
8945             if (!ended)
8946                 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
8947         }
8948         else
8949             break;
8950     }
8951     return p;
8952 }
8953
8954 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
8955    Character classes ([:foo:]) can also be negated ([:^foo:]).
8956    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
8957    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
8958    but trigger failures because they are currently unimplemented. */
8959
8960 #define POSIXCC_DONE(c)   ((c) == ':')
8961 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
8962 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
8963
8964 STATIC I32
8965 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
8966 {
8967     dVAR;
8968     I32 namedclass = OOB_NAMEDCLASS;
8969
8970     PERL_ARGS_ASSERT_REGPPOSIXCC;
8971
8972     if (value == '[' && RExC_parse + 1 < RExC_end &&
8973         /* I smell either [: or [= or [. -- POSIX has been here, right? */
8974         POSIXCC(UCHARAT(RExC_parse))) {
8975         const char c = UCHARAT(RExC_parse);
8976         char* const s = RExC_parse++;
8977
8978         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
8979             RExC_parse++;
8980         if (RExC_parse == RExC_end)
8981             /* Grandfather lone [:, [=, [. */
8982             RExC_parse = s;
8983         else {
8984             const char* const t = RExC_parse++; /* skip over the c */
8985             assert(*t == c);
8986
8987             if (UCHARAT(RExC_parse) == ']') {
8988                 const char *posixcc = s + 1;
8989                 RExC_parse++; /* skip over the ending ] */
8990
8991                 if (*s == ':') {
8992                     const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
8993                     const I32 skip = t - posixcc;
8994
8995                     /* Initially switch on the length of the name.  */
8996                     switch (skip) {
8997                     case 4:
8998                         if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
8999                             namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
9000                         break;
9001                     case 5:
9002                         /* Names all of length 5.  */
9003                         /* alnum alpha ascii blank cntrl digit graph lower
9004                            print punct space upper  */
9005                         /* Offset 4 gives the best switch position.  */
9006                         switch (posixcc[4]) {
9007                         case 'a':
9008                             if (memEQ(posixcc, "alph", 4)) /* alpha */
9009                                 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
9010                             break;
9011                         case 'e':
9012                             if (memEQ(posixcc, "spac", 4)) /* space */
9013                                 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
9014                             break;
9015                         case 'h':
9016                             if (memEQ(posixcc, "grap", 4)) /* graph */
9017                                 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
9018                             break;
9019                         case 'i':
9020                             if (memEQ(posixcc, "asci", 4)) /* ascii */
9021                                 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
9022                             break;
9023                         case 'k':
9024                             if (memEQ(posixcc, "blan", 4)) /* blank */
9025                                 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
9026                             break;
9027                         case 'l':
9028                             if (memEQ(posixcc, "cntr", 4)) /* cntrl */
9029                                 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
9030                             break;
9031                         case 'm':
9032                             if (memEQ(posixcc, "alnu", 4)) /* alnum */
9033                                 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
9034                             break;
9035                         case 'r':
9036                             if (memEQ(posixcc, "lowe", 4)) /* lower */
9037                                 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
9038                             else if (memEQ(posixcc, "uppe", 4)) /* upper */
9039                                 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
9040                             break;
9041                         case 't':
9042                             if (memEQ(posixcc, "digi", 4)) /* digit */
9043                                 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
9044                             else if (memEQ(posixcc, "prin", 4)) /* print */
9045                                 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
9046                             else if (memEQ(posixcc, "punc", 4)) /* punct */
9047                                 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
9048                             break;
9049                         }
9050                         break;
9051                     case 6:
9052                         if (memEQ(posixcc, "xdigit", 6))
9053                             namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
9054                         break;
9055                     }
9056
9057                     if (namedclass == OOB_NAMEDCLASS)
9058                         Simple_vFAIL3("POSIX class [:%.*s:] unknown",
9059                                       t - s - 1, s + 1);
9060                     assert (posixcc[skip] == ':');
9061                     assert (posixcc[skip+1] == ']');
9062                 } else if (!SIZE_ONLY) {
9063                     /* [[=foo=]] and [[.foo.]] are still future. */
9064
9065                     /* adjust RExC_parse so the warning shows after
9066                        the class closes */
9067                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
9068                         RExC_parse++;
9069                     Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
9070                 }
9071             } else {
9072                 /* Maternal grandfather:
9073                  * "[:" ending in ":" but not in ":]" */
9074                 RExC_parse = s;
9075             }
9076         }
9077     }
9078
9079     return namedclass;
9080 }
9081
9082 STATIC void
9083 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
9084 {
9085     dVAR;
9086
9087     PERL_ARGS_ASSERT_CHECKPOSIXCC;
9088
9089     if (POSIXCC(UCHARAT(RExC_parse))) {
9090         const char *s = RExC_parse;
9091         const char  c = *s++;
9092
9093         while (isALNUM(*s))
9094             s++;
9095         if (*s && c == *s && s[1] == ']') {
9096             ckWARN3reg(s+2,
9097                        "POSIX syntax [%c %c] belongs inside character classes",
9098                        c, c);
9099
9100             /* [[=foo=]] and [[.foo.]] are still future. */
9101             if (POSIXCC_NOTYET(c)) {
9102                 /* adjust RExC_parse so the error shows after
9103                    the class closes */
9104                 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
9105                     NOOP;
9106                 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
9107             }
9108         }
9109     }
9110 }
9111
9112 /* No locale test, and always Unicode semantics */
9113 #define _C_C_T_NOLOC_(NAME,TEST,WORD)                                          \
9114 ANYOF_##NAME:                                                                  \
9115         for (value = 0; value < 256; value++)                                  \
9116             if (TEST)                                                          \
9117             stored += set_regclass_bit(pRExC_state, ret, (U8) value, &nonbitmap);  \
9118     yesno = '+';                                                               \
9119     what = WORD;                                                               \
9120     break;                                                                     \
9121 case ANYOF_N##NAME:                                                            \
9122         for (value = 0; value < 256; value++)                                  \
9123             if (!TEST)                                                         \
9124             stored += set_regclass_bit(pRExC_state, ret, (U8) value, &nonbitmap);  \
9125     yesno = '!';                                                               \
9126     what = WORD;                                                               \
9127     break
9128
9129 /* Like the above, but there are differences if we are in uni-8-bit or not, so
9130  * there are two tests passed in, to use depending on that. There aren't any
9131  * cases where the label is different from the name, so no need for that
9132  * parameter */
9133 #define _C_C_T_(NAME, TEST_8, TEST_7, WORD)                                    \
9134 ANYOF_##NAME:                                                                  \
9135     if (LOC) ANYOF_CLASS_SET(ret, ANYOF_##NAME);                               \
9136     else if (UNI_SEMANTICS) {                                                  \
9137         for (value = 0; value < 256; value++) {                                \
9138             if (TEST_8(value)) stored +=                                       \
9139                       set_regclass_bit(pRExC_state, ret, (U8) value, &nonbitmap);  \
9140         }                                                                      \
9141     }                                                                          \
9142     else {                                                                     \
9143         for (value = 0; value < 128; value++) {                                \
9144             if (TEST_7(UNI_TO_NATIVE(value))) stored +=                        \
9145                 set_regclass_bit(pRExC_state, ret,                     \
9146                                    (U8) UNI_TO_NATIVE(value), &nonbitmap);                 \
9147         }                                                                      \
9148     }                                                                          \
9149     yesno = '+';                                                               \
9150     what = WORD;                                                               \
9151     break;                                                                     \
9152 case ANYOF_N##NAME:                                                            \
9153     if (LOC) ANYOF_CLASS_SET(ret, ANYOF_N##NAME);                              \
9154     else if (UNI_SEMANTICS) {                                                  \
9155         for (value = 0; value < 256; value++) {                                \
9156             if (! TEST_8(value)) stored +=                                     \
9157                     set_regclass_bit(pRExC_state, ret, (U8) value, &nonbitmap);    \
9158         }                                                                      \
9159     }                                                                          \
9160     else {                                                                     \
9161         for (value = 0; value < 128; value++) {                                \
9162             if (! TEST_7(UNI_TO_NATIVE(value))) stored += set_regclass_bit(  \
9163                         pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &nonbitmap);    \
9164         }                                                                      \
9165         if (AT_LEAST_ASCII_RESTRICTED) {                                       \
9166             for (value = 128; value < 256; value++) {                          \
9167              stored += set_regclass_bit(                                     \
9168                            pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &nonbitmap); \
9169             }                                                                  \
9170             ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL|ANYOF_UTF8;                  \
9171         }                                                                      \
9172         else {                                                                 \
9173             /* For a non-ut8 target string with DEPENDS semantics, all above   \
9174              * ASCII Latin1 code points match the complement of any of the     \
9175              * classes.  But in utf8, they have their Unicode semantics, so    \
9176              * can't just set them in the bitmap, or else regexec.c will think \
9177              * they matched when they shouldn't. */                            \
9178             ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL|ANYOF_UTF8;          \
9179         }                                                                      \
9180     }                                                                          \
9181     yesno = '!';                                                               \
9182     what = WORD;                                                               \
9183     break
9184
9185 /*
9186    We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
9187    so that it is possible to override the option here without having to
9188    rebuild the entire core. as we are required to do if we change regcomp.h
9189    which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
9190 */
9191 #if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
9192 #define BROKEN_UNICODE_CHARCLASS_MAPPINGS
9193 #endif
9194
9195 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
9196 #define POSIX_CC_UNI_NAME(CCNAME) CCNAME
9197 #else
9198 #define POSIX_CC_UNI_NAME(CCNAME) "Posix" CCNAME
9199 #endif
9200
9201 STATIC U8
9202 S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, HV** nonbitmap_ptr)
9203 {
9204
9205     /* Handle the setting of folds in the bitmap for non-locale ANYOF nodes.
9206      * Locale folding is done at run-time, so this function should not be
9207      * called for nodes that are for locales.
9208      *
9209      * This function simply sets the bit corresponding to the fold of the input
9210      * 'value', if not already set.  The fold of 'f' is 'F', and the fold of
9211      * 'F' is 'f'.
9212      *
9213      * It also sets any necessary flags, and returns the number of bits that
9214      * actually changed from 0 to 1 */
9215
9216     U8 stored = 0;
9217     U8 fold;
9218
9219     PERL_ARGS_ASSERT_SET_REGCLASS_BIT_FOLD;
9220
9221     fold = (AT_LEAST_UNI_SEMANTICS) ? PL_fold_latin1[value]
9222                                     : PL_fold[value];
9223
9224     /* It assumes the bit for 'value' has already been set */
9225     if (fold != value && ! ANYOF_BITMAP_TEST(node, fold)) {
9226         ANYOF_BITMAP_SET(node, fold);
9227         stored++;
9228     }
9229     if ((_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value) && (! isASCII(value) || ! MORE_ASCII_RESTRICTED))
9230         || (! UNI_SEMANTICS
9231             && ! isASCII(value)
9232             && PL_fold_latin1[value] != value))
9233     {   /* A character that has a fold outside of Latin1 matches outside the
9234            bitmap, but only when the target string is utf8.  Similarly when we
9235            don't have unicode semantics for the above ASCII Latin-1 characters,
9236            and they have a fold, they should match if the target is utf8, and
9237            not otherwise */
9238         if (! *nonbitmap_ptr) {
9239             *nonbitmap_ptr = _new_invlist(2);
9240         }
9241         *nonbitmap_ptr = add_range_to_invlist(*nonbitmap_ptr, value, value);
9242         ANYOF_FLAGS(node) |= ANYOF_UTF8;
9243     }
9244
9245     return stored;
9246 }
9247
9248
9249 PERL_STATIC_INLINE U8
9250 S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, HV** nonbitmap_ptr)
9251 {
9252     /* This inline function sets a bit in the bitmap if not already set, and if
9253      * appropriate, its fold, returning the number of bits that actually
9254      * changed from 0 to 1 */
9255
9256     U8 stored;
9257
9258     PERL_ARGS_ASSERT_SET_REGCLASS_BIT;
9259
9260     if (ANYOF_BITMAP_TEST(node, value)) {   /* Already set */
9261         return 0;
9262     }
9263
9264     ANYOF_BITMAP_SET(node, value);
9265     stored = 1;
9266
9267     if (FOLD && ! LOC) {        /* Locale folds aren't known until runtime */
9268         stored += set_regclass_bit_fold(pRExC_state, node, value, nonbitmap_ptr);
9269     }
9270
9271     return stored;
9272 }
9273
9274 /*
9275    parse a class specification and produce either an ANYOF node that
9276    matches the pattern or perhaps will be optimized into an EXACTish node
9277    instead. The node contains a bit map for the first 256 characters, with the
9278    corresponding bit set if that character is in the list.  For characters
9279    above 255, a range list is used */
9280
9281 STATIC regnode *
9282 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
9283 {
9284     dVAR;
9285     register UV nextvalue;
9286     register IV prevvalue = OOB_UNICODE;
9287     register IV range = 0;
9288     UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
9289     register regnode *ret;
9290     STRLEN numlen;
9291     IV namedclass;
9292     char *rangebegin = NULL;
9293     bool need_class = 0;
9294     SV *listsv = NULL;
9295     UV n;
9296     HV* nonbitmap = NULL;
9297     AV* unicode_alternate  = NULL;
9298 #ifdef EBCDIC
9299     UV literal_endpoint = 0;
9300 #endif
9301     UV stored = 0;  /* how many chars stored in the bitmap */
9302
9303     regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
9304         case we need to change the emitted regop to an EXACT. */
9305     const char * orig_parse = RExC_parse;
9306     GET_RE_DEBUG_FLAGS_DECL;
9307
9308     PERL_ARGS_ASSERT_REGCLASS;
9309 #ifndef DEBUGGING
9310     PERL_UNUSED_ARG(depth);
9311 #endif
9312
9313     DEBUG_PARSE("clas");
9314
9315     /* Assume we are going to generate an ANYOF node. */
9316     ret = reganode(pRExC_state, ANYOF, 0);
9317
9318
9319     if (!SIZE_ONLY) {
9320         ANYOF_FLAGS(ret) = 0;
9321     }
9322
9323     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
9324         RExC_naughty++;
9325         RExC_parse++;
9326         if (!SIZE_ONLY)
9327             ANYOF_FLAGS(ret) |= ANYOF_INVERT;
9328     }
9329
9330     if (SIZE_ONLY) {
9331         RExC_size += ANYOF_SKIP;
9332 #ifdef ANYOF_ADD_LOC_SKIP
9333         if (LOC) {
9334             RExC_size += ANYOF_ADD_LOC_SKIP;
9335         }
9336 #endif
9337         listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
9338     }
9339     else {
9340         RExC_emit += ANYOF_SKIP;
9341         if (LOC) {
9342             ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
9343 #ifdef ANYOF_ADD_LOC_SKIP
9344             RExC_emit += ANYOF_ADD_LOC_SKIP;
9345 #endif
9346         }
9347         ANYOF_BITMAP_ZERO(ret);
9348         listsv = newSVpvs("# comment\n");
9349     }
9350
9351     nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
9352
9353     if (!SIZE_ONLY && POSIXCC(nextvalue))
9354         checkposixcc(pRExC_state);
9355
9356     /* allow 1st char to be ] (allowing it to be - is dealt with later) */
9357     if (UCHARAT(RExC_parse) == ']')
9358         goto charclassloop;
9359
9360 parseit:
9361     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
9362
9363     charclassloop:
9364
9365         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
9366
9367         if (!range)
9368             rangebegin = RExC_parse;
9369         if (UTF) {
9370             value = utf8n_to_uvchr((U8*)RExC_parse,
9371                                    RExC_end - RExC_parse,
9372                                    &numlen, UTF8_ALLOW_DEFAULT);
9373             RExC_parse += numlen;
9374         }
9375         else
9376             value = UCHARAT(RExC_parse++);
9377
9378         nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
9379         if (value == '[' && POSIXCC(nextvalue))
9380             namedclass = regpposixcc(pRExC_state, value);
9381         else if (value == '\\') {
9382             if (UTF) {
9383                 value = utf8n_to_uvchr((U8*)RExC_parse,
9384                                    RExC_end - RExC_parse,
9385                                    &numlen, UTF8_ALLOW_DEFAULT);
9386                 RExC_parse += numlen;
9387             }
9388             else
9389                 value = UCHARAT(RExC_parse++);
9390             /* Some compilers cannot handle switching on 64-bit integer
9391              * values, therefore value cannot be an UV.  Yes, this will
9392              * be a problem later if we want switch on Unicode.
9393              * A similar issue a little bit later when switching on
9394              * namedclass. --jhi */
9395             switch ((I32)value) {
9396             case 'w':   namedclass = ANYOF_ALNUM;       break;
9397             case 'W':   namedclass = ANYOF_NALNUM;      break;
9398             case 's':   namedclass = ANYOF_SPACE;       break;
9399             case 'S':   namedclass = ANYOF_NSPACE;      break;
9400             case 'd':   namedclass = ANYOF_DIGIT;       break;
9401             case 'D':   namedclass = ANYOF_NDIGIT;      break;
9402             case 'v':   namedclass = ANYOF_VERTWS;      break;
9403             case 'V':   namedclass = ANYOF_NVERTWS;     break;
9404             case 'h':   namedclass = ANYOF_HORIZWS;     break;
9405             case 'H':   namedclass = ANYOF_NHORIZWS;    break;
9406             case 'N':  /* Handle \N{NAME} in class */
9407                 {
9408                     /* We only pay attention to the first char of
9409                     multichar strings being returned. I kinda wonder
9410                     if this makes sense as it does change the behaviour
9411                     from earlier versions, OTOH that behaviour was broken
9412                     as well. */
9413                     UV v; /* value is register so we cant & it /grrr */
9414                     if (reg_namedseq(pRExC_state, &v, NULL)) {
9415                         goto parseit;
9416                     }
9417                     value= v;
9418                 }
9419                 break;
9420             case 'p':
9421             case 'P':
9422                 {
9423                 char *e;
9424                 if (RExC_parse >= RExC_end)
9425                     vFAIL2("Empty \\%c{}", (U8)value);
9426                 if (*RExC_parse == '{') {
9427                     const U8 c = (U8)value;
9428                     e = strchr(RExC_parse++, '}');
9429                     if (!e)
9430                         vFAIL2("Missing right brace on \\%c{}", c);
9431                     while (isSPACE(UCHARAT(RExC_parse)))
9432                         RExC_parse++;
9433                     if (e == RExC_parse)
9434                         vFAIL2("Empty \\%c{}", c);
9435                     n = e - RExC_parse;
9436                     while (isSPACE(UCHARAT(RExC_parse + n - 1)))
9437                         n--;
9438                 }
9439                 else {
9440                     e = RExC_parse;
9441                     n = 1;
9442                 }
9443                 if (!SIZE_ONLY) {
9444                     if (UCHARAT(RExC_parse) == '^') {
9445                          RExC_parse++;
9446                          n--;
9447                          value = value == 'p' ? 'P' : 'p'; /* toggle */
9448                          while (isSPACE(UCHARAT(RExC_parse))) {
9449                               RExC_parse++;
9450                               n--;
9451                          }
9452                     }
9453
9454                     /* Add the property name to the list.  If /i matching, give
9455                      * a different name which consists of the normal name
9456                      * sandwiched between two underscores and '_i'.  The design
9457                      * is discussed in the commit message for this. */
9458                     Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s%.*s%s\n",
9459                                         (value=='p' ? '+' : '!'),
9460                                         (FOLD) ? "__" : "",
9461                                         (int)n,
9462                                         RExC_parse,
9463                                         (FOLD) ? "_i" : ""
9464                                     );
9465                 }
9466                 RExC_parse = e + 1;
9467
9468                 /* The \p could match something in the Latin1 range, hence
9469                  * something that isn't utf8 */
9470                 ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP;
9471                 namedclass = ANYOF_MAX;  /* no official name, but it's named */
9472
9473                 /* \p means they want Unicode semantics */
9474                 RExC_uni_semantics = 1;
9475                 }
9476                 break;
9477             case 'n':   value = '\n';                   break;
9478             case 'r':   value = '\r';                   break;
9479             case 't':   value = '\t';                   break;
9480             case 'f':   value = '\f';                   break;
9481             case 'b':   value = '\b';                   break;
9482             case 'e':   value = ASCII_TO_NATIVE('\033');break;
9483             case 'a':   value = ASCII_TO_NATIVE('\007');break;
9484             case 'o':
9485                 RExC_parse--;   /* function expects to be pointed at the 'o' */
9486                 {
9487                     const char* error_msg;
9488                     bool valid = grok_bslash_o(RExC_parse,
9489                                                &value,
9490                                                &numlen,
9491                                                &error_msg,
9492                                                SIZE_ONLY);
9493                     RExC_parse += numlen;
9494                     if (! valid) {
9495                         vFAIL(error_msg);
9496                     }
9497                 }
9498                 if (PL_encoding && value < 0x100) {
9499                     goto recode_encoding;
9500                 }
9501                 break;
9502             case 'x':
9503                 if (*RExC_parse == '{') {
9504                     I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
9505                         | PERL_SCAN_DISALLOW_PREFIX;
9506                     char * const e = strchr(RExC_parse++, '}');
9507                     if (!e)
9508                         vFAIL("Missing right brace on \\x{}");
9509
9510                     numlen = e - RExC_parse;
9511                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
9512                     RExC_parse = e + 1;
9513                 }
9514                 else {
9515                     I32 flags = PERL_SCAN_DISALLOW_PREFIX;
9516                     numlen = 2;
9517                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
9518                     RExC_parse += numlen;
9519                 }
9520                 if (PL_encoding && value < 0x100)
9521                     goto recode_encoding;
9522                 break;
9523             case 'c':
9524                 value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
9525                 break;
9526             case '0': case '1': case '2': case '3': case '4':
9527             case '5': case '6': case '7':
9528                 {
9529                     /* Take 1-3 octal digits */
9530                     I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
9531                     numlen = 3;
9532                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
9533                     RExC_parse += numlen;
9534                     if (PL_encoding && value < 0x100)
9535                         goto recode_encoding;
9536                     break;
9537                 }
9538             recode_encoding:
9539                 {
9540                     SV* enc = PL_encoding;
9541                     value = reg_recode((const char)(U8)value, &enc);
9542                     if (!enc && SIZE_ONLY)
9543                         ckWARNreg(RExC_parse,
9544                                   "Invalid escape in the specified encoding");
9545                     break;
9546                 }
9547             default:
9548                 /* Allow \_ to not give an error */
9549                 if (!SIZE_ONLY && isALNUM(value) && value != '_') {
9550                     ckWARN2reg(RExC_parse,
9551                                "Unrecognized escape \\%c in character class passed through",
9552                                (int)value);
9553                 }
9554                 break;
9555             }
9556         } /* end of \blah */
9557 #ifdef EBCDIC
9558         else
9559             literal_endpoint++;
9560 #endif
9561
9562         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
9563
9564             /* What matches in a locale is not known until runtime, so need to
9565              * (one time per class) allocate extra space to pass to regexec.
9566              * The space will contain a bit for each named class that is to be
9567              * matched against.  This isn't needed for \p{} and pseudo-classes,
9568              * as they are not affected by locale, and hence are dealt with
9569              * separately */
9570             if (LOC && namedclass < ANYOF_MAX && ! need_class) {
9571                 need_class = 1;
9572                 if (SIZE_ONLY) {
9573 #ifdef ANYOF_CLASS_ADD_SKIP
9574                     RExC_size += ANYOF_CLASS_ADD_SKIP;
9575 #endif
9576                 }
9577                 else {
9578 #ifdef ANYOF_CLASS_ADD_SKIP
9579                     RExC_emit += ANYOF_CLASS_ADD_SKIP;
9580 #endif
9581                     ANYOF_CLASS_ZERO(ret);
9582                 }
9583                 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
9584             }
9585
9586             /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
9587              * literal */
9588             if (range) {
9589                 if (!SIZE_ONLY) {
9590                     const int w =
9591                         RExC_parse >= rangebegin ?
9592                         RExC_parse - rangebegin : 0;
9593                     ckWARN4reg(RExC_parse,
9594                                "False [] range \"%*.*s\"",
9595                                w, w, rangebegin);
9596
9597                     if (prevvalue < 256) {
9598                         stored +=
9599                          set_regclass_bit(pRExC_state, ret, (U8) prevvalue, &nonbitmap);
9600                         stored +=
9601                          set_regclass_bit(pRExC_state, ret, '-', &nonbitmap);
9602                     }
9603                     else {
9604                         ANYOF_FLAGS(ret) |= ANYOF_UTF8;
9605                         Perl_sv_catpvf(aTHX_ listsv,
9606                            "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
9607                     }
9608                 }
9609
9610                 range = 0; /* this was not a true range */
9611             }
9612
9613
9614
9615             if (!SIZE_ONLY) {
9616                 const char *what = NULL;
9617                 char yesno = 0;
9618
9619                 /* Possible truncation here but in some 64-bit environments
9620                  * the compiler gets heartburn about switch on 64-bit values.
9621                  * A similar issue a little earlier when switching on value.
9622                  * --jhi */
9623                 switch ((I32)namedclass) {
9624
9625                 case _C_C_T_(ALNUMC, isALNUMC_L1, isALNUMC, "XPosixAlnum");
9626                 case _C_C_T_(ALPHA, isALPHA_L1, isALPHA, "XPosixAlpha");
9627                 case _C_C_T_(BLANK, isBLANK_L1, isBLANK, "XPosixBlank");
9628                 case _C_C_T_(CNTRL, isCNTRL_L1, isCNTRL, "XPosixCntrl");
9629                 case _C_C_T_(GRAPH, isGRAPH_L1, isGRAPH, "XPosixGraph");
9630                 case _C_C_T_(LOWER, isLOWER_L1, isLOWER, "XPosixLower");
9631                 case _C_C_T_(PRINT, isPRINT_L1, isPRINT, "XPosixPrint");
9632                 case _C_C_T_(PSXSPC, isPSXSPC_L1, isPSXSPC, "XPosixSpace");
9633                 case _C_C_T_(PUNCT, isPUNCT_L1, isPUNCT, "XPosixPunct");
9634                 case _C_C_T_(UPPER, isUPPER_L1, isUPPER, "XPosixUpper");
9635 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
9636                 /* \s, \w match all unicode if utf8. */
9637                 case _C_C_T_(SPACE, isSPACE_L1, isSPACE, "SpacePerl");
9638                 case _C_C_T_(ALNUM, isWORDCHAR_L1, isALNUM, "Word");
9639 #else
9640                 /* \s, \w match ascii and locale only */
9641                 case _C_C_T_(SPACE, isSPACE_L1, isSPACE, "PerlSpace");
9642                 case _C_C_T_(ALNUM, isWORDCHAR_L1, isALNUM, "PerlWord");
9643 #endif
9644                 case _C_C_T_(XDIGIT, isXDIGIT_L1, isXDIGIT, "XPosixXDigit");
9645                 case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
9646                 case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
9647                 case ANYOF_ASCII:
9648                     if (LOC)
9649                         ANYOF_CLASS_SET(ret, ANYOF_ASCII);
9650                     else {
9651                         for (value = 0; value < 128; value++)
9652                             stored +=
9653                               set_regclass_bit(pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &nonbitmap);
9654                     }
9655                     yesno = '+';
9656                     what = NULL;        /* Doesn't match outside ascii, so
9657                                            don't want to add +utf8:: */
9658                     break;
9659                 case ANYOF_NASCII:
9660                     if (LOC)
9661                         ANYOF_CLASS_SET(ret, ANYOF_NASCII);
9662                     else {
9663                         for (value = 128; value < 256; value++)
9664                             stored +=
9665                               set_regclass_bit(pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &nonbitmap);
9666                     }
9667                     ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
9668                     yesno = '!';
9669                     what = "ASCII";
9670                     break;
9671                 case ANYOF_DIGIT:
9672                     if (LOC)
9673                         ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
9674                     else {
9675                         /* consecutive digits assumed */
9676                         for (value = '0'; value <= '9'; value++)
9677                             stored +=
9678                               set_regclass_bit(pRExC_state, ret, (U8) value, &nonbitmap);
9679                     }
9680                     yesno = '+';
9681                     what = POSIX_CC_UNI_NAME("Digit");
9682                     break;
9683                 case ANYOF_NDIGIT:
9684                     if (LOC)
9685                         ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
9686                     else {
9687                         /* consecutive digits assumed */
9688                         for (value = 0; value < '0'; value++)
9689                             stored +=
9690                               set_regclass_bit(pRExC_state, ret, (U8) value, &nonbitmap);
9691                         for (value = '9' + 1; value < 256; value++)
9692                             stored +=
9693                               set_regclass_bit(pRExC_state, ret, (U8) value, &nonbitmap);
9694                     }
9695                     yesno = '!';
9696                     what = POSIX_CC_UNI_NAME("Digit");
9697                     if (AT_LEAST_ASCII_RESTRICTED ) {
9698                         ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
9699                     }
9700                     break;
9701                 case ANYOF_MAX:
9702                     /* this is to handle \p and \P */
9703                     break;
9704                 default:
9705                     vFAIL("Invalid [::] class");
9706                     break;
9707                 }
9708                 if (what && ! (AT_LEAST_ASCII_RESTRICTED)) {
9709                     /* Strings such as "+utf8::isWord\n" */
9710                     Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
9711                     ANYOF_FLAGS(ret) |= ANYOF_UTF8;
9712                 }
9713
9714                 continue;
9715             }
9716         } /* end of namedclass \blah */
9717
9718         if (range) {
9719             if (prevvalue > (IV)value) /* b-a */ {
9720                 const int w = RExC_parse - rangebegin;
9721                 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
9722                 range = 0; /* not a valid range */
9723             }
9724         }
9725         else {
9726             prevvalue = value; /* save the beginning of the range */
9727             if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
9728                 RExC_parse[1] != ']') {
9729                 RExC_parse++;
9730
9731                 /* a bad range like \w-, [:word:]- ? */
9732                 if (namedclass > OOB_NAMEDCLASS) {
9733                     if (ckWARN(WARN_REGEXP)) {
9734                         const int w =
9735                             RExC_parse >= rangebegin ?
9736                             RExC_parse - rangebegin : 0;
9737                         vWARN4(RExC_parse,
9738                                "False [] range \"%*.*s\"",
9739                                w, w, rangebegin);
9740                     }
9741                     if (!SIZE_ONLY)
9742                         stored +=
9743                             set_regclass_bit(pRExC_state, ret, '-', &nonbitmap);
9744                 } else
9745                     range = 1;  /* yeah, it's a range! */
9746                 continue;       /* but do it the next time */
9747             }
9748         }
9749
9750         /* non-Latin1 code point implies unicode semantics.  Must be set in
9751          * pass1 so is there for the whole of pass 2 */
9752         if (value > 255) {
9753             RExC_uni_semantics = 1;
9754         }
9755
9756         /* now is the next time */
9757         if (!SIZE_ONLY) {
9758             if (prevvalue < 256) {
9759                 const IV ceilvalue = value < 256 ? value : 255;
9760                 IV i;
9761 #ifdef EBCDIC
9762                 /* In EBCDIC [\x89-\x91] should include
9763                  * the \x8e but [i-j] should not. */
9764                 if (literal_endpoint == 2 &&
9765                     ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
9766                      (isUPPER(prevvalue) && isUPPER(ceilvalue))))
9767                 {
9768                     if (isLOWER(prevvalue)) {
9769                         for (i = prevvalue; i <= ceilvalue; i++)
9770                             if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
9771                                 stored +=
9772                                   set_regclass_bit(pRExC_state, ret, (U8) i, &nonbitmap);
9773                             }
9774                     } else {
9775                         for (i = prevvalue; i <= ceilvalue; i++)
9776                             if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
9777                                 stored +=
9778                                   set_regclass_bit(pRExC_state, ret, (U8) i, &nonbitmap);
9779                             }
9780                     }
9781                 }
9782                 else
9783 #endif
9784                       for (i = prevvalue; i <= ceilvalue; i++) {
9785                         stored += set_regclass_bit(pRExC_state, ret, (U8) i, &nonbitmap);
9786                       }
9787           }
9788           if (value > 255) {
9789             const UV prevnatvalue  = NATIVE_TO_UNI(prevvalue);
9790             const UV natvalue      = NATIVE_TO_UNI(value);
9791             if (! nonbitmap) {
9792                 nonbitmap = _new_invlist(2);
9793             }
9794             nonbitmap = add_range_to_invlist(nonbitmap, prevnatvalue, natvalue);
9795             ANYOF_FLAGS(ret) |= ANYOF_UTF8;
9796         }
9797 #if 0
9798
9799                 /* If the code point requires utf8 to represent, and we are not
9800                  * folding, it can't match unless the target is in utf8.  Only
9801                  * a few code points above 255 fold to below it, so XXX an
9802                  * optimization would be to know which ones and set the flag
9803                  * appropriately. */
9804                 ANYOF_FLAGS(ret) |= (FOLD || value < 256)
9805                                     ? ANYOF_NONBITMAP
9806                                     : ANYOF_UTF8;
9807                 if (prevnatvalue < natvalue) { /* '>' case is fatal error above */
9808
9809                     /* The \t sets the whole range */
9810                     Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
9811                                    prevnatvalue, natvalue);
9812
9813                     /* Currently, we don't look at every value in the range.
9814                      * Therefore we have to assume the worst case: that if
9815                      * folding, it will match more than one character.  But in
9816                      * lookbehind patterns, can only be single character
9817                      * length, so disallow those folds */
9818                     if (FOLD && ! RExC_in_lookbehind) {
9819                       OP(ret) = ANYOFV;
9820                     }
9821                 }
9822                 else if (prevnatvalue == natvalue) {
9823                     Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
9824                     if (FOLD) {
9825                          U8 foldbuf[UTF8_MAXBYTES_CASE+1];
9826                          STRLEN foldlen;
9827                          const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
9828
9829 #ifdef EBCDIC /* RD t/uni/fold ff and 6b */
9830                          if (RExC_precomp[0] == ':' &&
9831                              RExC_precomp[1] == '[' &&
9832                              (f == 0xDF || f == 0x92)) {
9833                              f = NATIVE_TO_UNI(f);
9834                         }
9835 #endif
9836                          /* If folding and foldable and a single
9837                           * character, insert also the folded version
9838                           * to the charclass. */
9839                          if (f != value) {
9840 #ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
9841                              if ((RExC_precomp[0] == ':' &&
9842                                   RExC_precomp[1] == '[' &&
9843                                   (f == 0xA2 &&
9844                                    (value == 0xFB05 || value == 0xFB06))) ?
9845                                  foldlen == ((STRLEN)UNISKIP(f) - 1) :
9846                                  foldlen == (STRLEN)UNISKIP(f) )
9847 #else
9848                               if (foldlen == (STRLEN)UNISKIP(f))
9849 #endif
9850                                   Perl_sv_catpvf(aTHX_ listsv,
9851                                                  "%04"UVxf"\n", f);
9852                               else if (! RExC_in_lookbehind) {
9853                                   /* Any multicharacter foldings
9854                                    * (disallowed in lookbehind patterns)
9855                                    * require the following transform:
9856                                    * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
9857                                    * where E folds into "pq" and F folds
9858                                    * into "rst", all other characters
9859                                    * fold to single characters.  We save
9860                                    * away these multicharacter foldings,
9861                                    * to be later saved as part of the
9862                                    * additional "s" data. */
9863                                   SV *sv;
9864
9865                                   if (!unicode_alternate)
9866                                       unicode_alternate = newAV();
9867                                   sv = newSVpvn_utf8((char*)foldbuf, foldlen,
9868                                                      TRUE);
9869                                   av_push(unicode_alternate, sv);
9870                                   OP(ret) = ANYOFV;
9871                               }
9872                          }
9873
9874                          /* If folding and the value is one of the Greek
9875                           * sigmas insert a few more sigmas to make the
9876                           * folding rules of the sigmas to work right.
9877                           * Note that not all the possible combinations
9878                           * are handled here: some of them are handled
9879                           * by the standard folding rules, and some of
9880                           * them (literal or EXACTF cases) are handled
9881                           * during runtime in regexec.c:S_find_byclass(). */
9882                          if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
9883                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
9884                                              (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
9885                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
9886                                              (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
9887                          }
9888                          else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
9889                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
9890                                              (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
9891                     }
9892                 }
9893             }
9894 #endif
9895 #ifdef EBCDIC
9896             literal_endpoint = 0;
9897 #endif
9898         }
9899
9900         range = 0; /* this range (if it was one) is done now */
9901     }
9902
9903
9904
9905     if (SIZE_ONLY)
9906         return ret;
9907     /****** !SIZE_ONLY AFTER HERE *********/
9908
9909     /* Finish up the non-bitmap entries */
9910     if (nonbitmap) {
9911         UV* nonbitmap_array;
9912         UV i;
9913
9914         /* If folding, we add to the list all characters that could fold to or
9915          * from the ones already on the list */
9916         if (FOLD) {
9917             HV* fold_intersection;
9918             UV* fold_list;
9919
9920             /* This is a list of all the characters that participate in folds
9921              * (except marks, etc in multi-char folds */
9922             if (! PL_utf8_foldable) {
9923                 SV* swash = swash_init("utf8", "Cased", &PL_sv_undef, 1, 0);
9924                 PL_utf8_foldable = _swash_to_invlist(swash);
9925             }
9926
9927             /* This is a hash that for a particular fold gives all characters
9928              * that are involved in it */
9929             if (! PL_utf8_foldclosures) {
9930
9931                 /* If we were unable to find any folds, then we likely won't be
9932                  * able to find the closures.  So just create an empty list.
9933                  * Folding will effectively be restricted to the non-Unicode
9934                  * rules hard-coded into Perl.  (This case happens legitimately
9935                  * during compilation of Perl itself before the Unicode tables
9936                  * are generated) */
9937                 if (invlist_len(PL_utf8_foldable) == 0) {
9938                     PL_utf8_foldclosures = _new_invlist(0);
9939                 } else {
9940                     /* If the folds haven't been read in, call a fold function
9941                      * to force that */
9942                     if (! PL_utf8_tofold) {
9943                         U8 dummy[UTF8_MAXBYTES+1];
9944                         STRLEN dummy_len;
9945                         to_utf8_fold((U8*) "A", dummy, &dummy_len);
9946                     }
9947                     PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
9948                 }
9949             }
9950
9951             /* Only the characters in this class that participate in folds need
9952              * be checked.  Get the intersection of this class and all the
9953              * possible characters that are foldable.  This can quickly narrow
9954              * down a large class */
9955             fold_intersection = invlist_intersection(PL_utf8_foldable, nonbitmap);
9956
9957             /* Now look at the foldable characters in this class individually */
9958             fold_list = invlist_array(fold_intersection);
9959             for (i = 0; i < invlist_len(fold_intersection); i++) {
9960                 UV j;
9961
9962                 /* The next entry is the beginning of the range that is in the
9963                  * class */
9964                 UV start = fold_list[i++];
9965
9966
9967                 /* The next entry is the beginning of the next range, which
9968                  * isn't in the class, so the end of the current range is one
9969                  * less than that */
9970                 UV end = fold_list[i] - 1;
9971
9972                 /* Look at every character in the range */
9973                 for (j = start; j <= end; j++) {
9974
9975                     /* Get its fold */
9976                     U8 foldbuf[UTF8_MAXBYTES_CASE+1];
9977                     STRLEN foldlen;
9978                     const UV f = to_uni_fold(j, foldbuf, &foldlen);
9979
9980                     if (foldlen > (STRLEN)UNISKIP(f)) {
9981
9982                         /* Any multicharacter foldings (disallowed in
9983                          * lookbehind patterns) require the following
9984                          * transform: [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst) where
9985                          * E folds into "pq" and F folds into "rst", all other
9986                          * characters fold to single characters.  We save away
9987                          * these multicharacter foldings, to be later saved as
9988                          * part of the additional "s" data. */
9989                         if (! RExC_in_lookbehind) {
9990                             SV *sv;
9991                             U8* loc = foldbuf;
9992                             U8* e = foldbuf + foldlen;
9993
9994                             /* If any of the folded characters of this are in
9995                              * the Latin1 range, tell the regex engine that
9996                              * this can match a non-utf8 target string.  The
9997                              * only multi-byte fold whose source is in the
9998                              * Latin1 range (U+00DF) applies only when the
9999                              * target string is utf8, or under unicode rules */
10000                             if (j > 255 || AT_LEAST_UNI_SEMANTICS) {
10001                                 while (loc < e) {
10002
10003                                     /* Can't mix ascii with non- under /aa */
10004                                     if (MORE_ASCII_RESTRICTED
10005                                         && (isASCII(*loc) != isASCII(j)))
10006                                     {
10007                                         goto end_multi_fold;
10008                                     }
10009                                     if (UTF8_IS_INVARIANT(*loc)
10010                                         || UTF8_IS_DOWNGRADEABLE_START(*loc))
10011                                     {
10012                                         /* Can't mix above and below 256 under
10013                                          * LOC */
10014                                         if (LOC) {
10015                                             goto end_multi_fold;
10016                                         }
10017                                         ANYOF_FLAGS(ret)
10018                                                 |= ANYOF_NONBITMAP_NON_UTF8;
10019                                         break;
10020                                     }
10021                                     loc += UTF8SKIP(loc);
10022                                 }
10023                             }
10024                             ANYOF_FLAGS(ret) |= ANYOF_UTF8;
10025
10026                             if (!unicode_alternate) {
10027                                 unicode_alternate = newAV();
10028                             }
10029                             sv = newSVpvn_utf8((char*)foldbuf, foldlen, TRUE);
10030                             av_push(unicode_alternate, sv);
10031
10032                             /* This node is variable length */
10033                             OP(ret) = ANYOFV;
10034                         end_multi_fold: ;
10035                         }
10036                     }
10037                     else {
10038                         /* Single character fold.  Add everything in its fold
10039                          * closure to the list that this node should match */
10040                         SV** listp;
10041
10042                         /* The fold closures data structure is a hash with the
10043                          * keys being every character that is folded to, like
10044                          * 'k', and the values each an array of everything that
10045                          * folds to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ] */
10046                         if ((listp = hv_fetch(PL_utf8_foldclosures,
10047                                       (char *) foldbuf, foldlen, FALSE)))
10048                         {
10049                             AV* list = (AV*) *listp;
10050                             IV k;
10051                             for (k = 0; k <= av_len(list); k++) {
10052                                 SV** c_p = av_fetch(list, k, FALSE);
10053                                 UV c;
10054                                 if (c_p == NULL) {
10055                                     Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
10056                                 }
10057                                 c = SvUV(*c_p);
10058
10059                                 /* /aa doesn't allow folds between ASCII and
10060                                  * non-; /l doesn't allow them between above
10061                                  * and below 256 */
10062                                 if ((MORE_ASCII_RESTRICTED && (isASCII(c) != isASCII(j)))
10063                                      || (LOC && ((c < 256) != (j < 256))))
10064                                 {
10065                                     continue;
10066                                 }
10067
10068                                 if (c < 256 && AT_LEAST_UNI_SEMANTICS) {
10069                                     stored += set_regclass_bit(pRExC_state, ret, (U8) c, &nonbitmap);
10070                                 }
10071                                     /* It may be that the code point is already
10072                                      * in this range or already in the bitmap,
10073                                      * in which case we need do nothing */
10074                                 else if ((c < start || c > end)
10075                                          && (c > 255
10076                                              || ! ANYOF_BITMAP_TEST(ret, c)))
10077                                 {
10078                                     nonbitmap = add_range_to_invlist(nonbitmap, c, c);
10079                                 }
10080                             }
10081                         }
10082                     }
10083                 }
10084             }
10085             invlist_destroy(fold_intersection);
10086         } /* End of processing all the folds */
10087
10088         /*  Here have the full list of items to match that aren't in the
10089          *  bitmap.  Convert to the structure that the rest of the code is
10090          *  expecting.   XXX That rest of the code should convert to this
10091          *  structure */
10092         nonbitmap_array = invlist_array(nonbitmap);
10093         for (i = 0; i < invlist_len(nonbitmap); i++) {
10094
10095             /* The next entry is the beginning of the range that is in the
10096              * class */
10097             UV start = nonbitmap_array[i++];
10098
10099             /* The next entry is the beginning of the next range, which isn't
10100              * in the class, so the end of the current range is one less than
10101              * that */
10102             UV end = nonbitmap_array[i] - 1;
10103
10104             if (start == end) {
10105                 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", start);
10106             }
10107             else {
10108                 /* The \t sets the whole range */
10109                 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
10110                         /* XXX EBCDIC */
10111                                    start, end);
10112             }
10113         }
10114         invlist_destroy(nonbitmap);
10115     }
10116
10117     /* Here, we have calculated what code points should be in the character
10118      * class.   Now we can see about various optimizations.  Fold calculation
10119      * needs to take place before inversion.  Otherwise /[^k]/i would invert to
10120      * include K, which under /i would match k. */
10121
10122     /* Optimize inverted simple patterns (e.g. [^a-z]).  Note that we haven't
10123      * set the FOLD flag yet, so this this does optimize those.  It doesn't
10124      * optimize locale.  Doing so perhaps could be done as long as there is
10125      * nothing like \w in it; some thought also would have to be given to the
10126      * interaction with above 0x100 chars */
10127     if (! LOC && (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
10128         for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
10129             ANYOF_BITMAP(ret)[value] ^= 0xFF;
10130         stored = 256 - stored;
10131
10132         /* The inversion means that everything above 255 is matched; and at the
10133          * same time we clear the invert flag */
10134         ANYOF_FLAGS(ret) = ANYOF_UTF8|ANYOF_UNICODE_ALL;
10135     }
10136
10137     /* Folding in the bitmap is taken care of above, but not for locale (for
10138      * which we have to wait to see what folding is in effect at runtime), and
10139      * for things not in the bitmap.  Set run-time fold flag for these */
10140     if (FOLD && (LOC || (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP))) {
10141         ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD;
10142     }
10143
10144     /* A single character class can be "optimized" into an EXACTish node.
10145      * Note that since we don't currently count how many characters there are
10146      * outside the bitmap, we are XXX missing optimization possibilities for
10147      * them.  This optimization can't happen unless this is a truly single
10148      * character class, which means that it can't be an inversion into a
10149      * many-character class, and there must be no possibility of there being
10150      * things outside the bitmap.  'stored' (only) for locales doesn't include
10151      * \w, etc, so have to make a special test that they aren't present
10152      *
10153      * Similarly A 2-character class of the very special form like [bB] can be
10154      * optimized into an EXACTFish node, but only for non-locales, and for
10155      * characters which only have the two folds; so things like 'fF' and 'Ii'
10156      * wouldn't work because they are part of the fold of 'LATIN SMALL LIGATURE
10157      * FI'. */
10158     if (! (ANYOF_FLAGS(ret) & (ANYOF_NONBITMAP|ANYOF_INVERT|ANYOF_UNICODE_ALL))
10159         && (((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
10160                               || (! ANYOF_CLASS_TEST_ANY_SET(ret)))))
10161             || (stored == 2 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
10162                                  && (! _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value))
10163                                  /* If the latest code point has a fold whose
10164                                   * bit is set, it must be the only other one */
10165                                 && ((prevvalue = PL_fold_latin1[value]) != (IV)value)
10166                                  && ANYOF_BITMAP_TEST(ret, prevvalue)))))
10167     {
10168         /* Note that the information needed to decide to do this optimization
10169          * is not currently available until the 2nd pass, and that the actually
10170          * used EXACTish node takes less space than the calculated ANYOF node,
10171          * and hence the amount of space calculated in the first pass is larger
10172          * than actually used, so this optimization doesn't gain us any space.
10173          * But an EXACT node is faster than an ANYOF node, and can be combined
10174          * with any adjacent EXACT nodes later by the optimizer for further
10175          * gains.  The speed of executing an EXACTF is similar to an ANYOF
10176          * node, so the optimization advantage comes from the ability to join
10177          * it to adjacent EXACT nodes */
10178
10179         const char * cur_parse= RExC_parse;
10180         U8 op;
10181         RExC_emit = (regnode *)orig_emit;
10182         RExC_parse = (char *)orig_parse;
10183
10184         if (stored == 1) {
10185
10186             /* A locale node with one point can be folded; all the other cases
10187              * with folding will have two points, since we calculate them above
10188              */
10189             if (ANYOF_FLAGS(ret) & ANYOF_LOC_NONBITMAP_FOLD) {
10190                  op = EXACTFL;
10191             }
10192             else {
10193                 op = EXACT;
10194             }
10195         }   /* else 2 chars in the bit map: the folds of each other */
10196         else if (AT_LEAST_UNI_SEMANTICS || !isASCII(value)) {
10197
10198             /* To join adjacent nodes, they must be the exact EXACTish type.
10199              * Try to use the most likely type, by using EXACTFU if the regex
10200              * calls for them, or is required because the character is
10201              * non-ASCII */
10202             op = EXACTFU;
10203         }
10204         else {    /* Otherwise, more likely to be EXACTF type */
10205             op = EXACTF;
10206         }
10207
10208         ret = reg_node(pRExC_state, op);
10209         RExC_parse = (char *)cur_parse;
10210         if (UTF && ! NATIVE_IS_INVARIANT(value)) {
10211             *STRING(ret)= UTF8_EIGHT_BIT_HI((U8) value);
10212             *(STRING(ret) + 1)= UTF8_EIGHT_BIT_LO((U8) value);
10213             STR_LEN(ret)= 2;
10214             RExC_emit += STR_SZ(2);
10215         }
10216         else {
10217             *STRING(ret)= (char)value;
10218             STR_LEN(ret)= 1;
10219             RExC_emit += STR_SZ(1);
10220         }
10221         SvREFCNT_dec(listsv);
10222         return ret;
10223     }
10224
10225     {
10226         AV * const av = newAV();
10227         SV *rv;
10228         /* The 0th element stores the character class description
10229          * in its textual form: used later (regexec.c:Perl_regclass_swash())
10230          * to initialize the appropriate swash (which gets stored in
10231          * the 1st element), and also useful for dumping the regnode.
10232          * The 2nd element stores the multicharacter foldings,
10233          * used later (regexec.c:S_reginclass()). */
10234         av_store(av, 0, listsv);
10235         av_store(av, 1, NULL);
10236         av_store(av, 2, MUTABLE_SV(unicode_alternate));
10237         rv = newRV_noinc(MUTABLE_SV(av));
10238         n = add_data(pRExC_state, 1, "s");
10239         RExC_rxi->data->data[n] = (void*)rv;
10240         ARG_SET(ret, n);
10241     }
10242     return ret;
10243 }
10244 #undef _C_C_T_
10245
10246
10247 /* reg_skipcomment()
10248
10249    Absorbs an /x style # comments from the input stream.
10250    Returns true if there is more text remaining in the stream.
10251    Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
10252    terminates the pattern without including a newline.
10253
10254    Note its the callers responsibility to ensure that we are
10255    actually in /x mode
10256
10257 */
10258
10259 STATIC bool
10260 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
10261 {
10262     bool ended = 0;
10263
10264     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
10265
10266     while (RExC_parse < RExC_end)
10267         if (*RExC_parse++ == '\n') {
10268             ended = 1;
10269             break;
10270         }
10271     if (!ended) {
10272         /* we ran off the end of the pattern without ending
10273            the comment, so we have to add an \n when wrapping */
10274         RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
10275         return 0;
10276     } else
10277         return 1;
10278 }
10279
10280 /* nextchar()
10281
10282    Advances the parse position, and optionally absorbs
10283    "whitespace" from the inputstream.
10284
10285    Without /x "whitespace" means (?#...) style comments only,
10286    with /x this means (?#...) and # comments and whitespace proper.
10287
10288    Returns the RExC_parse point from BEFORE the scan occurs.
10289
10290    This is the /x friendly way of saying RExC_parse++.
10291 */
10292
10293 STATIC char*
10294 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
10295 {
10296     char* const retval = RExC_parse++;
10297
10298     PERL_ARGS_ASSERT_NEXTCHAR;
10299
10300     for (;;) {
10301         if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
10302                 RExC_parse[2] == '#') {
10303             while (*RExC_parse != ')') {
10304                 if (RExC_parse == RExC_end)
10305                     FAIL("Sequence (?#... not terminated");
10306                 RExC_parse++;
10307             }
10308             RExC_parse++;
10309             continue;
10310         }
10311         if (RExC_flags & RXf_PMf_EXTENDED) {
10312             if (isSPACE(*RExC_parse)) {
10313                 RExC_parse++;
10314                 continue;
10315             }
10316             else if (*RExC_parse == '#') {
10317                 if ( reg_skipcomment( pRExC_state ) )
10318                     continue;
10319             }
10320         }
10321         return retval;
10322     }
10323 }
10324
10325 /*
10326 - reg_node - emit a node
10327 */
10328 STATIC regnode *                        /* Location. */
10329 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
10330 {
10331     dVAR;
10332     register regnode *ptr;
10333     regnode * const ret = RExC_emit;
10334     GET_RE_DEBUG_FLAGS_DECL;
10335
10336     PERL_ARGS_ASSERT_REG_NODE;
10337
10338     if (SIZE_ONLY) {
10339         SIZE_ALIGN(RExC_size);
10340         RExC_size += 1;
10341         return(ret);
10342     }
10343     if (RExC_emit >= RExC_emit_bound)
10344         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
10345
10346     NODE_ALIGN_FILL(ret);
10347     ptr = ret;
10348     FILL_ADVANCE_NODE(ptr, op);
10349     REH_CALL_REGCOMP_HOOK(pRExC_state->rx, (ptr) - 1);
10350 #ifdef RE_TRACK_PATTERN_OFFSETS
10351     if (RExC_offsets) {         /* MJD */
10352         MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
10353               "reg_node", __LINE__,
10354               PL_reg_name[op],
10355               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
10356                 ? "Overwriting end of array!\n" : "OK",
10357               (UV)(RExC_emit - RExC_emit_start),
10358               (UV)(RExC_parse - RExC_start),
10359               (UV)RExC_offsets[0]));
10360         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
10361     }
10362 #endif
10363     RExC_emit = ptr;
10364     return(ret);
10365 }
10366
10367 /*
10368 - reganode - emit a node with an argument
10369 */
10370 STATIC regnode *                        /* Location. */
10371 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
10372 {
10373     dVAR;
10374     register regnode *ptr;
10375     regnode * const ret = RExC_emit;
10376     GET_RE_DEBUG_FLAGS_DECL;
10377
10378     PERL_ARGS_ASSERT_REGANODE;
10379
10380     if (SIZE_ONLY) {
10381         SIZE_ALIGN(RExC_size);
10382         RExC_size += 2;
10383         /*
10384            We can't do this:
10385
10386            assert(2==regarglen[op]+1);
10387
10388            Anything larger than this has to allocate the extra amount.
10389            If we changed this to be:
10390
10391            RExC_size += (1 + regarglen[op]);
10392
10393            then it wouldn't matter. Its not clear what side effect
10394            might come from that so its not done so far.
10395            -- dmq
10396         */
10397         return(ret);
10398     }
10399     if (RExC_emit >= RExC_emit_bound)
10400         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
10401
10402     NODE_ALIGN_FILL(ret);
10403     ptr = ret;
10404     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
10405     REH_CALL_REGCOMP_HOOK(pRExC_state->rx, (ptr) - 2);
10406 #ifdef RE_TRACK_PATTERN_OFFSETS
10407     if (RExC_offsets) {         /* MJD */
10408         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
10409               "reganode",
10410               __LINE__,
10411               PL_reg_name[op],
10412               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
10413               "Overwriting end of array!\n" : "OK",
10414               (UV)(RExC_emit - RExC_emit_start),
10415               (UV)(RExC_parse - RExC_start),
10416               (UV)RExC_offsets[0]));
10417         Set_Cur_Node_Offset;
10418     }
10419 #endif
10420     RExC_emit = ptr;
10421     return(ret);
10422 }
10423
10424 /*
10425 - reguni - emit (if appropriate) a Unicode character
10426 */
10427 STATIC STRLEN
10428 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
10429 {
10430     dVAR;
10431
10432     PERL_ARGS_ASSERT_REGUNI;
10433
10434     return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
10435 }
10436
10437 /*
10438 - reginsert - insert an operator in front of already-emitted operand
10439 *
10440 * Means relocating the operand.
10441 */
10442 STATIC void
10443 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
10444 {
10445     dVAR;
10446     register regnode *src;
10447     register regnode *dst;
10448     register regnode *place;
10449     const int offset = regarglen[(U8)op];
10450     const int size = NODE_STEP_REGNODE + offset;
10451     GET_RE_DEBUG_FLAGS_DECL;
10452
10453     PERL_ARGS_ASSERT_REGINSERT;
10454     PERL_UNUSED_ARG(depth);
10455 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
10456     DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
10457     if (SIZE_ONLY) {
10458         RExC_size += size;
10459         return;
10460     }
10461
10462     src = RExC_emit;
10463     RExC_emit += size;
10464     dst = RExC_emit;
10465     if (RExC_open_parens) {
10466         int paren;
10467         /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
10468         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
10469             if ( RExC_open_parens[paren] >= opnd ) {
10470                 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
10471                 RExC_open_parens[paren] += size;
10472             } else {
10473                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
10474             }
10475             if ( RExC_close_parens[paren] >= opnd ) {
10476                 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
10477                 RExC_close_parens[paren] += size;
10478             } else {
10479                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
10480             }
10481         }
10482     }
10483
10484     while (src > opnd) {
10485         StructCopy(--src, --dst, regnode);
10486 #ifdef RE_TRACK_PATTERN_OFFSETS
10487         if (RExC_offsets) {     /* MJD 20010112 */
10488             MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
10489                   "reg_insert",
10490                   __LINE__,
10491                   PL_reg_name[op],
10492                   (UV)(dst - RExC_emit_start) > RExC_offsets[0]
10493                     ? "Overwriting end of array!\n" : "OK",
10494                   (UV)(src - RExC_emit_start),
10495                   (UV)(dst - RExC_emit_start),
10496                   (UV)RExC_offsets[0]));
10497             Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
10498             Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
10499         }
10500 #endif
10501     }
10502
10503
10504     place = opnd;               /* Op node, where operand used to be. */
10505 #ifdef RE_TRACK_PATTERN_OFFSETS
10506     if (RExC_offsets) {         /* MJD */
10507         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
10508               "reginsert",
10509               __LINE__,
10510               PL_reg_name[op],
10511               (UV)(place - RExC_emit_start) > RExC_offsets[0]
10512               ? "Overwriting end of array!\n" : "OK",
10513               (UV)(place - RExC_emit_start),
10514               (UV)(RExC_parse - RExC_start),
10515               (UV)RExC_offsets[0]));
10516         Set_Node_Offset(place, RExC_parse);
10517         Set_Node_Length(place, 1);
10518     }
10519 #endif
10520     src = NEXTOPER(place);
10521     FILL_ADVANCE_NODE(place, op);
10522     REH_CALL_REGCOMP_HOOK(pRExC_state->rx, (place) - 1);
10523     Zero(src, offset, regnode);
10524 }
10525
10526 /*
10527 - regtail - set the next-pointer at the end of a node chain of p to val.
10528 - SEE ALSO: regtail_study
10529 */
10530 /* TODO: All three parms should be const */
10531 STATIC void
10532 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
10533 {
10534     dVAR;
10535     register regnode *scan;
10536     GET_RE_DEBUG_FLAGS_DECL;
10537
10538     PERL_ARGS_ASSERT_REGTAIL;
10539 #ifndef DEBUGGING
10540     PERL_UNUSED_ARG(depth);
10541 #endif
10542
10543     if (SIZE_ONLY)
10544         return;
10545
10546     /* Find last node. */
10547     scan = p;
10548     for (;;) {
10549         regnode * const temp = regnext(scan);
10550         DEBUG_PARSE_r({
10551             SV * const mysv=sv_newmortal();
10552             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
10553             regprop(RExC_rx, mysv, scan);
10554             PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
10555                 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
10556                     (temp == NULL ? "->" : ""),
10557                     (temp == NULL ? PL_reg_name[OP(val)] : "")
10558             );
10559         });
10560         if (temp == NULL)
10561             break;
10562         scan = temp;
10563     }
10564
10565     if (reg_off_by_arg[OP(scan)]) {
10566         ARG_SET(scan, val - scan);
10567     }
10568     else {
10569         NEXT_OFF(scan) = val - scan;
10570     }
10571 }
10572
10573 #ifdef DEBUGGING
10574 /*
10575 - regtail_study - set the next-pointer at the end of a node chain of p to val.
10576 - Look for optimizable sequences at the same time.
10577 - currently only looks for EXACT chains.
10578
10579 This is experimental code. The idea is to use this routine to perform
10580 in place optimizations on branches and groups as they are constructed,
10581 with the long term intention of removing optimization from study_chunk so
10582 that it is purely analytical.
10583
10584 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
10585 to control which is which.
10586
10587 */
10588 /* TODO: All four parms should be const */
10589
10590 STATIC U8
10591 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
10592 {
10593     dVAR;
10594     register regnode *scan;
10595     U8 exact = PSEUDO;
10596 #ifdef EXPERIMENTAL_INPLACESCAN
10597     I32 min = 0;
10598 #endif
10599     GET_RE_DEBUG_FLAGS_DECL;
10600
10601     PERL_ARGS_ASSERT_REGTAIL_STUDY;
10602
10603
10604     if (SIZE_ONLY)
10605         return exact;
10606
10607     /* Find last node. */
10608
10609     scan = p;
10610     for (;;) {
10611         regnode * const temp = regnext(scan);
10612 #ifdef EXPERIMENTAL_INPLACESCAN
10613         if (PL_regkind[OP(scan)] == EXACT)
10614             if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
10615                 return EXACT;
10616 #endif
10617         if ( exact ) {
10618             switch (OP(scan)) {
10619                 case EXACT:
10620                 case EXACTF:
10621                 case EXACTFA:
10622                 case EXACTFU:
10623                 case EXACTFL:
10624                         if( exact == PSEUDO )
10625                             exact= OP(scan);
10626                         else if ( exact != OP(scan) )
10627                             exact= 0;
10628                 case NOTHING:
10629                     break;
10630                 default:
10631                     exact= 0;
10632             }
10633         }
10634         DEBUG_PARSE_r({
10635             SV * const mysv=sv_newmortal();
10636             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
10637             regprop(RExC_rx, mysv, scan);
10638             PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
10639                 SvPV_nolen_const(mysv),
10640                 REG_NODE_NUM(scan),
10641                 PL_reg_name[exact]);
10642         });
10643         if (temp == NULL)
10644             break;
10645         scan = temp;
10646     }
10647     DEBUG_PARSE_r({
10648         SV * const mysv_val=sv_newmortal();
10649         DEBUG_PARSE_MSG("");
10650         regprop(RExC_rx, mysv_val, val);
10651         PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
10652                       SvPV_nolen_const(mysv_val),
10653                       (IV)REG_NODE_NUM(val),
10654                       (IV)(val - scan)
10655         );
10656     });
10657     if (reg_off_by_arg[OP(scan)]) {
10658         ARG_SET(scan, val - scan);
10659     }
10660     else {
10661         NEXT_OFF(scan) = val - scan;
10662     }
10663
10664     return exact;
10665 }
10666 #endif
10667
10668 /*
10669  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
10670  */
10671 #ifdef DEBUGGING
10672 static void
10673 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
10674 {
10675     int bit;
10676     int set=0;
10677     regex_charset cs;
10678
10679     for (bit=0; bit<32; bit++) {
10680         if (flags & (1<<bit)) {
10681             if ((1<<bit) & RXf_PMf_CHARSET) {   /* Output separately, below */
10682                 continue;
10683             }
10684             if (!set++ && lead)
10685                 PerlIO_printf(Perl_debug_log, "%s",lead);
10686             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
10687         }
10688     }
10689     if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
10690             if (!set++ && lead) {
10691                 PerlIO_printf(Perl_debug_log, "%s",lead);
10692             }
10693             switch (cs) {
10694                 case REGEX_UNICODE_CHARSET:
10695                     PerlIO_printf(Perl_debug_log, "UNICODE");
10696                     break;
10697                 case REGEX_LOCALE_CHARSET:
10698                     PerlIO_printf(Perl_debug_log, "LOCALE");
10699                     break;
10700                 case REGEX_ASCII_RESTRICTED_CHARSET:
10701                     PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
10702                     break;
10703                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
10704                     PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
10705                     break;
10706                 default:
10707                     PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
10708                     break;
10709             }
10710     }
10711     if (lead)  {
10712         if (set)
10713             PerlIO_printf(Perl_debug_log, "\n");
10714         else
10715             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
10716     }
10717 }
10718 #endif
10719
10720 void
10721 Perl_regdump(pTHX_ const regexp *r)
10722 {
10723 #ifdef DEBUGGING
10724     dVAR;
10725     SV * const sv = sv_newmortal();
10726     SV *dsv= sv_newmortal();
10727     RXi_GET_DECL(r,ri);
10728     GET_RE_DEBUG_FLAGS_DECL;
10729
10730     PERL_ARGS_ASSERT_REGDUMP;
10731
10732     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
10733
10734     /* Header fields of interest. */
10735     if (r->anchored_substr) {
10736         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
10737             RE_SV_DUMPLEN(r->anchored_substr), 30);
10738         PerlIO_printf(Perl_debug_log,
10739                       "anchored %s%s at %"IVdf" ",
10740                       s, RE_SV_TAIL(r->anchored_substr),
10741                       (IV)r->anchored_offset);
10742     } else if (r->anchored_utf8) {
10743         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
10744             RE_SV_DUMPLEN(r->anchored_utf8), 30);
10745         PerlIO_printf(Perl_debug_log,
10746                       "anchored utf8 %s%s at %"IVdf" ",
10747                       s, RE_SV_TAIL(r->anchored_utf8),
10748                       (IV)r->anchored_offset);
10749     }
10750     if (r->float_substr) {
10751         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
10752             RE_SV_DUMPLEN(r->float_substr), 30);
10753         PerlIO_printf(Perl_debug_log,
10754                       "floating %s%s at %"IVdf"..%"UVuf" ",
10755                       s, RE_SV_TAIL(r->float_substr),
10756                       (IV)r->float_min_offset, (UV)r->float_max_offset);
10757     } else if (r->float_utf8) {
10758         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
10759             RE_SV_DUMPLEN(r->float_utf8), 30);
10760         PerlIO_printf(Perl_debug_log,
10761                       "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
10762                       s, RE_SV_TAIL(r->float_utf8),
10763                       (IV)r->float_min_offset, (UV)r->float_max_offset);
10764     }
10765     if (r->check_substr || r->check_utf8)
10766         PerlIO_printf(Perl_debug_log,
10767                       (const char *)
10768                       (r->check_substr == r->float_substr
10769                        && r->check_utf8 == r->float_utf8
10770                        ? "(checking floating" : "(checking anchored"));
10771     if (r->extflags & RXf_NOSCAN)
10772         PerlIO_printf(Perl_debug_log, " noscan");
10773     if (r->extflags & RXf_CHECK_ALL)
10774         PerlIO_printf(Perl_debug_log, " isall");
10775     if (r->check_substr || r->check_utf8)
10776         PerlIO_printf(Perl_debug_log, ") ");
10777
10778     if (ri->regstclass) {
10779         regprop(r, sv, ri->regstclass);
10780         PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
10781     }
10782     if (r->extflags & RXf_ANCH) {
10783         PerlIO_printf(Perl_debug_log, "anchored");
10784         if (r->extflags & RXf_ANCH_BOL)
10785             PerlIO_printf(Perl_debug_log, "(BOL)");
10786         if (r->extflags & RXf_ANCH_MBOL)
10787             PerlIO_printf(Perl_debug_log, "(MBOL)");
10788         if (r->extflags & RXf_ANCH_SBOL)
10789             PerlIO_printf(Perl_debug_log, "(SBOL)");
10790         if (r->extflags & RXf_ANCH_GPOS)
10791             PerlIO_printf(Perl_debug_log, "(GPOS)");
10792         PerlIO_putc(Perl_debug_log, ' ');
10793     }
10794     if (r->extflags & RXf_GPOS_SEEN)
10795         PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
10796     if (r->intflags & PREGf_SKIP)
10797         PerlIO_printf(Perl_debug_log, "plus ");
10798     if (r->intflags & PREGf_IMPLICIT)
10799         PerlIO_printf(Perl_debug_log, "implicit ");
10800     PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
10801     if (r->extflags & RXf_EVAL_SEEN)
10802         PerlIO_printf(Perl_debug_log, "with eval ");
10803     PerlIO_printf(Perl_debug_log, "\n");
10804     DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
10805 #else
10806     PERL_ARGS_ASSERT_REGDUMP;
10807     PERL_UNUSED_CONTEXT;
10808     PERL_UNUSED_ARG(r);
10809 #endif  /* DEBUGGING */
10810 }
10811
10812 /*
10813 - regprop - printable representation of opcode
10814 */
10815 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
10816 STMT_START { \
10817         if (do_sep) {                           \
10818             Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
10819             if (flags & ANYOF_INVERT)           \
10820                 /*make sure the invert info is in each */ \
10821                 sv_catpvs(sv, "^");             \
10822             do_sep = 0;                         \
10823         }                                       \
10824 } STMT_END
10825
10826 void
10827 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
10828 {
10829 #ifdef DEBUGGING
10830     dVAR;
10831     register int k;
10832     RXi_GET_DECL(prog,progi);
10833     GET_RE_DEBUG_FLAGS_DECL;
10834
10835     PERL_ARGS_ASSERT_REGPROP;
10836
10837     sv_setpvs(sv, "");
10838
10839     if (OP(o) > REGNODE_MAX)            /* regnode.type is unsigned */
10840         /* It would be nice to FAIL() here, but this may be called from
10841            regexec.c, and it would be hard to supply pRExC_state. */
10842         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
10843     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
10844
10845     k = PL_regkind[OP(o)];
10846
10847     if (k == EXACT) {
10848         sv_catpvs(sv, " ");
10849         /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
10850          * is a crude hack but it may be the best for now since
10851          * we have no flag "this EXACTish node was UTF-8"
10852          * --jhi */
10853         pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
10854                   PERL_PV_ESCAPE_UNI_DETECT |
10855                   PERL_PV_ESCAPE_NONASCII   |
10856                   PERL_PV_PRETTY_ELLIPSES   |
10857                   PERL_PV_PRETTY_LTGT       |
10858                   PERL_PV_PRETTY_NOCLEAR
10859                   );
10860     } else if (k == TRIE) {
10861         /* print the details of the trie in dumpuntil instead, as
10862          * progi->data isn't available here */
10863         const char op = OP(o);
10864         const U32 n = ARG(o);
10865         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
10866                (reg_ac_data *)progi->data->data[n] :
10867                NULL;
10868         const reg_trie_data * const trie
10869             = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
10870
10871         Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
10872         DEBUG_TRIE_COMPILE_r(
10873             Perl_sv_catpvf(aTHX_ sv,
10874                 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
10875                 (UV)trie->startstate,
10876                 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
10877                 (UV)trie->wordcount,
10878                 (UV)trie->minlen,
10879                 (UV)trie->maxlen,
10880                 (UV)TRIE_CHARCOUNT(trie),
10881                 (UV)trie->uniquecharcount
10882             )
10883         );
10884         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
10885             int i;
10886             int rangestart = -1;
10887             U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
10888             sv_catpvs(sv, "[");
10889             for (i = 0; i <= 256; i++) {
10890                 if (i < 256 && BITMAP_TEST(bitmap,i)) {
10891                     if (rangestart == -1)
10892                         rangestart = i;
10893                 } else if (rangestart != -1) {
10894                     if (i <= rangestart + 3)
10895                         for (; rangestart < i; rangestart++)
10896                             put_byte(sv, rangestart);
10897                     else {
10898                         put_byte(sv, rangestart);
10899                         sv_catpvs(sv, "-");
10900                         put_byte(sv, i - 1);
10901                     }
10902                     rangestart = -1;
10903                 }
10904             }
10905             sv_catpvs(sv, "]");
10906         }
10907
10908     } else if (k == CURLY) {
10909         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
10910             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
10911         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
10912     }
10913     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
10914         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
10915     else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
10916         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
10917         if ( RXp_PAREN_NAMES(prog) ) {
10918             if ( k != REF || (OP(o) < NREF)) {
10919                 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
10920                 SV **name= av_fetch(list, ARG(o), 0 );
10921                 if (name)
10922                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
10923             }
10924             else {
10925                 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
10926                 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
10927                 I32 *nums=(I32*)SvPVX(sv_dat);
10928                 SV **name= av_fetch(list, nums[0], 0 );
10929                 I32 n;
10930                 if (name) {
10931                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
10932                         Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
10933                                     (n ? "," : ""), (IV)nums[n]);
10934                     }
10935                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
10936                 }
10937             }
10938         }
10939     } else if (k == GOSUB)
10940         Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
10941     else if (k == VERB) {
10942         if (!o->flags)
10943             Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
10944                            SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
10945     } else if (k == LOGICAL)
10946         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);     /* 2: embedded, otherwise 1 */
10947     else if (k == FOLDCHAR)
10948         Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) );
10949     else if (k == ANYOF) {
10950         int i, rangestart = -1;
10951         const U8 flags = ANYOF_FLAGS(o);
10952         int do_sep = 0;
10953
10954         /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
10955         static const char * const anyofs[] = {
10956             "\\w",
10957             "\\W",
10958             "\\s",
10959             "\\S",
10960             "\\d",
10961             "\\D",
10962             "[:alnum:]",
10963             "[:^alnum:]",
10964             "[:alpha:]",
10965             "[:^alpha:]",
10966             "[:ascii:]",
10967             "[:^ascii:]",
10968             "[:cntrl:]",
10969             "[:^cntrl:]",
10970             "[:graph:]",
10971             "[:^graph:]",
10972             "[:lower:]",
10973             "[:^lower:]",
10974             "[:print:]",
10975             "[:^print:]",
10976             "[:punct:]",
10977             "[:^punct:]",
10978             "[:upper:]",
10979             "[:^upper:]",
10980             "[:xdigit:]",
10981             "[:^xdigit:]",
10982             "[:space:]",
10983             "[:^space:]",
10984             "[:blank:]",
10985             "[:^blank:]"
10986         };
10987
10988         if (flags & ANYOF_LOCALE)
10989             sv_catpvs(sv, "{loc}");
10990         if (flags & ANYOF_LOC_NONBITMAP_FOLD)
10991             sv_catpvs(sv, "{i}");
10992         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
10993         if (flags & ANYOF_INVERT)
10994             sv_catpvs(sv, "^");
10995
10996         /* output what the standard cp 0-255 bitmap matches */
10997         for (i = 0; i <= 256; i++) {
10998             if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
10999                 if (rangestart == -1)
11000                     rangestart = i;
11001             } else if (rangestart != -1) {
11002                 if (i <= rangestart + 3)
11003                     for (; rangestart < i; rangestart++)
11004                         put_byte(sv, rangestart);
11005                 else {
11006                     put_byte(sv, rangestart);
11007                     sv_catpvs(sv, "-");
11008                     put_byte(sv, i - 1);
11009                 }
11010                 do_sep = 1;
11011                 rangestart = -1;
11012             }
11013         }
11014
11015         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
11016         /* output any special charclass tests (used entirely under use locale) */
11017         if (ANYOF_CLASS_TEST_ANY_SET(o))
11018             for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
11019                 if (ANYOF_CLASS_TEST(o,i)) {
11020                     sv_catpv(sv, anyofs[i]);
11021                     do_sep = 1;
11022                 }
11023
11024         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
11025
11026         if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
11027             sv_catpvs(sv, "{non-utf8-latin1-all}");
11028         }
11029
11030         /* output information about the unicode matching */
11031         if (flags & ANYOF_UNICODE_ALL)
11032             sv_catpvs(sv, "{unicode_all}");
11033         else if (flags & ANYOF_UTF8)
11034             sv_catpvs(sv, "{unicode}");
11035         if (flags & ANYOF_NONBITMAP_NON_UTF8)
11036             sv_catpvs(sv, "{outside bitmap}");
11037
11038         {
11039             SV *lv;
11040             SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
11041
11042             if (lv) {
11043                 if (sw) {
11044                     U8 s[UTF8_MAXBYTES_CASE+1];
11045
11046                     for (i = 0; i <= 256; i++) { /* just the first 256 */
11047                         uvchr_to_utf8(s, i);
11048
11049                         if (i < 256 && swash_fetch(sw, s, TRUE)) {
11050                             if (rangestart == -1)
11051                                 rangestart = i;
11052                         } else if (rangestart != -1) {
11053                             if (i <= rangestart + 3)
11054                                 for (; rangestart < i; rangestart++) {
11055                                     const U8 * const e = uvchr_to_utf8(s,rangestart);
11056                                     U8 *p;
11057                                     for(p = s; p < e; p++)
11058                                         put_byte(sv, *p);
11059                                 }
11060                             else {
11061                                 const U8 *e = uvchr_to_utf8(s,rangestart);
11062                                 U8 *p;
11063                                 for (p = s; p < e; p++)
11064                                     put_byte(sv, *p);
11065                                 sv_catpvs(sv, "-");
11066                                 e = uvchr_to_utf8(s, i-1);
11067                                 for (p = s; p < e; p++)
11068                                     put_byte(sv, *p);
11069                                 }
11070                                 rangestart = -1;
11071                             }
11072                         }
11073
11074                     sv_catpvs(sv, "..."); /* et cetera */
11075                 }
11076
11077                 {
11078                     char *s = savesvpv(lv);
11079                     char * const origs = s;
11080
11081                     while (*s && *s != '\n')
11082                         s++;
11083
11084                     if (*s == '\n') {
11085                         const char * const t = ++s;
11086
11087                         while (*s) {
11088                             if (*s == '\n')
11089                                 *s = ' ';
11090                             s++;
11091                         }
11092                         if (s[-1] == ' ')
11093                             s[-1] = 0;
11094
11095                         sv_catpv(sv, t);
11096                     }
11097
11098                     Safefree(origs);
11099                 }
11100             }
11101         }
11102
11103         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
11104     }
11105     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
11106         Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
11107 #else
11108     PERL_UNUSED_CONTEXT;
11109     PERL_UNUSED_ARG(sv);
11110     PERL_UNUSED_ARG(o);
11111     PERL_UNUSED_ARG(prog);
11112 #endif  /* DEBUGGING */
11113 }
11114
11115 SV *
11116 Perl_re_intuit_string(pTHX_ REGEXP * const r)
11117 {                               /* Assume that RE_INTUIT is set */
11118     dVAR;
11119     struct regexp *const prog = (struct regexp *)SvANY(r);
11120     GET_RE_DEBUG_FLAGS_DECL;
11121
11122     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
11123     PERL_UNUSED_CONTEXT;
11124
11125     DEBUG_COMPILE_r(
11126         {
11127             const char * const s = SvPV_nolen_const(prog->check_substr
11128                       ? prog->check_substr : prog->check_utf8);
11129
11130             if (!PL_colorset) reginitcolors();
11131             PerlIO_printf(Perl_debug_log,
11132                       "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
11133                       PL_colors[4],
11134                       prog->check_substr ? "" : "utf8 ",
11135                       PL_colors[5],PL_colors[0],
11136                       s,
11137                       PL_colors[1],
11138                       (strlen(s) > 60 ? "..." : ""));
11139         } );
11140
11141     return prog->check_substr ? prog->check_substr : prog->check_utf8;
11142 }
11143
11144 /*
11145    pregfree()
11146
11147    handles refcounting and freeing the perl core regexp structure. When
11148    it is necessary to actually free the structure the first thing it
11149    does is call the 'free' method of the regexp_engine associated to
11150    the regexp, allowing the handling of the void *pprivate; member
11151    first. (This routine is not overridable by extensions, which is why
11152    the extensions free is called first.)
11153
11154    See regdupe and regdupe_internal if you change anything here.
11155 */
11156 #ifndef PERL_IN_XSUB_RE
11157 void
11158 Perl_pregfree(pTHX_ REGEXP *r)
11159 {
11160     SvREFCNT_dec(r);
11161 }
11162
11163 void
11164 Perl_pregfree2(pTHX_ REGEXP *rx)
11165 {
11166     dVAR;
11167     struct regexp *const r = (struct regexp *)SvANY(rx);
11168     GET_RE_DEBUG_FLAGS_DECL;
11169
11170     PERL_ARGS_ASSERT_PREGFREE2;
11171
11172     if (r->mother_re) {
11173         ReREFCNT_dec(r->mother_re);
11174     } else {
11175         CALLREGFREE_PVT(rx); /* free the private data */
11176         SvREFCNT_dec(RXp_PAREN_NAMES(r));
11177     }
11178     if (r->substrs) {
11179         SvREFCNT_dec(r->anchored_substr);
11180         SvREFCNT_dec(r->anchored_utf8);
11181         SvREFCNT_dec(r->float_substr);
11182         SvREFCNT_dec(r->float_utf8);
11183         Safefree(r->substrs);
11184     }
11185     RX_MATCH_COPY_FREE(rx);
11186 #ifdef PERL_OLD_COPY_ON_WRITE
11187     SvREFCNT_dec(r->saved_copy);
11188 #endif
11189     Safefree(r->offs);
11190 }
11191
11192 /*  reg_temp_copy()
11193
11194     This is a hacky workaround to the structural issue of match results
11195     being stored in the regexp structure which is in turn stored in
11196     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
11197     could be PL_curpm in multiple contexts, and could require multiple
11198     result sets being associated with the pattern simultaneously, such
11199     as when doing a recursive match with (??{$qr})
11200
11201     The solution is to make a lightweight copy of the regexp structure
11202     when a qr// is returned from the code executed by (??{$qr}) this
11203     lightweight copy doesn't actually own any of its data except for
11204     the starp/end and the actual regexp structure itself.
11205
11206 */
11207
11208
11209 REGEXP *
11210 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
11211 {
11212     struct regexp *ret;
11213     struct regexp *const r = (struct regexp *)SvANY(rx);
11214     register const I32 npar = r->nparens+1;
11215
11216     PERL_ARGS_ASSERT_REG_TEMP_COPY;
11217
11218     if (!ret_x)
11219         ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
11220     ret = (struct regexp *)SvANY(ret_x);
11221
11222     (void)ReREFCNT_inc(rx);
11223     /* We can take advantage of the existing "copied buffer" mechanism in SVs
11224        by pointing directly at the buffer, but flagging that the allocated
11225        space in the copy is zero. As we've just done a struct copy, it's now
11226        a case of zero-ing that, rather than copying the current length.  */
11227     SvPV_set(ret_x, RX_WRAPPED(rx));
11228     SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
11229     memcpy(&(ret->xpv_cur), &(r->xpv_cur),
11230            sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
11231     SvLEN_set(ret_x, 0);
11232     SvSTASH_set(ret_x, NULL);
11233     SvMAGIC_set(ret_x, NULL);
11234     Newx(ret->offs, npar, regexp_paren_pair);
11235     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
11236     if (r->substrs) {
11237         Newx(ret->substrs, 1, struct reg_substr_data);
11238         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
11239
11240         SvREFCNT_inc_void(ret->anchored_substr);
11241         SvREFCNT_inc_void(ret->anchored_utf8);
11242         SvREFCNT_inc_void(ret->float_substr);
11243         SvREFCNT_inc_void(ret->float_utf8);
11244
11245         /* check_substr and check_utf8, if non-NULL, point to either their
11246            anchored or float namesakes, and don't hold a second reference.  */
11247     }
11248     RX_MATCH_COPIED_off(ret_x);
11249 #ifdef PERL_OLD_COPY_ON_WRITE
11250     ret->saved_copy = NULL;
11251 #endif
11252     ret->mother_re = rx;
11253
11254     return ret_x;
11255 }
11256 #endif
11257
11258 /* regfree_internal()
11259
11260    Free the private data in a regexp. This is overloadable by
11261    extensions. Perl takes care of the regexp structure in pregfree(),
11262    this covers the *pprivate pointer which technically perl doesn't
11263    know about, however of course we have to handle the
11264    regexp_internal structure when no extension is in use.
11265
11266    Note this is called before freeing anything in the regexp
11267    structure.
11268  */
11269
11270 void
11271 Perl_regfree_internal(pTHX_ REGEXP * const rx)
11272 {
11273     dVAR;
11274     struct regexp *const r = (struct regexp *)SvANY(rx);
11275     RXi_GET_DECL(r,ri);
11276     GET_RE_DEBUG_FLAGS_DECL;
11277
11278     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
11279
11280     DEBUG_COMPILE_r({
11281         if (!PL_colorset)
11282             reginitcolors();
11283         {
11284             SV *dsv= sv_newmortal();
11285             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
11286                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
11287             PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
11288                 PL_colors[4],PL_colors[5],s);
11289         }
11290     });
11291 #ifdef RE_TRACK_PATTERN_OFFSETS
11292     if (ri->u.offsets)
11293         Safefree(ri->u.offsets);             /* 20010421 MJD */
11294 #endif
11295     if (ri->data) {
11296         int n = ri->data->count;
11297         PAD* new_comppad = NULL;
11298         PAD* old_comppad;
11299         PADOFFSET refcnt;
11300
11301         while (--n >= 0) {
11302           /* If you add a ->what type here, update the comment in regcomp.h */
11303             switch (ri->data->what[n]) {
11304             case 'a':
11305             case 's':
11306             case 'S':
11307             case 'u':
11308                 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
11309                 break;
11310             case 'f':
11311                 Safefree(ri->data->data[n]);
11312                 break;
11313             case 'p':
11314                 new_comppad = MUTABLE_AV(ri->data->data[n]);
11315                 break;
11316             case 'o':
11317                 if (new_comppad == NULL)
11318                     Perl_croak(aTHX_ "panic: pregfree comppad");
11319                 PAD_SAVE_LOCAL(old_comppad,
11320                     /* Watch out for global destruction's random ordering. */
11321                     (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
11322                 );
11323                 OP_REFCNT_LOCK;
11324                 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
11325                 OP_REFCNT_UNLOCK;
11326                 if (!refcnt)
11327                     op_free((OP_4tree*)ri->data->data[n]);
11328
11329                 PAD_RESTORE_LOCAL(old_comppad);
11330                 SvREFCNT_dec(MUTABLE_SV(new_comppad));
11331                 new_comppad = NULL;
11332                 break;
11333             case 'n':
11334                 break;
11335             case 'T':
11336                 { /* Aho Corasick add-on structure for a trie node.
11337                      Used in stclass optimization only */
11338                     U32 refcount;
11339                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
11340                     OP_REFCNT_LOCK;
11341                     refcount = --aho->refcount;
11342                     OP_REFCNT_UNLOCK;
11343                     if ( !refcount ) {
11344                         PerlMemShared_free(aho->states);
11345                         PerlMemShared_free(aho->fail);
11346                          /* do this last!!!! */
11347                         PerlMemShared_free(ri->data->data[n]);
11348                         PerlMemShared_free(ri->regstclass);
11349                     }
11350                 }
11351                 break;
11352             case 't':
11353                 {
11354                     /* trie structure. */
11355                     U32 refcount;
11356                     reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
11357                     OP_REFCNT_LOCK;
11358                     refcount = --trie->refcount;
11359                     OP_REFCNT_UNLOCK;
11360                     if ( !refcount ) {
11361                         PerlMemShared_free(trie->charmap);
11362                         PerlMemShared_free(trie->states);
11363                         PerlMemShared_free(trie->trans);
11364                         if (trie->bitmap)
11365                             PerlMemShared_free(trie->bitmap);
11366                         if (trie->jump)
11367                             PerlMemShared_free(trie->jump);
11368                         PerlMemShared_free(trie->wordinfo);
11369                         /* do this last!!!! */
11370                         PerlMemShared_free(ri->data->data[n]);
11371                     }
11372                 }
11373                 break;
11374             default:
11375                 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
11376             }
11377         }
11378         Safefree(ri->data->what);
11379         Safefree(ri->data);
11380     }
11381
11382     Safefree(ri);
11383 }
11384
11385 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
11386 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
11387 #define SAVEPVN(p,n)    ((p) ? savepvn(p,n) : NULL)
11388
11389 /*
11390    re_dup - duplicate a regexp.
11391
11392    This routine is expected to clone a given regexp structure. It is only
11393    compiled under USE_ITHREADS.
11394
11395    After all of the core data stored in struct regexp is duplicated
11396    the regexp_engine.dupe method is used to copy any private data
11397    stored in the *pprivate pointer. This allows extensions to handle
11398    any duplication it needs to do.
11399
11400    See pregfree() and regfree_internal() if you change anything here.
11401 */
11402 #if defined(USE_ITHREADS)
11403 #ifndef PERL_IN_XSUB_RE
11404 void
11405 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
11406 {
11407     dVAR;
11408     I32 npar;
11409     const struct regexp *r = (const struct regexp *)SvANY(sstr);
11410     struct regexp *ret = (struct regexp *)SvANY(dstr);
11411
11412     PERL_ARGS_ASSERT_RE_DUP_GUTS;
11413
11414     npar = r->nparens+1;
11415     Newx(ret->offs, npar, regexp_paren_pair);
11416     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
11417     if(ret->swap) {
11418         /* no need to copy these */
11419         Newx(ret->swap, npar, regexp_paren_pair);
11420     }
11421
11422     if (ret->substrs) {
11423         /* Do it this way to avoid reading from *r after the StructCopy().
11424            That way, if any of the sv_dup_inc()s dislodge *r from the L1
11425            cache, it doesn't matter.  */
11426         const bool anchored = r->check_substr
11427             ? r->check_substr == r->anchored_substr
11428             : r->check_utf8 == r->anchored_utf8;
11429         Newx(ret->substrs, 1, struct reg_substr_data);
11430         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
11431
11432         ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
11433         ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
11434         ret->float_substr = sv_dup_inc(ret->float_substr, param);
11435         ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
11436
11437         /* check_substr and check_utf8, if non-NULL, point to either their
11438            anchored or float namesakes, and don't hold a second reference.  */
11439
11440         if (ret->check_substr) {
11441             if (anchored) {
11442                 assert(r->check_utf8 == r->anchored_utf8);
11443                 ret->check_substr = ret->anchored_substr;
11444                 ret->check_utf8 = ret->anchored_utf8;
11445             } else {
11446                 assert(r->check_substr == r->float_substr);
11447                 assert(r->check_utf8 == r->float_utf8);
11448                 ret->check_substr = ret->float_substr;
11449                 ret->check_utf8 = ret->float_utf8;
11450             }
11451         } else if (ret->check_utf8) {
11452             if (anchored) {
11453                 ret->check_utf8 = ret->anchored_utf8;
11454             } else {
11455                 ret->check_utf8 = ret->float_utf8;
11456             }
11457         }
11458     }
11459
11460     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
11461
11462     if (ret->pprivate)
11463         RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
11464
11465     if (RX_MATCH_COPIED(dstr))
11466         ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
11467     else
11468         ret->subbeg = NULL;
11469 #ifdef PERL_OLD_COPY_ON_WRITE
11470     ret->saved_copy = NULL;
11471 #endif
11472
11473     if (ret->mother_re) {
11474         if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
11475             /* Our storage points directly to our mother regexp, but that's
11476                1: a buffer in a different thread
11477                2: something we no longer hold a reference on
11478                so we need to copy it locally.  */
11479             /* Note we need to sue SvCUR() on our mother_re, because it, in
11480                turn, may well be pointing to its own mother_re.  */
11481             SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
11482                                    SvCUR(ret->mother_re)+1));
11483             SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
11484         }
11485         ret->mother_re      = NULL;
11486     }
11487     ret->gofs = 0;
11488 }
11489 #endif /* PERL_IN_XSUB_RE */
11490
11491 /*
11492    regdupe_internal()
11493
11494    This is the internal complement to regdupe() which is used to copy
11495    the structure pointed to by the *pprivate pointer in the regexp.
11496    This is the core version of the extension overridable cloning hook.
11497    The regexp structure being duplicated will be copied by perl prior
11498    to this and will be provided as the regexp *r argument, however
11499    with the /old/ structures pprivate pointer value. Thus this routine
11500    may override any copying normally done by perl.
11501
11502    It returns a pointer to the new regexp_internal structure.
11503 */
11504
11505 void *
11506 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
11507 {
11508     dVAR;
11509     struct regexp *const r = (struct regexp *)SvANY(rx);
11510     regexp_internal *reti;
11511     int len, npar;
11512     RXi_GET_DECL(r,ri);
11513
11514     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
11515
11516     npar = r->nparens+1;
11517     len = ProgLen(ri);
11518
11519     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
11520     Copy(ri->program, reti->program, len+1, regnode);
11521
11522
11523     reti->regstclass = NULL;
11524
11525     if (ri->data) {
11526         struct reg_data *d;
11527         const int count = ri->data->count;
11528         int i;
11529
11530         Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
11531                 char, struct reg_data);
11532         Newx(d->what, count, U8);
11533
11534         d->count = count;
11535         for (i = 0; i < count; i++) {
11536             d->what[i] = ri->data->what[i];
11537             switch (d->what[i]) {
11538                 /* legal options are one of: sSfpontTua
11539                    see also regcomp.h and pregfree() */
11540             case 'a': /* actually an AV, but the dup function is identical.  */
11541             case 's':
11542             case 'S':
11543             case 'p': /* actually an AV, but the dup function is identical.  */
11544             case 'u': /* actually an HV, but the dup function is identical.  */
11545                 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
11546                 break;
11547             case 'f':
11548                 /* This is cheating. */
11549                 Newx(d->data[i], 1, struct regnode_charclass_class);
11550                 StructCopy(ri->data->data[i], d->data[i],
11551                             struct regnode_charclass_class);
11552                 reti->regstclass = (regnode*)d->data[i];
11553                 break;
11554             case 'o':
11555                 /* Compiled op trees are readonly and in shared memory,
11556                    and can thus be shared without duplication. */
11557                 OP_REFCNT_LOCK;
11558                 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
11559                 OP_REFCNT_UNLOCK;
11560                 break;
11561             case 'T':
11562                 /* Trie stclasses are readonly and can thus be shared
11563                  * without duplication. We free the stclass in pregfree
11564                  * when the corresponding reg_ac_data struct is freed.
11565                  */
11566                 reti->regstclass= ri->regstclass;
11567                 /* Fall through */
11568             case 't':
11569                 OP_REFCNT_LOCK;
11570                 ((reg_trie_data*)ri->data->data[i])->refcount++;
11571                 OP_REFCNT_UNLOCK;
11572                 /* Fall through */
11573             case 'n':
11574                 d->data[i] = ri->data->data[i];
11575                 break;
11576             default:
11577                 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
11578             }
11579         }
11580
11581         reti->data = d;
11582     }
11583     else
11584         reti->data = NULL;
11585
11586     reti->name_list_idx = ri->name_list_idx;
11587
11588 #ifdef RE_TRACK_PATTERN_OFFSETS
11589     if (ri->u.offsets) {
11590         Newx(reti->u.offsets, 2*len+1, U32);
11591         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
11592     }
11593 #else
11594     SetProgLen(reti,len);
11595 #endif
11596
11597     return (void*)reti;
11598 }
11599
11600 #endif    /* USE_ITHREADS */
11601
11602 #ifndef PERL_IN_XSUB_RE
11603
11604 /*
11605  - regnext - dig the "next" pointer out of a node
11606  */
11607 regnode *
11608 Perl_regnext(pTHX_ register regnode *p)
11609 {
11610     dVAR;
11611     register I32 offset;
11612
11613     if (!p)
11614         return(NULL);
11615
11616     if (OP(p) > REGNODE_MAX) {          /* regnode.type is unsigned */
11617         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
11618     }
11619
11620     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
11621     if (offset == 0)
11622         return(NULL);
11623
11624     return(p+offset);
11625 }
11626 #endif
11627
11628 STATIC void
11629 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
11630 {
11631     va_list args;
11632     STRLEN l1 = strlen(pat1);
11633     STRLEN l2 = strlen(pat2);
11634     char buf[512];
11635     SV *msv;
11636     const char *message;
11637
11638     PERL_ARGS_ASSERT_RE_CROAK2;
11639
11640     if (l1 > 510)
11641         l1 = 510;
11642     if (l1 + l2 > 510)
11643         l2 = 510 - l1;
11644     Copy(pat1, buf, l1 , char);
11645     Copy(pat2, buf + l1, l2 , char);
11646     buf[l1 + l2] = '\n';
11647     buf[l1 + l2 + 1] = '\0';
11648 #ifdef I_STDARG
11649     /* ANSI variant takes additional second argument */
11650     va_start(args, pat2);
11651 #else
11652     va_start(args);
11653 #endif
11654     msv = vmess(buf, &args);
11655     va_end(args);
11656     message = SvPV_const(msv,l1);
11657     if (l1 > 512)
11658         l1 = 512;
11659     Copy(message, buf, l1 , char);
11660     buf[l1-1] = '\0';                   /* Overwrite \n */
11661     Perl_croak(aTHX_ "%s", buf);
11662 }
11663
11664 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
11665
11666 #ifndef PERL_IN_XSUB_RE
11667 void
11668 Perl_save_re_context(pTHX)
11669 {
11670     dVAR;
11671
11672     struct re_save_state *state;
11673
11674     SAVEVPTR(PL_curcop);
11675     SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
11676
11677     state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
11678     PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
11679     SSPUSHUV(SAVEt_RE_STATE);
11680
11681     Copy(&PL_reg_state, state, 1, struct re_save_state);
11682
11683     PL_reg_start_tmp = 0;
11684     PL_reg_start_tmpl = 0;
11685     PL_reg_oldsaved = NULL;
11686     PL_reg_oldsavedlen = 0;
11687     PL_reg_maxiter = 0;
11688     PL_reg_leftiter = 0;
11689     PL_reg_poscache = NULL;
11690     PL_reg_poscache_size = 0;
11691 #ifdef PERL_OLD_COPY_ON_WRITE
11692     PL_nrs = NULL;
11693 #endif
11694
11695     /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
11696     if (PL_curpm) {
11697         const REGEXP * const rx = PM_GETRE(PL_curpm);
11698         if (rx) {
11699             U32 i;
11700             for (i = 1; i <= RX_NPARENS(rx); i++) {
11701                 char digits[TYPE_CHARS(long)];
11702                 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
11703                 GV *const *const gvp
11704                     = (GV**)hv_fetch(PL_defstash, digits, len, 0);
11705
11706                 if (gvp) {
11707                     GV * const gv = *gvp;
11708                     if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
11709                         save_scalar(gv);
11710                 }
11711             }
11712         }
11713     }
11714 }
11715 #endif
11716
11717 static void
11718 clear_re(pTHX_ void *r)
11719 {
11720     dVAR;
11721     ReREFCNT_dec((REGEXP *)r);
11722 }
11723
11724 #ifdef DEBUGGING
11725
11726 STATIC void
11727 S_put_byte(pTHX_ SV *sv, int c)
11728 {
11729     PERL_ARGS_ASSERT_PUT_BYTE;
11730
11731     /* Our definition of isPRINT() ignores locales, so only bytes that are
11732        not part of UTF-8 are considered printable. I assume that the same
11733        holds for UTF-EBCDIC.
11734        Also, code point 255 is not printable in either (it's E0 in EBCDIC,
11735        which Wikipedia says:
11736
11737        EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
11738        ones (binary 1111 1111, hexadecimal FF). It is similar, but not
11739        identical, to the ASCII delete (DEL) or rubout control character.
11740        ) So the old condition can be simplified to !isPRINT(c)  */
11741     if (!isPRINT(c)) {
11742         if (c < 256) {
11743             Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
11744         }
11745         else {
11746             Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
11747         }
11748     }
11749     else {
11750         const char string = c;
11751         if (c == '-' || c == ']' || c == '\\' || c == '^')
11752             sv_catpvs(sv, "\\");
11753         sv_catpvn(sv, &string, 1);
11754     }
11755 }
11756
11757
11758 #define CLEAR_OPTSTART \
11759     if (optstart) STMT_START { \
11760             DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
11761             optstart=NULL; \
11762     } STMT_END
11763
11764 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
11765
11766 STATIC const regnode *
11767 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
11768             const regnode *last, const regnode *plast,
11769             SV* sv, I32 indent, U32 depth)
11770 {
11771     dVAR;
11772     register U8 op = PSEUDO;    /* Arbitrary non-END op. */
11773     register const regnode *next;
11774     const regnode *optstart= NULL;
11775
11776     RXi_GET_DECL(r,ri);
11777     GET_RE_DEBUG_FLAGS_DECL;
11778
11779     PERL_ARGS_ASSERT_DUMPUNTIL;
11780
11781 #ifdef DEBUG_DUMPUNTIL
11782     PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
11783         last ? last-start : 0,plast ? plast-start : 0);
11784 #endif
11785
11786     if (plast && plast < last)
11787         last= plast;
11788
11789     while (PL_regkind[op] != END && (!last || node < last)) {
11790         /* While that wasn't END last time... */
11791         NODE_ALIGN(node);
11792         op = OP(node);
11793         if (op == CLOSE || op == WHILEM)
11794             indent--;
11795         next = regnext((regnode *)node);
11796
11797         /* Where, what. */
11798         if (OP(node) == OPTIMIZED) {
11799             if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
11800                 optstart = node;
11801             else
11802                 goto after_print;
11803         } else
11804             CLEAR_OPTSTART;
11805
11806         regprop(r, sv, node);
11807         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
11808                       (int)(2*indent + 1), "", SvPVX_const(sv));
11809
11810         if (OP(node) != OPTIMIZED) {
11811             if (next == NULL)           /* Next ptr. */
11812                 PerlIO_printf(Perl_debug_log, " (0)");
11813             else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
11814                 PerlIO_printf(Perl_debug_log, " (FAIL)");
11815             else
11816                 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
11817             (void)PerlIO_putc(Perl_debug_log, '\n');
11818         }
11819
11820       after_print:
11821         if (PL_regkind[(U8)op] == BRANCHJ) {
11822             assert(next);
11823             {
11824                 register const regnode *nnode = (OP(next) == LONGJMP
11825                                              ? regnext((regnode *)next)
11826                                              : next);
11827                 if (last && nnode > last)
11828                     nnode = last;
11829                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
11830             }
11831         }
11832         else if (PL_regkind[(U8)op] == BRANCH) {
11833             assert(next);
11834             DUMPUNTIL(NEXTOPER(node), next);
11835         }
11836         else if ( PL_regkind[(U8)op]  == TRIE ) {
11837             const regnode *this_trie = node;
11838             const char op = OP(node);
11839             const U32 n = ARG(node);
11840             const reg_ac_data * const ac = op>=AHOCORASICK ?
11841                (reg_ac_data *)ri->data->data[n] :
11842                NULL;
11843             const reg_trie_data * const trie =
11844                 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
11845 #ifdef DEBUGGING
11846             AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
11847 #endif
11848             const regnode *nextbranch= NULL;
11849             I32 word_idx;
11850             sv_setpvs(sv, "");
11851             for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
11852                 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
11853
11854                 PerlIO_printf(Perl_debug_log, "%*s%s ",
11855                    (int)(2*(indent+3)), "",
11856                     elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
11857                             PL_colors[0], PL_colors[1],
11858                             (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
11859                             PERL_PV_PRETTY_ELLIPSES    |
11860                             PERL_PV_PRETTY_LTGT
11861                             )
11862                             : "???"
11863                 );
11864                 if (trie->jump) {
11865                     U16 dist= trie->jump[word_idx+1];
11866                     PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
11867                                   (UV)((dist ? this_trie + dist : next) - start));
11868                     if (dist) {
11869                         if (!nextbranch)
11870                             nextbranch= this_trie + trie->jump[0];
11871                         DUMPUNTIL(this_trie + dist, nextbranch);
11872                     }
11873                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
11874                         nextbranch= regnext((regnode *)nextbranch);
11875                 } else {
11876                     PerlIO_printf(Perl_debug_log, "\n");
11877                 }
11878             }
11879             if (last && next > last)
11880                 node= last;
11881             else
11882                 node= next;
11883         }
11884         else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
11885             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
11886                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
11887         }
11888         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
11889             assert(next);
11890             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
11891         }
11892         else if ( op == PLUS || op == STAR) {
11893             DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
11894         }
11895         else if (PL_regkind[(U8)op] == ANYOF) {
11896             /* arglen 1 + class block */
11897             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
11898                     ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
11899             node = NEXTOPER(node);
11900         }
11901         else if (PL_regkind[(U8)op] == EXACT) {
11902             /* Literal string, where present. */
11903             node += NODE_SZ_STR(node) - 1;
11904             node = NEXTOPER(node);
11905         }
11906         else {
11907             node = NEXTOPER(node);
11908             node += regarglen[(U8)op];
11909         }
11910         if (op == CURLYX || op == OPEN)
11911             indent++;
11912     }
11913     CLEAR_OPTSTART;
11914 #ifdef DEBUG_DUMPUNTIL
11915     PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
11916 #endif
11917     return node;
11918 }
11919
11920 #endif  /* DEBUGGING */
11921
11922 /*
11923  * Local variables:
11924  * c-indentation-style: bsd
11925  * c-basic-offset: 4
11926  * indent-tabs-mode: t
11927  * End:
11928  *
11929  * ex: set ts=8 sts=4 sw=4 noet:
11930  */