src/5013011/regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  *      Copyright (c) 1986 by University of Toronto.
  42  *      Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  *      Permission is granted to anyone to use this software for any
  45  *      purpose on any computer system, and to redistribute it freely,
  46  *      subject to the following restrictions:
  47  *
  48  *      1. The author is not responsible for the consequences of use of
  49  *              this software, no matter how awful, even if they arise
  50  *              from defects in it.
  51  *
  52  *      2. The origin of this software must not be misrepresented, either
  53  *              by explicit claim or by omission.
  54  *
  55  *      3. Altered versions must be plainly marked as such, and must not
  56  *              be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #include "perl.h"
  76
  77 #ifndef PERL_IN_XSUB_RE
  78 #include "re_defs.h"
  79 #endif
  80
  81 #define REG_COMP_C
  82 #ifdef PERL_IN_XSUB_RE
  83 #  include "re_comp.h"
  84 #else
  85 #  include "regcomp.h"
  86 #endif
  87
  88 #include "dquote_static.c"
  89
  90 #ifdef op
  91 #undef op
  92 #endif /* op */
  93
  94 #ifdef MSDOS
  95 #  if defined(BUGGY_MSC6)
  96  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
  97 #    pragma optimize("a",off)
  98  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
  99 #    pragma optimize("w",on )
 100 #  endif /* BUGGY_MSC6 */
 101 #endif /* MSDOS */
 102
 103 #ifndef STATIC
 104 #define STATIC  static
 105 #endif
 106
 107 typedef struct RExC_state_t {
 108     U32         flags;                  /* are we folding, multilining? */
 109     char        *precomp;               /* uncompiled string. */
 110     REGEXP      *rx_sv;                 /* The SV that is the regexp. */
 111     regexp      *rx;                    /* perl core regexp structure */
 112     regexp_internal     *rxi;           /* internal data for regexp object pprivate field */
 113     char        *start;                 /* Start of input for compile */
 114     char        *end;                   /* End of input for compile */
 115     char        *parse;                 /* Input-scan pointer. */
 116     I32         whilem_seen;            /* number of WHILEM in this expr */
 117     regnode     *emit_start;            /* Start of emitted-code area */
 118     regnode     *emit_bound;            /* First regnode outside of the allocated space */
 119     regnode     *emit;                  /* Code-emit pointer; &regdummy = don't = compiling */
 120     I32         naughty;                /* How bad is this pattern? */
 121     I32         sawback;                /* Did we see \1, ...? */
 122     U32         seen;
 123     I32         size;                   /* Code size. */
 124     I32         npar;                   /* Capture buffer count, (OPEN). */
 125     I32         cpar;                   /* Capture buffer count, (CLOSE). */
 126     I32         nestroot;               /* root parens we are in - used by accept */
 127     I32         extralen;
 128     I32         seen_zerolen;
 129     I32         seen_evals;
 130     regnode     **open_parens;          /* pointers to open parens */
 131     regnode     **close_parens;         /* pointers to close parens */
 132     regnode     *opend;                 /* END node in program */
 133     I32         utf8;           /* whether the pattern is utf8 or not */
 134     I32         orig_utf8;      /* whether the pattern was originally in utf8 */
 135                                 /* XXX use this for future optimisation of case
 136                                  * where pattern must be upgraded to utf8. */
 137     I32         uni_semantics;  /* If a d charset modifier should use unicode
 138                                    rules, even if the pattern is not in
 139                                    utf8 */
 140     HV          *paren_names;           /* Paren names */
 141
 142     regnode     **recurse;              /* Recurse regops */
 143     I32         recurse_count;          /* Number of recurse regops */
 144     I32         in_lookbehind;
 145     I32         contains_locale;
 146 #if ADD_TO_REGEXEC
 147     char        *starttry;              /* -Dr: where regtry was called. */
 148 #define RExC_starttry   (pRExC_state->starttry)
 149 #endif
 150 #ifdef DEBUGGING
 151     const char  *lastparse;
 152     I32         lastnum;
 153     AV          *paren_name_list;       /* idx -> name */
 154 #define RExC_lastparse  (pRExC_state->lastparse)
 155 #define RExC_lastnum    (pRExC_state->lastnum)
 156 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 157 #endif
 158 } RExC_state_t;
 159
 160 #define RExC_flags      (pRExC_state->flags)
 161 #define RExC_precomp    (pRExC_state->precomp)
 162 #define RExC_rx_sv      (pRExC_state->rx_sv)
 163 #define RExC_rx         (pRExC_state->rx)
 164 #define RExC_rxi        (pRExC_state->rxi)
 165 #define RExC_start      (pRExC_state->start)
 166 #define RExC_end        (pRExC_state->end)
 167 #define RExC_parse      (pRExC_state->parse)
 168 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
 169 #ifdef RE_TRACK_PATTERN_OFFSETS
 170 #define RExC_offsets    (pRExC_state->rxi->u.offsets) /* I am not like the others */
 171 #endif
 172 #define RExC_emit       (pRExC_state->emit)
 173 #define RExC_emit_start (pRExC_state->emit_start)
 174 #define RExC_emit_bound (pRExC_state->emit_bound)
 175 #define RExC_naughty    (pRExC_state->naughty)
 176 #define RExC_sawback    (pRExC_state->sawback)
 177 #define RExC_seen       (pRExC_state->seen)
 178 #define RExC_size       (pRExC_state->size)
 179 #define RExC_npar       (pRExC_state->npar)
 180 #define RExC_nestroot   (pRExC_state->nestroot)
 181 #define RExC_extralen   (pRExC_state->extralen)
 182 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
 183 #define RExC_seen_evals (pRExC_state->seen_evals)
 184 #define RExC_utf8       (pRExC_state->utf8)
 185 #define RExC_uni_semantics      (pRExC_state->uni_semantics)
 186 #define RExC_orig_utf8  (pRExC_state->orig_utf8)
 187 #define RExC_open_parens        (pRExC_state->open_parens)
 188 #define RExC_close_parens       (pRExC_state->close_parens)
 189 #define RExC_opend      (pRExC_state->opend)
 190 #define RExC_paren_names        (pRExC_state->paren_names)
 191 #define RExC_recurse    (pRExC_state->recurse)
 192 #define RExC_recurse_count      (pRExC_state->recurse_count)
 193 #define RExC_in_lookbehind      (pRExC_state->in_lookbehind)
 194 #define RExC_contains_locale    (pRExC_state->contains_locale)
 195
 196
 197 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
 198 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 199         ((*s) == '{' && regcurly(s)))
 200
 201 #ifdef SPSTART
 202 #undef SPSTART          /* dratted cpp namespace... */
 203 #endif
 204 /*
 205  * Flags to be passed up and down.
 206  */
 207 #define WORST           0       /* Worst case. */
 208 #define HASWIDTH        0x01    /* Known to match non-null strings. */
 209
 210 /* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
 211  * character, and if utf8, must be invariant.  Note that this is not the same thing as REGNODE_SIMPLE */
 212 #define SIMPLE          0x02
 213 #define SPSTART         0x04    /* Starts with * or +. */
 214 #define TRYAGAIN        0x08    /* Weeded out a declaration. */
 215 #define POSTPONED       0x10    /* (?1),(?&name), (??{...}) or similar */
 216
 217 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 218
 219 /* whether trie related optimizations are enabled */
 220 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 221 #define TRIE_STUDY_OPT
 222 #define FULL_TRIE_STUDY
 223 #define TRIE_STCLASS
 224 #endif
 225
 226
 227
 228 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 229 #define PBITVAL(paren) (1 << ((paren) & 7))
 230 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 231 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 232 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 233
 234 /* If not already in utf8, do a longjmp back to the beginning */
 235 #define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
 236 #define REQUIRE_UTF8    STMT_START {                                       \
 237                                      if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
 238                         } STMT_END
 239
 240 /* About scan_data_t.
 241
 242   During optimisation we recurse through the regexp program performing
 243   various inplace (keyhole style) optimisations. In addition study_chunk
 244   and scan_commit populate this data structure with information about
 245   what strings MUST appear in the pattern. We look for the longest
 246   string that must appear at a fixed location, and we look for the
 247   longest string that may appear at a floating location. So for instance
 248   in the pattern:
 249
 250     /FOO[xX]A.*B[xX]BAR/
 251
 252   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 253   strings (because they follow a .* construct). study_chunk will identify
 254   both FOO and BAR as being the longest fixed and floating strings respectively.
 255
 256   The strings can be composites, for instance
 257
 258      /(f)(o)(o)/
 259
 260   will result in a composite fixed substring 'foo'.
 261
 262   For each string some basic information is maintained:
 263
 264   - offset or min_offset
 265     This is the position the string must appear at, or not before.
 266     It also implicitly (when combined with minlenp) tells us how many
 267     characters must match before the string we are searching for.
 268     Likewise when combined with minlenp and the length of the string it
 269     tells us how many characters must appear after the string we have
 270     found.
 271
 272   - max_offset
 273     Only used for floating strings. This is the rightmost point that
 274     the string can appear at. If set to I32 max it indicates that the
 275     string can occur infinitely far to the right.
 276
 277   - minlenp
 278     A pointer to the minimum length of the pattern that the string
 279     was found inside. This is important as in the case of positive
 280     lookahead or positive lookbehind we can have multiple patterns
 281     involved. Consider
 282
 283     /(?=FOO).*F/
 284
 285     The minimum length of the pattern overall is 3, the minimum length
 286     of the lookahead part is 3, but the minimum length of the part that
 287     will actually match is 1. So 'FOO's minimum length is 3, but the
 288     minimum length for the F is 1. This is important as the minimum length
 289     is used to determine offsets in front of and behind the string being
 290     looked for.  Since strings can be composites this is the length of the
 291     pattern at the time it was committed with a scan_commit. Note that
 292     the length is calculated by study_chunk, so that the minimum lengths
 293     are not known until the full pattern has been compiled, thus the
 294     pointer to the value.
 295
 296   - lookbehind
 297
 298     In the case of lookbehind the string being searched for can be
 299     offset past the start point of the final matching string.
 300     If this value was just blithely removed from the min_offset it would
 301     invalidate some of the calculations for how many chars must match
 302     before or after (as they are derived from min_offset and minlen and
 303     the length of the string being searched for).
 304     When the final pattern is compiled and the data is moved from the
 305     scan_data_t structure into the regexp structure the information
 306     about lookbehind is factored in, with the information that would
 307     have been lost precalculated in the end_shift field for the
 308     associated string.
 309
 310   The fields pos_min and pos_delta are used to store the minimum offset
 311   and the delta to the maximum offset at the current point in the pattern.
 312
 313 */
 314
 315 typedef struct scan_data_t {
 316     /*I32 len_min;      unused */
 317     /*I32 len_delta;    unused */
 318     I32 pos_min;
 319     I32 pos_delta;
 320     SV *last_found;
 321     I32 last_end;           /* min value, <0 unless valid. */
 322     I32 last_start_min;
 323     I32 last_start_max;
 324     SV **longest;           /* Either &l_fixed, or &l_float. */
 325     SV *longest_fixed;      /* longest fixed string found in pattern */
 326     I32 offset_fixed;       /* offset where it starts */
 327     I32 *minlen_fixed;      /* pointer to the minlen relevant to the string */
 328     I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 329     SV *longest_float;      /* longest floating string found in pattern */
 330     I32 offset_float_min;   /* earliest point in string it can appear */
 331     I32 offset_float_max;   /* latest point in string it can appear */
 332     I32 *minlen_float;      /* pointer to the minlen relevant to the string */
 333     I32 lookbehind_float;   /* is the position of the string modified by LB */
 334     I32 flags;
 335     I32 whilem_c;
 336     I32 *last_closep;
 337     struct regnode_charclass_class *start_class;
 338 } scan_data_t;
 339
 340 /*
 341  * Forward declarations for pregcomp()'s friends.
 342  */
 343
 344 static const scan_data_t zero_scan_data =
 345   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 346
 347 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 348 #define SF_BEFORE_SEOL          0x0001
 349 #define SF_BEFORE_MEOL          0x0002
 350 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 351 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 352
 353 #ifdef NO_UNARY_PLUS
 354 #  define SF_FIX_SHIFT_EOL      (0+2)
 355 #  define SF_FL_SHIFT_EOL               (0+4)
 356 #else
 357 #  define SF_FIX_SHIFT_EOL      (+2)
 358 #  define SF_FL_SHIFT_EOL               (+4)
 359 #endif
 360
 361 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 362 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 363
 364 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 365 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 366 #define SF_IS_INF               0x0040
 367 #define SF_HAS_PAR              0x0080
 368 #define SF_IN_PAR               0x0100
 369 #define SF_HAS_EVAL             0x0200
 370 #define SCF_DO_SUBSTR           0x0400
 371 #define SCF_DO_STCLASS_AND      0x0800
 372 #define SCF_DO_STCLASS_OR       0x1000
 373 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 374 #define SCF_WHILEM_VISITED_POS  0x2000
 375
 376 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 377 #define SCF_SEEN_ACCEPT         0x8000
 378
 379 #define UTF cBOOL(RExC_utf8)
 380 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 381 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
 382 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
 383 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
 384 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
 385 #define MORE_ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
 386 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
 387
 388 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 389
 390 #define OOB_UNICODE             12345678
 391 #define OOB_NAMEDCLASS          -1
 392
 393 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 394 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 395
 396
 397 /* length of regex to show in messages that don't mark a position within */
 398 #define RegexLengthToShowInErrorMessages 127
 399
 400 /*
 401  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 402  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 403  * op/pragma/warn/regcomp.
 404  */
 405 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 406 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 407
 408 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
 409
 410 /*
 411  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 412  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 413  * "...".
 414  */
 415 #define _FAIL(code) STMT_START {                                        \
 416     const char *ellipses = "";                                          \
 417     IV len = RExC_end - RExC_precomp;                                   \
 418                                                                         \
 419     if (!SIZE_ONLY)                                                     \
 420         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);                   \
 421     if (len > RegexLengthToShowInErrorMessages) {                       \
 422         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
 423         len = RegexLengthToShowInErrorMessages - 10;                    \
 424         ellipses = "...";                                               \
 425     }                                                                   \
 426     code;                                                               \
 427 } STMT_END
 428
 429 #define FAIL(msg) _FAIL(                            \
 430     Perl_croak(aTHX_ "%s in regex m/%.*s%s/",       \
 431             msg, (int)len, RExC_precomp, ellipses))
 432
 433 #define FAIL2(msg,arg) _FAIL(                       \
 434     Perl_croak(aTHX_ msg " in regex m/%.*s%s/",     \
 435             arg, (int)len, RExC_precomp, ellipses))
 436
 437 /*
 438  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 439  */
 440 #define Simple_vFAIL(m) STMT_START {                                    \
 441     const IV offset = RExC_parse - RExC_precomp;                        \
 442     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
 443             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
 444 } STMT_END
 445
 446 /*
 447  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 448  */
 449 #define vFAIL(m) STMT_START {                           \
 450     if (!SIZE_ONLY)                                     \
 451         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 452     Simple_vFAIL(m);                                    \
 453 } STMT_END
 454
 455 /*
 456  * Like Simple_vFAIL(), but accepts two arguments.
 457  */
 458 #define Simple_vFAIL2(m,a1) STMT_START {                        \
 459     const IV offset = RExC_parse - RExC_precomp;                        \
 460     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,                   \
 461             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 462 } STMT_END
 463
 464 /*
 465  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 466  */
 467 #define vFAIL2(m,a1) STMT_START {                       \
 468     if (!SIZE_ONLY)                                     \
 469         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 470     Simple_vFAIL2(m, a1);                               \
 471 } STMT_END
 472
 473
 474 /*
 475  * Like Simple_vFAIL(), but accepts three arguments.
 476  */
 477 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
 478     const IV offset = RExC_parse - RExC_precomp;                \
 479     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,               \
 480             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 481 } STMT_END
 482
 483 /*
 484  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 485  */
 486 #define vFAIL3(m,a1,a2) STMT_START {                    \
 487     if (!SIZE_ONLY)                                     \
 488         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 489     Simple_vFAIL3(m, a1, a2);                           \
 490 } STMT_END
 491
 492 /*
 493  * Like Simple_vFAIL(), but accepts four arguments.
 494  */
 495 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
 496     const IV offset = RExC_parse - RExC_precomp;                \
 497     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,           \
 498             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 499 } STMT_END
 500
 501 #define ckWARNreg(loc,m) STMT_START {                                   \
 502     const IV offset = loc - RExC_precomp;                               \
 503     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 504             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 505 } STMT_END
 506
 507 #define ckWARNregdep(loc,m) STMT_START {                                \
 508     const IV offset = loc - RExC_precomp;                               \
 509     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 510             m REPORT_LOCATION,                                          \
 511             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 512 } STMT_END
 513
 514 #define ckWARN2regdep(loc,m, a1) STMT_START {                           \
 515     const IV offset = loc - RExC_precomp;                               \
 516     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 517             m REPORT_LOCATION,                                          \
 518             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
 519 } STMT_END
 520
 521 #define ckWARN2reg(loc, m, a1) STMT_START {                             \
 522     const IV offset = loc - RExC_precomp;                               \
 523     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 524             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
 525 } STMT_END
 526
 527 #define vWARN3(loc, m, a1, a2) STMT_START {                             \
 528     const IV offset = loc - RExC_precomp;                               \
 529     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 530             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 531 } STMT_END
 532
 533 #define ckWARN3reg(loc, m, a1, a2) STMT_START {                         \
 534     const IV offset = loc - RExC_precomp;                               \
 535     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 536             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 537 } STMT_END
 538
 539 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
 540     const IV offset = loc - RExC_precomp;                               \
 541     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 542             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 543 } STMT_END
 544
 545 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START {                     \
 546     const IV offset = loc - RExC_precomp;                               \
 547     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 548             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 549 } STMT_END
 550
 551 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
 552     const IV offset = loc - RExC_precomp;                               \
 553     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 554             a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
 555 } STMT_END
 556
 557
 558 /* Allow for side effects in s */
 559 #define REGC(c,s) STMT_START {                  \
 560     if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
 561 } STMT_END
 562
 563 /* Macros for recording node offsets.   20001227 mjd@plover.com
 564  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 565  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 566  * Element 0 holds the number n.
 567  * Position is 1 indexed.
 568  */
 569 #ifndef RE_TRACK_PATTERN_OFFSETS
 570 #define Set_Node_Offset_To_R(node,byte)
 571 #define Set_Node_Offset(node,byte)
 572 #define Set_Cur_Node_Offset
 573 #define Set_Node_Length_To_R(node,len)
 574 #define Set_Node_Length(node,len)
 575 #define Set_Node_Cur_Length(node)
 576 #define Node_Offset(n)
 577 #define Node_Length(n)
 578 #define Set_Node_Offset_Length(node,offset,len)
 579 #define ProgLen(ri) ri->u.proglen
 580 #define SetProgLen(ri,x) ri->u.proglen = x
 581 #else
 582 #define ProgLen(ri) ri->u.offsets[0]
 583 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 584 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
 585     if (! SIZE_ONLY) {                                                  \
 586         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
 587                     __LINE__, (int)(node), (int)(byte)));               \
 588         if((node) < 0) {                                                \
 589             Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
 590         } else {                                                        \
 591             RExC_offsets[2*(node)-1] = (byte);                          \
 592         }                                                               \
 593     }                                                                   \
 594 } STMT_END
 595
 596 #define Set_Node_Offset(node,byte) \
 597     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 598 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 599
 600 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
 601     if (! SIZE_ONLY) {                                                  \
 602         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
 603                 __LINE__, (int)(node), (int)(len)));                    \
 604         if((node) < 0) {                                                \
 605             Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
 606         } else {                                                        \
 607             RExC_offsets[2*(node)] = (len);                             \
 608         }                                                               \
 609     }                                                                   \
 610 } STMT_END
 611
 612 #define Set_Node_Length(node,len) \
 613     Set_Node_Length_To_R((node)-RExC_emit_start, len)
 614 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
 615 #define Set_Node_Cur_Length(node) \
 616     Set_Node_Length(node, RExC_parse - parse_start)
 617
 618 /* Get offsets and lengths */
 619 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 620 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 621
 622 #define Set_Node_Offset_Length(node,offset,len) STMT_START {    \
 623     Set_Node_Offset_To_R((node)-RExC_emit_start, (offset));     \
 624     Set_Node_Length_To_R((node)-RExC_emit_start, (len));        \
 625 } STMT_END
 626 #endif
 627
 628 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 629 #define EXPERIMENTAL_INPLACESCAN
 630 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
 631
 632 #define DEBUG_STUDYDATA(str,data,depth)                              \
 633 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
 634     PerlIO_printf(Perl_debug_log,                                    \
 635         "%*s" str "Pos:%"IVdf"/%"IVdf                                \
 636         " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s",       \
 637         (int)(depth)*2, "",                                          \
 638         (IV)((data)->pos_min),                                       \
 639         (IV)((data)->pos_delta),                                     \
 640         (UV)((data)->flags),                                         \
 641         (IV)((data)->whilem_c),                                      \
 642         (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
 643         is_inf ? "INF " : ""                                         \
 644     );                                                               \
 645     if ((data)->last_found)                                          \
 646         PerlIO_printf(Perl_debug_log,                                \
 647             "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 648             " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 649             SvPVX_const((data)->last_found),                         \
 650             (IV)((data)->last_end),                                  \
 651             (IV)((data)->last_start_min),                            \
 652             (IV)((data)->last_start_max),                            \
 653             ((data)->longest &&                                      \
 654              (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 655             SvPVX_const((data)->longest_fixed),                      \
 656             (IV)((data)->offset_fixed),                              \
 657             ((data)->longest &&                                      \
 658              (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 659             SvPVX_const((data)->longest_float),                      \
 660             (IV)((data)->offset_float_min),                          \
 661             (IV)((data)->offset_float_max)                           \
 662         );                                                           \
 663     PerlIO_printf(Perl_debug_log,"\n");                              \
 664 });
 665
 666 static void clear_re(pTHX_ void *r);
 667
 668 /* Mark that we cannot extend a found fixed substring at this point.
 669    Update the longest found anchored substring and the longest found
 670    floating substrings if needed. */
 671
 672 STATIC void
 673 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
 674 {
 675     const STRLEN l = CHR_SVLEN(data->last_found);
 676     const STRLEN old_l = CHR_SVLEN(*data->longest);
 677     GET_RE_DEBUG_FLAGS_DECL;
 678
 679     PERL_ARGS_ASSERT_SCAN_COMMIT;
 680
 681     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 682         SvSetMagicSV(*data->longest, data->last_found);
 683         if (*data->longest == data->longest_fixed) {
 684             data->offset_fixed = l ? data->last_start_min : data->pos_min;
 685             if (data->flags & SF_BEFORE_EOL)
 686                 data->flags
 687                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
 688             else
 689                 data->flags &= ~SF_FIX_BEFORE_EOL;
 690             data->minlen_fixed=minlenp;
 691             data->lookbehind_fixed=0;
 692         }
 693         else { /* *data->longest == data->longest_float */
 694             data->offset_float_min = l ? data->last_start_min : data->pos_min;
 695             data->offset_float_max = (l
 696                                       ? data->last_start_max
 697                                       : data->pos_min + data->pos_delta);
 698             if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
 699                 data->offset_float_max = I32_MAX;
 700             if (data->flags & SF_BEFORE_EOL)
 701                 data->flags
 702                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
 703             else
 704                 data->flags &= ~SF_FL_BEFORE_EOL;
 705             data->minlen_float=minlenp;
 706             data->lookbehind_float=0;
 707         }
 708     }
 709     SvCUR_set(data->last_found, 0);
 710     {
 711         SV * const sv = data->last_found;
 712         if (SvUTF8(sv) && SvMAGICAL(sv)) {
 713             MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 714             if (mg)
 715                 mg->mg_len = 0;
 716         }
 717     }
 718     data->last_end = -1;
 719     data->flags &= ~SF_BEFORE_EOL;
 720     DEBUG_STUDYDATA("commit: ",data,0);
 721 }
 722
 723 /* Can match anything (initialization) */
 724 STATIC void
 725 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 726 {
 727     PERL_ARGS_ASSERT_CL_ANYTHING;
 728
 729     ANYOF_BITMAP_SETALL(cl);
 730     cl->flags = ANYOF_CLASS|ANYOF_EOS|ANYOF_UNICODE_ALL
 731                 |ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL
 732                     /* Even though no bitmap is in use here, we need to set
 733                      * the flag below so an AND with a node that does have one
 734                      * doesn't lose that one.  The flag should get cleared if
 735                      * the other one doesn't; and the code in regexec.c is
 736                      * structured so this being set when not needed does no
 737                      * harm.  It seemed a little cleaner to set it here than do
 738                      * a special case in cl_and() */
 739                 |ANYOF_NONBITMAP_NON_UTF8;
 740
 741     /* If any portion of the regex is to operate under locale rules,
 742      * initialization includes it.  The reason this isn't done for all regexes
 743      * is that the optimizer was written under the assumption that locale was
 744      * all-or-nothing.  Given the complexity and lack of documentation in the
 745      * optimizer, and that there are inadequate test cases for locale, so many
 746      * parts of it may not work properly, it is safest to avoid locale unless
 747      * necessary. */
 748     if (RExC_contains_locale) {
 749         ANYOF_CLASS_SETALL(cl);     /* /l uses class */
 750         cl->flags |= ANYOF_LOCALE;
 751     }
 752     else {
 753         ANYOF_CLASS_ZERO(cl);       /* Only /l uses class now */
 754     }
 755 }
 756
 757 /* Can match anything (initialization) */
 758 STATIC int
 759 S_cl_is_anything(const struct regnode_charclass_class *cl)
 760 {
 761     int value;
 762
 763     PERL_ARGS_ASSERT_CL_IS_ANYTHING;
 764
 765     for (value = 0; value <= ANYOF_MAX; value += 2)
 766         if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
 767             return 1;
 768     if (!(cl->flags & ANYOF_UNICODE_ALL))
 769         return 0;
 770     if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
 771         return 0;
 772     return 1;
 773 }
 774
 775 /* Can match anything (initialization) */
 776 STATIC void
 777 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 778 {
 779     PERL_ARGS_ASSERT_CL_INIT;
 780
 781     Zero(cl, 1, struct regnode_charclass_class);
 782     cl->type = ANYOF;
 783     cl_anything(pRExC_state, cl);
 784     ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
 785 }
 786
 787 /* These two functions currently do the exact same thing */
 788 #define cl_init_zero            S_cl_init
 789
 790 /* 'AND' a given class with another one.  Can create false positives.  'cl'
 791  * should not be inverted.  'and_with->flags & ANYOF_CLASS' should be 0 if
 792  * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
 793 STATIC void
 794 S_cl_and(struct regnode_charclass_class *cl,
 795         const struct regnode_charclass_class *and_with)
 796 {
 797     PERL_ARGS_ASSERT_CL_AND;
 798
 799     assert(and_with->type == ANYOF);
 800
 801     /* I (khw) am not sure all these restrictions are necessary XXX */
 802     if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
 803         && !(ANYOF_CLASS_TEST_ANY_SET(cl))
 804         && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 805         && !(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
 806         && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) {
 807         int i;
 808
 809         if (and_with->flags & ANYOF_INVERT)
 810             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 811                 cl->bitmap[i] &= ~and_with->bitmap[i];
 812         else
 813             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 814                 cl->bitmap[i] &= and_with->bitmap[i];
 815     } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
 816
 817     if (and_with->flags & ANYOF_INVERT) {
 818
 819         /* Here, the and'ed node is inverted.  Get the AND of the flags that
 820          * aren't affected by the inversion.  Those that are affected are
 821          * handled individually below */
 822         U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
 823         cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
 824         cl->flags |= affected_flags;
 825
 826         /* We currently don't know how to deal with things that aren't in the
 827          * bitmap, but we know that the intersection is no greater than what
 828          * is already in cl, so let there be false positives that get sorted
 829          * out after the synthetic start class succeeds, and the node is
 830          * matched for real. */
 831
 832         /* The inversion of these two flags indicate that the resulting
 833          * intersection doesn't have them */
 834         if (and_with->flags & ANYOF_UNICODE_ALL) {
 835             cl->flags &= ~ANYOF_UNICODE_ALL;
 836         }
 837         if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
 838             cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
 839         }
 840     }
 841     else {   /* and'd node is not inverted */
 842         if (! ANYOF_NONBITMAP(and_with)) {
 843
 844             /* Here 'and_with' doesn't match anything outside the bitmap
 845              * (except possibly ANYOF_UNICODE_ALL), which means the
 846              * intersection can't either, except for ANYOF_UNICODE_ALL, in
 847              * which case we don't know what the intersection is, but it's no
 848              * greater than what cl already has, so can just leave it alone,
 849              * with possible false positives */
 850             if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
 851                 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
 852                 cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
 853             }
 854         }
 855         else if (! ANYOF_NONBITMAP(cl)) {
 856
 857             /* Here, 'and_with' does match something outside the bitmap, and cl
 858              * doesn't have a list of things to match outside the bitmap.  If
 859              * cl can match all code points above 255, the intersection will
 860              * be those above-255 code points that 'and_with' matches.  There
 861              * may be false positives from code points in 'and_with' that are
 862              * outside the bitmap but below 256, but those get sorted out
 863              * after the synthetic start class succeeds).  If cl can't match
 864              * all Unicode code points, it means here that it can't match *
 865              * anything outside the bitmap, so we leave the bitmap empty */
 866             if (cl->flags & ANYOF_UNICODE_ALL) {
 867                 ARG_SET(cl, ARG(and_with));
 868             }
 869         }
 870         else {
 871             /* Here, both 'and_with' and cl match something outside the
 872              * bitmap.  Currently we do not do the intersection, so just match
 873              * whatever cl had at the beginning.  */
 874         }
 875
 876
 877         /* Take the intersection of the two sets of flags */
 878         cl->flags &= and_with->flags;
 879     }
 880 }
 881
 882 /* 'OR' a given class with another one.  Can create false positives.  'cl'
 883  * should not be inverted.  'or_with->flags & ANYOF_CLASS' should be 0 if
 884  * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
 885 STATIC void
 886 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
 887 {
 888     PERL_ARGS_ASSERT_CL_OR;
 889
 890     if (or_with->flags & ANYOF_INVERT) {
 891
 892         /* Here, the or'd node is to be inverted.  This means we take the
 893          * complement of everything not in the bitmap, but currently we don't
 894          * know what that is, so give up and match anything */
 895         if (ANYOF_NONBITMAP(or_with)) {
 896             cl_anything(pRExC_state, cl);
 897         }
 898         /* We do not use
 899          * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
 900          *   <= (B1 | !B2) | (CL1 | !CL2)
 901          * which is wasteful if CL2 is small, but we ignore CL2:
 902          *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
 903          * XXXX Can we handle case-fold?  Unclear:
 904          *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
 905          *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
 906          */
 907         else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 908              && !(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
 909              && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD) ) {
 910             int i;
 911
 912             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 913                 cl->bitmap[i] |= ~or_with->bitmap[i];
 914         } /* XXXX: logic is complicated otherwise */
 915         else {
 916             cl_anything(pRExC_state, cl);
 917         }
 918
 919         /* And, we can just take the union of the flags that aren't affected
 920          * by the inversion */
 921         cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
 922
 923         /* For the remaining flags:
 924             ANYOF_UNICODE_ALL and inverted means to not match anything above
 925                     255, which means that the union with cl should just be
 926                     what cl has in it, so can ignore this flag
 927             ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
 928                     is 127-255 to match them, but then invert that, so the
 929                     union with cl should just be what cl has in it, so can
 930                     ignore this flag
 931          */
 932     } else {    /* 'or_with' is not inverted */
 933         /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
 934         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 935              && (!(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
 936                  || (cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) ) {
 937             int i;
 938
 939             /* OR char bitmap and class bitmap separately */
 940             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 941                 cl->bitmap[i] |= or_with->bitmap[i];
 942             if (ANYOF_CLASS_TEST_ANY_SET(or_with)) {
 943                 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
 944                     cl->classflags[i] |= or_with->classflags[i];
 945                 cl->flags |= ANYOF_CLASS;
 946             }
 947         }
 948         else { /* XXXX: logic is complicated, leave it along for a moment. */
 949             cl_anything(pRExC_state, cl);
 950         }
 951
 952         if (ANYOF_NONBITMAP(or_with)) {
 953
 954             /* Use the added node's outside-the-bit-map match if there isn't a
 955              * conflict.  If there is a conflict (both nodes match something
 956              * outside the bitmap, but what they match outside is not the same
 957              * pointer, and hence not easily compared until XXX we extend
 958              * inversion lists this far), give up and allow the start class to
 959              * match everything outside the bitmap.  If that stuff is all above
 960              * 255, can just set UNICODE_ALL, otherwise caould be anything. */
 961             if (! ANYOF_NONBITMAP(cl)) {
 962                 ARG_SET(cl, ARG(or_with));
 963             }
 964             else if (ARG(cl) != ARG(or_with)) {
 965
 966                 if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
 967                     cl_anything(pRExC_state, cl);
 968                 }
 969                 else {
 970                     cl->flags |= ANYOF_UNICODE_ALL;
 971                 }
 972             }
 973
 974         /* Take the union */
 975         cl->flags |= or_with->flags;
 976         }
 977     }
 978 }
 979
 980 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
 981 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
 982 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
 983 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
 984
 985
 986 #ifdef DEBUGGING
 987 /*
 988    dump_trie(trie,widecharmap,revcharmap)
 989    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
 990    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
 991
 992    These routines dump out a trie in a somewhat readable format.
 993    The _interim_ variants are used for debugging the interim
 994    tables that are used to generate the final compressed
 995    representation which is what dump_trie expects.
 996
 997    Part of the reason for their existence is to provide a form
 998    of documentation as to how the different representations function.
 999
1000 */
1001
1002 /*
1003   Dumps the final compressed table form of the trie to Perl_debug_log.
1004   Used for debugging make_trie().
1005 */
1006
1007 STATIC void
1008 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1009             AV *revcharmap, U32 depth)
1010 {
1011     U32 state;
1012     SV *sv=sv_newmortal();
1013     int colwidth= widecharmap ? 6 : 4;
1014     U16 word;
1015     GET_RE_DEBUG_FLAGS_DECL;
1016
1017     PERL_ARGS_ASSERT_DUMP_TRIE;
1018
1019     PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1020         (int)depth * 2 + 2,"",
1021         "Match","Base","Ofs" );
1022
1023     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1024         SV ** const tmp = av_fetch( revcharmap, state, 0);
1025         if ( tmp ) {
1026             PerlIO_printf( Perl_debug_log, "%*s",
1027                 colwidth,
1028                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1029                             PL_colors[0], PL_colors[1],
1030                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1031                             PERL_PV_ESCAPE_FIRSTCHAR
1032                 )
1033             );
1034         }
1035     }
1036     PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1037         (int)depth * 2 + 2,"");
1038
1039     for( state = 0 ; state < trie->uniquecharcount ; state++ )
1040         PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1041     PerlIO_printf( Perl_debug_log, "\n");
1042
1043     for( state = 1 ; state < trie->statecount ; state++ ) {
1044         const U32 base = trie->states[ state ].trans.base;
1045
1046         PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1047
1048         if ( trie->states[ state ].wordnum ) {
1049             PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1050         } else {
1051             PerlIO_printf( Perl_debug_log, "%6s", "" );
1052         }
1053
1054         PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1055
1056         if ( base ) {
1057             U32 ofs = 0;
1058
1059             while( ( base + ofs  < trie->uniquecharcount ) ||
1060                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
1061                      && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1062                     ofs++;
1063
1064             PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1065
1066             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1067                 if ( ( base + ofs >= trie->uniquecharcount ) &&
1068                      ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1069                      trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1070                 {
1071                    PerlIO_printf( Perl_debug_log, "%*"UVXf,
1072                     colwidth,
1073                     (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1074                 } else {
1075                     PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
1076                 }
1077             }
1078
1079             PerlIO_printf( Perl_debug_log, "]");
1080
1081         }
1082         PerlIO_printf( Perl_debug_log, "\n" );
1083     }
1084     PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1085     for (word=1; word <= trie->wordcount; word++) {
1086         PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1087             (int)word, (int)(trie->wordinfo[word].prev),
1088             (int)(trie->wordinfo[word].len));
1089     }
1090     PerlIO_printf(Perl_debug_log, "\n" );
1091 }
1092 /*
1093   Dumps a fully constructed but uncompressed trie in list form.
1094   List tries normally only are used for construction when the number of
1095   possible chars (trie->uniquecharcount) is very high.
1096   Used for debugging make_trie().
1097 */
1098 STATIC void
1099 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1100                          HV *widecharmap, AV *revcharmap, U32 next_alloc,
1101                          U32 depth)
1102 {
1103     U32 state;
1104     SV *sv=sv_newmortal();
1105     int colwidth= widecharmap ? 6 : 4;
1106     GET_RE_DEBUG_FLAGS_DECL;
1107
1108     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1109
1110     /* print out the table precompression.  */
1111     PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1112         (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1113         "------:-----+-----------------\n" );
1114
1115     for( state=1 ; state < next_alloc ; state ++ ) {
1116         U16 charid;
1117
1118         PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1119             (int)depth * 2 + 2,"", (UV)state  );
1120         if ( ! trie->states[ state ].wordnum ) {
1121             PerlIO_printf( Perl_debug_log, "%5s| ","");
1122         } else {
1123             PerlIO_printf( Perl_debug_log, "W%4x| ",
1124                 trie->states[ state ].wordnum
1125             );
1126         }
1127         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1128             SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1129             if ( tmp ) {
1130                 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1131                     colwidth,
1132                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1133                             PL_colors[0], PL_colors[1],
1134                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1135                             PERL_PV_ESCAPE_FIRSTCHAR
1136                     ) ,
1137                     TRIE_LIST_ITEM(state,charid).forid,
1138                     (UV)TRIE_LIST_ITEM(state,charid).newstate
1139                 );
1140                 if (!(charid % 10))
1141                     PerlIO_printf(Perl_debug_log, "\n%*s| ",
1142                         (int)((depth * 2) + 14), "");
1143             }
1144         }
1145         PerlIO_printf( Perl_debug_log, "\n");
1146     }
1147 }
1148
1149 /*
1150   Dumps a fully constructed but uncompressed trie in table form.
1151   This is the normal DFA style state transition table, with a few
1152   twists to facilitate compression later.
1153   Used for debugging make_trie().
1154 */
1155 STATIC void
1156 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1157                           HV *widecharmap, AV *revcharmap, U32 next_alloc,
1158                           U32 depth)
1159 {
1160     U32 state;
1161     U16 charid;
1162     SV *sv=sv_newmortal();
1163     int colwidth= widecharmap ? 6 : 4;
1164     GET_RE_DEBUG_FLAGS_DECL;
1165
1166     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1167
1168     /*
1169        print out the table precompression so that we can do a visual check
1170        that they are identical.
1171      */
1172
1173     PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1174
1175     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1176         SV ** const tmp = av_fetch( revcharmap, charid, 0);
1177         if ( tmp ) {
1178             PerlIO_printf( Perl_debug_log, "%*s",
1179                 colwidth,
1180                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1181                             PL_colors[0], PL_colors[1],
1182                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1183                             PERL_PV_ESCAPE_FIRSTCHAR
1184                 )
1185             );
1186         }
1187     }
1188
1189     PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1190
1191     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1192         PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1193     }
1194
1195     PerlIO_printf( Perl_debug_log, "\n" );
1196
1197     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1198
1199         PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1200             (int)depth * 2 + 2,"",
1201             (UV)TRIE_NODENUM( state ) );
1202
1203         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1204             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1205             if (v)
1206                 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1207             else
1208                 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1209         }
1210         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1211             PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1212         } else {
1213             PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1214             trie->states[ TRIE_NODENUM( state ) ].wordnum );
1215         }
1216     }
1217 }
1218
1219 #endif
1220
1221
1222 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1223   startbranch: the first branch in the whole branch sequence
1224   first      : start branch of sequence of branch-exact nodes.
1225                May be the same as startbranch
1226   last       : Thing following the last branch.
1227                May be the same as tail.
1228   tail       : item following the branch sequence
1229   count      : words in the sequence
1230   flags      : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1231   depth      : indent depth
1232
1233 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1234
1235 A trie is an N'ary tree where the branches are determined by digital
1236 decomposition of the key. IE, at the root node you look up the 1st character and
1237 follow that branch repeat until you find the end of the branches. Nodes can be
1238 marked as "accepting" meaning they represent a complete word. Eg:
1239
1240   /he|she|his|hers/
1241
1242 would convert into the following structure. Numbers represent states, letters
1243 following numbers represent valid transitions on the letter from that state, if
1244 the number is in square brackets it represents an accepting state, otherwise it
1245 will be in parenthesis.
1246
1247       +-h->+-e->[3]-+-r->(8)-+-s->[9]
1248       |    |
1249       |   (2)
1250       |    |
1251      (1)   +-i->(6)-+-s->[7]
1252       |
1253       +-s->(3)-+-h->(4)-+-e->[5]
1254
1255       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1256
1257 This shows that when matching against the string 'hers' we will begin at state 1
1258 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1259 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1260 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1261 single traverse. We store a mapping from accepting to state to which word was
1262 matched, and then when we have multiple possibilities we try to complete the
1263 rest of the regex in the order in which they occured in the alternation.
1264
1265 The only prior NFA like behaviour that would be changed by the TRIE support is
1266 the silent ignoring of duplicate alternations which are of the form:
1267
1268  / (DUPE|DUPE) X? (?{ ... }) Y /x
1269
1270 Thus EVAL blocks following a trie may be called a different number of times with
1271 and without the optimisation. With the optimisations dupes will be silently
1272 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1273 the following demonstrates:
1274
1275  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1276
1277 which prints out 'word' three times, but
1278
1279  'words'=~/(word|word|word)(?{ print $1 })S/
1280
1281 which doesnt print it out at all. This is due to other optimisations kicking in.
1282
1283 Example of what happens on a structural level:
1284
1285 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1286
1287    1: CURLYM[1] {1,32767}(18)
1288    5:   BRANCH(8)
1289    6:     EXACT <ac>(16)
1290    8:   BRANCH(11)
1291    9:     EXACT <ad>(16)
1292   11:   BRANCH(14)
1293   12:     EXACT <ab>(16)
1294   16:   SUCCEED(0)
1295   17:   NOTHING(18)
1296   18: END(0)
1297
1298 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1299 and should turn into:
1300
1301    1: CURLYM[1] {1,32767}(18)
1302    5:   TRIE(16)
1303         [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1304           <ac>
1305           <ad>
1306           <ab>
1307   16:   SUCCEED(0)
1308   17:   NOTHING(18)
1309   18: END(0)
1310
1311 Cases where tail != last would be like /(?foo|bar)baz/:
1312
1313    1: BRANCH(4)
1314    2:   EXACT <foo>(8)
1315    4: BRANCH(7)
1316    5:   EXACT <bar>(8)
1317    7: TAIL(8)
1318    8: EXACT <baz>(10)
1319   10: END(0)
1320
1321 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1322 and would end up looking like:
1323
1324     1: TRIE(8)
1325       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1326         <foo>
1327         <bar>
1328    7: TAIL(8)
1329    8: EXACT <baz>(10)
1330   10: END(0)
1331
1332     d = uvuni_to_utf8_flags(d, uv, 0);
1333
1334 is the recommended Unicode-aware way of saying
1335
1336     *(d++) = uv;
1337 */
1338
1339 #define TRIE_STORE_REVCHAR                                                 \
1340     STMT_START {                                                           \
1341         if (UTF) {                                                         \
1342             SV *zlopp = newSV(2);                                          \
1343             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
1344             unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
1345             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
1346             SvPOK_on(zlopp);                                               \
1347             SvUTF8_on(zlopp);                                              \
1348             av_push(revcharmap, zlopp);                                    \
1349         } else {                                                           \
1350             char ooooff = (char)uvc;                                               \
1351             av_push(revcharmap, newSVpvn(&ooooff, 1));                     \
1352         }                                                                  \
1353         } STMT_END
1354
1355 #define TRIE_READ_CHAR STMT_START {                                           \
1356     wordlen++;                                                                \
1357     if ( UTF ) {                                                              \
1358         if ( folder ) {                                                       \
1359             if ( foldlen > 0 ) {                                              \
1360                uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags );     \
1361                foldlen -= len;                                                \
1362                scan += len;                                                   \
1363                len = 0;                                                       \
1364             } else {                                                          \
1365                 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1366                 uvc = to_uni_fold( uvc, foldbuf, &foldlen );                  \
1367                 foldlen -= UNISKIP( uvc );                                    \
1368                 scan = foldbuf + UNISKIP( uvc );                              \
1369             }                                                                 \
1370         } else {                                                              \
1371             uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1372         }                                                                     \
1373     } else {                                                                  \
1374         uvc = (U32)*uc;                                                       \
1375         len = 1;                                                              \
1376     }                                                                         \
1377 } STMT_END
1378
1379
1380
1381 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
1382     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
1383         U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
1384         Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1385     }                                                           \
1386     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
1387     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
1388     TRIE_LIST_CUR( state )++;                                   \
1389 } STMT_END
1390
1391 #define TRIE_LIST_NEW(state) STMT_START {                       \
1392     Newxz( trie->states[ state ].trans.list,               \
1393         4, reg_trie_trans_le );                                 \
1394      TRIE_LIST_CUR( state ) = 1;                                \
1395      TRIE_LIST_LEN( state ) = 4;                                \
1396 } STMT_END
1397
1398 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
1399     U16 dupe= trie->states[ state ].wordnum;                    \
1400     regnode * const noper_next = regnext( noper );              \
1401                                                                 \
1402     DEBUG_r({                                                   \
1403         /* store the word for dumping */                        \
1404         SV* tmp;                                                \
1405         if (OP(noper) != NOTHING)                               \
1406             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);    \
1407         else                                                    \
1408             tmp = newSVpvn_utf8( "", 0, UTF );                  \
1409         av_push( trie_words, tmp );                             \
1410     });                                                         \
1411                                                                 \
1412     curword++;                                                  \
1413     trie->wordinfo[curword].prev   = 0;                         \
1414     trie->wordinfo[curword].len    = wordlen;                   \
1415     trie->wordinfo[curword].accept = state;                     \
1416                                                                 \
1417     if ( noper_next < tail ) {                                  \
1418         if (!trie->jump)                                        \
1419             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1420         trie->jump[curword] = (U16)(noper_next - convert);      \
1421         if (!jumper)                                            \
1422             jumper = noper_next;                                \
1423         if (!nextbranch)                                        \
1424             nextbranch= regnext(cur);                           \
1425     }                                                           \
1426                                                                 \
1427     if ( dupe ) {                                               \
1428         /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
1429         /* chain, so that when the bits of chain are later    */\
1430         /* linked together, the dups appear in the chain      */\
1431         trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1432         trie->wordinfo[dupe].prev = curword;                    \
1433     } else {                                                    \
1434         /* we haven't inserted this word yet.                */ \
1435         trie->states[ state ].wordnum = curword;                \
1436     }                                                           \
1437 } STMT_END
1438
1439
1440 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)          \
1441      ( ( base + charid >=  ucharcount                                   \
1442          && base + charid < ubound                                      \
1443          && state == trie->trans[ base - ucharcount + charid ].check    \
1444          && trie->trans[ base - ucharcount + charid ].next )            \
1445            ? trie->trans[ base - ucharcount + charid ].next             \
1446            : ( state==1 ? special : 0 )                                 \
1447       )
1448
1449 #define MADE_TRIE       1
1450 #define MADE_JUMP_TRIE  2
1451 #define MADE_EXACT_TRIE 4
1452
1453 STATIC I32
1454 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1455 {
1456     dVAR;
1457     /* first pass, loop through and scan words */
1458     reg_trie_data *trie;
1459     HV *widecharmap = NULL;
1460     AV *revcharmap = newAV();
1461     regnode *cur;
1462     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1463     STRLEN len = 0;
1464     UV uvc = 0;
1465     U16 curword = 0;
1466     U32 next_alloc = 0;
1467     regnode *jumper = NULL;
1468     regnode *nextbranch = NULL;
1469     regnode *convert = NULL;
1470     U32 *prev_states; /* temp array mapping each state to previous one */
1471     /* we just use folder as a flag in utf8 */
1472     const U8 * folder = NULL;
1473
1474 #ifdef DEBUGGING
1475     const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1476     AV *trie_words = NULL;
1477     /* along with revcharmap, this only used during construction but both are
1478      * useful during debugging so we store them in the struct when debugging.
1479      */
1480 #else
1481     const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1482     STRLEN trie_charcount=0;
1483 #endif
1484     SV *re_trie_maxbuff;
1485     GET_RE_DEBUG_FLAGS_DECL;
1486
1487     PERL_ARGS_ASSERT_MAKE_TRIE;
1488 #ifndef DEBUGGING
1489     PERL_UNUSED_ARG(depth);
1490 #endif
1491
1492     switch (flags) {
1493         case EXACTFA:
1494         case EXACTFU: folder = PL_fold_latin1; break;
1495         case EXACTF:  folder = PL_fold; break;
1496         case EXACTFL: folder = PL_fold_locale; break;
1497     }
1498
1499     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1500     trie->refcount = 1;
1501     trie->startstate = 1;
1502     trie->wordcount = word_count;
1503     RExC_rxi->data->data[ data_slot ] = (void*)trie;
1504     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1505     if (!(UTF && folder))
1506         trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1507     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1508                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
1509
1510     DEBUG_r({
1511         trie_words = newAV();
1512     });
1513
1514     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1515     if (!SvIOK(re_trie_maxbuff)) {
1516         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1517     }
1518     DEBUG_OPTIMISE_r({
1519                 PerlIO_printf( Perl_debug_log,
1520                   "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1521                   (int)depth * 2 + 2, "",
1522                   REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1523                   REG_NODE_NUM(last), REG_NODE_NUM(tail),
1524                   (int)depth);
1525     });
1526
1527    /* Find the node we are going to overwrite */
1528     if ( first == startbranch && OP( last ) != BRANCH ) {
1529         /* whole branch chain */
1530         convert = first;
1531     } else {
1532         /* branch sub-chain */
1533         convert = NEXTOPER( first );
1534     }
1535
1536     /*  -- First loop and Setup --
1537
1538        We first traverse the branches and scan each word to determine if it
1539        contains widechars, and how many unique chars there are, this is
1540        important as we have to build a table with at least as many columns as we
1541        have unique chars.
1542
1543        We use an array of integers to represent the character codes 0..255
1544        (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1545        native representation of the character value as the key and IV's for the
1546        coded index.
1547
1548        *TODO* If we keep track of how many times each character is used we can
1549        remap the columns so that the table compression later on is more
1550        efficient in terms of memory by ensuring the most common value is in the
1551        middle and the least common are on the outside.  IMO this would be better
1552        than a most to least common mapping as theres a decent chance the most
1553        common letter will share a node with the least common, meaning the node
1554        will not be compressible. With a middle is most common approach the worst
1555        case is when we have the least common nodes twice.
1556
1557      */
1558
1559     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1560         regnode * const noper = NEXTOPER( cur );
1561         const U8 *uc = (U8*)STRING( noper );
1562         const U8 * const e  = uc + STR_LEN( noper );
1563         STRLEN foldlen = 0;
1564         U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1565         const U8 *scan = (U8*)NULL;
1566         U32 wordlen      = 0;         /* required init */
1567         STRLEN chars = 0;
1568         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1569
1570         if (OP(noper) == NOTHING) {
1571             trie->minlen= 0;
1572             continue;
1573         }
1574         if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1575             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1576                                           regardless of encoding */
1577
1578         for ( ; uc < e ; uc += len ) {
1579             TRIE_CHARCOUNT(trie)++;
1580             TRIE_READ_CHAR;
1581             chars++;
1582             if ( uvc < 256 ) {
1583                 if ( !trie->charmap[ uvc ] ) {
1584                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1585                     if ( folder )
1586                         trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1587                     TRIE_STORE_REVCHAR;
1588                 }
1589                 if ( set_bit ) {
1590                     /* store the codepoint in the bitmap, and its folded
1591                      * equivalent. */
1592                     TRIE_BITMAP_SET(trie,uvc);
1593
1594                     /* store the folded codepoint */
1595                     if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1596
1597                     if ( !UTF ) {
1598                         /* store first byte of utf8 representation of
1599                            variant codepoints */
1600                         if (! UNI_IS_INVARIANT(uvc)) {
1601                             TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
1602                         }
1603                     }
1604                     set_bit = 0; /* We've done our bit :-) */
1605                 }
1606             } else {
1607                 SV** svpp;
1608                 if ( !widecharmap )
1609                     widecharmap = newHV();
1610
1611                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1612
1613                 if ( !svpp )
1614                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1615
1616                 if ( !SvTRUE( *svpp ) ) {
1617                     sv_setiv( *svpp, ++trie->uniquecharcount );
1618                     TRIE_STORE_REVCHAR;
1619                 }
1620             }
1621         }
1622         if( cur == first ) {
1623             trie->minlen=chars;
1624             trie->maxlen=chars;
1625         } else if (chars < trie->minlen) {
1626             trie->minlen=chars;
1627         } else if (chars > trie->maxlen) {
1628             trie->maxlen=chars;
1629         }
1630
1631     } /* end first pass */
1632     DEBUG_TRIE_COMPILE_r(
1633         PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1634                 (int)depth * 2 + 2,"",
1635                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1636                 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1637                 (int)trie->minlen, (int)trie->maxlen )
1638     );
1639
1640     /*
1641         We now know what we are dealing with in terms of unique chars and
1642         string sizes so we can calculate how much memory a naive
1643         representation using a flat table  will take. If it's over a reasonable
1644         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1645         conservative but potentially much slower representation using an array
1646         of lists.
1647
1648         At the end we convert both representations into the same compressed
1649         form that will be used in regexec.c for matching with. The latter
1650         is a form that cannot be used to construct with but has memory
1651         properties similar to the list form and access properties similar
1652         to the table form making it both suitable for fast searches and
1653         small enough that its feasable to store for the duration of a program.
1654
1655         See the comment in the code where the compressed table is produced
1656         inplace from the flat tabe representation for an explanation of how
1657         the compression works.
1658
1659     */
1660
1661
1662     Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1663     prev_states[1] = 0;
1664
1665     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1666         /*
1667             Second Pass -- Array Of Lists Representation
1668
1669             Each state will be represented by a list of charid:state records
1670             (reg_trie_trans_le) the first such element holds the CUR and LEN
1671             points of the allocated array. (See defines above).
1672
1673             We build the initial structure using the lists, and then convert
1674             it into the compressed table form which allows faster lookups
1675             (but cant be modified once converted).
1676         */
1677
1678         STRLEN transcount = 1;
1679
1680         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1681             "%*sCompiling trie using list compiler\n",
1682             (int)depth * 2 + 2, ""));
1683
1684         trie->states = (reg_trie_state *)
1685             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1686                                   sizeof(reg_trie_state) );
1687         TRIE_LIST_NEW(1);
1688         next_alloc = 2;
1689
1690         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1691
1692             regnode * const noper = NEXTOPER( cur );
1693             U8 *uc           = (U8*)STRING( noper );
1694             const U8 * const e = uc + STR_LEN( noper );
1695             U32 state        = 1;         /* required init */
1696             U16 charid       = 0;         /* sanity init */
1697             U8 *scan         = (U8*)NULL; /* sanity init */
1698             STRLEN foldlen   = 0;         /* required init */
1699             U32 wordlen      = 0;         /* required init */
1700             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1701
1702             if (OP(noper) != NOTHING) {
1703                 for ( ; uc < e ; uc += len ) {
1704
1705                     TRIE_READ_CHAR;
1706
1707                     if ( uvc < 256 ) {
1708                         charid = trie->charmap[ uvc ];
1709                     } else {
1710                         SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1711                         if ( !svpp ) {
1712                             charid = 0;
1713                         } else {
1714                             charid=(U16)SvIV( *svpp );
1715                         }
1716                     }
1717                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
1718                     if ( charid ) {
1719
1720                         U16 check;
1721                         U32 newstate = 0;
1722
1723                         charid--;
1724                         if ( !trie->states[ state ].trans.list ) {
1725                             TRIE_LIST_NEW( state );
1726                         }
1727                         for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1728                             if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1729                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1730                                 break;
1731                             }
1732                         }
1733                         if ( ! newstate ) {
1734                             newstate = next_alloc++;
1735                             prev_states[newstate] = state;
1736                             TRIE_LIST_PUSH( state, charid, newstate );
1737                             transcount++;
1738                         }
1739                         state = newstate;
1740                     } else {
1741                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1742                     }
1743                 }
1744             }
1745             TRIE_HANDLE_WORD(state);
1746
1747         } /* end second pass */
1748
1749         /* next alloc is the NEXT state to be allocated */
1750         trie->statecount = next_alloc;
1751         trie->states = (reg_trie_state *)
1752             PerlMemShared_realloc( trie->states,
1753                                    next_alloc
1754                                    * sizeof(reg_trie_state) );
1755
1756         /* and now dump it out before we compress it */
1757         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1758                                                          revcharmap, next_alloc,
1759                                                          depth+1)
1760         );
1761
1762         trie->trans = (reg_trie_trans *)
1763             PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1764         {
1765             U32 state;
1766             U32 tp = 0;
1767             U32 zp = 0;
1768
1769
1770             for( state=1 ; state < next_alloc ; state ++ ) {
1771                 U32 base=0;
1772
1773                 /*
1774                 DEBUG_TRIE_COMPILE_MORE_r(
1775                     PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1776                 );
1777                 */
1778
1779                 if (trie->states[state].trans.list) {
1780                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1781                     U16 maxid=minid;
1782                     U16 idx;
1783
1784                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1785                         const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1786                         if ( forid < minid ) {
1787                             minid=forid;
1788                         } else if ( forid > maxid ) {
1789                             maxid=forid;
1790                         }
1791                     }
1792                     if ( transcount < tp + maxid - minid + 1) {
1793                         transcount *= 2;
1794                         trie->trans = (reg_trie_trans *)
1795                             PerlMemShared_realloc( trie->trans,
1796                                                      transcount
1797                                                      * sizeof(reg_trie_trans) );
1798                         Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1799                     }
1800                     base = trie->uniquecharcount + tp - minid;
1801                     if ( maxid == minid ) {
1802                         U32 set = 0;
1803                         for ( ; zp < tp ; zp++ ) {
1804                             if ( ! trie->trans[ zp ].next ) {
1805                                 base = trie->uniquecharcount + zp - minid;
1806                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1807                                 trie->trans[ zp ].check = state;
1808                                 set = 1;
1809                                 break;
1810                             }
1811                         }
1812                         if ( !set ) {
1813                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1814                             trie->trans[ tp ].check = state;
1815                             tp++;
1816                             zp = tp;
1817                         }
1818                     } else {
1819                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1820                             const U32 tid = base -  trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1821                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1822                             trie->trans[ tid ].check = state;
1823                         }
1824                         tp += ( maxid - minid + 1 );
1825                     }
1826                     Safefree(trie->states[ state ].trans.list);
1827                 }
1828                 /*
1829                 DEBUG_TRIE_COMPILE_MORE_r(
1830                     PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1831                 );
1832                 */
1833                 trie->states[ state ].trans.base=base;
1834             }
1835             trie->lasttrans = tp + 1;
1836         }
1837     } else {
1838         /*
1839            Second Pass -- Flat Table Representation.
1840
1841            we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1842            We know that we will need Charcount+1 trans at most to store the data
1843            (one row per char at worst case) So we preallocate both structures
1844            assuming worst case.
1845
1846            We then construct the trie using only the .next slots of the entry
1847            structs.
1848
1849            We use the .check field of the first entry of the node temporarily to
1850            make compression both faster and easier by keeping track of how many non
1851            zero fields are in the node.
1852
1853            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1854            transition.
1855
1856            There are two terms at use here: state as a TRIE_NODEIDX() which is a
1857            number representing the first entry of the node, and state as a
1858            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1859            TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1860            are 2 entrys per node. eg:
1861
1862              A B       A B
1863           1. 2 4    1. 3 7
1864           2. 0 3    3. 0 5
1865           3. 0 0    5. 0 0
1866           4. 0 0    7. 0 0
1867
1868            The table is internally in the right hand, idx form. However as we also
1869            have to deal with the states array which is indexed by nodenum we have to
1870            use TRIE_NODENUM() to convert.
1871
1872         */
1873         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1874             "%*sCompiling trie using table compiler\n",
1875             (int)depth * 2 + 2, ""));
1876
1877         trie->trans = (reg_trie_trans *)
1878             PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1879                                   * trie->uniquecharcount + 1,
1880                                   sizeof(reg_trie_trans) );
1881         trie->states = (reg_trie_state *)
1882             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1883                                   sizeof(reg_trie_state) );
1884         next_alloc = trie->uniquecharcount + 1;
1885
1886
1887         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1888
1889             regnode * const noper   = NEXTOPER( cur );
1890             const U8 *uc     = (U8*)STRING( noper );
1891             const U8 * const e = uc + STR_LEN( noper );
1892
1893             U32 state        = 1;         /* required init */
1894
1895             U16 charid       = 0;         /* sanity init */
1896             U32 accept_state = 0;         /* sanity init */
1897             U8 *scan         = (U8*)NULL; /* sanity init */
1898
1899             STRLEN foldlen   = 0;         /* required init */
1900             U32 wordlen      = 0;         /* required init */
1901             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1902
1903             if ( OP(noper) != NOTHING ) {
1904                 for ( ; uc < e ; uc += len ) {
1905
1906                     TRIE_READ_CHAR;
1907
1908                     if ( uvc < 256 ) {
1909                         charid = trie->charmap[ uvc ];
1910                     } else {
1911                         SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1912                         charid = svpp ? (U16)SvIV(*svpp) : 0;
1913                     }
1914                     if ( charid ) {
1915                         charid--;
1916                         if ( !trie->trans[ state + charid ].next ) {
1917                             trie->trans[ state + charid ].next = next_alloc;
1918                             trie->trans[ state ].check++;
1919                             prev_states[TRIE_NODENUM(next_alloc)]
1920                                     = TRIE_NODENUM(state);
1921                             next_alloc += trie->uniquecharcount;
1922                         }
1923                         state = trie->trans[ state + charid ].next;
1924                     } else {
1925                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1926                     }
1927                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
1928                 }
1929             }
1930             accept_state = TRIE_NODENUM( state );
1931             TRIE_HANDLE_WORD(accept_state);
1932
1933         } /* end second pass */
1934
1935         /* and now dump it out before we compress it */
1936         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1937                                                           revcharmap,
1938                                                           next_alloc, depth+1));
1939
1940         {
1941         /*
1942            * Inplace compress the table.*
1943
1944            For sparse data sets the table constructed by the trie algorithm will
1945            be mostly 0/FAIL transitions or to put it another way mostly empty.
1946            (Note that leaf nodes will not contain any transitions.)
1947
1948            This algorithm compresses the tables by eliminating most such
1949            transitions, at the cost of a modest bit of extra work during lookup:
1950
1951            - Each states[] entry contains a .base field which indicates the
1952            index in the state[] array wheres its transition data is stored.
1953
1954            - If .base is 0 there are no valid transitions from that node.
1955
1956            - If .base is nonzero then charid is added to it to find an entry in
1957            the trans array.
1958
1959            -If trans[states[state].base+charid].check!=state then the
1960            transition is taken to be a 0/Fail transition. Thus if there are fail
1961            transitions at the front of the node then the .base offset will point
1962            somewhere inside the previous nodes data (or maybe even into a node
1963            even earlier), but the .check field determines if the transition is
1964            valid.
1965
1966            XXX - wrong maybe?
1967            The following process inplace converts the table to the compressed
1968            table: We first do not compress the root node 1,and mark all its
1969            .check pointers as 1 and set its .base pointer as 1 as well. This
1970            allows us to do a DFA construction from the compressed table later,
1971            and ensures that any .base pointers we calculate later are greater
1972            than 0.
1973
1974            - We set 'pos' to indicate the first entry of the second node.
1975
1976            - We then iterate over the columns of the node, finding the first and
1977            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1978            and set the .check pointers accordingly, and advance pos
1979            appropriately and repreat for the next node. Note that when we copy
1980            the next pointers we have to convert them from the original
1981            NODEIDX form to NODENUM form as the former is not valid post
1982            compression.
1983
1984            - If a node has no transitions used we mark its base as 0 and do not
1985            advance the pos pointer.
1986
1987            - If a node only has one transition we use a second pointer into the
1988            structure to fill in allocated fail transitions from other states.
1989            This pointer is independent of the main pointer and scans forward
1990            looking for null transitions that are allocated to a state. When it
1991            finds one it writes the single transition into the "hole".  If the
1992            pointer doesnt find one the single transition is appended as normal.
1993
1994            - Once compressed we can Renew/realloc the structures to release the
1995            excess space.
1996
1997            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1998            specifically Fig 3.47 and the associated pseudocode.
1999
2000            demq
2001         */
2002         const U32 laststate = TRIE_NODENUM( next_alloc );
2003         U32 state, charid;
2004         U32 pos = 0, zp=0;
2005         trie->statecount = laststate;
2006
2007         for ( state = 1 ; state < laststate ; state++ ) {
2008             U8 flag = 0;
2009             const U32 stateidx = TRIE_NODEIDX( state );
2010             const U32 o_used = trie->trans[ stateidx ].check;
2011             U32 used = trie->trans[ stateidx ].check;
2012             trie->trans[ stateidx ].check = 0;
2013
2014             for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2015                 if ( flag || trie->trans[ stateidx + charid ].next ) {
2016                     if ( trie->trans[ stateidx + charid ].next ) {
2017                         if (o_used == 1) {
2018                             for ( ; zp < pos ; zp++ ) {
2019                                 if ( ! trie->trans[ zp ].next ) {
2020                                     break;
2021                                 }
2022                             }
2023                             trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2024                             trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2025                             trie->trans[ zp ].check = state;
2026                             if ( ++zp > pos ) pos = zp;
2027                             break;
2028                         }
2029                         used--;
2030                     }
2031                     if ( !flag ) {
2032                         flag = 1;
2033                         trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2034                     }
2035                     trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2036                     trie->trans[ pos ].check = state;
2037                     pos++;
2038                 }
2039             }
2040         }
2041         trie->lasttrans = pos + 1;
2042         trie->states = (reg_trie_state *)
2043             PerlMemShared_realloc( trie->states, laststate
2044                                    * sizeof(reg_trie_state) );
2045         DEBUG_TRIE_COMPILE_MORE_r(
2046                 PerlIO_printf( Perl_debug_log,
2047                     "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2048                     (int)depth * 2 + 2,"",
2049                     (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
2050                     (IV)next_alloc,
2051                     (IV)pos,
2052                     ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2053             );
2054
2055         } /* end table compress */
2056     }
2057     DEBUG_TRIE_COMPILE_MORE_r(
2058             PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2059                 (int)depth * 2 + 2, "",
2060                 (UV)trie->statecount,
2061                 (UV)trie->lasttrans)
2062     );
2063     /* resize the trans array to remove unused space */
2064     trie->trans = (reg_trie_trans *)
2065         PerlMemShared_realloc( trie->trans, trie->lasttrans
2066                                * sizeof(reg_trie_trans) );
2067
2068     {   /* Modify the program and insert the new TRIE node */
2069         U8 nodetype =(U8)(flags & 0xFF);
2070         char *str=NULL;
2071
2072 #ifdef DEBUGGING
2073         regnode *optimize = NULL;
2074 #ifdef RE_TRACK_PATTERN_OFFSETS
2075
2076         U32 mjd_offset = 0;
2077         U32 mjd_nodelen = 0;
2078 #endif /* RE_TRACK_PATTERN_OFFSETS */
2079 #endif /* DEBUGGING */
2080         /*
2081            This means we convert either the first branch or the first Exact,
2082            depending on whether the thing following (in 'last') is a branch
2083            or not and whther first is the startbranch (ie is it a sub part of
2084            the alternation or is it the whole thing.)
2085            Assuming its a sub part we convert the EXACT otherwise we convert
2086            the whole branch sequence, including the first.
2087          */
2088         /* Find the node we are going to overwrite */
2089         if ( first != startbranch || OP( last ) == BRANCH ) {
2090             /* branch sub-chain */
2091             NEXT_OFF( first ) = (U16)(last - first);
2092 #ifdef RE_TRACK_PATTERN_OFFSETS
2093             DEBUG_r({
2094                 mjd_offset= Node_Offset((convert));
2095                 mjd_nodelen= Node_Length((convert));
2096             });
2097 #endif
2098             /* whole branch chain */
2099         }
2100 #ifdef RE_TRACK_PATTERN_OFFSETS
2101         else {
2102             DEBUG_r({
2103                 const  regnode *nop = NEXTOPER( convert );
2104                 mjd_offset= Node_Offset((nop));
2105                 mjd_nodelen= Node_Length((nop));
2106             });
2107         }
2108         DEBUG_OPTIMISE_r(
2109             PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2110                 (int)depth * 2 + 2, "",
2111                 (UV)mjd_offset, (UV)mjd_nodelen)
2112         );
2113 #endif
2114         /* But first we check to see if there is a common prefix we can
2115            split out as an EXACT and put in front of the TRIE node.  */
2116         trie->startstate= 1;
2117         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
2118             U32 state;
2119             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2120                 U32 ofs = 0;
2121                 I32 idx = -1;
2122                 U32 count = 0;
2123                 const U32 base = trie->states[ state ].trans.base;
2124
2125                 if ( trie->states[state].wordnum )
2126                         count = 1;
2127
2128                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2129                     if ( ( base + ofs >= trie->uniquecharcount ) &&
2130                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2131                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2132                     {
2133                         if ( ++count > 1 ) {
2134                             SV **tmp = av_fetch( revcharmap, ofs, 0);
2135                             const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2136                             if ( state == 1 ) break;
2137                             if ( count == 2 ) {
2138                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2139                                 DEBUG_OPTIMISE_r(
2140                                     PerlIO_printf(Perl_debug_log,
2141                                         "%*sNew Start State=%"UVuf" Class: [",
2142                                         (int)depth * 2 + 2, "",
2143                                         (UV)state));
2144                                 if (idx >= 0) {
2145                                     SV ** const tmp = av_fetch( revcharmap, idx, 0);
2146                                     const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2147
2148                                     TRIE_BITMAP_SET(trie,*ch);
2149                                     if ( folder )
2150                                         TRIE_BITMAP_SET(trie, folder[ *ch ]);
2151                                     DEBUG_OPTIMISE_r(
2152                                         PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2153                                     );
2154                                 }
2155                             }
2156                             TRIE_BITMAP_SET(trie,*ch);
2157                             if ( folder )
2158                                 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2159                             DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2160                         }
2161                         idx = ofs;
2162                     }
2163                 }
2164                 if ( count == 1 ) {
2165                     SV **tmp = av_fetch( revcharmap, idx, 0);
2166                     STRLEN len;
2167                     char *ch = SvPV( *tmp, len );
2168                     DEBUG_OPTIMISE_r({
2169                         SV *sv=sv_newmortal();
2170                         PerlIO_printf( Perl_debug_log,
2171                             "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2172                             (int)depth * 2 + 2, "",
2173                             (UV)state, (UV)idx,
2174                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2175                                 PL_colors[0], PL_colors[1],
2176                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2177                                 PERL_PV_ESCAPE_FIRSTCHAR
2178                             )
2179                         );
2180                     });
2181                     if ( state==1 ) {
2182                         OP( convert ) = nodetype;
2183                         str=STRING(convert);
2184                         STR_LEN(convert)=0;
2185                     }
2186                     STR_LEN(convert) += len;
2187                     while (len--)
2188                         *str++ = *ch++;
2189                 } else {
2190 #ifdef DEBUGGING
2191                     if (state>1)
2192                         DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2193 #endif
2194                     break;
2195                 }
2196             }
2197             trie->prefixlen = (state-1);
2198             if (str) {
2199                 regnode *n = convert+NODE_SZ_STR(convert);
2200                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2201                 trie->startstate = state;
2202                 trie->minlen -= (state - 1);
2203                 trie->maxlen -= (state - 1);
2204 #ifdef DEBUGGING
2205                /* At least the UNICOS C compiler choked on this
2206                 * being argument to DEBUG_r(), so let's just have
2207                 * it right here. */
2208                if (
2209 #ifdef PERL_EXT_RE_BUILD
2210                    1
2211 #else
2212                    DEBUG_r_TEST
2213 #endif
2214                    ) {
2215                    regnode *fix = convert;
2216                    U32 word = trie->wordcount;
2217                    mjd_nodelen++;
2218                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2219                    while( ++fix < n ) {
2220                        Set_Node_Offset_Length(fix, 0, 0);
2221                    }
2222                    while (word--) {
2223                        SV ** const tmp = av_fetch( trie_words, word, 0 );
2224                        if (tmp) {
2225                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
2226                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2227                            else
2228                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2229                        }
2230                    }
2231                }
2232 #endif
2233                 if (trie->maxlen) {
2234                     convert = n;
2235                 } else {
2236                     NEXT_OFF(convert) = (U16)(tail - convert);
2237                     DEBUG_r(optimize= n);
2238                 }
2239             }
2240         }
2241         if (!jumper)
2242             jumper = last;
2243         if ( trie->maxlen ) {
2244             NEXT_OFF( convert ) = (U16)(tail - convert);
2245             ARG_SET( convert, data_slot );
2246             /* Store the offset to the first unabsorbed branch in
2247                jump[0], which is otherwise unused by the jump logic.
2248                We use this when dumping a trie and during optimisation. */
2249             if (trie->jump)
2250                 trie->jump[0] = (U16)(nextbranch - convert);
2251
2252             /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2253              *   and there is a bitmap
2254              *   and the first "jump target" node we found leaves enough room
2255              * then convert the TRIE node into a TRIEC node, with the bitmap
2256              * embedded inline in the opcode - this is hypothetically faster.
2257              */
2258             if ( !trie->states[trie->startstate].wordnum
2259                  && trie->bitmap
2260                  && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2261             {
2262                 OP( convert ) = TRIEC;
2263                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2264                 PerlMemShared_free(trie->bitmap);
2265                 trie->bitmap= NULL;
2266             } else
2267                 OP( convert ) = TRIE;
2268
2269             /* store the type in the flags */
2270             convert->flags = nodetype;
2271             DEBUG_r({
2272             optimize = convert
2273                       + NODE_STEP_REGNODE
2274                       + regarglen[ OP( convert ) ];
2275             });
2276             /* XXX We really should free up the resource in trie now,
2277                    as we won't use them - (which resources?) dmq */
2278         }
2279         /* needed for dumping*/
2280         DEBUG_r(if (optimize) {
2281             regnode *opt = convert;
2282
2283             while ( ++opt < optimize) {
2284                 Set_Node_Offset_Length(opt,0,0);
2285             }
2286             /*
2287                 Try to clean up some of the debris left after the
2288                 optimisation.
2289              */
2290             while( optimize < jumper ) {
2291                 mjd_nodelen += Node_Length((optimize));
2292                 OP( optimize ) = OPTIMIZED;
2293                 Set_Node_Offset_Length(optimize,0,0);
2294                 optimize++;
2295             }
2296             Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2297         });
2298     } /* end node insert */
2299     REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, convert);
2300
2301     /*  Finish populating the prev field of the wordinfo array.  Walk back
2302      *  from each accept state until we find another accept state, and if
2303      *  so, point the first word's .prev field at the second word. If the
2304      *  second already has a .prev field set, stop now. This will be the
2305      *  case either if we've already processed that word's accept state,
2306      *  or that state had multiple words, and the overspill words were
2307      *  already linked up earlier.
2308      */
2309     {
2310         U16 word;
2311         U32 state;
2312         U16 prev;
2313
2314         for (word=1; word <= trie->wordcount; word++) {
2315             prev = 0;
2316             if (trie->wordinfo[word].prev)
2317                 continue;
2318             state = trie->wordinfo[word].accept;
2319             while (state) {
2320                 state = prev_states[state];
2321                 if (!state)
2322                     break;
2323                 prev = trie->states[state].wordnum;
2324                 if (prev)
2325                     break;
2326             }
2327             trie->wordinfo[word].prev = prev;
2328         }
2329         Safefree(prev_states);
2330     }
2331
2332
2333     /* and now dump out the compressed format */
2334     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2335
2336     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2337 #ifdef DEBUGGING
2338     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2339     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2340 #else
2341     SvREFCNT_dec(revcharmap);
2342 #endif
2343     return trie->jump
2344            ? MADE_JUMP_TRIE
2345            : trie->startstate>1
2346              ? MADE_EXACT_TRIE
2347              : MADE_TRIE;
2348 }
2349
2350 STATIC void
2351 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode *stclass, U32 depth)
2352 {
2353 /* The Trie is constructed and compressed now so we can build a fail array if it's needed
2354
2355    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2356    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2357    ISBN 0-201-10088-6
2358
2359    We find the fail state for each state in the trie, this state is the longest proper
2360    suffix of the current state's 'word' that is also a proper prefix of another word in our
2361    trie. State 1 represents the word '' and is thus the default fail state. This allows
2362    the DFA not to have to restart after its tried and failed a word at a given point, it
2363    simply continues as though it had been matching the other word in the first place.
2364    Consider
2365       'abcdgu'=~/abcdefg|cdgu/
2366    When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2367    fail, which would bring us to the state representing 'd' in the second word where we would
2368    try 'g' and succeed, proceeding to match 'cdgu'.
2369  */
2370  /* add a fail transition */
2371     const U32 trie_offset = ARG(source);
2372     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2373     U32 *q;
2374     const U32 ucharcount = trie->uniquecharcount;
2375     const U32 numstates = trie->statecount;
2376     const U32 ubound = trie->lasttrans + ucharcount;
2377     U32 q_read = 0;
2378     U32 q_write = 0;
2379     U32 charid;
2380     U32 base = trie->states[ 1 ].trans.base;
2381     U32 *fail;
2382     reg_ac_data *aho;
2383     const U32 data_slot = add_data( pRExC_state, 1, "T" );
2384     GET_RE_DEBUG_FLAGS_DECL;
2385
2386     PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2387 #ifndef DEBUGGING
2388     PERL_UNUSED_ARG(depth);
2389 #endif
2390
2391
2392     ARG_SET( stclass, data_slot );
2393     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2394     RExC_rxi->data->data[ data_slot ] = (void*)aho;
2395     aho->trie=trie_offset;
2396     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2397     Copy( trie->states, aho->states, numstates, reg_trie_state );
2398     Newxz( q, numstates, U32);
2399     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2400     aho->refcount = 1;
2401     fail = aho->fail;
2402     /* initialize fail[0..1] to be 1 so that we always have
2403        a valid final fail state */
2404     fail[ 0 ] = fail[ 1 ] = 1;
2405
2406     for ( charid = 0; charid < ucharcount ; charid++ ) {
2407         const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2408         if ( newstate ) {
2409             q[ q_write ] = newstate;
2410             /* set to point at the root */
2411             fail[ q[ q_write++ ] ]=1;
2412         }
2413     }
2414     while ( q_read < q_write) {
2415         const U32 cur = q[ q_read++ % numstates ];
2416         base = trie->states[ cur ].trans.base;
2417
2418         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2419             const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2420             if (ch_state) {
2421                 U32 fail_state = cur;
2422                 U32 fail_base;
2423                 do {
2424                     fail_state = fail[ fail_state ];
2425                     fail_base = aho->states[ fail_state ].trans.base;
2426                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2427
2428                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2429                 fail[ ch_state ] = fail_state;
2430                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2431                 {
2432                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
2433                 }
2434                 q[ q_write++ % numstates] = ch_state;
2435             }
2436         }
2437     }
2438     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2439        when we fail in state 1, this allows us to use the
2440        charclass scan to find a valid start char. This is based on the principle
2441        that theres a good chance the string being searched contains lots of stuff
2442        that cant be a start char.
2443      */
2444     fail[ 0 ] = fail[ 1 ] = 0;
2445     DEBUG_TRIE_COMPILE_r({
2446         PerlIO_printf(Perl_debug_log,
2447                       "%*sStclass Failtable (%"UVuf" states): 0",
2448                       (int)(depth * 2), "", (UV)numstates
2449         );
2450         for( q_read=1; q_read<numstates; q_read++ ) {
2451             PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2452         }
2453         PerlIO_printf(Perl_debug_log, "\n");
2454     });
2455     Safefree(q);
2456     /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2457 }
2458
2459
2460 /*
2461  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2462  * These need to be revisited when a newer toolchain becomes available.
2463  */
2464 #if defined(__sparc64__) && defined(__GNUC__)
2465 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2466 #       undef  SPARC64_GCC_WORKAROUND
2467 #       define SPARC64_GCC_WORKAROUND 1
2468 #   endif
2469 #endif
2470
2471 #define DEBUG_PEEP(str,scan,depth) \
2472     DEBUG_OPTIMISE_r({if (scan){ \
2473        SV * const mysv=sv_newmortal(); \
2474        regnode *Next = regnext(scan); \
2475        regprop(RExC_rx, mysv, scan); \
2476        PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2477        (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2478        Next ? (REG_NODE_NUM(Next)) : 0 ); \
2479    }});
2480
2481
2482
2483
2484
2485 #define JOIN_EXACT(scan,min,flags) \
2486     if (PL_regkind[OP(scan)] == EXACT) \
2487         join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2488
2489 STATIC U32
2490 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2491     /* Merge several consecutive EXACTish nodes into one. */
2492     regnode *n = regnext(scan);
2493     U32 stringok = 1;
2494     regnode *next = scan + NODE_SZ_STR(scan);
2495     U32 merged = 0;
2496     U32 stopnow = 0;
2497 #ifdef DEBUGGING
2498     regnode *stop = scan;
2499     GET_RE_DEBUG_FLAGS_DECL;
2500 #else
2501     PERL_UNUSED_ARG(depth);
2502 #endif
2503
2504     PERL_ARGS_ASSERT_JOIN_EXACT;
2505 #ifndef EXPERIMENTAL_INPLACESCAN
2506     PERL_UNUSED_ARG(flags);
2507     PERL_UNUSED_ARG(val);
2508 #endif
2509     DEBUG_PEEP("join",scan,depth);
2510
2511     /* Skip NOTHING, merge EXACT*. */
2512     while (n &&
2513            ( PL_regkind[OP(n)] == NOTHING ||
2514              (stringok && (OP(n) == OP(scan))))
2515            && NEXT_OFF(n)
2516            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2517
2518         if (OP(n) == TAIL || n > next)
2519             stringok = 0;
2520         if (PL_regkind[OP(n)] == NOTHING) {
2521             DEBUG_PEEP("skip:",n,depth);
2522             NEXT_OFF(scan) += NEXT_OFF(n);
2523             next = n + NODE_STEP_REGNODE;
2524 #ifdef DEBUGGING
2525             if (stringok)
2526                 stop = n;
2527 #endif
2528             n = regnext(n);
2529         }
2530         else if (stringok) {
2531             const unsigned int oldl = STR_LEN(scan);
2532             regnode * const nnext = regnext(n);
2533
2534             DEBUG_PEEP("merg",n,depth);
2535
2536             merged++;
2537             if (oldl + STR_LEN(n) > U8_MAX)
2538                 break;
2539             NEXT_OFF(scan) += NEXT_OFF(n);
2540             STR_LEN(scan) += STR_LEN(n);
2541             next = n + NODE_SZ_STR(n);
2542             /* Now we can overwrite *n : */
2543             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2544 #ifdef DEBUGGING
2545             stop = next - 1;
2546 #endif
2547             n = nnext;
2548             if (stopnow) break;
2549         }
2550
2551 #ifdef EXPERIMENTAL_INPLACESCAN
2552         if (flags && !NEXT_OFF(n)) {
2553             DEBUG_PEEP("atch", val, depth);
2554             if (reg_off_by_arg[OP(n)]) {
2555                 ARG_SET(n, val - n);
2556             }
2557             else {
2558                 NEXT_OFF(n) = val - n;
2559             }
2560             stopnow = 1;
2561         }
2562 #endif
2563     }
2564 #define GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS   0x0390
2565 #define IOTA_D_T        GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS
2566 #define GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS     0x03B0
2567 #define UPSILON_D_T     GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS
2568
2569     if (UTF
2570         && ( OP(scan) == EXACTF || OP(scan) == EXACTFU || OP(scan) == EXACTFA)
2571         && ( STR_LEN(scan) >= 6 ) )
2572     {
2573     /*
2574     Two problematic code points in Unicode casefolding of EXACT nodes:
2575
2576     U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2577     U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2578
2579     which casefold to
2580
2581     Unicode                      UTF-8
2582
2583     U+03B9 U+0308 U+0301         0xCE 0xB9 0xCC 0x88 0xCC 0x81
2584     U+03C5 U+0308 U+0301         0xCF 0x85 0xCC 0x88 0xCC 0x81
2585
2586     This means that in case-insensitive matching (or "loose matching",
2587     as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2588     length of the above casefolded versions) can match a target string
2589     of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2590     This would rather mess up the minimum length computation.
2591
2592     What we'll do is to look for the tail four bytes, and then peek
2593     at the preceding two bytes to see whether we need to decrease
2594     the minimum length by four (six minus two).
2595
2596     Thanks to the design of UTF-8, there cannot be false matches:
2597     A sequence of valid UTF-8 bytes cannot be a subsequence of
2598     another valid sequence of UTF-8 bytes.
2599
2600     */
2601          char * const s0 = STRING(scan), *s, *t;
2602          char * const s1 = s0 + STR_LEN(scan) - 1;
2603          char * const s2 = s1 - 4;
2604 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2605          const char t0[] = "\xaf\x49\xaf\x42";
2606 #else
2607          const char t0[] = "\xcc\x88\xcc\x81";
2608 #endif
2609          const char * const t1 = t0 + 3;
2610
2611          for (s = s0 + 2;
2612               s < s2 && (t = ninstr(s, s1, t0, t1));
2613               s = t + 4) {
2614 #ifdef EBCDIC
2615               if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2616                   ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2617 #else
2618               if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2619                   ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
2620 #endif
2621                    *min -= 4;
2622          }
2623     }
2624
2625 #ifdef DEBUGGING
2626     /* Allow dumping */
2627     n = scan + NODE_SZ_STR(scan);
2628     while (n <= stop) {
2629         if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2630             OP(n) = OPTIMIZED;
2631             NEXT_OFF(n) = 0;
2632         }
2633         n++;
2634     }
2635 #endif
2636     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2637     return stopnow;
2638 }
2639
2640 /* REx optimizer.  Converts nodes into quicker variants "in place".
2641    Finds fixed substrings.  */
2642
2643 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2644    to the position after last scanned or to NULL. */
2645
2646 #define INIT_AND_WITHP \
2647     assert(!and_withp); \
2648     Newx(and_withp,1,struct regnode_charclass_class); \
2649     SAVEFREEPV(and_withp)
2650
2651 /* this is a chain of data about sub patterns we are processing that
2652    need to be handled separately/specially in study_chunk. Its so
2653    we can simulate recursion without losing state.  */
2654 struct scan_frame;
2655 typedef struct scan_frame {
2656     regnode *last;  /* last node to process in this frame */
2657     regnode *next;  /* next node to process when last is reached */
2658     struct scan_frame *prev; /*previous frame*/
2659     I32 stop; /* what stopparen do we use */
2660 } scan_frame;
2661
2662
2663 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2664
2665 #define CASE_SYNST_FNC(nAmE)                                       \
2666 case nAmE:                                                         \
2667     if (flags & SCF_DO_STCLASS_AND) {                              \
2668             for (value = 0; value < 256; value++)                  \
2669                 if (!is_ ## nAmE ## _cp(value))                       \
2670                     ANYOF_BITMAP_CLEAR(data->start_class, value);  \
2671     }                                                              \
2672     else {                                                         \
2673             for (value = 0; value < 256; value++)                  \
2674                 if (is_ ## nAmE ## _cp(value))                        \
2675                     ANYOF_BITMAP_SET(data->start_class, value);    \
2676     }                                                              \
2677     break;                                                         \
2678 case N ## nAmE:                                                    \
2679     if (flags & SCF_DO_STCLASS_AND) {                              \
2680             for (value = 0; value < 256; value++)                   \
2681                 if (is_ ## nAmE ## _cp(value))                         \
2682                     ANYOF_BITMAP_CLEAR(data->start_class, value);   \
2683     }                                                               \
2684     else {                                                          \
2685             for (value = 0; value < 256; value++)                   \
2686                 if (!is_ ## nAmE ## _cp(value))                        \
2687                     ANYOF_BITMAP_SET(data->start_class, value);     \
2688     }                                                               \
2689     break
2690
2691
2692
2693 STATIC I32
2694 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2695                         I32 *minlenp, I32 *deltap,
2696                         regnode *last,
2697                         scan_data_t *data,
2698                         I32 stopparen,
2699                         U8* recursed,
2700                         struct regnode_charclass_class *and_withp,
2701                         U32 flags, U32 depth)
2702                         /* scanp: Start here (read-write). */
2703                         /* deltap: Write maxlen-minlen here. */
2704                         /* last: Stop before this one. */
2705                         /* data: string data about the pattern */
2706                         /* stopparen: treat close N as END */
2707                         /* recursed: which subroutines have we recursed into */
2708                         /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2709 {
2710     dVAR;
2711     I32 min = 0, pars = 0, code;
2712     regnode *scan = *scanp, *next;
2713     I32 delta = 0;
2714     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2715     int is_inf_internal = 0;            /* The studied chunk is infinite */
2716     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2717     scan_data_t data_fake;
2718     SV *re_trie_maxbuff = NULL;
2719     regnode *first_non_open = scan;
2720     I32 stopmin = I32_MAX;
2721     scan_frame *frame = NULL;
2722     GET_RE_DEBUG_FLAGS_DECL;
2723
2724     PERL_ARGS_ASSERT_STUDY_CHUNK;
2725
2726 #ifdef DEBUGGING
2727     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2728 #endif
2729
2730     if ( depth == 0 ) {
2731         while (first_non_open && OP(first_non_open) == OPEN)
2732             first_non_open=regnext(first_non_open);
2733     }
2734
2735
2736   fake_study_recurse:
2737     while ( scan && OP(scan) != END && scan < last ){
2738         /* Peephole optimizer: */
2739         DEBUG_STUDYDATA("Peep:", data,depth);
2740         DEBUG_PEEP("Peep",scan,depth);
2741         JOIN_EXACT(scan,&min,0);
2742
2743         /* Follow the next-chain of the current node and optimize
2744            away all the NOTHINGs from it.  */
2745         if (OP(scan) != CURLYX) {
2746             const int max = (reg_off_by_arg[OP(scan)]
2747                        ? I32_MAX
2748                        /* I32 may be smaller than U16 on CRAYs! */
2749                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2750             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2751             int noff;
2752             regnode *n = scan;
2753
2754             /* Skip NOTHING and LONGJMP. */
2755             while ((n = regnext(n))
2756                    && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2757                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2758                    && off + noff < max)
2759                 off += noff;
2760             if (reg_off_by_arg[OP(scan)])
2761                 ARG(scan) = off;
2762             else
2763                 NEXT_OFF(scan) = off;
2764         }
2765
2766
2767
2768         /* The principal pseudo-switch.  Cannot be a switch, since we
2769            look into several different things.  */
2770         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2771                    || OP(scan) == IFTHEN) {
2772             next = regnext(scan);
2773             code = OP(scan);
2774             /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2775
2776             if (OP(next) == code || code == IFTHEN) {
2777                 /* NOTE - There is similar code to this block below for handling
2778                    TRIE nodes on a re-study.  If you change stuff here check there
2779                    too. */
2780                 I32 max1 = 0, min1 = I32_MAX, num = 0;
2781                 struct regnode_charclass_class accum;
2782                 regnode * const startbranch=scan;
2783
2784                 if (flags & SCF_DO_SUBSTR)
2785                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2786                 if (flags & SCF_DO_STCLASS)
2787                     cl_init_zero(pRExC_state, &accum);
2788
2789                 while (OP(scan) == code) {
2790                     I32 deltanext, minnext, f = 0, fake;
2791                     struct regnode_charclass_class this_class;
2792
2793                     num++;
2794                     data_fake.flags = 0;
2795                     if (data) {
2796                         data_fake.whilem_c = data->whilem_c;
2797                         data_fake.last_closep = data->last_closep;
2798                     }
2799                     else
2800                         data_fake.last_closep = &fake;
2801
2802                     data_fake.pos_delta = delta;
2803                     next = regnext(scan);
2804                     scan = NEXTOPER(scan);
2805                     if (code != BRANCH)
2806                         scan = NEXTOPER(scan);
2807                     if (flags & SCF_DO_STCLASS) {
2808                         cl_init(pRExC_state, &this_class);
2809                         data_fake.start_class = &this_class;
2810                         f = SCF_DO_STCLASS_AND;
2811                     }
2812                     if (flags & SCF_WHILEM_VISITED_POS)
2813                         f |= SCF_WHILEM_VISITED_POS;
2814
2815                     /* we suppose the run is continuous, last=next...*/
2816                     minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2817                                           next, &data_fake,
2818                                           stopparen, recursed, NULL, f,depth+1);
2819                     if (min1 > minnext)
2820                         min1 = minnext;
2821                     if (max1 < minnext + deltanext)
2822                         max1 = minnext + deltanext;
2823                     if (deltanext == I32_MAX)
2824                         is_inf = is_inf_internal = 1;
2825                     scan = next;
2826                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2827                         pars++;
2828                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
2829                         if ( stopmin > minnext)
2830                             stopmin = min + min1;
2831                         flags &= ~SCF_DO_SUBSTR;
2832                         if (data)
2833                             data->flags |= SCF_SEEN_ACCEPT;
2834                     }
2835                     if (data) {
2836                         if (data_fake.flags & SF_HAS_EVAL)
2837                             data->flags |= SF_HAS_EVAL;
2838                         data->whilem_c = data_fake.whilem_c;
2839                     }
2840                     if (flags & SCF_DO_STCLASS)
2841                         cl_or(pRExC_state, &accum, &this_class);
2842                 }
2843                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2844                     min1 = 0;
2845                 if (flags & SCF_DO_SUBSTR) {
2846                     data->pos_min += min1;
2847                     data->pos_delta += max1 - min1;
2848                     if (max1 != min1 || is_inf)
2849                         data->longest = &(data->longest_float);
2850                 }
2851                 min += min1;
2852                 delta += max1 - min1;
2853                 if (flags & SCF_DO_STCLASS_OR) {
2854                     cl_or(pRExC_state, data->start_class, &accum);
2855                     if (min1) {
2856                         cl_and(data->start_class, and_withp);
2857                         flags &= ~SCF_DO_STCLASS;
2858                     }
2859                 }
2860                 else if (flags & SCF_DO_STCLASS_AND) {
2861                     if (min1) {
2862                         cl_and(data->start_class, &accum);
2863                         flags &= ~SCF_DO_STCLASS;
2864                     }
2865                     else {
2866                         /* Switch to OR mode: cache the old value of
2867                          * data->start_class */
2868                         INIT_AND_WITHP;
2869                         StructCopy(data->start_class, and_withp,
2870                                    struct regnode_charclass_class);
2871                         flags &= ~SCF_DO_STCLASS_AND;
2872                         StructCopy(&accum, data->start_class,
2873                                    struct regnode_charclass_class);
2874                         flags |= SCF_DO_STCLASS_OR;
2875                         data->start_class->flags |= ANYOF_EOS;
2876                     }
2877                 }
2878
2879                 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2880                 /* demq.
2881
2882                    Assuming this was/is a branch we are dealing with: 'scan' now
2883                    points at the item that follows the branch sequence, whatever
2884                    it is. We now start at the beginning of the sequence and look
2885                    for subsequences of
2886
2887                    BRANCH->EXACT=>x1
2888                    BRANCH->EXACT=>x2
2889                    tail
2890
2891                    which would be constructed from a pattern like /A|LIST|OF|WORDS/
2892
2893                    If we can find such a subsequence we need to turn the first
2894                    element into a trie and then add the subsequent branch exact
2895                    strings to the trie.
2896
2897                    We have two cases
2898
2899                      1. patterns where the whole set of branches can be converted.
2900
2901                      2. patterns where only a subset can be converted.
2902
2903                    In case 1 we can replace the whole set with a single regop
2904                    for the trie. In case 2 we need to keep the start and end
2905                    branches so
2906
2907                      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2908                      becomes BRANCH TRIE; BRANCH X;
2909
2910                   There is an additional case, that being where there is a
2911                   common prefix, which gets split out into an EXACT like node
2912                   preceding the TRIE node.
2913
2914                   If x(1..n)==tail then we can do a simple trie, if not we make
2915                   a "jump" trie, such that when we match the appropriate word
2916                   we "jump" to the appropriate tail node. Essentially we turn
2917                   a nested if into a case structure of sorts.
2918
2919                 */
2920
2921                     int made=0;
2922                     if (!re_trie_maxbuff) {
2923                         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2924                         if (!SvIOK(re_trie_maxbuff))
2925                             sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2926                     }
2927                     if ( SvIV(re_trie_maxbuff)>=0  ) {
2928                         regnode *cur;
2929                         regnode *first = (regnode *)NULL;
2930                         regnode *last = (regnode *)NULL;
2931                         regnode *tail = scan;
2932                         U8 optype = 0;
2933                         U32 count=0;
2934
2935 #ifdef DEBUGGING
2936                         SV * const mysv = sv_newmortal();       /* for dumping */
2937 #endif
2938                         /* var tail is used because there may be a TAIL
2939                            regop in the way. Ie, the exacts will point to the
2940                            thing following the TAIL, but the last branch will
2941                            point at the TAIL. So we advance tail. If we
2942                            have nested (?:) we may have to move through several
2943                            tails.
2944                          */
2945
2946                         while ( OP( tail ) == TAIL ) {
2947                             /* this is the TAIL generated by (?:) */
2948                             tail = regnext( tail );
2949                         }
2950
2951
2952                         DEBUG_OPTIMISE_r({
2953                             regprop(RExC_rx, mysv, tail );
2954                             PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2955                                 (int)depth * 2 + 2, "",
2956                                 "Looking for TRIE'able sequences. Tail node is: ",
2957                                 SvPV_nolen_const( mysv )
2958                             );
2959                         });
2960
2961                         /*
2962
2963                            step through the branches, cur represents each
2964                            branch, noper is the first thing to be matched
2965                            as part of that branch and noper_next is the
2966                            regnext() of that node. if noper is an EXACT
2967                            and noper_next is the same as scan (our current
2968                            position in the regex) then the EXACT branch is
2969                            a possible optimization target. Once we have
2970                            two or more consecutive such branches we can
2971                            create a trie of the EXACT's contents and stich
2972                            it in place. If the sequence represents all of
2973                            the branches we eliminate the whole thing and
2974                            replace it with a single TRIE. If it is a
2975                            subsequence then we need to stitch it in. This
2976                            means the first branch has to remain, and needs
2977                            to be repointed at the item on the branch chain
2978                            following the last branch optimized. This could
2979                            be either a BRANCH, in which case the
2980                            subsequence is internal, or it could be the
2981                            item following the branch sequence in which
2982                            case the subsequence is at the end.
2983
2984                         */
2985
2986                         /* dont use tail as the end marker for this traverse */
2987                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2988                             regnode * const noper = NEXTOPER( cur );
2989 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
2990                             regnode * const noper_next = regnext( noper );
2991 #endif
2992
2993                             DEBUG_OPTIMISE_r({
2994                                 regprop(RExC_rx, mysv, cur);
2995                                 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2996                                    (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2997
2998                                 regprop(RExC_rx, mysv, noper);
2999                                 PerlIO_printf( Perl_debug_log, " -> %s",
3000                                     SvPV_nolen_const(mysv));
3001
3002                                 if ( noper_next ) {
3003                                   regprop(RExC_rx, mysv, noper_next );
3004                                   PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3005                                     SvPV_nolen_const(mysv));
3006                                 }
3007                                 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
3008                                    REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
3009                             });
3010                             if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
3011                                          : PL_regkind[ OP( noper ) ] == EXACT )
3012                                   || OP(noper) == NOTHING )
3013 #ifdef NOJUMPTRIE
3014                                   && noper_next == tail
3015 #endif
3016                                   && count < U16_MAX)
3017                             {
3018                                 count++;
3019                                 if ( !first || optype == NOTHING ) {
3020                                     if (!first) first = cur;
3021                                     optype = OP( noper );
3022                                 } else {
3023                                     last = cur;
3024                                 }
3025                             } else {
3026 /*
3027     Currently we do not believe that the trie logic can
3028     handle case insensitive matching properly when the
3029     pattern is not unicode (thus forcing unicode semantics).
3030
3031     If/when this is fixed the following define can be swapped
3032     in below to fully enable trie logic.
3033
3034     XXX It may work if not UTF and/or /a (AT_LEAST_UNI_SEMANTICS) but perhaps
3035     not /aa
3036
3037 #define TRIE_TYPE_IS_SAFE 1
3038
3039 */
3040 #define TRIE_TYPE_IS_SAFE ((UTF && UNI_SEMANTICS) || optype==EXACT)
3041
3042                                 if ( last && TRIE_TYPE_IS_SAFE ) {
3043                                     make_trie( pRExC_state,
3044                                             startbranch, first, cur, tail, count,
3045                                             optype, depth+1 );
3046                                 }
3047                                 if ( PL_regkind[ OP( noper ) ] == EXACT
3048 #ifdef NOJUMPTRIE
3049                                      && noper_next == tail
3050 #endif
3051                                 ){
3052                                     count = 1;
3053                                     first = cur;
3054                                     optype = OP( noper );
3055                                 } else {
3056                                     count = 0;
3057                                     first = NULL;
3058                                     optype = 0;
3059                                 }
3060                                 last = NULL;
3061                             }
3062                         }
3063                         DEBUG_OPTIMISE_r({
3064                             regprop(RExC_rx, mysv, cur);
3065                             PerlIO_printf( Perl_debug_log,
3066                               "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3067                               "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3068
3069                         });
3070
3071                         if ( last && TRIE_TYPE_IS_SAFE ) {
3072                             made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3073 #ifdef TRIE_STUDY_OPT
3074                             if ( ((made == MADE_EXACT_TRIE &&
3075                                  startbranch == first)
3076                                  || ( first_non_open == first )) &&
3077                                  depth==0 ) {
3078                                 flags |= SCF_TRIE_RESTUDY;
3079                                 if ( startbranch == first
3080                                      && scan == tail )
3081                                 {
3082                                     RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3083                                 }
3084                             }
3085 #endif
3086                         }
3087                     }
3088
3089                 } /* do trie */
3090
3091             }
3092             else if ( code == BRANCHJ ) {  /* single branch is optimized. */
3093                 scan = NEXTOPER(NEXTOPER(scan));
3094             } else                      /* single branch is optimized. */
3095                 scan = NEXTOPER(scan);
3096             continue;
3097         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3098             scan_frame *newframe = NULL;
3099             I32 paren;
3100             regnode *start;
3101             regnode *end;
3102
3103             if (OP(scan) != SUSPEND) {
3104             /* set the pointer */
3105                 if (OP(scan) == GOSUB) {
3106                     paren = ARG(scan);
3107                     RExC_recurse[ARG2L(scan)] = scan;
3108                     start = RExC_open_parens[paren-1];
3109                     end   = RExC_close_parens[paren-1];
3110                 } else {
3111                     paren = 0;
3112                     start = RExC_rxi->program + 1;
3113                     end   = RExC_opend;
3114                 }
3115                 if (!recursed) {
3116                     Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3117                     SAVEFREEPV(recursed);
3118                 }
3119                 if (!PAREN_TEST(recursed,paren+1)) {
3120                     PAREN_SET(recursed,paren+1);
3121                     Newx(newframe,1,scan_frame);
3122                 } else {
3123                     if (flags & SCF_DO_SUBSTR) {
3124                         SCAN_COMMIT(pRExC_state,data,minlenp);
3125                         data->longest = &(data->longest_float);
3126                     }
3127                     is_inf = is_inf_internal = 1;
3128                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3129                         cl_anything(pRExC_state, data->start_class);
3130                     flags &= ~SCF_DO_STCLASS;
3131                 }
3132             } else {
3133                 Newx(newframe,1,scan_frame);
3134                 paren = stopparen;
3135                 start = scan+2;
3136                 end = regnext(scan);
3137             }
3138             if (newframe) {
3139                 assert(start);
3140                 assert(end);
3141                 SAVEFREEPV(newframe);
3142                 newframe->next = regnext(scan);
3143                 newframe->last = last;
3144                 newframe->stop = stopparen;
3145                 newframe->prev = frame;
3146
3147                 frame = newframe;
3148                 scan =  start;
3149                 stopparen = paren;
3150                 last = end;
3151
3152                 continue;
3153             }
3154         }
3155         else if (OP(scan) == EXACT) {
3156             I32 l = STR_LEN(scan);
3157             UV uc;
3158             if (UTF) {
3159                 const U8 * const s = (U8*)STRING(scan);
3160                 l = utf8_length(s, s + l);
3161                 uc = utf8_to_uvchr(s, NULL);
3162             } else {
3163                 uc = *((U8*)STRING(scan));
3164             }
3165             min += l;
3166             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3167                 /* The code below prefers earlier match for fixed
3168                    offset, later match for variable offset.  */
3169                 if (data->last_end == -1) { /* Update the start info. */
3170                     data->last_start_min = data->pos_min;
3171                     data->last_start_max = is_inf
3172                         ? I32_MAX : data->pos_min + data->pos_delta;
3173                 }
3174                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3175                 if (UTF)
3176                     SvUTF8_on(data->last_found);
3177                 {
3178                     SV * const sv = data->last_found;
3179                     MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3180                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
3181                     if (mg && mg->mg_len >= 0)
3182                         mg->mg_len += utf8_length((U8*)STRING(scan),
3183                                                   (U8*)STRING(scan)+STR_LEN(scan));
3184                 }
3185                 data->last_end = data->pos_min + l;
3186                 data->pos_min += l; /* As in the first entry. */
3187                 data->flags &= ~SF_BEFORE_EOL;
3188             }
3189             if (flags & SCF_DO_STCLASS_AND) {
3190                 /* Check whether it is compatible with what we know already! */
3191                 int compat = 1;
3192
3193
3194                 /* If compatible, we or it in below.  It is compatible if is
3195                  * in the bitmp and either 1) its bit or its fold is set, or 2)
3196                  * it's for a locale.  Even if there isn't unicode semantics
3197                  * here, at runtime there may be because of matching against a
3198                  * utf8 string, so accept a possible false positive for
3199                  * latin1-range folds */
3200                 if (uc >= 0x100 ||
3201                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3202                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
3203                     && (!(data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD)
3204                         || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3205                     )
3206                 {
3207                     compat = 0;
3208                 }
3209                 ANYOF_CLASS_ZERO(data->start_class);
3210                 ANYOF_BITMAP_ZERO(data->start_class);
3211                 if (compat)
3212                     ANYOF_BITMAP_SET(data->start_class, uc);
3213                 else if (uc >= 0x100) {
3214                     int i;
3215
3216                     /* Some Unicode code points fold to the Latin1 range; as
3217                      * XXX temporary code, instead of figuring out if this is
3218                      * one, just assume it is and set all the start class bits
3219                      * that could be some such above 255 code point's fold
3220                      * which will generate fals positives.  As the code
3221                      * elsewhere that does compute the fold settles down, it
3222                      * can be extracted out and re-used here */
3223                     for (i = 0; i < 256; i++){
3224                         if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3225                             ANYOF_BITMAP_SET(data->start_class, i);
3226                         }
3227                     }
3228                 }
3229                 data->start_class->flags &= ~ANYOF_EOS;
3230                 if (uc < 0x100)
3231                   data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3232             }
3233             else if (flags & SCF_DO_STCLASS_OR) {
3234                 /* false positive possible if the class is case-folded */
3235                 if (uc < 0x100)
3236                     ANYOF_BITMAP_SET(data->start_class, uc);
3237                 else
3238                     data->start_class->flags |= ANYOF_UNICODE_ALL;
3239                 data->start_class->flags &= ~ANYOF_EOS;
3240                 cl_and(data->start_class, and_withp);
3241             }
3242             flags &= ~SCF_DO_STCLASS;
3243         }
3244         else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3245             I32 l = STR_LEN(scan);
3246             UV uc = *((U8*)STRING(scan));
3247
3248             /* Search for fixed substrings supports EXACT only. */
3249             if (flags & SCF_DO_SUBSTR) {
3250                 assert(data);
3251                 SCAN_COMMIT(pRExC_state, data, minlenp);
3252             }
3253             if (UTF) {
3254                 const U8 * const s = (U8 *)STRING(scan);
3255                 l = utf8_length(s, s + l);
3256                 uc = utf8_to_uvchr(s, NULL);
3257             }
3258             min += l;
3259             if (flags & SCF_DO_SUBSTR)
3260                 data->pos_min += l;
3261             if (flags & SCF_DO_STCLASS_AND) {
3262                 /* Check whether it is compatible with what we know already! */
3263                 int compat = 1;
3264                 if (uc >= 0x100 ||
3265                  (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3266                   && !ANYOF_BITMAP_TEST(data->start_class, uc)
3267                   && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3268                 {
3269                     compat = 0;
3270                 }
3271                 ANYOF_CLASS_ZERO(data->start_class);
3272                 ANYOF_BITMAP_ZERO(data->start_class);
3273                 if (compat) {
3274                     ANYOF_BITMAP_SET(data->start_class, uc);
3275                     data->start_class->flags &= ~ANYOF_EOS;
3276                     data->start_class->flags |= ANYOF_LOC_NONBITMAP_FOLD;
3277                     if (OP(scan) == EXACTFL) {
3278                         /* XXX This set is probably no longer necessary, and
3279                          * probably wrong as LOCALE now is on in the initial
3280                          * state */
3281                         data->start_class->flags |= ANYOF_LOCALE;
3282                     }
3283                     else {
3284
3285                         /* Also set the other member of the fold pair.  In case
3286                          * that unicode semantics is called for at runtime, use
3287                          * the full latin1 fold.  (Can't do this for locale,
3288                          * because not known until runtime */
3289                         ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3290                     }
3291                 }
3292                 else if (uc >= 0x100) {
3293                     int i;
3294                     for (i = 0; i < 256; i++){
3295                         if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3296                             ANYOF_BITMAP_SET(data->start_class, i);
3297                         }
3298                     }
3299                 }
3300             }
3301             else if (flags & SCF_DO_STCLASS_OR) {
3302                 if (data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) {
3303                     /* false positive possible if the class is case-folded.
3304                        Assume that the locale settings are the same... */
3305                     if (uc < 0x100) {
3306                         ANYOF_BITMAP_SET(data->start_class, uc);
3307                         if (OP(scan) != EXACTFL) {
3308
3309                             /* And set the other member of the fold pair, but
3310                              * can't do that in locale because not known until
3311                              * run-time */
3312                             ANYOF_BITMAP_SET(data->start_class,
3313                                              PL_fold_latin1[uc]);
3314                         }
3315                     }
3316                     data->start_class->flags &= ~ANYOF_EOS;
3317                 }
3318                 cl_and(data->start_class, and_withp);
3319             }
3320             flags &= ~SCF_DO_STCLASS;
3321         }
3322         else if (REGNODE_VARIES(OP(scan))) {
3323             I32 mincount, maxcount, minnext, deltanext, fl = 0;
3324             I32 f = flags, pos_before = 0;
3325             regnode * const oscan = scan;
3326             struct regnode_charclass_class this_class;
3327             struct regnode_charclass_class *oclass = NULL;
3328             I32 next_is_eval = 0;
3329
3330             switch (PL_regkind[OP(scan)]) {
3331             case WHILEM:                /* End of (?:...)* . */
3332                 scan = NEXTOPER(scan);
3333                 goto finish;
3334             case PLUS:
3335                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3336                     next = NEXTOPER(scan);
3337                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3338                         mincount = 1;
3339                         maxcount = REG_INFTY;
3340                         next = regnext(scan);
3341                         scan = NEXTOPER(scan);
3342                         goto do_curly;
3343                     }
3344                 }
3345                 if (flags & SCF_DO_SUBSTR)
3346                     data->pos_min++;
3347                 min++;
3348                 /* Fall through. */
3349             case STAR:
3350                 if (flags & SCF_DO_STCLASS) {
3351                     mincount = 0;
3352                     maxcount = REG_INFTY;
3353                     next = regnext(scan);
3354                     scan = NEXTOPER(scan);
3355                     goto do_curly;
3356                 }
3357                 is_inf = is_inf_internal = 1;
3358                 scan = regnext(scan);
3359                 if (flags & SCF_DO_SUBSTR) {
3360                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3361                     data->longest = &(data->longest_float);
3362                 }
3363                 goto optimize_curly_tail;
3364             case CURLY:
3365                 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3366                     && (scan->flags == stopparen))
3367                 {
3368                     mincount = 1;
3369                     maxcount = 1;
3370                 } else {
3371                     mincount = ARG1(scan);
3372                     maxcount = ARG2(scan);
3373                 }
3374                 next = regnext(scan);
3375                 if (OP(scan) == CURLYX) {
3376                     I32 lp = (data ? *(data->last_closep) : 0);
3377                     scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3378                 }
3379                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3380                 next_is_eval = (OP(scan) == EVAL);
3381               do_curly:
3382                 if (flags & SCF_DO_SUBSTR) {
3383                     if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3384                     pos_before = data->pos_min;
3385                 }
3386                 if (data) {
3387                     fl = data->flags;
3388                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3389                     if (is_inf)
3390                         data->flags |= SF_IS_INF;
3391                 }
3392                 if (flags & SCF_DO_STCLASS) {
3393                     cl_init(pRExC_state, &this_class);
3394                     oclass = data->start_class;
3395                     data->start_class = &this_class;
3396                     f |= SCF_DO_STCLASS_AND;
3397                     f &= ~SCF_DO_STCLASS_OR;
3398                 }
3399                 /* Exclude from super-linear cache processing any {n,m}
3400                    regops for which the combination of input pos and regex
3401                    pos is not enough information to determine if a match
3402                    will be possible.
3403
3404                    For example, in the regex /foo(bar\s*){4,8}baz/ with the
3405                    regex pos at the \s*, the prospects for a match depend not
3406                    only on the input position but also on how many (bar\s*)
3407                    repeats into the {4,8} we are. */
3408                if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3409                     f &= ~SCF_WHILEM_VISITED_POS;
3410
3411                 /* This will finish on WHILEM, setting scan, or on NULL: */
3412                 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3413                                       last, data, stopparen, recursed, NULL,
3414                                       (mincount == 0
3415                                         ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3416
3417                 if (flags & SCF_DO_STCLASS)
3418                     data->start_class = oclass;
3419                 if (mincount == 0 || minnext == 0) {
3420                     if (flags & SCF_DO_STCLASS_OR) {
3421                         cl_or(pRExC_state, data->start_class, &this_class);
3422                     }
3423                     else if (flags & SCF_DO_STCLASS_AND) {
3424                         /* Switch to OR mode: cache the old value of
3425                          * data->start_class */
3426                         INIT_AND_WITHP;
3427                         StructCopy(data->start_class, and_withp,
3428                                    struct regnode_charclass_class);
3429                         flags &= ~SCF_DO_STCLASS_AND;
3430                         StructCopy(&this_class, data->start_class,
3431                                    struct regnode_charclass_class);
3432                         flags |= SCF_DO_STCLASS_OR;
3433                         data->start_class->flags |= ANYOF_EOS;
3434                     }
3435                 } else {                /* Non-zero len */
3436                     if (flags & SCF_DO_STCLASS_OR) {
3437                         cl_or(pRExC_state, data->start_class, &this_class);
3438                         cl_and(data->start_class, and_withp);
3439                     }
3440                     else if (flags & SCF_DO_STCLASS_AND)
3441                         cl_and(data->start_class, &this_class);
3442                     flags &= ~SCF_DO_STCLASS;
3443                 }
3444                 if (!scan)              /* It was not CURLYX, but CURLY. */
3445                     scan = next;
3446                 if ( /* ? quantifier ok, except for (?{ ... }) */
3447                     (next_is_eval || !(mincount == 0 && maxcount == 1))
3448                     && (minnext == 0) && (deltanext == 0)
3449                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3450                     && maxcount <= REG_INFTY/3) /* Complement check for big count */
3451                 {
3452                     ckWARNreg(RExC_parse,
3453                               "Quantifier unexpected on zero-length expression");
3454                 }
3455
3456                 min += minnext * mincount;
3457                 is_inf_internal |= ((maxcount == REG_INFTY
3458                                      && (minnext + deltanext) > 0)
3459                                     || deltanext == I32_MAX);
3460                 is_inf |= is_inf_internal;
3461                 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3462
3463                 /* Try powerful optimization CURLYX => CURLYN. */
3464                 if (  OP(oscan) == CURLYX && data
3465                       && data->flags & SF_IN_PAR
3466                       && !(data->flags & SF_HAS_EVAL)
3467                       && !deltanext && minnext == 1 ) {
3468                     /* Try to optimize to CURLYN.  */
3469                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3470                     regnode * const nxt1 = nxt;
3471 #ifdef DEBUGGING
3472                     regnode *nxt2;
3473 #endif
3474
3475                     /* Skip open. */
3476                     nxt = regnext(nxt);
3477                     if (!REGNODE_SIMPLE(OP(nxt))
3478                         && !(PL_regkind[OP(nxt)] == EXACT
3479                              && STR_LEN(nxt) == 1))
3480                         goto nogo;
3481 #ifdef DEBUGGING
3482                     nxt2 = nxt;
3483 #endif
3484                     nxt = regnext(nxt);
3485                     if (OP(nxt) != CLOSE)
3486                         goto nogo;
3487                     if (RExC_open_parens) {
3488                         RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3489                         RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3490                     }
3491                     /* Now we know that nxt2 is the only contents: */
3492                     oscan->flags = (U8)ARG(nxt);
3493                     OP(oscan) = CURLYN;
3494                     OP(nxt1) = NOTHING; /* was OPEN. */
3495
3496 #ifdef DEBUGGING
3497                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3498                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3499                     NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3500                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
3501                     OP(nxt + 1) = OPTIMIZED; /* was count. */
3502                     NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3503 #endif
3504                 }
3505               nogo:
3506
3507                 /* Try optimization CURLYX => CURLYM. */
3508                 if (  OP(oscan) == CURLYX && data
3509                       && !(data->flags & SF_HAS_PAR)
3510                       && !(data->flags & SF_HAS_EVAL)
3511                       && !deltanext     /* atom is fixed width */
3512                       && minnext != 0   /* CURLYM can't handle zero width */
3513                 ) {
3514                     /* XXXX How to optimize if data == 0? */
3515                     /* Optimize to a simpler form.  */
3516                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3517                     regnode *nxt2;
3518
3519                     OP(oscan) = CURLYM;
3520                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3521                             && (OP(nxt2) != WHILEM))
3522                         nxt = nxt2;
3523                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
3524                     /* Need to optimize away parenths. */
3525                     if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3526                         /* Set the parenth number.  */
3527                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3528
3529                         oscan->flags = (U8)ARG(nxt);
3530                         if (RExC_open_parens) {
3531                             RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3532                             RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3533                         }
3534                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
3535                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
3536
3537 #ifdef DEBUGGING
3538                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3539                         OP(nxt + 1) = OPTIMIZED; /* was count. */
3540                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3541                         NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
3542 #endif
3543 #if 0
3544                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
3545                             regnode *nnxt = regnext(nxt1);
3546                             if (nnxt == nxt) {
3547                                 if (reg_off_by_arg[OP(nxt1)])
3548                                     ARG_SET(nxt1, nxt2 - nxt1);
3549                                 else if (nxt2 - nxt1 < U16_MAX)
3550                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
3551                                 else
3552                                     OP(nxt) = NOTHING;  /* Cannot beautify */
3553                             }
3554                             nxt1 = nnxt;
3555                         }
3556 #endif
3557                         /* Optimize again: */
3558                         study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3559                                     NULL, stopparen, recursed, NULL, 0,depth+1);
3560                     }
3561                     else
3562                         oscan->flags = 0;
3563                 }
3564                 else if ((OP(oscan) == CURLYX)
3565                          && (flags & SCF_WHILEM_VISITED_POS)
3566                          /* See the comment on a similar expression above.
3567                             However, this time it's not a subexpression
3568                             we care about, but the expression itself. */
3569                          && (maxcount == REG_INFTY)
3570                          && data && ++data->whilem_c < 16) {
3571                     /* This stays as CURLYX, we can put the count/of pair. */
3572                     /* Find WHILEM (as in regexec.c) */
3573                     regnode *nxt = oscan + NEXT_OFF(oscan);
3574
3575                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3576                         nxt += ARG(nxt);
3577                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
3578                         | (RExC_whilem_seen << 4)); /* On WHILEM */
3579                 }
3580                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3581                     pars++;
3582                 if (flags & SCF_DO_SUBSTR) {
3583                     SV *last_str = NULL;
3584                     int counted = mincount != 0;
3585
3586                     if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3587 #if defined(SPARC64_GCC_WORKAROUND)
3588                         I32 b = 0;
3589                         STRLEN l = 0;
3590                         const char *s = NULL;
3591                         I32 old = 0;
3592
3593                         if (pos_before >= data->last_start_min)
3594                             b = pos_before;
3595                         else
3596                             b = data->last_start_min;
3597
3598                         l = 0;
3599                         s = SvPV_const(data->last_found, l);
3600                         old = b - data->last_start_min;
3601
3602 #else
3603                         I32 b = pos_before >= data->last_start_min
3604                             ? pos_before : data->last_start_min;
3605                         STRLEN l;
3606                         const char * const s = SvPV_const(data->last_found, l);
3607                         I32 old = b - data->last_start_min;
3608 #endif
3609
3610                         if (UTF)
3611                             old = utf8_hop((U8*)s, old) - (U8*)s;
3612                         l -= old;
3613                         /* Get the added string: */
3614                         last_str = newSVpvn_utf8(s  + old, l, UTF);
3615                         if (deltanext == 0 && pos_before == b) {
3616                             /* What was added is a constant string */
3617                             if (mincount > 1) {
3618                                 SvGROW(last_str, (mincount * l) + 1);
3619                                 repeatcpy(SvPVX(last_str) + l,
3620                                           SvPVX_const(last_str), l, mincount - 1);
3621                                 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3622                                 /* Add additional parts. */
3623                                 SvCUR_set(data->last_found,
3624                                           SvCUR(data->last_found) - l);
3625                                 sv_catsv(data->last_found, last_str);
3626                                 {
3627                                     SV * sv = data->last_found;
3628                                     MAGIC *mg =
3629                                         SvUTF8(sv) && SvMAGICAL(sv) ?
3630                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
3631                                     if (mg && mg->mg_len >= 0)
3632                                         mg->mg_len += CHR_SVLEN(last_str) - l;
3633                                 }
3634                                 data->last_end += l * (mincount - 1);
3635                             }
3636                         } else {
3637                             /* start offset must point into the last copy */
3638                             data->last_start_min += minnext * (mincount - 1);
3639                             data->last_start_max += is_inf ? I32_MAX
3640                                 : (maxcount - 1) * (minnext + data->pos_delta);
3641                         }
3642                     }
3643                     /* It is counted once already... */
3644                     data->pos_min += minnext * (mincount - counted);
3645                     data->pos_delta += - counted * deltanext +
3646                         (minnext + deltanext) * maxcount - minnext * mincount;
3647                     if (mincount != maxcount) {
3648                          /* Cannot extend fixed substrings found inside
3649                             the group.  */
3650                         SCAN_COMMIT(pRExC_state,data,minlenp);
3651                         if (mincount && last_str) {
3652                             SV * const sv = data->last_found;
3653                             MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3654                                 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3655
3656                             if (mg)
3657                                 mg->mg_len = -1;
3658                             sv_setsv(sv, last_str);
3659                             data->last_end = data->pos_min;
3660                             data->last_start_min =
3661                                 data->pos_min - CHR_SVLEN(last_str);
3662                             data->last_start_max = is_inf
3663                                 ? I32_MAX
3664                                 : data->pos_min + data->pos_delta
3665                                 - CHR_SVLEN(last_str);
3666                         }
3667                         data->longest = &(data->longest_float);
3668                     }
3669                     SvREFCNT_dec(last_str);
3670                 }
3671                 if (data && (fl & SF_HAS_EVAL))
3672                     data->flags |= SF_HAS_EVAL;
3673               optimize_curly_tail:
3674                 if (OP(oscan) != CURLYX) {
3675                     while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3676                            && NEXT_OFF(next))
3677                         NEXT_OFF(oscan) += NEXT_OFF(next);
3678                 }
3679                 continue;
3680             default:                    /* REF, ANYOFV, and CLUMP only? */
3681                 if (flags & SCF_DO_SUBSTR) {
3682                     SCAN_COMMIT(pRExC_state,data,minlenp);      /* Cannot expect anything... */
3683                     data->longest = &(data->longest_float);
3684                 }
3685                 is_inf = is_inf_internal = 1;
3686                 if (flags & SCF_DO_STCLASS_OR)
3687                     cl_anything(pRExC_state, data->start_class);
3688                 flags &= ~SCF_DO_STCLASS;
3689                 break;
3690             }
3691         }
3692         else if (OP(scan) == LNBREAK) {
3693             if (flags & SCF_DO_STCLASS) {
3694                 int value = 0;
3695                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3696                 if (flags & SCF_DO_STCLASS_AND) {
3697                     for (value = 0; value < 256; value++)
3698                         if (!is_VERTWS_cp(value))
3699                             ANYOF_BITMAP_CLEAR(data->start_class, value);
3700                 }
3701                 else {
3702                     for (value = 0; value < 256; value++)
3703                         if (is_VERTWS_cp(value))
3704                             ANYOF_BITMAP_SET(data->start_class, value);
3705                 }
3706                 if (flags & SCF_DO_STCLASS_OR)
3707                     cl_and(data->start_class, and_withp);
3708                 flags &= ~SCF_DO_STCLASS;
3709             }
3710             min += 1;
3711             delta += 1;
3712             if (flags & SCF_DO_SUBSTR) {
3713                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
3714                 data->pos_min += 1;
3715                 data->pos_delta += 1;
3716                 data->longest = &(data->longest_float);
3717             }
3718         }
3719         else if (OP(scan) == FOLDCHAR) {
3720             int d = ARG(scan) == LATIN_SMALL_LETTER_SHARP_S ? 1 : 2;
3721             flags &= ~SCF_DO_STCLASS;
3722             min += 1;
3723             delta += d;
3724             if (flags & SCF_DO_SUBSTR) {
3725                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
3726                 data->pos_min += 1;
3727                 data->pos_delta += d;
3728                 data->longest = &(data->longest_float);
3729             }
3730         }
3731         else if (REGNODE_SIMPLE(OP(scan))) {
3732             int value = 0;
3733
3734             if (flags & SCF_DO_SUBSTR) {
3735                 SCAN_COMMIT(pRExC_state,data,minlenp);
3736                 data->pos_min++;
3737             }
3738             min++;
3739             if (flags & SCF_DO_STCLASS) {
3740                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3741
3742                 /* Some of the logic below assumes that switching
3743                    locale on will only add false positives. */
3744                 switch (PL_regkind[OP(scan)]) {
3745                 case SANY:
3746                 default:
3747                   do_default:
3748                     /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3749                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3750                         cl_anything(pRExC_state, data->start_class);
3751                     break;
3752                 case REG_ANY:
3753                     if (OP(scan) == SANY)
3754                         goto do_default;
3755                     if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3756                         value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3757                                  || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
3758                         cl_anything(pRExC_state, data->start_class);
3759                     }
3760                     if (flags & SCF_DO_STCLASS_AND || !value)
3761                         ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3762                     break;
3763                 case ANYOF:
3764                     if (flags & SCF_DO_STCLASS_AND)
3765                         cl_and(data->start_class,
3766                                (struct regnode_charclass_class*)scan);
3767                     else
3768                         cl_or(pRExC_state, data->start_class,
3769                               (struct regnode_charclass_class*)scan);
3770                     break;
3771                 case ALNUM:
3772                     if (flags & SCF_DO_STCLASS_AND) {
3773                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3774                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3775                             if (OP(scan) == ALNUMU) {
3776                                 for (value = 0; value < 256; value++) {
3777                                     if (!isWORDCHAR_L1(value)) {
3778                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3779                                     }
3780                                 }
3781                             } else {
3782                                 for (value = 0; value < 256; value++) {
3783                                     if (!isALNUM(value)) {
3784                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3785                                     }
3786                                 }
3787                             }
3788                         }
3789                     }
3790                     else {
3791                         if (data->start_class->flags & ANYOF_LOCALE)
3792                             ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3793
3794                         /* Even if under locale, set the bits for non-locale
3795                          * in case it isn't a true locale-node.  This will
3796                          * create false positives if it truly is locale */
3797                         if (OP(scan) == ALNUMU) {
3798                             for (value = 0; value < 256; value++) {
3799                                 if (isWORDCHAR_L1(value)) {
3800                                     ANYOF_BITMAP_SET(data->start_class, value);
3801                                 }
3802                             }
3803                         } else {
3804                             for (value = 0; value < 256; value++) {
3805                                 if (isALNUM(value)) {
3806                                     ANYOF_BITMAP_SET(data->start_class, value);
3807                                 }
3808                             }
3809                         }
3810                     }
3811                     break;
3812                 case NALNUM:
3813                     if (flags & SCF_DO_STCLASS_AND) {
3814                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3815                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3816                             if (OP(scan) == NALNUMU) {
3817                                 for (value = 0; value < 256; value++) {
3818                                     if (isWORDCHAR_L1(value)) {
3819                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3820                                     }
3821                                 }
3822                             } else {
3823                                 for (value = 0; value < 256; value++) {
3824                                     if (isALNUM(value)) {
3825                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3826                                     }
3827                                 }
3828                             }
3829                         }
3830                     }
3831                     else {
3832                         if (data->start_class->flags & ANYOF_LOCALE)
3833                             ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3834
3835                         /* Even if under locale, set the bits for non-locale in
3836                          * case it isn't a true locale-node.  This will create
3837                          * false positives if it truly is locale */
3838                         if (OP(scan) == NALNUMU) {
3839                             for (value = 0; value < 256; value++) {
3840                                 if (! isWORDCHAR_L1(value)) {
3841                                     ANYOF_BITMAP_SET(data->start_class, value);
3842                                 }
3843                             }
3844                         } else {
3845                             for (value = 0; value < 256; value++) {
3846                                 if (! isALNUM(value)) {
3847                                     ANYOF_BITMAP_SET(data->start_class, value);
3848                                 }
3849                             }
3850                         }
3851                     }
3852                     break;
3853                 case SPACE:
3854                     if (flags & SCF_DO_STCLASS_AND) {
3855                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3856                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3857                             if (OP(scan) == SPACEU) {
3858                                 for (value = 0; value < 256; value++) {
3859                                     if (!isSPACE_L1(value)) {
3860                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3861                                     }
3862                                 }
3863                             } else {
3864                                 for (value = 0; value < 256; value++) {
3865                                     if (!isSPACE(value)) {
3866                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3867                                     }
3868                                 }
3869                             }
3870                         }
3871                     }
3872                     else {
3873                         if (data->start_class->flags & ANYOF_LOCALE) {
3874                             ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3875                         }
3876                         if (OP(scan) == SPACEU) {
3877                             for (value = 0; value < 256; value++) {
3878                                 if (isSPACE_L1(value)) {
3879                                     ANYOF_BITMAP_SET(data->start_class, value);
3880                                 }
3881                             }
3882                         } else {
3883                             for (value = 0; value < 256; value++) {
3884                                 if (isSPACE(value)) {
3885                                     ANYOF_BITMAP_SET(data->start_class, value);
3886                                 }
3887                             }
3888                         }
3889                     }
3890                     break;
3891                 case NSPACE:
3892                     if (flags & SCF_DO_STCLASS_AND) {
3893                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3894                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3895                             if (OP(scan) == NSPACEU) {
3896                                 for (value = 0; value < 256; value++) {
3897                                     if (isSPACE_L1(value)) {
3898                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3899                                     }
3900                                 }
3901                             } else {
3902                                 for (value = 0; value < 256; value++) {
3903                                     if (isSPACE(value)) {
3904                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3905                                     }
3906                                 }
3907                             }
3908                         }
3909                     }
3910                     else {
3911                         if (data->start_class->flags & ANYOF_LOCALE)
3912                             ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3913                         if (OP(scan) == NSPACEU) {
3914                             for (value = 0; value < 256; value++) {
3915                                 if (!isSPACE_L1(value)) {
3916                                     ANYOF_BITMAP_SET(data->start_class, value);
3917                                 }
3918                             }
3919                         }
3920                         else {
3921                             for (value = 0; value < 256; value++) {
3922                                 if (!isSPACE(value)) {
3923                                     ANYOF_BITMAP_SET(data->start_class, value);
3924                                 }
3925                             }
3926                         }
3927                     }
3928                     break;
3929                 case DIGIT:
3930                     if (flags & SCF_DO_STCLASS_AND) {
3931                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3932                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3933                             for (value = 0; value < 256; value++)
3934                                 if (!isDIGIT(value))
3935                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
3936                         }
3937                     }
3938                     else {
3939                         if (data->start_class->flags & ANYOF_LOCALE)
3940                             ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3941                         for (value = 0; value < 256; value++)
3942                             if (isDIGIT(value))
3943                                 ANYOF_BITMAP_SET(data->start_class, value);
3944                     }
3945                     break;
3946                 case NDIGIT:
3947                     if (flags & SCF_DO_STCLASS_AND) {
3948                         if (!(data->start_class->flags & ANYOF_LOCALE))
3949                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3950                         for (value = 0; value < 256; value++)
3951                             if (isDIGIT(value))
3952                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
3953                     }
3954                     else {
3955                         if (data->start_class->flags & ANYOF_LOCALE)
3956                             ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3957                         for (value = 0; value < 256; value++)
3958                             if (!isDIGIT(value))
3959                                 ANYOF_BITMAP_SET(data->start_class, value);
3960                     }
3961                     break;
3962                 CASE_SYNST_FNC(VERTWS);
3963                 CASE_SYNST_FNC(HORIZWS);
3964
3965                 }
3966                 if (flags & SCF_DO_STCLASS_OR)
3967                     cl_and(data->start_class, and_withp);
3968                 flags &= ~SCF_DO_STCLASS;
3969             }
3970         }
3971         else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3972             data->flags |= (OP(scan) == MEOL
3973                             ? SF_BEFORE_MEOL
3974                             : SF_BEFORE_SEOL);
3975         }
3976         else if (  PL_regkind[OP(scan)] == BRANCHJ
3977                  /* Lookbehind, or need to calculate parens/evals/stclass: */
3978                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
3979                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3980             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3981                 || OP(scan) == UNLESSM )
3982             {
3983                 /* Negative Lookahead/lookbehind
3984                    In this case we can't do fixed string optimisation.
3985                 */
3986
3987                 I32 deltanext, minnext, fake = 0;
3988                 regnode *nscan;
3989                 struct regnode_charclass_class intrnl;
3990                 int f = 0;
3991
3992                 data_fake.flags = 0;
3993                 if (data) {
3994                     data_fake.whilem_c = data->whilem_c;
3995                     data_fake.last_closep = data->last_closep;
3996                 }
3997                 else
3998                     data_fake.last_closep = &fake;
3999                 data_fake.pos_delta = delta;
4000                 if ( flags & SCF_DO_STCLASS && !scan->flags
4001                      && OP(scan) == IFMATCH ) { /* Lookahead */
4002                     cl_init(pRExC_state, &intrnl);
4003                     data_fake.start_class = &intrnl;
4004                     f |= SCF_DO_STCLASS_AND;
4005                 }
4006                 if (flags & SCF_WHILEM_VISITED_POS)
4007                     f |= SCF_WHILEM_VISITED_POS;
4008                 next = regnext(scan);
4009                 nscan = NEXTOPER(NEXTOPER(scan));
4010                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
4011                     last, &data_fake, stopparen, recursed, NULL, f, depth+1);
4012                 if (scan->flags) {
4013                     if (deltanext) {
4014                         FAIL("Variable length lookbehind not implemented");
4015                     }
4016                     else if (minnext > (I32)U8_MAX) {
4017                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4018                     }
4019                     scan->flags = (U8)minnext;
4020                 }
4021                 if (data) {
4022                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4023                         pars++;
4024                     if (data_fake.flags & SF_HAS_EVAL)
4025                         data->flags |= SF_HAS_EVAL;
4026                     data->whilem_c = data_fake.whilem_c;
4027                 }
4028                 if (f & SCF_DO_STCLASS_AND) {
4029                     if (flags & SCF_DO_STCLASS_OR) {
4030                         /* OR before, AND after: ideally we would recurse with
4031                          * data_fake to get the AND applied by study of the
4032                          * remainder of the pattern, and then derecurse;
4033                          * *** HACK *** for now just treat as "no information".
4034                          * See [perl #56690].
4035                          */
4036                         cl_init(pRExC_state, data->start_class);
4037                     }  else {
4038                         /* AND before and after: combine and continue */
4039                         const int was = (data->start_class->flags & ANYOF_EOS);
4040
4041                         cl_and(data->start_class, &intrnl);
4042                         if (was)
4043                             data->start_class->flags |= ANYOF_EOS;
4044                     }
4045                 }
4046             }
4047 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4048             else {
4049                 /* Positive Lookahead/lookbehind
4050                    In this case we can do fixed string optimisation,
4051                    but we must be careful about it. Note in the case of
4052                    lookbehind the positions will be offset by the minimum
4053                    length of the pattern, something we won't know about
4054                    until after the recurse.
4055                 */
4056                 I32 deltanext, fake = 0;
4057                 regnode *nscan;
4058                 struct regnode_charclass_class intrnl;
4059                 int f = 0;
4060                 /* We use SAVEFREEPV so that when the full compile
4061                     is finished perl will clean up the allocated
4062                     minlens when it's all done. This way we don't
4063                     have to worry about freeing them when we know
4064                     they wont be used, which would be a pain.
4065                  */
4066                 I32 *minnextp;
4067                 Newx( minnextp, 1, I32 );
4068                 SAVEFREEPV(minnextp);
4069
4070                 if (data) {
4071                     StructCopy(data, &data_fake, scan_data_t);
4072                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
4073                         f |= SCF_DO_SUBSTR;
4074                         if (scan->flags)
4075                             SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
4076                         data_fake.last_found=newSVsv(data->last_found);
4077                     }
4078                 }
4079                 else
4080                     data_fake.last_closep = &fake;
4081                 data_fake.flags = 0;
4082                 data_fake.pos_delta = delta;
4083                 if (is_inf)
4084                     data_fake.flags |= SF_IS_INF;
4085                 if ( flags & SCF_DO_STCLASS && !scan->flags
4086                      && OP(scan) == IFMATCH ) { /* Lookahead */
4087                     cl_init(pRExC_state, &intrnl);
4088                     data_fake.start_class = &intrnl;
4089                     f |= SCF_DO_STCLASS_AND;
4090                 }
4091                 if (flags & SCF_WHILEM_VISITED_POS)
4092                     f |= SCF_WHILEM_VISITED_POS;
4093                 next = regnext(scan);
4094                 nscan = NEXTOPER(NEXTOPER(scan));
4095
4096                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
4097                     last, &data_fake, stopparen, recursed, NULL, f,depth+1);
4098                 if (scan->flags) {
4099                     if (deltanext) {
4100                         FAIL("Variable length lookbehind not implemented");
4101                     }
4102                     else if (*minnextp > (I32)U8_MAX) {
4103                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4104                     }
4105                     scan->flags = (U8)*minnextp;
4106                 }
4107
4108                 *minnextp += min;
4109
4110                 if (f & SCF_DO_STCLASS_AND) {
4111                     const int was = (data->start_class->flags & ANYOF_EOS);
4112
4113                     cl_and(data->start_class, &intrnl);
4114                     if (was)
4115                         data->start_class->flags |= ANYOF_EOS;
4116                 }
4117                 if (data) {
4118                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4119                         pars++;
4120                     if (data_fake.flags & SF_HAS_EVAL)
4121                         data->flags |= SF_HAS_EVAL;
4122                     data->whilem_c = data_fake.whilem_c;
4123                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4124                         if (RExC_rx->minlen<*minnextp)
4125                             RExC_rx->minlen=*minnextp;
4126                         SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4127                         SvREFCNT_dec(data_fake.last_found);
4128
4129                         if ( data_fake.minlen_fixed != minlenp )
4130                         {
4131                             data->offset_fixed= data_fake.offset_fixed;
4132                             data->minlen_fixed= data_fake.minlen_fixed;
4133                             data->lookbehind_fixed+= scan->flags;
4134                         }
4135                         if ( data_fake.minlen_float != minlenp )
4136                         {
4137                             data->minlen_float= data_fake.minlen_float;
4138                             data->offset_float_min=data_fake.offset_float_min;
4139                             data->offset_float_max=data_fake.offset_float_max;
4140                             data->lookbehind_float+= scan->flags;
4141                         }
4142                     }
4143                 }
4144
4145
4146             }
4147 #endif
4148         }
4149         else if (OP(scan) == OPEN) {
4150             if (stopparen != (I32)ARG(scan))
4151                 pars++;
4152         }
4153         else if (OP(scan) == CLOSE) {
4154             if (stopparen == (I32)ARG(scan)) {
4155                 break;
4156             }
4157             if ((I32)ARG(scan) == is_par) {
4158                 next = regnext(scan);
4159
4160                 if ( next && (OP(next) != WHILEM) && next < last)
4161                     is_par = 0;         /* Disable optimization */
4162             }
4163             if (data)
4164                 *(data->last_closep) = ARG(scan);
4165         }
4166         else if (OP(scan) == EVAL) {
4167                 if (data)
4168                     data->flags |= SF_HAS_EVAL;
4169         }
4170         else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4171             if (flags & SCF_DO_SUBSTR) {
4172                 SCAN_COMMIT(pRExC_state,data,minlenp);
4173                 flags &= ~SCF_DO_SUBSTR;
4174             }
4175             if (data && OP(scan)==ACCEPT) {
4176                 data->flags |= SCF_SEEN_ACCEPT;
4177                 if (stopmin > min)
4178                     stopmin = min;
4179             }
4180         }
4181         else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4182         {
4183                 if (flags & SCF_DO_SUBSTR) {
4184                     SCAN_COMMIT(pRExC_state,data,minlenp);
4185                     data->longest = &(data->longest_float);
4186                 }
4187                 is_inf = is_inf_internal = 1;
4188                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4189                     cl_anything(pRExC_state, data->start_class);
4190                 flags &= ~SCF_DO_STCLASS;
4191         }
4192         else if (OP(scan) == GPOS) {
4193             if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4194                 !(delta || is_inf || (data && data->pos_delta)))
4195             {
4196                 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4197                     RExC_rx->extflags |= RXf_ANCH_GPOS;
4198                 if (RExC_rx->gofs < (U32)min)
4199                     RExC_rx->gofs = min;
4200             } else {
4201                 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4202                 RExC_rx->gofs = 0;
4203             }
4204         }
4205 #ifdef TRIE_STUDY_OPT
4206 #ifdef FULL_TRIE_STUDY
4207         else if (PL_regkind[OP(scan)] == TRIE) {
4208             /* NOTE - There is similar code to this block above for handling
4209                BRANCH nodes on the initial study.  If you change stuff here
4210                check there too. */
4211             regnode *trie_node= scan;
4212             regnode *tail= regnext(scan);
4213             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4214             I32 max1 = 0, min1 = I32_MAX;
4215             struct regnode_charclass_class accum;
4216
4217             if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4218                 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4219             if (flags & SCF_DO_STCLASS)
4220                 cl_init_zero(pRExC_state, &accum);
4221
4222             if (!trie->jump) {
4223                 min1= trie->minlen;
4224                 max1= trie->maxlen;
4225             } else {
4226                 const regnode *nextbranch= NULL;
4227                 U32 word;
4228
4229                 for ( word=1 ; word <= trie->wordcount ; word++)
4230                 {
4231                     I32 deltanext=0, minnext=0, f = 0, fake;
4232                     struct regnode_charclass_class this_class;
4233
4234                     data_fake.flags = 0;
4235                     if (data) {
4236                         data_fake.whilem_c = data->whilem_c;
4237                         data_fake.last_closep = data->last_closep;
4238                     }
4239                     else
4240                         data_fake.last_closep = &fake;
4241                     data_fake.pos_delta = delta;
4242                     if (flags & SCF_DO_STCLASS) {
4243                         cl_init(pRExC_state, &this_class);
4244                         data_fake.start_class = &this_class;
4245                         f = SCF_DO_STCLASS_AND;
4246                     }
4247                     if (flags & SCF_WHILEM_VISITED_POS)
4248                         f |= SCF_WHILEM_VISITED_POS;
4249
4250                     if (trie->jump[word]) {
4251                         if (!nextbranch)
4252                             nextbranch = trie_node + trie->jump[0];
4253                         scan= trie_node + trie->jump[word];
4254                         /* We go from the jump point to the branch that follows
4255                            it. Note this means we need the vestigal unused branches
4256                            even though they arent otherwise used.
4257                          */
4258                         minnext = study_chunk(pRExC_state, &scan, minlenp,
4259                             &deltanext, (regnode *)nextbranch, &data_fake,
4260                             stopparen, recursed, NULL, f,depth+1);
4261                     }
4262                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4263                         nextbranch= regnext((regnode*)nextbranch);
4264
4265                     if (min1 > (I32)(minnext + trie->minlen))
4266                         min1 = minnext + trie->minlen;
4267                     if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4268                         max1 = minnext + deltanext + trie->maxlen;
4269                     if (deltanext == I32_MAX)
4270                         is_inf = is_inf_internal = 1;
4271
4272                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4273                         pars++;
4274                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
4275                         if ( stopmin > min + min1)
4276                             stopmin = min + min1;
4277                         flags &= ~SCF_DO_SUBSTR;
4278                         if (data)
4279                             data->flags |= SCF_SEEN_ACCEPT;
4280                     }
4281                     if (data) {
4282                         if (data_fake.flags & SF_HAS_EVAL)
4283                             data->flags |= SF_HAS_EVAL;
4284                         data->whilem_c = data_fake.whilem_c;
4285                     }
4286                     if (flags & SCF_DO_STCLASS)
4287                         cl_or(pRExC_state, &accum, &this_class);
4288                 }
4289             }
4290             if (flags & SCF_DO_SUBSTR) {
4291                 data->pos_min += min1;
4292                 data->pos_delta += max1 - min1;
4293                 if (max1 != min1 || is_inf)
4294                     data->longest = &(data->longest_float);
4295             }
4296             min += min1;
4297             delta += max1 - min1;
4298             if (flags & SCF_DO_STCLASS_OR) {
4299                 cl_or(pRExC_state, data->start_class, &accum);
4300                 if (min1) {
4301                     cl_and(data->start_class, and_withp);
4302                     flags &= ~SCF_DO_STCLASS;
4303                 }
4304             }
4305             else if (flags & SCF_DO_STCLASS_AND) {
4306                 if (min1) {
4307                     cl_and(data->start_class, &accum);
4308                     flags &= ~SCF_DO_STCLASS;
4309                 }
4310                 else {
4311                     /* Switch to OR mode: cache the old value of
4312                      * data->start_class */
4313                     INIT_AND_WITHP;
4314                     StructCopy(data->start_class, and_withp,
4315                                struct regnode_charclass_class);
4316                     flags &= ~SCF_DO_STCLASS_AND;
4317                     StructCopy(&accum, data->start_class,
4318                                struct regnode_charclass_class);
4319                     flags |= SCF_DO_STCLASS_OR;
4320                     data->start_class->flags |= ANYOF_EOS;
4321                 }
4322             }
4323             scan= tail;
4324             continue;
4325         }
4326 #else
4327         else if (PL_regkind[OP(scan)] == TRIE) {
4328             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4329             U8*bang=NULL;
4330
4331             min += trie->minlen;
4332             delta += (trie->maxlen - trie->minlen);
4333             flags &= ~SCF_DO_STCLASS; /* xxx */
4334             if (flags & SCF_DO_SUBSTR) {
4335                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
4336                 data->pos_min += trie->minlen;
4337                 data->pos_delta += (trie->maxlen - trie->minlen);
4338                 if (trie->maxlen != trie->minlen)
4339                     data->longest = &(data->longest_float);
4340             }
4341             if (trie->jump) /* no more substrings -- for now /grr*/
4342                 flags &= ~SCF_DO_SUBSTR;
4343         }
4344 #endif /* old or new */
4345 #endif /* TRIE_STUDY_OPT */
4346
4347         /* Else: zero-length, ignore. */
4348         scan = regnext(scan);
4349     }
4350     if (frame) {
4351         last = frame->last;
4352         scan = frame->next;
4353         stopparen = frame->stop;
4354         frame = frame->prev;
4355         goto fake_study_recurse;
4356     }
4357
4358   finish:
4359     assert(!frame);
4360     DEBUG_STUDYDATA("pre-fin:",data,depth);
4361
4362     *scanp = scan;
4363     *deltap = is_inf_internal ? I32_MAX : delta;
4364     if (flags & SCF_DO_SUBSTR && is_inf)
4365         data->pos_delta = I32_MAX - data->pos_min;
4366     if (is_par > (I32)U8_MAX)
4367         is_par = 0;
4368     if (is_par && pars==1 && data) {
4369         data->flags |= SF_IN_PAR;
4370         data->flags &= ~SF_HAS_PAR;
4371     }
4372     else if (pars && data) {
4373         data->flags |= SF_HAS_PAR;
4374         data->flags &= ~SF_IN_PAR;
4375     }
4376     if (flags & SCF_DO_STCLASS_OR)
4377         cl_and(data->start_class, and_withp);
4378     if (flags & SCF_TRIE_RESTUDY)
4379         data->flags |=  SCF_TRIE_RESTUDY;
4380
4381     DEBUG_STUDYDATA("post-fin:",data,depth);
4382
4383     return min < stopmin ? min : stopmin;
4384 }
4385
4386 STATIC U32
4387 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4388 {
4389     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4390
4391     PERL_ARGS_ASSERT_ADD_DATA;
4392
4393     Renewc(RExC_rxi->data,
4394            sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4395            char, struct reg_data);
4396     if(count)
4397         Renew(RExC_rxi->data->what, count + n, U8);
4398     else
4399         Newx(RExC_rxi->data->what, n, U8);
4400     RExC_rxi->data->count = count + n;
4401     Copy(s, RExC_rxi->data->what + count, n, U8);
4402     return count;
4403 }
4404
4405 /*XXX: todo make this not included in a non debugging perl */
4406 #ifndef PERL_IN_XSUB_RE
4407 void
4408 Perl_reginitcolors(pTHX)
4409 {
4410     dVAR;
4411     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4412     if (s) {
4413         char *t = savepv(s);
4414         int i = 0;
4415         PL_colors[0] = t;
4416         while (++i < 6) {
4417             t = strchr(t, '\t');
4418             if (t) {
4419                 *t = '\0';
4420                 PL_colors[i] = ++t;
4421             }
4422             else
4423                 PL_colors[i] = t = (char *)"";
4424         }
4425     } else {
4426         int i = 0;
4427         while (i < 6)
4428             PL_colors[i++] = (char *)"";
4429     }
4430     PL_colorset = 1;
4431 }
4432 #endif
4433
4434
4435 #ifdef TRIE_STUDY_OPT
4436 #define CHECK_RESTUDY_GOTO                                  \
4437         if (                                                \
4438               (data.flags & SCF_TRIE_RESTUDY)               \
4439               && ! restudied++                              \
4440         )     goto reStudy
4441 #else
4442 #define CHECK_RESTUDY_GOTO
4443 #endif
4444
4445 /*
4446  - pregcomp - compile a regular expression into internal code
4447  *
4448  * We can't allocate space until we know how big the compiled form will be,
4449  * but we can't compile it (and thus know how big it is) until we've got a
4450  * place to put the code.  So we cheat:  we compile it twice, once with code
4451  * generation turned off and size counting turned on, and once "for real".
4452  * This also means that we don't allocate space until we are sure that the
4453  * thing really will compile successfully, and we never have to move the
4454  * code and thus invalidate pointers into it.  (Note that it has to be in
4455  * one piece because free() must be able to free it all.) [NB: not true in perl]
4456  *
4457  * Beware that the optimization-preparation code in here knows about some
4458  * of the structure of the compiled regexp.  [I'll say.]
4459  */
4460
4461
4462
4463 #ifndef PERL_IN_XSUB_RE
4464 #define RE_ENGINE_PTR &PL_core_reg_engine
4465 #else
4466 extern const struct regexp_engine my_reg_engine;
4467 #define RE_ENGINE_PTR &my_reg_engine
4468 #endif
4469
4470 #ifndef PERL_IN_XSUB_RE
4471 REGEXP *
4472 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4473 {
4474     dVAR;
4475     HV * const table = GvHV(PL_hintgv);
4476
4477     PERL_ARGS_ASSERT_PREGCOMP;
4478
4479     /* Dispatch a request to compile a regexp to correct
4480        regexp engine. */
4481     if (table) {
4482         SV **ptr= hv_fetchs(table, "regcomp", FALSE);
4483         GET_RE_DEBUG_FLAGS_DECL;
4484         if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
4485             const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4486             DEBUG_COMPILE_r({
4487                 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4488                     SvIV(*ptr));
4489             });
4490             return CALLREGCOMP_ENG(eng, pattern, flags);
4491         }
4492     }
4493     return Perl_re_compile(aTHX_ pattern, flags);
4494 }
4495 #endif
4496
4497 REGEXP *
4498 Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
4499 {
4500     dVAR;
4501     REGEXP *rx;
4502     struct regexp *r;
4503     register regexp_internal *ri;
4504     STRLEN plen;
4505     char  *exp;
4506     char* xend;
4507     regnode *scan;
4508     I32 flags;
4509     I32 minlen = 0;
4510     U32 pm_flags;
4511
4512     /* these are all flags - maybe they should be turned
4513      * into a single int with different bit masks */
4514     I32 sawlookahead = 0;
4515     I32 sawplus = 0;
4516     I32 sawopen = 0;
4517     bool used_setjump = FALSE;
4518     regex_charset initial_charset = get_regex_charset(orig_pm_flags);
4519
4520     U8 jump_ret = 0;
4521     dJMPENV;
4522     scan_data_t data;
4523     RExC_state_t RExC_state;
4524     RExC_state_t * const pRExC_state = &RExC_state;
4525 #ifdef TRIE_STUDY_OPT
4526     int restudied;
4527     RExC_state_t copyRExC_state;
4528 #endif
4529     GET_RE_DEBUG_FLAGS_DECL;
4530
4531     PERL_ARGS_ASSERT_RE_COMPILE;
4532
4533     DEBUG_r(if (!PL_colorset) reginitcolors());
4534
4535     RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
4536     RExC_uni_semantics = 0;
4537     RExC_contains_locale = 0;
4538
4539     /****************** LONG JUMP TARGET HERE***********************/
4540     /* Longjmp back to here if have to switch in midstream to utf8 */
4541     if (! RExC_orig_utf8) {
4542         JMPENV_PUSH(jump_ret);
4543         used_setjump = TRUE;
4544     }
4545
4546     if (jump_ret == 0) {    /* First time through */
4547         exp = SvPV(pattern, plen);
4548         xend = exp + plen;
4549         /* ignore the utf8ness if the pattern is 0 length */
4550         if (plen == 0) {
4551             RExC_utf8 = RExC_orig_utf8 = 0;
4552         }
4553
4554         DEBUG_COMPILE_r({
4555             SV *dsv= sv_newmortal();
4556             RE_PV_QUOTED_DECL(s, RExC_utf8,
4557                 dsv, exp, plen, 60);
4558             PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4559                            PL_colors[4],PL_colors[5],s);
4560         });
4561     }
4562     else {  /* longjumped back */
4563         STRLEN len = plen;
4564
4565         /* If the cause for the longjmp was other than changing to utf8, pop
4566          * our own setjmp, and longjmp to the correct handler */
4567         if (jump_ret != UTF8_LONGJMP) {
4568             JMPENV_POP;
4569             JMPENV_JUMP(jump_ret);
4570         }
4571
4572         GET_RE_DEBUG_FLAGS;
4573
4574         /* It's possible to write a regexp in ascii that represents Unicode
4575         codepoints outside of the byte range, such as via \x{100}. If we
4576         detect such a sequence we have to convert the entire pattern to utf8
4577         and then recompile, as our sizing calculation will have been based
4578         on 1 byte == 1 character, but we will need to use utf8 to encode
4579         at least some part of the pattern, and therefore must convert the whole
4580         thing.
4581         -- dmq */
4582         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4583             "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4584         exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)SvPV(pattern, plen), &len);
4585         xend = exp + len;
4586         RExC_orig_utf8 = RExC_utf8 = 1;
4587         SAVEFREEPV(exp);
4588     }
4589
4590 #ifdef TRIE_STUDY_OPT
4591     restudied = 0;
4592 #endif
4593
4594     pm_flags = orig_pm_flags;
4595
4596     if (initial_charset == REGEX_LOCALE_CHARSET) {
4597         RExC_contains_locale = 1;
4598     }
4599     else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
4600
4601         /* Set to use unicode semantics if the pattern is in utf8 and has the
4602          * 'depends' charset specified, as it means unicode when utf8  */
4603         set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
4604     }
4605
4606     RExC_precomp = exp;
4607     RExC_flags = pm_flags;
4608     RExC_sawback = 0;
4609
4610     RExC_seen = 0;
4611     RExC_in_lookbehind = 0;
4612     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4613     RExC_seen_evals = 0;
4614     RExC_extralen = 0;
4615
4616     /* First pass: determine size, legality. */
4617     RExC_parse = exp;
4618     RExC_start = exp;
4619     RExC_end = xend;
4620     RExC_naughty = 0;
4621     RExC_npar = 1;
4622     RExC_nestroot = 0;
4623     RExC_size = 0L;
4624     RExC_emit = &PL_regdummy;
4625     RExC_whilem_seen = 0;
4626     RExC_open_parens = NULL;
4627     RExC_close_parens = NULL;
4628     RExC_opend = NULL;
4629     RExC_paren_names = NULL;
4630 #ifdef DEBUGGING
4631     RExC_paren_name_list = NULL;
4632 #endif
4633     RExC_recurse = NULL;
4634     RExC_recurse_count = 0;
4635
4636 #if 0 /* REGC() is (currently) a NOP at the first pass.
4637        * Clever compilers notice this and complain. --jhi */
4638     REGC((U8)REG_MAGIC, (char*)RExC_emit);
4639 #endif
4640     DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4641     if (reg(pRExC_state, 0, &flags,1) == NULL) {
4642         RExC_precomp = NULL;
4643         return(NULL);
4644     }
4645
4646     /* Here, finished first pass.  Get rid of any added setjmp */
4647     if (used_setjump) {
4648         JMPENV_POP;
4649     }
4650
4651     DEBUG_PARSE_r({
4652         PerlIO_printf(Perl_debug_log,
4653             "Required size %"IVdf" nodes\n"
4654             "Starting second pass (creation)\n",
4655             (IV)RExC_size);
4656         RExC_lastnum=0;
4657         RExC_lastparse=NULL;
4658     });
4659
4660     /* The first pass could have found things that force Unicode semantics */
4661     if ((RExC_utf8 || RExC_uni_semantics)
4662          && get_regex_charset(pm_flags) == REGEX_DEPENDS_CHARSET)
4663     {
4664         set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
4665     }
4666
4667     /* Small enough for pointer-storage convention?
4668        If extralen==0, this means that we will not need long jumps. */
4669     if (RExC_size >= 0x10000L && RExC_extralen)
4670         RExC_size += RExC_extralen;
4671     else
4672         RExC_extralen = 0;
4673     if (RExC_whilem_seen > 15)
4674         RExC_whilem_seen = 15;
4675
4676     /* Allocate space and zero-initialize. Note, the two step process
4677        of zeroing when in debug mode, thus anything assigned has to
4678        happen after that */
4679     rx = (REGEXP*) newSV_type(SVt_REGEXP);
4680     r = (struct regexp*)SvANY(rx);
4681     Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4682          char, regexp_internal);
4683     if ( r == NULL || ri == NULL )
4684         FAIL("Regexp out of space");
4685 #ifdef DEBUGGING
4686     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
4687     Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
4688 #else
4689     /* bulk initialize base fields with 0. */
4690     Zero(ri, sizeof(regexp_internal), char);
4691 #endif
4692
4693     /* non-zero initialization begins here */
4694     RXi_SET( r, ri );
4695     r->engine= RE_ENGINE_PTR;
4696     r->extflags = pm_flags;
4697     {
4698         bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
4699         bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
4700
4701         /* The caret is output if there are any defaults: if not all the STD
4702          * flags are set, or if no character set specifier is needed */
4703         bool has_default =
4704                     (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
4705                     || ! has_charset);
4706         bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
4707         U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
4708                             >> RXf_PMf_STD_PMMOD_SHIFT);
4709         const char *fptr = STD_PAT_MODS;        /*"msix"*/
4710         char *p;
4711         /* Allocate for the worst case, which is all the std flags are turned
4712          * on.  If more precision is desired, we could do a population count of
4713          * the flags set.  This could be done with a small lookup table, or by
4714          * shifting, masking and adding, or even, when available, assembly
4715          * language for a machine-language population count.
4716          * We never output a minus, as all those are defaults, so are
4717          * covered by the caret */
4718         const STRLEN wraplen = plen + has_p + has_runon
4719             + has_default       /* If needs a caret */
4720
4721                 /* If needs a character set specifier */
4722             + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
4723             + (sizeof(STD_PAT_MODS) - 1)
4724             + (sizeof("(?:)") - 1);
4725
4726         p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
4727         SvPOK_on(rx);
4728         SvFLAGS(rx) |= SvUTF8(pattern);
4729         *p++='('; *p++='?';
4730
4731         /* If a default, cover it using the caret */
4732         if (has_default) {
4733             *p++= DEFAULT_PAT_MOD;
4734         }
4735         if (has_charset) {
4736             STRLEN len;
4737             const char* const name = get_regex_charset_name(r->extflags, &len);
4738             Copy(name, p, len, char);
4739             p += len;
4740         }
4741         if (has_p)
4742             *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
4743         {
4744             char ch;
4745             while((ch = *fptr++)) {
4746                 if(reganch & 1)
4747                     *p++ = ch;
4748                 reganch >>= 1;
4749             }
4750         }
4751
4752         *p++ = ':';
4753         Copy(RExC_precomp, p, plen, char);
4754         assert ((RX_WRAPPED(rx) - p) < 16);
4755         r->pre_prefix = p - RX_WRAPPED(rx);
4756         p += plen;
4757         if (has_runon)
4758             *p++ = '\n';
4759         *p++ = ')';
4760         *p = 0;
4761         SvCUR_set(rx, p - SvPVX_const(rx));
4762     }
4763
4764     r->intflags = 0;
4765     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4766
4767     if (RExC_seen & REG_SEEN_RECURSE) {
4768         Newxz(RExC_open_parens, RExC_npar,regnode *);
4769         SAVEFREEPV(RExC_open_parens);
4770         Newxz(RExC_close_parens,RExC_npar,regnode *);
4771         SAVEFREEPV(RExC_close_parens);
4772     }
4773
4774     /* Useful during FAIL. */
4775 #ifdef RE_TRACK_PATTERN_OFFSETS
4776     Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
4777     DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
4778                           "%s %"UVuf" bytes for offset annotations.\n",
4779                           ri->u.offsets ? "Got" : "Couldn't get",
4780                           (UV)((2*RExC_size+1) * sizeof(U32))));
4781 #endif
4782     SetProgLen(ri,RExC_size);
4783     RExC_rx_sv = rx;
4784     RExC_rx = r;
4785     RExC_rxi = ri;
4786     REH_CALL_COMP_BEGIN_HOOK(pRExC_state->rx);
4787
4788     /* Second pass: emit code. */
4789     RExC_flags = pm_flags;      /* don't let top level (?i) bleed */
4790     RExC_parse = exp;
4791     RExC_end = xend;
4792     RExC_naughty = 0;
4793     RExC_npar = 1;
4794     RExC_emit_start = ri->program;
4795     RExC_emit = ri->program;
4796     RExC_emit_bound = ri->program + RExC_size + 1;
4797
4798     /* Store the count of eval-groups for security checks: */
4799     RExC_rx->seen_evals = RExC_seen_evals;
4800     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
4801     if (reg(pRExC_state, 0, &flags,1) == NULL) {
4802         ReREFCNT_dec(rx);
4803         return(NULL);
4804     }
4805     /* XXXX To minimize changes to RE engine we always allocate
4806        3-units-long substrs field. */
4807     Newx(r->substrs, 1, struct reg_substr_data);
4808     if (RExC_recurse_count) {
4809         Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4810         SAVEFREEPV(RExC_recurse);
4811     }
4812
4813 reStudy:
4814     r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
4815     Zero(r->substrs, 1, struct reg_substr_data);
4816
4817 #ifdef TRIE_STUDY_OPT
4818     if (!restudied) {
4819         StructCopy(&zero_scan_data, &data, scan_data_t);
4820         copyRExC_state = RExC_state;
4821     } else {
4822         U32 seen=RExC_seen;
4823         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
4824
4825         RExC_state = copyRExC_state;
4826         if (seen & REG_TOP_LEVEL_BRANCHES)
4827             RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4828         else
4829             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
4830         if (data.last_found) {
4831             SvREFCNT_dec(data.longest_fixed);
4832             SvREFCNT_dec(data.longest_float);
4833             SvREFCNT_dec(data.last_found);
4834         }
4835         StructCopy(&zero_scan_data, &data, scan_data_t);
4836     }
4837 #else
4838     StructCopy(&zero_scan_data, &data, scan_data_t);
4839 #endif
4840
4841     /* Dig out information for optimizations. */
4842     r->extflags = RExC_flags; /* was pm_op */
4843     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
4844
4845     if (UTF)
4846         SvUTF8_on(rx);  /* Unicode in it? */
4847     ri->regstclass = NULL;
4848     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
4849         r->intflags |= PREGf_NAUGHTY;
4850     scan = ri->program + 1;             /* First BRANCH. */
4851
4852     /* testing for BRANCH here tells us whether there is "must appear"
4853        data in the pattern. If there is then we can use it for optimisations */
4854     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /*  Only one top-level choice. */
4855         I32 fake;
4856         STRLEN longest_float_length, longest_fixed_length;
4857         struct regnode_charclass_class ch_class; /* pointed to by data */
4858         int stclass_flag;
4859         I32 last_close = 0; /* pointed to by data */
4860         regnode *first= scan;
4861         regnode *first_next= regnext(first);
4862         /*
4863          * Skip introductions and multiplicators >= 1
4864          * so that we can extract the 'meat' of the pattern that must
4865          * match in the large if() sequence following.
4866          * NOTE that EXACT is NOT covered here, as it is normally
4867          * picked up by the optimiser separately.
4868          *
4869          * This is unfortunate as the optimiser isnt handling lookahead
4870          * properly currently.
4871          *
4872          */
4873         while ((OP(first) == OPEN && (sawopen = 1)) ||
4874                /* An OR of *one* alternative - should not happen now. */
4875             (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
4876             /* for now we can't handle lookbehind IFMATCH*/
4877             (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
4878             (OP(first) == PLUS) ||
4879             (OP(first) == MINMOD) ||
4880                /* An {n,m} with n>0 */
4881             (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
4882             (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
4883         {
4884                 /*
4885                  * the only op that could be a regnode is PLUS, all the rest
4886                  * will be regnode_1 or regnode_2.
4887                  *
4888                  */
4889                 if (OP(first) == PLUS)
4890                     sawplus = 1;
4891                 else
4892                     first += regarglen[OP(first)];
4893
4894                 first = NEXTOPER(first);
4895                 first_next= regnext(first);
4896         }
4897
4898         /* Starting-point info. */
4899       again:
4900         DEBUG_PEEP("first:",first,0);
4901         /* Ignore EXACT as we deal with it later. */
4902         if (PL_regkind[OP(first)] == EXACT) {
4903             if (OP(first) == EXACT)
4904                 NOOP;   /* Empty, get anchored substr later. */
4905             else
4906                 ri->regstclass = first;
4907         }
4908 #ifdef TRIE_STCLASS
4909         else if (PL_regkind[OP(first)] == TRIE &&
4910                 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
4911         {
4912             regnode *trie_op;
4913             /* this can happen only on restudy */
4914             if ( OP(first) == TRIE ) {
4915                 struct regnode_1 *trieop = (struct regnode_1 *)
4916                     PerlMemShared_calloc(1, sizeof(struct regnode_1));
4917                 StructCopy(first,trieop,struct regnode_1);
4918                 trie_op=(regnode *)trieop;
4919             } else {
4920                 struct regnode_charclass *trieop = (struct regnode_charclass *)
4921                     PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
4922                 StructCopy(first,trieop,struct regnode_charclass);
4923                 trie_op=(regnode *)trieop;
4924             }
4925             OP(trie_op)+=2;
4926             make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
4927             ri->regstclass = trie_op;
4928         }
4929 #endif
4930         else if (REGNODE_SIMPLE(OP(first)))
4931             ri->regstclass = first;
4932         else if (PL_regkind[OP(first)] == BOUND ||
4933                  PL_regkind[OP(first)] == NBOUND)
4934             ri->regstclass = first;
4935         else if (PL_regkind[OP(first)] == BOL) {
4936             r->extflags |= (OP(first) == MBOL
4937                            ? RXf_ANCH_MBOL
4938                            : (OP(first) == SBOL
4939                               ? RXf_ANCH_SBOL
4940                               : RXf_ANCH_BOL));
4941             first = NEXTOPER(first);
4942             goto again;
4943         }
4944         else if (OP(first) == GPOS) {
4945             r->extflags |= RXf_ANCH_GPOS;
4946             first = NEXTOPER(first);
4947             goto again;
4948         }
4949         else if ((!sawopen || !RExC_sawback) &&
4950             (OP(first) == STAR &&
4951             PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
4952             !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
4953         {
4954             /* turn .* into ^.* with an implied $*=1 */
4955             const int type =
4956                 (OP(NEXTOPER(first)) == REG_ANY)
4957                     ? RXf_ANCH_MBOL
4958                     : RXf_ANCH_SBOL;
4959             r->extflags |= type;
4960             r->intflags |= PREGf_IMPLICIT;
4961             first = NEXTOPER(first);
4962             goto again;
4963         }
4964         if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
4965             && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
4966             /* x+ must match at the 1st pos of run of x's */
4967             r->intflags |= PREGf_SKIP;
4968
4969         /* Scan is after the zeroth branch, first is atomic matcher. */
4970 #ifdef TRIE_STUDY_OPT
4971         DEBUG_PARSE_r(
4972             if (!restudied)
4973                 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4974                               (IV)(first - scan + 1))
4975         );
4976 #else
4977         DEBUG_PARSE_r(
4978             PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4979                 (IV)(first - scan + 1))
4980         );
4981 #endif
4982
4983
4984         /*
4985         * If there's something expensive in the r.e., find the
4986         * longest literal string that must appear and make it the
4987         * regmust.  Resolve ties in favor of later strings, since
4988         * the regstart check works with the beginning of the r.e.
4989         * and avoiding duplication strengthens checking.  Not a
4990         * strong reason, but sufficient in the absence of others.
4991         * [Now we resolve ties in favor of the earlier string if
4992         * it happens that c_offset_min has been invalidated, since the
4993         * earlier string may buy us something the later one won't.]
4994         */
4995
4996         data.longest_fixed = newSVpvs("");
4997         data.longest_float = newSVpvs("");
4998         data.last_found = newSVpvs("");
4999         data.longest = &(data.longest_fixed);
5000         first = scan;
5001         if (!ri->regstclass) {
5002             cl_init(pRExC_state, &ch_class);
5003             data.start_class = &ch_class;
5004             stclass_flag = SCF_DO_STCLASS_AND;
5005         } else                          /* XXXX Check for BOUND? */
5006             stclass_flag = 0;
5007         data.last_closep = &last_close;
5008
5009         minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
5010             &data, -1, NULL, NULL,
5011             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
5012
5013
5014         CHECK_RESTUDY_GOTO;
5015
5016
5017         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
5018              && data.last_start_min == 0 && data.last_end > 0
5019              && !RExC_seen_zerolen
5020              && !(RExC_seen & REG_SEEN_VERBARG)
5021              && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
5022             r->extflags |= RXf_CHECK_ALL;
5023         scan_commit(pRExC_state, &data,&minlen,0);
5024         SvREFCNT_dec(data.last_found);
5025
5026         /* Note that code very similar to this but for anchored string
5027            follows immediately below, changes may need to be made to both.
5028            Be careful.
5029          */
5030         longest_float_length = CHR_SVLEN(data.longest_float);
5031         if (longest_float_length
5032             || (data.flags & SF_FL_BEFORE_EOL
5033                 && (!(data.flags & SF_FL_BEFORE_MEOL)
5034                     || (RExC_flags & RXf_PMf_MULTILINE))))
5035         {
5036             I32 t,ml;
5037
5038             if (SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
5039                 && data.offset_fixed == data.offset_float_min
5040                 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
5041                     goto remove_float;          /* As in (a)+. */
5042
5043             /* copy the information about the longest float from the reg_scan_data
5044                over to the program. */
5045             if (SvUTF8(data.longest_float)) {
5046                 r->float_utf8 = data.longest_float;
5047                 r->float_substr = NULL;
5048             } else {
5049                 r->float_substr = data.longest_float;
5050                 r->float_utf8 = NULL;
5051             }
5052             /* float_end_shift is how many chars that must be matched that
5053                follow this item. We calculate it ahead of time as once the
5054                lookbehind offset is added in we lose the ability to correctly
5055                calculate it.*/
5056             ml = data.minlen_float ? *(data.minlen_float)
5057                                    : (I32)longest_float_length;
5058             r->float_end_shift = ml - data.offset_float_min
5059                 - longest_float_length + (SvTAIL(data.longest_float) != 0)
5060                 + data.lookbehind_float;
5061             r->float_min_offset = data.offset_float_min - data.lookbehind_float;
5062             r->float_max_offset = data.offset_float_max;
5063             if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
5064                 r->float_max_offset -= data.lookbehind_float;
5065
5066             t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
5067                        && (!(data.flags & SF_FL_BEFORE_MEOL)
5068                            || (RExC_flags & RXf_PMf_MULTILINE)));
5069             fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
5070         }
5071         else {
5072           remove_float:
5073             r->float_substr = r->float_utf8 = NULL;
5074             SvREFCNT_dec(data.longest_float);
5075             longest_float_length = 0;
5076         }
5077
5078         /* Note that code very similar to this but for floating string
5079            is immediately above, changes may need to be made to both.
5080            Be careful.
5081          */
5082         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
5083         if (longest_fixed_length
5084             || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
5085                 && (!(data.flags & SF_FIX_BEFORE_MEOL)
5086                     || (RExC_flags & RXf_PMf_MULTILINE))))
5087         {
5088             I32 t,ml;
5089
5090             /* copy the information about the longest fixed
5091                from the reg_scan_data over to the program. */
5092             if (SvUTF8(data.longest_fixed)) {
5093                 r->anchored_utf8 = data.longest_fixed;
5094                 r->anchored_substr = NULL;
5095             } else {
5096                 r->anchored_substr = data.longest_fixed;
5097                 r->anchored_utf8 = NULL;
5098             }
5099             /* fixed_end_shift is how many chars that must be matched that
5100                follow this item. We calculate it ahead of time as once the
5101                lookbehind offset is added in we lose the ability to correctly
5102                calculate it.*/
5103             ml = data.minlen_fixed ? *(data.minlen_fixed)
5104                                    : (I32)longest_fixed_length;
5105             r->anchored_end_shift = ml - data.offset_fixed
5106                 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
5107                 + data.lookbehind_fixed;
5108             r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
5109
5110             t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
5111                  && (!(data.flags & SF_FIX_BEFORE_MEOL)
5112                      || (RExC_flags & RXf_PMf_MULTILINE)));
5113             fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
5114         }
5115         else {
5116             r->anchored_substr = r->anchored_utf8 = NULL;
5117             SvREFCNT_dec(data.longest_fixed);
5118             longest_fixed_length = 0;
5119         }
5120         if (ri->regstclass
5121             && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
5122             ri->regstclass = NULL;
5123
5124         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
5125             && stclass_flag
5126             && !(data.start_class->flags & ANYOF_EOS)
5127             && !cl_is_anything(data.start_class))
5128         {
5129             const U32 n = add_data(pRExC_state, 1, "f");
5130             data.start_class->flags |= ANYOF_IS_SYNTHETIC;
5131
5132             Newx(RExC_rxi->data->data[n], 1,
5133                 struct regnode_charclass_class);
5134             StructCopy(data.start_class,
5135                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
5136                        struct regnode_charclass_class);
5137             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
5138             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
5139             DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
5140                       regprop(r, sv, (regnode*)data.start_class);
5141                       PerlIO_printf(Perl_debug_log,
5142                                     "synthetic stclass \"%s\".\n",
5143                                     SvPVX_const(sv));});
5144         }
5145
5146         /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
5147         if (longest_fixed_length > longest_float_length) {
5148             r->check_end_shift = r->anchored_end_shift;
5149             r->check_substr = r->anchored_substr;
5150             r->check_utf8 = r->anchored_utf8;
5151             r->check_offset_min = r->check_offset_max = r->anchored_offset;
5152             if (r->extflags & RXf_ANCH_SINGLE)
5153                 r->extflags |= RXf_NOSCAN;
5154         }
5155         else {
5156             r->check_end_shift = r->float_end_shift;
5157             r->check_substr = r->float_substr;
5158             r->check_utf8 = r->float_utf8;
5159             r->check_offset_min = r->float_min_offset;
5160             r->check_offset_max = r->float_max_offset;
5161         }
5162         /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
5163            This should be changed ASAP!  */
5164         if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
5165             r->extflags |= RXf_USE_INTUIT;
5166             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
5167                 r->extflags |= RXf_INTUIT_TAIL;
5168         }
5169         /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
5170         if ( (STRLEN)minlen < longest_float_length )
5171             minlen= longest_float_length;
5172         if ( (STRLEN)minlen < longest_fixed_length )
5173             minlen= longest_fixed_length;
5174         */
5175     }
5176     else {
5177         /* Several toplevels. Best we can is to set minlen. */
5178         I32 fake;
5179         struct regnode_charclass_class ch_class;
5180         I32 last_close = 0;
5181
5182         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
5183
5184         scan = ri->program + 1;
5185         cl_init(pRExC_state, &ch_class);
5186         data.start_class = &ch_class;
5187         data.last_closep = &last_close;
5188
5189
5190         minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
5191             &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
5192
5193         CHECK_RESTUDY_GOTO;
5194
5195         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
5196                 = r->float_substr = r->float_utf8 = NULL;
5197
5198         if (!(data.start_class->flags & ANYOF_EOS)
5199             && !cl_is_anything(data.start_class))
5200         {
5201             const U32 n = add_data(pRExC_state, 1, "f");
5202             data.start_class->flags |= ANYOF_IS_SYNTHETIC;
5203
5204             Newx(RExC_rxi->data->data[n], 1,
5205                 struct regnode_charclass_class);
5206             StructCopy(data.start_class,
5207                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
5208                        struct regnode_charclass_class);
5209             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
5210             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
5211             DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
5212                       regprop(r, sv, (regnode*)data.start_class);
5213                       PerlIO_printf(Perl_debug_log,
5214                                     "synthetic stclass \"%s\".\n",
5215                                     SvPVX_const(sv));});
5216         }
5217     }
5218
5219     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
5220        the "real" pattern. */
5221     DEBUG_OPTIMISE_r({
5222         PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
5223                       (IV)minlen, (IV)r->minlen);
5224     });
5225     r->minlenret = minlen;
5226     if (r->minlen < minlen)
5227         r->minlen = minlen;
5228
5229     if (RExC_seen & REG_SEEN_GPOS)
5230         r->extflags |= RXf_GPOS_SEEN;
5231     if (RExC_seen & REG_SEEN_LOOKBEHIND)
5232         r->extflags |= RXf_LOOKBEHIND_SEEN;
5233     if (RExC_seen & REG_SEEN_EVAL)
5234         r->extflags |= RXf_EVAL_SEEN;
5235     if (RExC_seen & REG_SEEN_CANY)
5236         r->extflags |= RXf_CANY_SEEN;
5237     if (RExC_seen & REG_SEEN_VERBARG)
5238         r->intflags |= PREGf_VERBARG_SEEN;
5239     if (RExC_seen & REG_SEEN_CUTGROUP)
5240         r->intflags |= PREGf_CUTGROUP_SEEN;
5241     if (RExC_paren_names)
5242         RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
5243     else
5244         RXp_PAREN_NAMES(r) = NULL;
5245
5246 #ifdef STUPID_PATTERN_CHECKS
5247     if (RX_PRELEN(rx) == 0)
5248         r->extflags |= RXf_NULL;
5249     if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
5250         /* XXX: this should happen BEFORE we compile */
5251         r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5252     else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
5253         r->extflags |= RXf_WHITE;
5254     else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
5255         r->extflags |= RXf_START_ONLY;
5256 #else
5257     if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
5258             /* XXX: this should happen BEFORE we compile */
5259             r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5260     else {
5261         regnode *first = ri->program + 1;
5262         U8 fop = OP(first);
5263
5264         if (PL_regkind[fop] == NOTHING && OP(NEXTOPER(first)) == END)
5265             r->extflags |= RXf_NULL;
5266         else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END)
5267             r->extflags |= RXf_START_ONLY;
5268         else if (fop == PLUS && OP(NEXTOPER(first)) == SPACE
5269                              && OP(regnext(first)) == END)
5270             r->extflags |= RXf_WHITE;
5271     }
5272 #endif
5273 #ifdef DEBUGGING
5274     if (RExC_paren_names) {
5275         ri->name_list_idx = add_data( pRExC_state, 1, "a" );
5276         ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
5277     } else
5278 #endif
5279         ri->name_list_idx = 0;
5280
5281     if (RExC_recurse_count) {
5282         for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
5283             const regnode *scan = RExC_recurse[RExC_recurse_count-1];
5284             ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
5285         }
5286     }
5287     Newxz(r->offs, RExC_npar, regexp_paren_pair);
5288     /* assume we don't need to swap parens around before we match */
5289
5290     DEBUG_DUMP_r({
5291         PerlIO_printf(Perl_debug_log,"Final program:\n");
5292         regdump(r);
5293     });
5294 #ifdef RE_TRACK_PATTERN_OFFSETS
5295     DEBUG_OFFSETS_r(if (ri->u.offsets) {
5296         const U32 len = ri->u.offsets[0];
5297         U32 i;
5298         GET_RE_DEBUG_FLAGS_DECL;
5299         PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
5300         for (i = 1; i <= len; i++) {
5301             if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
5302                 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
5303                 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
5304             }
5305         PerlIO_printf(Perl_debug_log, "\n");
5306     });
5307 #endif
5308     return rx;
5309 }
5310
5311 #undef RE_ENGINE_PTR
5312
5313
5314 SV*
5315 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
5316                     const U32 flags)
5317 {
5318     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
5319
5320     PERL_UNUSED_ARG(value);
5321
5322     if (flags & RXapif_FETCH) {
5323         return reg_named_buff_fetch(rx, key, flags);
5324     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
5325         Perl_croak_no_modify(aTHX);
5326         return NULL;
5327     } else if (flags & RXapif_EXISTS) {
5328         return reg_named_buff_exists(rx, key, flags)
5329             ? &PL_sv_yes
5330             : &PL_sv_no;
5331     } else if (flags & RXapif_REGNAMES) {
5332         return reg_named_buff_all(rx, flags);
5333     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
5334         return reg_named_buff_scalar(rx, flags);
5335     } else {
5336         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
5337         return NULL;
5338     }
5339 }
5340
5341 SV*
5342 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
5343                          const U32 flags)
5344 {
5345     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
5346     PERL_UNUSED_ARG(lastkey);
5347
5348     if (flags & RXapif_FIRSTKEY)
5349         return reg_named_buff_firstkey(rx, flags);
5350     else if (flags & RXapif_NEXTKEY)
5351         return reg_named_buff_nextkey(rx, flags);
5352     else {
5353         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
5354         return NULL;
5355     }
5356 }
5357
5358 SV*
5359 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
5360                           const U32 flags)
5361 {
5362     AV *retarray = NULL;
5363     SV *ret;
5364     struct regexp *const rx = (struct regexp *)SvANY(r);
5365
5366     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
5367
5368     if (flags & RXapif_ALL)
5369         retarray=newAV();
5370
5371     if (rx && RXp_PAREN_NAMES(rx)) {
5372         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
5373         if (he_str) {
5374             IV i;
5375             SV* sv_dat=HeVAL(he_str);
5376             I32 *nums=(I32*)SvPVX(sv_dat);
5377             for ( i=0; i<SvIVX(sv_dat); i++ ) {
5378                 if ((I32)(rx->nparens) >= nums[i]
5379                     && rx->offs[nums[i]].start != -1
5380                     && rx->offs[nums[i]].end != -1)
5381                 {
5382                     ret = newSVpvs("");
5383                     CALLREG_NUMBUF_FETCH(r,nums[i],ret);
5384                     if (!retarray)
5385                         return ret;
5386                 } else {
5387                     ret = newSVsv(&PL_sv_undef);
5388                 }
5389                 if (retarray)
5390                     av_push(retarray, ret);
5391             }
5392             if (retarray)
5393                 return newRV_noinc(MUTABLE_SV(retarray));
5394         }
5395     }
5396     return NULL;
5397 }
5398
5399 bool
5400 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
5401                            const U32 flags)
5402 {
5403     struct regexp *const rx = (struct regexp *)SvANY(r);
5404
5405     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
5406
5407     if (rx && RXp_PAREN_NAMES(rx)) {
5408         if (flags & RXapif_ALL) {
5409             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
5410         } else {
5411             SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
5412             if (sv) {
5413                 SvREFCNT_dec(sv);
5414                 return TRUE;
5415             } else {
5416                 return FALSE;
5417             }
5418         }
5419     } else {
5420         return FALSE;
5421     }
5422 }
5423
5424 SV*
5425 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
5426 {
5427     struct regexp *const rx = (struct regexp *)SvANY(r);
5428
5429     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
5430
5431     if ( rx && RXp_PAREN_NAMES(rx) ) {
5432         (void)hv_iterinit(RXp_PAREN_NAMES(rx));
5433
5434         return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
5435     } else {
5436         return FALSE;
5437     }
5438 }
5439
5440 SV*
5441 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
5442 {
5443     struct regexp *const rx = (struct regexp *)SvANY(r);
5444     GET_RE_DEBUG_FLAGS_DECL;
5445
5446     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
5447
5448     if (rx && RXp_PAREN_NAMES(rx)) {
5449         HV *hv = RXp_PAREN_NAMES(rx);
5450         HE *temphe;
5451         while ( (temphe = hv_iternext_flags(hv,0)) ) {
5452             IV i;
5453             IV parno = 0;
5454             SV* sv_dat = HeVAL(temphe);
5455             I32 *nums = (I32*)SvPVX(sv_dat);
5456             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5457                 if ((I32)(rx->lastparen) >= nums[i] &&
5458                     rx->offs[nums[i]].start != -1 &&
5459                     rx->offs[nums[i]].end != -1)
5460                 {
5461                     parno = nums[i];
5462                     break;
5463                 }
5464             }
5465             if (parno || flags & RXapif_ALL) {
5466                 return newSVhek(HeKEY_hek(temphe));
5467             }
5468         }
5469     }
5470     return NULL;
5471 }
5472
5473 SV*
5474 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
5475 {
5476     SV *ret;
5477     AV *av;
5478     I32 length;
5479     struct regexp *const rx = (struct regexp *)SvANY(r);
5480
5481     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
5482
5483     if (rx && RXp_PAREN_NAMES(rx)) {
5484         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5485             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
5486         } else if (flags & RXapif_ONE) {
5487             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
5488             av = MUTABLE_AV(SvRV(ret));
5489             length = av_len(av);
5490             SvREFCNT_dec(ret);
5491             return newSViv(length + 1);
5492         } else {
5493             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5494             return NULL;
5495         }
5496     }
5497     return &PL_sv_undef;
5498 }
5499
5500 SV*
5501 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
5502 {
5503     struct regexp *const rx = (struct regexp *)SvANY(r);
5504     AV *av = newAV();
5505
5506     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
5507
5508     if (rx && RXp_PAREN_NAMES(rx)) {
5509         HV *hv= RXp_PAREN_NAMES(rx);
5510         HE *temphe;
5511         (void)hv_iterinit(hv);
5512         while ( (temphe = hv_iternext_flags(hv,0)) ) {
5513             IV i;
5514             IV parno = 0;
5515             SV* sv_dat = HeVAL(temphe);
5516             I32 *nums = (I32*)SvPVX(sv_dat);
5517             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5518                 if ((I32)(rx->lastparen) >= nums[i] &&
5519                     rx->offs[nums[i]].start != -1 &&
5520                     rx->offs[nums[i]].end != -1)
5521                 {
5522                     parno = nums[i];
5523                     break;
5524                 }
5525             }
5526             if (parno || flags & RXapif_ALL) {
5527                 av_push(av, newSVhek(HeKEY_hek(temphe)));
5528             }
5529         }
5530     }
5531
5532     return newRV_noinc(MUTABLE_SV(av));
5533 }
5534
5535 void
5536 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
5537                              SV * const sv)
5538 {
5539     struct regexp *const rx = (struct regexp *)SvANY(r);
5540     char *s = NULL;
5541     I32 i = 0;
5542     I32 s1, t1;
5543
5544     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
5545
5546     if (!rx->subbeg) {
5547         sv_setsv(sv,&PL_sv_undef);
5548         return;
5549     }
5550     else
5551     if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
5552         /* $` */
5553         i = rx->offs[0].start;
5554         s = rx->subbeg;
5555     }
5556     else
5557     if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
5558         /* $' */
5559         s = rx->subbeg + rx->offs[0].end;
5560         i = rx->sublen - rx->offs[0].end;
5561     }
5562     else
5563     if ( 0 <= paren && paren <= (I32)rx->nparens &&
5564         (s1 = rx->offs[paren].start) != -1 &&
5565         (t1 = rx->offs[paren].end) != -1)
5566     {
5567         /* $& $1 ... */
5568         i = t1 - s1;
5569         s = rx->subbeg + s1;
5570     } else {
5571         sv_setsv(sv,&PL_sv_undef);
5572         return;
5573     }
5574     assert(rx->sublen >= (s - rx->subbeg) + i );
5575     if (i >= 0) {
5576         const int oldtainted = PL_tainted;
5577         TAINT_NOT;
5578         sv_setpvn(sv, s, i);
5579         PL_tainted = oldtainted;
5580         if ( (rx->extflags & RXf_CANY_SEEN)
5581             ? (RXp_MATCH_UTF8(rx)
5582                         && (!i || is_utf8_string((U8*)s, i)))
5583             : (RXp_MATCH_UTF8(rx)) )
5584         {
5585             SvUTF8_on(sv);
5586         }
5587         else
5588             SvUTF8_off(sv);
5589         if (PL_tainting) {
5590             if (RXp_MATCH_TAINTED(rx)) {
5591                 if (SvTYPE(sv) >= SVt_PVMG) {
5592                     MAGIC* const mg = SvMAGIC(sv);
5593                     MAGIC* mgt;
5594                     PL_tainted = 1;
5595                     SvMAGIC_set(sv, mg->mg_moremagic);
5596                     SvTAINT(sv);
5597                     if ((mgt = SvMAGIC(sv))) {
5598                         mg->mg_moremagic = mgt;
5599                         SvMAGIC_set(sv, mg);
5600                     }
5601                 } else {
5602                     PL_tainted = 1;
5603                     SvTAINT(sv);
5604                 }
5605             } else
5606                 SvTAINTED_off(sv);
5607         }
5608     } else {
5609         sv_setsv(sv,&PL_sv_undef);
5610         return;
5611     }
5612 }
5613
5614 void
5615 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
5616                                                          SV const * const value)
5617 {
5618     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
5619
5620     PERL_UNUSED_ARG(rx);
5621     PERL_UNUSED_ARG(paren);
5622     PERL_UNUSED_ARG(value);
5623
5624     if (!PL_localizing)
5625         Perl_croak_no_modify(aTHX);
5626 }
5627
5628 I32
5629 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
5630                               const I32 paren)
5631 {
5632     struct regexp *const rx = (struct regexp *)SvANY(r);
5633     I32 i;
5634     I32 s1, t1;
5635
5636     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
5637
5638     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
5639         switch (paren) {
5640       /* $` / ${^PREMATCH} */
5641       case RX_BUFF_IDX_PREMATCH:
5642         if (rx->offs[0].start != -1) {
5643                         i = rx->offs[0].start;
5644                         if (i > 0) {
5645                                 s1 = 0;
5646                                 t1 = i;
5647                                 goto getlen;
5648                         }
5649             }
5650         return 0;
5651       /* $' / ${^POSTMATCH} */
5652       case RX_BUFF_IDX_POSTMATCH:
5653             if (rx->offs[0].end != -1) {
5654                         i = rx->sublen - rx->offs[0].end;
5655                         if (i > 0) {
5656                                 s1 = rx->offs[0].end;
5657                                 t1 = rx->sublen;
5658                                 goto getlen;
5659                         }
5660             }
5661         return 0;
5662       /* $& / ${^MATCH}, $1, $2, ... */
5663       default:
5664             if (paren <= (I32)rx->nparens &&
5665             (s1 = rx->offs[paren].start) != -1 &&
5666             (t1 = rx->offs[paren].end) != -1)
5667             {
5668             i = t1 - s1;
5669             goto getlen;
5670         } else {
5671             if (ckWARN(WARN_UNINITIALIZED))
5672                 report_uninit((const SV *)sv);
5673             return 0;
5674         }
5675     }
5676   getlen:
5677     if (i > 0 && RXp_MATCH_UTF8(rx)) {
5678         const char * const s = rx->subbeg + s1;
5679         const U8 *ep;
5680         STRLEN el;
5681
5682         i = t1 - s1;
5683         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
5684                         i = el;
5685     }
5686     return i;
5687 }
5688
5689 SV*
5690 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
5691 {
5692     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
5693         PERL_UNUSED_ARG(rx);
5694         if (0)
5695             return NULL;
5696         else
5697             return newSVpvs("Regexp");
5698 }
5699
5700 /* Scans the name of a named buffer from the pattern.
5701  * If flags is REG_RSN_RETURN_NULL returns null.
5702  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
5703  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
5704  * to the parsed name as looked up in the RExC_paren_names hash.
5705  * If there is an error throws a vFAIL().. type exception.
5706  */
5707
5708 #define REG_RSN_RETURN_NULL    0
5709 #define REG_RSN_RETURN_NAME    1
5710 #define REG_RSN_RETURN_DATA    2
5711
5712 STATIC SV*
5713 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
5714 {
5715     char *name_start = RExC_parse;
5716
5717     PERL_ARGS_ASSERT_REG_SCAN_NAME;
5718
5719     if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
5720          /* skip IDFIRST by using do...while */
5721         if (UTF)
5722             do {
5723                 RExC_parse += UTF8SKIP(RExC_parse);
5724             } while (isALNUM_utf8((U8*)RExC_parse));
5725         else
5726             do {
5727                 RExC_parse++;
5728             } while (isALNUM(*RExC_parse));
5729     }
5730
5731     if ( flags ) {
5732         SV* sv_name
5733             = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
5734                              SVs_TEMP | (UTF ? SVf_UTF8 : 0));
5735         if ( flags == REG_RSN_RETURN_NAME)
5736             return sv_name;
5737         else if (flags==REG_RSN_RETURN_DATA) {
5738             HE *he_str = NULL;
5739             SV *sv_dat = NULL;
5740             if ( ! sv_name )      /* should not happen*/
5741                 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
5742             if (RExC_paren_names)
5743                 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
5744             if ( he_str )
5745                 sv_dat = HeVAL(he_str);
5746             if ( ! sv_dat )
5747                 vFAIL("Reference to nonexistent named group");
5748             return sv_dat;
5749         }
5750         else {
5751             Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
5752         }
5753         /* NOT REACHED */
5754     }
5755     return NULL;
5756 }
5757
5758 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
5759     int rem=(int)(RExC_end - RExC_parse);                       \
5760     int cut;                                                    \
5761     int num;                                                    \
5762     int iscut=0;                                                \
5763     if (rem>10) {                                               \
5764         rem=10;                                                 \
5765         iscut=1;                                                \
5766     }                                                           \
5767     cut=10-rem;                                                 \
5768     if (RExC_lastparse!=RExC_parse)                             \
5769         PerlIO_printf(Perl_debug_log," >%.*s%-*s",              \
5770             rem, RExC_parse,                                    \
5771             cut + 4,                                            \
5772             iscut ? "..." : "<"                                 \
5773         );                                                      \
5774     else                                                        \
5775         PerlIO_printf(Perl_debug_log,"%16s","");                \
5776                                                                 \
5777     if (SIZE_ONLY)                                              \
5778        num = RExC_size + 1;                                     \
5779     else                                                        \
5780        num=REG_NODE_NUM(RExC_emit);                             \
5781     if (RExC_lastnum!=num)                                      \
5782        PerlIO_printf(Perl_debug_log,"|%4d",num);                \
5783     else                                                        \
5784        PerlIO_printf(Perl_debug_log,"|%4s","");                 \
5785     PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
5786         (int)((depth*2)), "",                                   \
5787         (funcname)                                              \
5788     );                                                          \
5789     RExC_lastnum=num;                                           \
5790     RExC_lastparse=RExC_parse;                                  \
5791 })
5792
5793
5794
5795 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
5796     DEBUG_PARSE_MSG((funcname));                            \
5797     PerlIO_printf(Perl_debug_log,"%4s","\n");               \
5798 })
5799 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
5800     DEBUG_PARSE_MSG((funcname));                            \
5801     PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
5802 })
5803
5804 /* This section of code defines the inversion list object and its methods.  The
5805  * interfaces are highly subject to change, so as much as possible is static to
5806  * this file.  An inversion list is here implemented as a malloc'd C array with
5807  * some added info.  More will be coming when functionality is added later.
5808  *
5809  * Some of the methods should always be private to the implementation, and some
5810  * should eventually be made public */
5811
5812 #define INVLIST_INITIAL_LEN 10
5813 #define INVLIST_ARRAY_KEY "array"
5814 #define INVLIST_MAX_KEY "max"
5815 #define INVLIST_LEN_KEY "len"
5816
5817 PERL_STATIC_INLINE UV*
5818 S_invlist_array(pTHX_ HV* const invlist)
5819 {
5820     /* Returns the pointer to the inversion list's array.  Every time the
5821      * length changes, this needs to be called in case malloc or realloc moved
5822      * it */
5823
5824     SV** list_ptr = hv_fetchs(invlist, INVLIST_ARRAY_KEY, FALSE);
5825
5826     PERL_ARGS_ASSERT_INVLIST_ARRAY;
5827
5828     if (list_ptr == NULL) {
5829         Perl_croak(aTHX_ "panic: inversion list without a '%s' element",
5830                                                             INVLIST_ARRAY_KEY);
5831     }
5832
5833     return INT2PTR(UV *, SvUV(*list_ptr));
5834 }
5835
5836 PERL_STATIC_INLINE void
5837 S_invlist_set_array(pTHX_ HV* const invlist, const UV* const array)
5838 {
5839     PERL_ARGS_ASSERT_INVLIST_SET_ARRAY;
5840
5841     /* Sets the array stored in the inversion list to the memory beginning with
5842      * the parameter */
5843
5844     if (hv_stores(invlist, INVLIST_ARRAY_KEY, newSVuv(PTR2UV(array))) == NULL) {
5845         Perl_croak(aTHX_ "panic: can't store '%s' entry in inversion list",
5846                                                             INVLIST_ARRAY_KEY);
5847     }
5848 }
5849
5850 PERL_STATIC_INLINE UV
5851 S_invlist_len(pTHX_ HV* const invlist)
5852 {
5853     /* Returns the current number of elements in the inversion list's array */
5854
5855     SV** len_ptr = hv_fetchs(invlist, INVLIST_LEN_KEY, FALSE);
5856
5857     PERL_ARGS_ASSERT_INVLIST_LEN;
5858
5859     if (len_ptr == NULL) {
5860         Perl_croak(aTHX_ "panic: inversion list without a '%s' element",
5861                                                             INVLIST_LEN_KEY);
5862     }
5863
5864     return SvUV(*len_ptr);
5865 }
5866
5867 PERL_STATIC_INLINE UV
5868 S_invlist_max(pTHX_ HV* const invlist)
5869 {
5870     /* Returns the maximum number of elements storable in the inversion list's
5871      * array, without having to realloc() */
5872
5873     SV** max_ptr = hv_fetchs(invlist, INVLIST_MAX_KEY, FALSE);
5874
5875     PERL_ARGS_ASSERT_INVLIST_MAX;
5876
5877     if (max_ptr == NULL) {
5878         Perl_croak(aTHX_ "panic: inversion list without a '%s' element",
5879                                                             INVLIST_MAX_KEY);
5880     }
5881
5882     return SvUV(*max_ptr);
5883 }
5884
5885 PERL_STATIC_INLINE void
5886 S_invlist_set_len(pTHX_ HV* const invlist, const UV len)
5887 {
5888     /* Sets the current number of elements stored in the inversion list */
5889
5890     PERL_ARGS_ASSERT_INVLIST_SET_LEN;
5891
5892     if (len != 0 && len > invlist_max(invlist)) {
5893         Perl_croak(aTHX_ "panic: Can't make '%s=%"UVuf"' more than %s=%"UVuf" in inversion list", INVLIST_LEN_KEY, len, INVLIST_MAX_KEY, invlist_max(invlist));
5894     }
5895
5896     if (hv_stores(invlist, INVLIST_LEN_KEY, newSVuv(len)) == NULL) {
5897         Perl_croak(aTHX_ "panic: can't store '%s' entry in inversion list",
5898                                                             INVLIST_LEN_KEY);
5899     }
5900 }
5901
5902 PERL_STATIC_INLINE void
5903 S_invlist_set_max(pTHX_ HV* const invlist, const UV max)
5904 {
5905
5906     /* Sets the maximum number of elements storable in the inversion list
5907      * without having to realloc() */
5908
5909     PERL_ARGS_ASSERT_INVLIST_SET_MAX;
5910
5911     if (max < invlist_len(invlist)) {
5912         Perl_croak(aTHX_ "panic: Can't make '%s=%"UVuf"' less than %s=%"UVuf" in inversion list", INVLIST_MAX_KEY, invlist_len(invlist), INVLIST_LEN_KEY, invlist_max(invlist));
5913     }
5914
5915     if (hv_stores(invlist, INVLIST_MAX_KEY, newSVuv(max)) == NULL) {
5916         Perl_croak(aTHX_ "panic: can't store '%s' entry in inversion list",
5917                                                             INVLIST_LEN_KEY);
5918     }
5919 }
5920
5921 #ifndef PERL_IN_XSUB_RE
5922 HV*
5923 Perl__new_invlist(pTHX_ IV initial_size)
5924 {
5925
5926     /* Return a pointer to a newly constructed inversion list, with enough
5927      * space to store 'initial_size' elements.  If that number is negative, a
5928      * system default is used instead */
5929
5930     HV* invlist = newHV();
5931     UV* list;
5932
5933     if (initial_size < 0) {
5934         initial_size = INVLIST_INITIAL_LEN;
5935     }
5936
5937     /* Allocate the initial space */
5938     Newx(list, initial_size, UV);
5939     invlist_set_array(invlist, list);
5940
5941     /* set_len has to come before set_max, as the latter inspects the len */
5942     invlist_set_len(invlist, 0);
5943     invlist_set_max(invlist, initial_size);
5944
5945     return invlist;
5946 }
5947 #endif
5948
5949 PERL_STATIC_INLINE void
5950 S_invlist_destroy(pTHX_ HV* const invlist)
5951 {
5952    /* Inversion list destructor */
5953
5954     SV** list_ptr = hv_fetchs(invlist, INVLIST_ARRAY_KEY, FALSE);
5955
5956     PERL_ARGS_ASSERT_INVLIST_DESTROY;
5957
5958     if (list_ptr != NULL) {
5959         UV *list = INT2PTR(UV *, SvUV(*list_ptr)); /* PERL_POISON needs lvalue */
5960         Safefree(list);
5961     }
5962 }
5963
5964 STATIC void
5965 S_invlist_extend(pTHX_ HV* const invlist, const UV new_max)
5966 {
5967     /* Change the maximum size of an inversion list (up or down) */
5968
5969     UV* orig_array;
5970     UV* array;
5971     const UV old_max = invlist_max(invlist);
5972
5973     PERL_ARGS_ASSERT_INVLIST_EXTEND;
5974
5975     if (old_max == new_max) {   /* If a no-op */
5976         return;
5977     }
5978
5979     array = orig_array = invlist_array(invlist);
5980     Renew(array, new_max, UV);
5981
5982     /* If the size change moved the list in memory, set the new one */
5983     if (array != orig_array) {
5984         invlist_set_array(invlist, array);
5985     }
5986
5987     invlist_set_max(invlist, new_max);
5988
5989 }
5990
5991 PERL_STATIC_INLINE void
5992 S_invlist_trim(pTHX_ HV* const invlist)
5993 {
5994     PERL_ARGS_ASSERT_INVLIST_TRIM;
5995
5996     /* Change the length of the inversion list to how many entries it currently
5997      * has */
5998
5999     invlist_extend(invlist, invlist_len(invlist));
6000 }
6001
6002 /* An element is in an inversion list iff its index is even numbered: 0, 2, 4,
6003  * etc */
6004
6005 #define ELEMENT_IN_INVLIST_SET(i) (! ((i) & 1))
6006
6007 #ifndef PERL_IN_XSUB_RE
6008 void
6009 Perl__append_range_to_invlist(pTHX_ HV* const invlist, const UV start, const UV end)
6010 {
6011    /* Subject to change or removal.  Append the range from 'start' to 'end' at
6012     * the end of the inversion list.  The range must be above any existing
6013     * ones. */
6014
6015     UV* array = invlist_array(invlist);
6016     UV max = invlist_max(invlist);
6017     UV len = invlist_len(invlist);
6018
6019     PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
6020
6021     if (len > 0) {
6022
6023         /* Here, the existing list is non-empty. The current max entry in the
6024          * list is generally the first value not in the set, except when the
6025          * set extends to the end of permissible values, in which case it is
6026          * the first entry in that final set, and so this call is an attempt to
6027          * append out-of-order */
6028
6029         UV final_element = len - 1;
6030         if (array[final_element] > start
6031             || ELEMENT_IN_INVLIST_SET(final_element))
6032         {
6033             Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list");
6034         }
6035
6036         /* Here, it is a legal append.  If the new range begins with the first
6037          * value not in the set, it is extending the set, so the new first
6038          * value not in the set is one greater than the newly extended range.
6039          * */
6040         if (array[final_element] == start) {
6041             if (end != UV_MAX) {
6042                 array[final_element] = end + 1;
6043             }
6044             else {
6045                 /* But if the end is the maximum representable on the machine,
6046                  * just let the range that this would extend have no end */
6047                 invlist_set_len(invlist, len - 1);
6048             }
6049             return;
6050         }
6051     }
6052
6053     /* Here the new range doesn't extend any existing set.  Add it */
6054
6055     len += 2;   /* Includes an element each for the start and end of range */
6056
6057     /* If overflows the existing space, extend, which may cause the array to be
6058      * moved */
6059     if (max < len) {
6060         invlist_extend(invlist, len);
6061         array = invlist_array(invlist);
6062     }
6063
6064     invlist_set_len(invlist, len);
6065
6066     /* The next item on the list starts the range, the one after that is
6067      * one past the new range.  */
6068     array[len - 2] = start;
6069     if (end != UV_MAX) {
6070         array[len - 1] = end + 1;
6071     }
6072     else {
6073         /* But if the end is the maximum representable on the machine, just let
6074          * the range have no end */
6075         invlist_set_len(invlist, len - 1);
6076     }
6077 }
6078 #endif
6079
6080 STATIC HV*
6081 S_invlist_union(pTHX_ HV* const a, HV* const b)
6082 {
6083     /* Return a new inversion list which is the union of two inversion lists.
6084      * The basis for this comes from "Unicode Demystified" Chapter 13 by
6085      * Richard Gillam, published by Addison-Wesley, and explained at some
6086      * length there.  The preface says to incorporate its examples into your
6087      * code at your own risk.
6088      *
6089      * The algorithm is like a merge sort.
6090      *
6091      * XXX A potential performance improvement is to keep track as we go along
6092      * if only one of the inputs contributes to the result, meaning the other
6093      * is a subset of that one.  In that case, we can skip the final copy and
6094      * return the larger of the input lists */
6095
6096     UV* array_a = invlist_array(a);   /* a's array */
6097     UV* array_b = invlist_array(b);
6098     UV len_a = invlist_len(a);  /* length of a's array */
6099     UV len_b = invlist_len(b);
6100
6101     HV* u;                      /* the resulting union */
6102     UV* array_u;
6103     UV len_u;
6104
6105     UV i_a = 0;             /* current index into a's array */
6106     UV i_b = 0;
6107     UV i_u = 0;
6108
6109     /* running count, as explained in the algorithm source book; items are
6110      * stopped accumulating and are output when the count changes to/from 0.
6111      * The count is incremented when we start a range that's in the set, and
6112      * decremented when we start a range that's not in the set.  So its range
6113      * is 0 to 2.  Only when the count is zero is something not in the set.
6114      */
6115     UV count = 0;
6116
6117     PERL_ARGS_ASSERT_INVLIST_UNION;
6118
6119     /* Size the union for the worst case: that the sets are completely
6120      * disjoint */
6121     u = _new_invlist(len_a + len_b);
6122     array_u = invlist_array(u);
6123
6124     /* Go through each list item by item, stopping when exhausted one of
6125      * them */
6126     while (i_a < len_a && i_b < len_b) {
6127         UV cp;      /* The element to potentially add to the union's array */
6128         bool cp_in_set;   /* is it in the the input list's set or not */
6129
6130         /* We need to take one or the other of the two inputs for the union.
6131          * Since we are merging two sorted lists, we take the smaller of the
6132          * next items.  In case of a tie, we take the one that is in its set
6133          * first.  If we took one not in the set first, it would decrement the
6134          * count, possibly to 0 which would cause it to be output as ending the
6135          * range, and the next time through we would take the same number, and
6136          * output it again as beginning the next range.  By doing it the
6137          * opposite way, there is no possibility that the count will be
6138          * momentarily decremented to 0, and thus the two adjoining ranges will
6139          * be seamlessly merged.  (In a tie and both are in the set or both not
6140          * in the set, it doesn't matter which we take first.) */
6141         if (array_a[i_a] < array_b[i_b]
6142             || (array_a[i_a] == array_b[i_b] && ELEMENT_IN_INVLIST_SET(i_a)))
6143         {
6144             cp_in_set = ELEMENT_IN_INVLIST_SET(i_a);
6145             cp= array_a[i_a++];
6146         }
6147         else {
6148             cp_in_set = ELEMENT_IN_INVLIST_SET(i_b);
6149             cp= array_b[i_b++];
6150         }
6151
6152         /* Here, have chosen which of the two inputs to look at.  Only output
6153          * if the running count changes to/from 0, which marks the
6154          * beginning/end of a range in that's in the set */
6155         if (cp_in_set) {
6156             if (count == 0) {
6157                 array_u[i_u++] = cp;
6158             }
6159             count++;
6160         }
6161         else {
6162             count--;
6163             if (count == 0) {
6164                 array_u[i_u++] = cp;
6165             }
6166         }
6167     }
6168
6169     /* Here, we are finished going through at least one of the lists, which
6170      * means there is something remaining in at most one.  We check if the list
6171      * that hasn't been exhausted is positioned such that we are in the middle
6172      * of a range in its set or not.  (We are in the set if the next item in
6173      * the array marks the beginning of something not in the set)   If in the
6174      * set, we decrement 'count'; if 0, there is potentially more to output.
6175      * There are four cases:
6176      *  1) Both weren't in their sets, count is 0, and remains 0.  What's left
6177      *     in the union is entirely from the non-exhausted set.
6178      *  2) Both were in their sets, count is 2.  Nothing further should
6179      *     be output, as everything that remains will be in the exhausted
6180      *     list's set, hence in the union; decrementing to 1 but not 0 insures
6181      *     that
6182      *  3) the exhausted was in its set, non-exhausted isn't, count is 1.
6183      *     Nothing further should be output because the union includes
6184      *     everything from the exhausted set.  Not decrementing insures that.
6185      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1;
6186      *     decrementing to 0 insures that we look at the remainder of the
6187      *     non-exhausted set */
6188     if ((i_a != len_a && ! ELEMENT_IN_INVLIST_SET(i_a))
6189         || (i_b != len_b && ! ELEMENT_IN_INVLIST_SET(i_b)))
6190     {
6191         count--;
6192     }
6193
6194     /* The final length is what we've output so far, plus what else is about to
6195      * be output.  (If 'count' is non-zero, then the input list we exhausted
6196      * has everything remaining up to the machine's limit in its set, and hence
6197      * in the union, so there will be no further output. */
6198     len_u = i_u;
6199     if (count == 0) {
6200         /* At most one of the subexpressions will be non-zero */
6201         len_u += (len_a - i_a) + (len_b - i_b);
6202     }
6203
6204     /* Set result to final length, which can change the pointer to array_u, so
6205      * re-find it */
6206     if (len_u != invlist_len(u)) {
6207         invlist_set_len(u, len_u);
6208         invlist_trim(u);
6209         array_u = invlist_array(u);
6210     }
6211
6212     /* When 'count' is 0, the list that was exhausted (if one was shorter than
6213      * the other) ended with everything above it not in its set.  That means
6214      * that the remaining part of the union is precisely the same as the
6215      * non-exhausted list, so can just copy it unchanged.  (If both list were
6216      * exhausted at the same time, then the operations below will be both 0.)
6217      */
6218     if (count == 0) {
6219         IV copy_count; /* At most one will have a non-zero copy count */
6220         if ((copy_count = len_a - i_a) > 0) {
6221             Copy(array_a + i_a, array_u + i_u, copy_count, UV);
6222         }
6223         else if ((copy_count = len_b - i_b) > 0) {
6224             Copy(array_b + i_b, array_u + i_u, copy_count, UV);
6225         }
6226     }
6227
6228     return u;
6229 }
6230
6231 STATIC HV*
6232 S_invlist_intersection(pTHX_ HV* const a, HV* const b)
6233 {
6234     /* Return the intersection of two inversion lists.  The basis for this
6235      * comes from "Unicode Demystified" Chapter 13 by Richard Gillam, published
6236      * by Addison-Wesley, and explained at some length there.  The preface says
6237      * to incorporate its examples into your code at your own risk.
6238      *
6239      * The algorithm is like a merge sort, and is essentially the same as the
6240      * union above
6241      */
6242
6243     UV* array_a = invlist_array(a);   /* a's array */
6244     UV* array_b = invlist_array(b);
6245     UV len_a = invlist_len(a);  /* length of a's array */
6246     UV len_b = invlist_len(b);
6247
6248     HV* r;                   /* the resulting intersection */
6249     UV* array_r;
6250     UV len_r;
6251
6252     UV i_a = 0;             /* current index into a's array */
6253     UV i_b = 0;
6254     UV i_r = 0;
6255
6256     /* running count, as explained in the algorithm source book; items are
6257      * stopped accumulating and are output when the count changes to/from 2.
6258      * The count is incremented when we start a range that's in the set, and
6259      * decremented when we start a range that's not in the set.  So its range
6260      * is 0 to 2.  Only when the count is 2 is something in the intersection.
6261      */
6262     UV count = 0;
6263
6264     PERL_ARGS_ASSERT_INVLIST_INTERSECTION;
6265
6266     /* Size the intersection for the worst case: that the intersection ends up
6267      * fragmenting everything to be completely disjoint */
6268     r= _new_invlist(len_a + len_b);
6269     array_r = invlist_array(r);
6270
6271     /* Go through each list item by item, stopping when exhausted one of
6272      * them */
6273     while (i_a < len_a && i_b < len_b) {
6274         UV cp;      /* The element to potentially add to the intersection's
6275                        array */
6276         bool cp_in_set; /* Is it in the input list's set or not */
6277
6278         /* We need to take one or the other of the two inputs for the union.
6279          * Since we are merging two sorted lists, we take the smaller of the
6280          * next items.  In case of a tie, we take the one that is not in its
6281          * set first (a difference from the union algorithm).  If we took one
6282          * in the set first, it would increment the count, possibly to 2 which
6283          * would cause it to be output as starting a range in the intersection,
6284          * and the next time through we would take that same number, and output
6285          * it again as ending the set.  By doing it the opposite of this, we
6286          * there is no possibility that the count will be momentarily
6287          * incremented to 2.  (In a tie and both are in the set or both not in
6288          * the set, it doesn't matter which we take first.) */
6289         if (array_a[i_a] < array_b[i_b]
6290             || (array_a[i_a] == array_b[i_b] && ! ELEMENT_IN_INVLIST_SET(i_a)))
6291         {
6292             cp_in_set = ELEMENT_IN_INVLIST_SET(i_a);
6293             cp= array_a[i_a++];
6294         }
6295         else {
6296             cp_in_set = ELEMENT_IN_INVLIST_SET(i_b);
6297             cp= array_b[i_b++];
6298         }
6299
6300         /* Here, have chosen which of the two inputs to look at.  Only output
6301          * if the running count changes to/from 2, which marks the
6302          * beginning/end of a range that's in the intersection */
6303         if (cp_in_set) {
6304             count++;
6305             if (count == 2) {
6306                 array_r[i_r++] = cp;
6307             }
6308         }
6309         else {
6310             if (count == 2) {
6311                 array_r[i_r++] = cp;
6312             }
6313             count--;
6314         }
6315     }
6316
6317     /* Here, we are finished going through at least one of the sets, which
6318      * means there is something remaining in at most one.  See the comments in
6319      * the union code */
6320     if ((i_a != len_a && ! ELEMENT_IN_INVLIST_SET(i_a))
6321         || (i_b != len_b && ! ELEMENT_IN_INVLIST_SET(i_b)))
6322     {
6323         count--;
6324     }
6325
6326     /* The final length is what we've output so far plus what else is in the
6327      * intersection.  Only one of the subexpressions below will be non-zero */
6328     len_r = i_r;
6329     if (count == 2) {
6330         len_r += (len_a - i_a) + (len_b - i_b);
6331     }
6332
6333     /* Set result to final length, which can change the pointer to array_r, so
6334      * re-find it */
6335     if (len_r != invlist_len(r)) {
6336         invlist_set_len(r, len_r);
6337         invlist_trim(r);
6338         array_r = invlist_array(r);
6339     }
6340
6341     /* Finish outputting any remaining */
6342     if (count == 2) { /* Only one of will have a non-zero copy count */
6343         IV copy_count;
6344         if ((copy_count = len_a - i_a) > 0) {
6345             Copy(array_a + i_a, array_r + i_r, copy_count, UV);
6346         }
6347         else if ((copy_count = len_b - i_b) > 0) {
6348             Copy(array_b + i_b, array_r + i_r, copy_count, UV);
6349         }
6350     }
6351
6352     return r;
6353 }
6354
6355 STATIC HV*
6356 S_add_range_to_invlist(pTHX_ HV* invlist, const UV start, const UV end)
6357 {
6358     /* Add the range from 'start' to 'end' inclusive to the inversion list's
6359      * set.  A pointer to the inversion list is returned.  This may actually be
6360      * a new list, in which case the passed in one has been destroyed.  The
6361      * passed in inversion list can be NULL, in which case a new one is created
6362      * with just the one range in it */
6363
6364     HV* range_invlist;
6365     HV* added_invlist;
6366     UV len;
6367
6368     if (invlist == NULL) {
6369         invlist = _new_invlist(2);
6370         len = 0;
6371     }
6372     else {
6373         len = invlist_len(invlist);
6374     }
6375
6376     /* If comes after the final entry, can just append it to the end */
6377     if (len == 0
6378         || start >= invlist_array(invlist)
6379                                     [invlist_len(invlist) - 1])
6380     {
6381         _append_range_to_invlist(invlist, start, end);
6382         return invlist;
6383     }
6384
6385     /* Here, can't just append things, create and return a new inversion list
6386      * which is the union of this range and the existing inversion list */
6387     range_invlist = _new_invlist(2);
6388     _append_range_to_invlist(range_invlist, start, end);
6389
6390     added_invlist = invlist_union(invlist, range_invlist);
6391
6392     /* The passed in list can be freed, as well as our temporary */
6393     invlist_destroy(range_invlist);
6394     if (invlist != added_invlist) {
6395         invlist_destroy(invlist);
6396     }
6397
6398     return added_invlist;
6399 }
6400
6401 PERL_STATIC_INLINE HV*
6402 S_add_cp_to_invlist(pTHX_ HV* invlist, const UV cp) {
6403     return add_range_to_invlist(invlist, cp, cp);
6404 }
6405
6406 /* End of inversion list object */
6407
6408 /*
6409  - reg - regular expression, i.e. main body or parenthesized thing
6410  *
6411  * Caller must absorb opening parenthesis.
6412  *
6413  * Combining parenthesis handling with the base level of regular expression
6414  * is a trifle forced, but the need to tie the tails of the branches to what
6415  * follows makes it hard to avoid.
6416  */
6417 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
6418 #ifdef DEBUGGING
6419 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
6420 #else
6421 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
6422 #endif
6423
6424 STATIC regnode *
6425 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
6426     /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
6427 {
6428     dVAR;
6429     register regnode *ret;              /* Will be the head of the group. */
6430     register regnode *br;
6431     register regnode *lastbr;
6432     register regnode *ender = NULL;
6433     register I32 parno = 0;
6434     I32 flags;
6435     U32 oregflags = RExC_flags;
6436     bool have_branch = 0;
6437     bool is_open = 0;
6438     I32 freeze_paren = 0;
6439     I32 after_freeze = 0;
6440
6441     /* for (?g), (?gc), and (?o) warnings; warning
6442        about (?c) will warn about (?g) -- japhy    */
6443
6444 #define WASTED_O  0x01
6445 #define WASTED_G  0x02
6446 #define WASTED_C  0x04
6447 #define WASTED_GC (0x02|0x04)
6448     I32 wastedflags = 0x00;
6449
6450     char * parse_start = RExC_parse; /* MJD */
6451     char * const oregcomp_parse = RExC_parse;
6452
6453     GET_RE_DEBUG_FLAGS_DECL;
6454
6455     PERL_ARGS_ASSERT_REG;
6456     DEBUG_PARSE("reg ");
6457
6458     *flagp = 0;                         /* Tentatively. */
6459
6460
6461     /* Make an OPEN node, if parenthesized. */
6462     if (paren) {
6463         if ( *RExC_parse == '*') { /* (*VERB:ARG) */
6464             char *start_verb = RExC_parse;
6465             STRLEN verb_len = 0;
6466             char *start_arg = NULL;
6467             unsigned char op = 0;
6468             int argok = 1;
6469             int internal_argval = 0; /* internal_argval is only useful if !argok */
6470             while ( *RExC_parse && *RExC_parse != ')' ) {
6471                 if ( *RExC_parse == ':' ) {
6472                     start_arg = RExC_parse + 1;
6473                     break;
6474                 }
6475                 RExC_parse++;
6476             }
6477             ++start_verb;
6478             verb_len = RExC_parse - start_verb;
6479             if ( start_arg ) {
6480                 RExC_parse++;
6481                 while ( *RExC_parse && *RExC_parse != ')' )
6482                     RExC_parse++;
6483                 if ( *RExC_parse != ')' )
6484                     vFAIL("Unterminated verb pattern argument");
6485                 if ( RExC_parse == start_arg )
6486                     start_arg = NULL;
6487             } else {
6488                 if ( *RExC_parse != ')' )
6489                     vFAIL("Unterminated verb pattern");
6490             }
6491
6492             switch ( *start_verb ) {
6493             case 'A':  /* (*ACCEPT) */
6494                 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
6495                     op = ACCEPT;
6496                     internal_argval = RExC_nestroot;
6497                 }
6498                 break;
6499             case 'C':  /* (*COMMIT) */
6500                 if ( memEQs(start_verb,verb_len,"COMMIT") )
6501                     op = COMMIT;
6502                 break;
6503             case 'F':  /* (*FAIL) */
6504                 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
6505                     op = OPFAIL;
6506                     argok = 0;
6507                 }
6508                 break;
6509             case ':':  /* (*:NAME) */
6510             case 'M':  /* (*MARK:NAME) */
6511                 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
6512                     op = MARKPOINT;
6513                     argok = -1;
6514                 }
6515                 break;
6516             case 'P':  /* (*PRUNE) */
6517                 if ( memEQs(start_verb,verb_len,"PRUNE") )
6518                     op = PRUNE;
6519                 break;
6520             case 'S':   /* (*SKIP) */
6521                 if ( memEQs(start_verb,verb_len,"SKIP") )
6522                     op = SKIP;
6523                 break;
6524             case 'T':  /* (*THEN) */
6525                 /* [19:06] <TimToady> :: is then */
6526                 if ( memEQs(start_verb,verb_len,"THEN") ) {
6527                     op = CUTGROUP;
6528                     RExC_seen |= REG_SEEN_CUTGROUP;
6529                 }
6530                 break;
6531             }
6532             if ( ! op ) {
6533                 RExC_parse++;
6534                 vFAIL3("Unknown verb pattern '%.*s'",
6535                     verb_len, start_verb);
6536             }
6537             if ( argok ) {
6538                 if ( start_arg && internal_argval ) {
6539                     vFAIL3("Verb pattern '%.*s' may not have an argument",
6540                         verb_len, start_verb);
6541                 } else if ( argok < 0 && !start_arg ) {
6542                     vFAIL3("Verb pattern '%.*s' has a mandatory argument",
6543                         verb_len, start_verb);
6544                 } else {
6545                     ret = reganode(pRExC_state, op, internal_argval);
6546                     if ( ! internal_argval && ! SIZE_ONLY ) {
6547                         if (start_arg) {
6548                             SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
6549                             ARG(ret) = add_data( pRExC_state, 1, "S" );
6550                             RExC_rxi->data->data[ARG(ret)]=(void*)sv;
6551                             ret->flags = 0;
6552                         } else {
6553                             ret->flags = 1;
6554                         }
6555                     }
6556                 }
6557                 if (!internal_argval)
6558                     RExC_seen |= REG_SEEN_VERBARG;
6559             } else if ( start_arg ) {
6560                 vFAIL3("Verb pattern '%.*s' may not have an argument",
6561                         verb_len, start_verb);
6562             } else {
6563                 ret = reg_node(pRExC_state, op);
6564             }
6565             nextchar(pRExC_state);
6566             return ret;
6567         } else
6568         if (*RExC_parse == '?') { /* (?...) */
6569             bool is_logical = 0;
6570             const char * const seqstart = RExC_parse;
6571             bool has_use_defaults = FALSE;
6572
6573             RExC_parse++;
6574             paren = *RExC_parse++;
6575             ret = NULL;                 /* For look-ahead/behind. */
6576             switch (paren) {
6577
6578             case 'P':   /* (?P...) variants for those used to PCRE/Python */
6579                 paren = *RExC_parse++;
6580                 if ( paren == '<')         /* (?P<...>) named capture */
6581                     goto named_capture;
6582                 else if (paren == '>') {   /* (?P>name) named recursion */
6583                     goto named_recursion;
6584                 }
6585                 else if (paren == '=') {   /* (?P=...)  named backref */
6586                     /* this pretty much dupes the code for \k<NAME> in regatom(), if
6587                        you change this make sure you change that */
6588                     char* name_start = RExC_parse;
6589                     U32 num = 0;
6590                     SV *sv_dat = reg_scan_name(pRExC_state,
6591                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6592                     if (RExC_parse == name_start || *RExC_parse != ')')
6593                         vFAIL2("Sequence %.3s... not terminated",parse_start);
6594
6595                     if (!SIZE_ONLY) {
6596                         num = add_data( pRExC_state, 1, "S" );
6597                         RExC_rxi->data->data[num]=(void*)sv_dat;
6598                         SvREFCNT_inc_simple_void(sv_dat);
6599                     }
6600                     RExC_sawback = 1;
6601                     ret = reganode(pRExC_state,
6602                                    ((! FOLD)
6603                                      ? NREF
6604                                      : (MORE_ASCII_RESTRICTED)
6605                                        ? NREFFA
6606                                        : (AT_LEAST_UNI_SEMANTICS)
6607                                          ? NREFFU
6608                                          : (LOC)
6609                                            ? NREFFL
6610                                            : NREFF),
6611                                     num);
6612                     *flagp |= HASWIDTH;
6613
6614                     Set_Node_Offset(ret, parse_start+1);
6615                     Set_Node_Cur_Length(ret); /* MJD */
6616
6617                     nextchar(pRExC_state);
6618                     return ret;
6619                 }
6620                 RExC_parse++;
6621                 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6622                 /*NOTREACHED*/
6623             case '<':           /* (?<...) */
6624                 if (*RExC_parse == '!')
6625                     paren = ',';
6626                 else if (*RExC_parse != '=')
6627               named_capture:
6628                 {               /* (?<...>) */
6629                     char *name_start;
6630                     SV *svname;
6631                     paren= '>';
6632             case '\'':          /* (?'...') */
6633                     name_start= RExC_parse;
6634                     svname = reg_scan_name(pRExC_state,
6635                         SIZE_ONLY ?  /* reverse test from the others */
6636                         REG_RSN_RETURN_NAME :
6637                         REG_RSN_RETURN_NULL);
6638                     if (RExC_parse == name_start) {
6639                         RExC_parse++;
6640                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6641                         /*NOTREACHED*/
6642                     }
6643                     if (*RExC_parse != paren)
6644                         vFAIL2("Sequence (?%c... not terminated",
6645                             paren=='>' ? '<' : paren);
6646                     if (SIZE_ONLY) {
6647                         HE *he_str;
6648                         SV *sv_dat = NULL;
6649                         if (!svname) /* shouldn't happen */
6650                             Perl_croak(aTHX_
6651                                 "panic: reg_scan_name returned NULL");
6652                         if (!RExC_paren_names) {
6653                             RExC_paren_names= newHV();
6654                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
6655 #ifdef DEBUGGING
6656                             RExC_paren_name_list= newAV();
6657                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
6658 #endif
6659                         }
6660                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
6661                         if ( he_str )
6662                             sv_dat = HeVAL(he_str);
6663                         if ( ! sv_dat ) {
6664                             /* croak baby croak */
6665                             Perl_croak(aTHX_
6666                                 "panic: paren_name hash element allocation failed");
6667                         } else if ( SvPOK(sv_dat) ) {
6668                             /* (?|...) can mean we have dupes so scan to check
6669                                its already been stored. Maybe a flag indicating
6670                                we are inside such a construct would be useful,
6671                                but the arrays are likely to be quite small, so
6672                                for now we punt -- dmq */
6673                             IV count = SvIV(sv_dat);
6674                             I32 *pv = (I32*)SvPVX(sv_dat);
6675                             IV i;
6676                             for ( i = 0 ; i < count ; i++ ) {
6677                                 if ( pv[i] == RExC_npar ) {
6678                                     count = 0;
6679                                     break;
6680                                 }
6681                             }
6682                             if ( count ) {
6683                                 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
6684                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
6685                                 pv[count] = RExC_npar;
6686                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
6687                             }
6688                         } else {
6689                             (void)SvUPGRADE(sv_dat,SVt_PVNV);
6690                             sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
6691                             SvIOK_on(sv_dat);
6692                             SvIV_set(sv_dat, 1);
6693                         }
6694 #ifdef DEBUGGING
6695                         if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
6696                             SvREFCNT_dec(svname);
6697 #endif
6698
6699                         /*sv_dump(sv_dat);*/
6700                     }
6701                     nextchar(pRExC_state);
6702                     paren = 1;
6703                     goto capturing_parens;
6704                 }
6705                 RExC_seen |= REG_SEEN_LOOKBEHIND;
6706                 RExC_in_lookbehind++;
6707                 RExC_parse++;
6708             case '=':           /* (?=...) */
6709                 RExC_seen_zerolen++;
6710                 break;
6711             case '!':           /* (?!...) */
6712                 RExC_seen_zerolen++;
6713                 if (*RExC_parse == ')') {
6714                     ret=reg_node(pRExC_state, OPFAIL);
6715                     nextchar(pRExC_state);
6716                     return ret;
6717                 }
6718                 break;
6719             case '|':           /* (?|...) */
6720                 /* branch reset, behave like a (?:...) except that
6721                    buffers in alternations share the same numbers */
6722                 paren = ':';
6723                 after_freeze = freeze_paren = RExC_npar;
6724                 break;
6725             case ':':           /* (?:...) */
6726             case '>':           /* (?>...) */
6727                 break;
6728             case '$':           /* (?$...) */
6729             case '@':           /* (?@...) */
6730                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
6731                 break;
6732             case '#':           /* (?#...) */
6733                 while (*RExC_parse && *RExC_parse != ')')
6734                     RExC_parse++;
6735                 if (*RExC_parse != ')')
6736                     FAIL("Sequence (?#... not terminated");
6737                 nextchar(pRExC_state);
6738                 *flagp = TRYAGAIN;
6739                 return NULL;
6740             case '0' :           /* (?0) */
6741             case 'R' :           /* (?R) */
6742                 if (*RExC_parse != ')')
6743                     FAIL("Sequence (?R) not terminated");
6744                 ret = reg_node(pRExC_state, GOSTART);
6745                 *flagp |= POSTPONED;
6746                 nextchar(pRExC_state);
6747                 return ret;
6748                 /*notreached*/
6749             { /* named and numeric backreferences */
6750                 I32 num;
6751             case '&':            /* (?&NAME) */
6752                 parse_start = RExC_parse - 1;
6753               named_recursion:
6754                 {
6755                     SV *sv_dat = reg_scan_name(pRExC_state,
6756                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6757                      num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
6758                 }
6759                 goto gen_recurse_regop;
6760                 /* NOT REACHED */
6761             case '+':
6762                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
6763                     RExC_parse++;
6764                     vFAIL("Illegal pattern");
6765                 }
6766                 goto parse_recursion;
6767                 /* NOT REACHED*/
6768             case '-': /* (?-1) */
6769                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
6770                     RExC_parse--; /* rewind to let it be handled later */
6771                     goto parse_flags;
6772                 }
6773                 /*FALLTHROUGH */
6774             case '1': case '2': case '3': case '4': /* (?1) */
6775             case '5': case '6': case '7': case '8': case '9':
6776                 RExC_parse--;
6777               parse_recursion:
6778                 num = atoi(RExC_parse);
6779                 parse_start = RExC_parse - 1; /* MJD */
6780                 if (*RExC_parse == '-')
6781                     RExC_parse++;
6782                 while (isDIGIT(*RExC_parse))
6783                         RExC_parse++;
6784                 if (*RExC_parse!=')')
6785                     vFAIL("Expecting close bracket");
6786
6787               gen_recurse_regop:
6788                 if ( paren == '-' ) {
6789                     /*
6790                     Diagram of capture buffer numbering.
6791                     Top line is the normal capture buffer numbers
6792                     Bottom line is the negative indexing as from
6793                     the X (the (?-2))
6794
6795                     +   1 2    3 4 5 X          6 7
6796                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
6797                     -   5 4    3 2 1 X          x x
6798
6799                     */
6800                     num = RExC_npar + num;
6801                     if (num < 1)  {
6802                         RExC_parse++;
6803                         vFAIL("Reference to nonexistent group");
6804                     }
6805                 } else if ( paren == '+' ) {
6806                     num = RExC_npar + num - 1;
6807                 }
6808
6809                 ret = reganode(pRExC_state, GOSUB, num);
6810                 if (!SIZE_ONLY) {
6811                     if (num > (I32)RExC_rx->nparens) {
6812                         RExC_parse++;
6813                         vFAIL("Reference to nonexistent group");
6814                     }
6815                     ARG2L_SET( ret, RExC_recurse_count++);
6816                     RExC_emit++;
6817                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6818                         "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
6819                 } else {
6820                     RExC_size++;
6821                 }
6822                 RExC_seen |= REG_SEEN_RECURSE;
6823                 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
6824                 Set_Node_Offset(ret, parse_start); /* MJD */
6825
6826                 *flagp |= POSTPONED;
6827                 nextchar(pRExC_state);
6828                 return ret;
6829             } /* named and numeric backreferences */
6830             /* NOT REACHED */
6831
6832             case '?':           /* (??...) */
6833                 is_logical = 1;
6834                 if (*RExC_parse != '{') {
6835                     RExC_parse++;
6836                     vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6837                     /*NOTREACHED*/
6838                 }
6839                 *flagp |= POSTPONED;
6840                 paren = *RExC_parse++;
6841                 /* FALL THROUGH */
6842             case '{':           /* (?{...}) */
6843             {
6844                 I32 count = 1;
6845                 U32 n = 0;
6846                 char c;
6847                 char *s = RExC_parse;
6848
6849                 RExC_seen_zerolen++;
6850                 RExC_seen |= REG_SEEN_EVAL;
6851                 while (count && (c = *RExC_parse)) {
6852                     if (c == '\\') {
6853                         if (RExC_parse[1])
6854                             RExC_parse++;
6855                     }
6856                     else if (c == '{')
6857                         count++;
6858                     else if (c == '}')
6859                         count--;
6860                     RExC_parse++;
6861                 }
6862                 if (*RExC_parse != ')') {
6863                     RExC_parse = s;
6864                     vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
6865                 }
6866                 if (!SIZE_ONLY) {
6867                     PAD *pad;
6868                     OP_4tree *sop, *rop;
6869                     SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
6870
6871                     ENTER;
6872                     Perl_save_re_context(aTHX);
6873                     rop = Perl_sv_compile_2op_is_broken(aTHX_ sv, &sop, "re", &pad);
6874                     sop->op_private |= OPpREFCOUNTED;
6875                     /* re_dup will OpREFCNT_inc */
6876                     OpREFCNT_set(sop, 1);
6877                     LEAVE;
6878
6879                     n = add_data(pRExC_state, 3, "nop");
6880                     RExC_rxi->data->data[n] = (void*)rop;
6881                     RExC_rxi->data->data[n+1] = (void*)sop;
6882                     RExC_rxi->data->data[n+2] = (void*)pad;
6883                     SvREFCNT_dec(sv);
6884                 }
6885                 else {                                          /* First pass */
6886                     if (PL_reginterp_cnt < ++RExC_seen_evals
6887                         && IN_PERL_RUNTIME)
6888                         /* No compiled RE interpolated, has runtime
6889                            components ===> unsafe.  */
6890                         FAIL("Eval-group not allowed at runtime, use re 'eval'");
6891                     if (PL_tainting && PL_tainted)
6892                         FAIL("Eval-group in insecure regular expression");
6893 #if PERL_VERSION > 8
6894                     if (IN_PERL_COMPILETIME)
6895                         PL_cv_has_eval = 1;
6896 #endif
6897                 }
6898
6899                 nextchar(pRExC_state);
6900                 if (is_logical) {
6901                     ret = reg_node(pRExC_state, LOGICAL);
6902                     if (!SIZE_ONLY)
6903                         ret->flags = 2;
6904                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
6905                     /* deal with the length of this later - MJD */
6906                     return ret;
6907                 }
6908                 ret = reganode(pRExC_state, EVAL, n);
6909                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
6910                 Set_Node_Offset(ret, parse_start);
6911                 return ret;
6912             }
6913             case '(':           /* (?(?{...})...) and (?(?=...)...) */
6914             {
6915                 int is_define= 0;
6916                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
6917                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
6918                         || RExC_parse[1] == '<'
6919                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
6920                         I32 flag;
6921
6922                         ret = reg_node(pRExC_state, LOGICAL);
6923                         if (!SIZE_ONLY)
6924                             ret->flags = 1;
6925                         REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
6926                         goto insert_if;
6927                     }
6928                 }
6929                 else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
6930                          || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
6931                 {
6932                     char ch = RExC_parse[0] == '<' ? '>' : '\'';
6933                     char *name_start= RExC_parse++;
6934                     U32 num = 0;
6935                     SV *sv_dat=reg_scan_name(pRExC_state,
6936                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6937                     if (RExC_parse == name_start || *RExC_parse != ch)
6938                         vFAIL2("Sequence (?(%c... not terminated",
6939                             (ch == '>' ? '<' : ch));
6940                     RExC_parse++;
6941                     if (!SIZE_ONLY) {
6942                         num = add_data( pRExC_state, 1, "S" );
6943                         RExC_rxi->data->data[num]=(void*)sv_dat;
6944                         SvREFCNT_inc_simple_void(sv_dat);
6945                     }
6946                     ret = reganode(pRExC_state,NGROUPP,num);
6947                     goto insert_if_check_paren;
6948                 }
6949                 else if (RExC_parse[0] == 'D' &&
6950                          RExC_parse[1] == 'E' &&
6951                          RExC_parse[2] == 'F' &&
6952                          RExC_parse[3] == 'I' &&
6953                          RExC_parse[4] == 'N' &&
6954                          RExC_parse[5] == 'E')
6955                 {
6956                     ret = reganode(pRExC_state,DEFINEP,0);
6957                     RExC_parse +=6 ;
6958                     is_define = 1;
6959                     goto insert_if_check_paren;
6960                 }
6961                 else if (RExC_parse[0] == 'R') {
6962                     RExC_parse++;
6963                     parno = 0;
6964                     if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
6965                         parno = atoi(RExC_parse++);
6966                         while (isDIGIT(*RExC_parse))
6967                             RExC_parse++;
6968                     } else if (RExC_parse[0] == '&') {
6969                         SV *sv_dat;
6970                         RExC_parse++;
6971                         sv_dat = reg_scan_name(pRExC_state,
6972                             SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6973                         parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
6974                     }
6975                     ret = reganode(pRExC_state,INSUBP,parno);
6976                     goto insert_if_check_paren;
6977                 }
6978                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
6979                     /* (?(1)...) */
6980                     char c;
6981                     parno = atoi(RExC_parse++);
6982
6983                     while (isDIGIT(*RExC_parse))
6984                         RExC_parse++;
6985                     ret = reganode(pRExC_state, GROUPP, parno);
6986
6987                  insert_if_check_paren:
6988                     if ((c = *nextchar(pRExC_state)) != ')')
6989                         vFAIL("Switch condition not recognized");
6990                   insert_if:
6991                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
6992                     br = regbranch(pRExC_state, &flags, 1,depth+1);
6993                     if (br == NULL)
6994                         br = reganode(pRExC_state, LONGJMP, 0);
6995                     else
6996                         REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
6997                     c = *nextchar(pRExC_state);
6998                     if (flags&HASWIDTH)
6999                         *flagp |= HASWIDTH;
7000                     if (c == '|') {
7001                         if (is_define)
7002                             vFAIL("(?(DEFINE)....) does not allow branches");
7003                         lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
7004                         regbranch(pRExC_state, &flags, 1,depth+1);
7005                         REGTAIL(pRExC_state, ret, lastbr);
7006                         if (flags&HASWIDTH)
7007                             *flagp |= HASWIDTH;
7008                         c = *nextchar(pRExC_state);
7009                     }
7010                     else
7011                         lastbr = NULL;
7012                     if (c != ')')
7013                         vFAIL("Switch (?(condition)... contains too many branches");
7014                     ender = reg_node(pRExC_state, TAIL);
7015                     REGTAIL(pRExC_state, br, ender);
7016                     if (lastbr) {
7017                         REGTAIL(pRExC_state, lastbr, ender);
7018                         REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
7019                     }
7020                     else
7021                         REGTAIL(pRExC_state, ret, ender);
7022                     RExC_size++; /* XXX WHY do we need this?!!
7023                                     For large programs it seems to be required
7024                                     but I can't figure out why. -- dmq*/
7025                     return ret;
7026                 }
7027                 else {
7028                     vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
7029                 }
7030             }
7031             case 0:
7032                 RExC_parse--; /* for vFAIL to print correctly */
7033                 vFAIL("Sequence (? incomplete");
7034                 break;
7035             case DEFAULT_PAT_MOD:   /* Use default flags with the exceptions
7036                                        that follow */
7037                 has_use_defaults = TRUE;
7038                 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
7039                 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
7040                                                 ? REGEX_UNICODE_CHARSET
7041                                                 : REGEX_DEPENDS_CHARSET);
7042                 goto parse_flags;
7043             default:
7044                 --RExC_parse;
7045                 parse_flags:      /* (?i) */
7046             {
7047                 U32 posflags = 0, negflags = 0;
7048                 U32 *flagsp = &posflags;
7049                 bool has_charset_modifier = 0;
7050                 regex_charset cs = (RExC_utf8 || RExC_uni_semantics)
7051                                     ? REGEX_UNICODE_CHARSET
7052                                     : REGEX_DEPENDS_CHARSET;
7053
7054                 while (*RExC_parse) {
7055                     /* && strchr("iogcmsx", *RExC_parse) */
7056                     /* (?g), (?gc) and (?o) are useless here
7057                        and must be globally applied -- japhy */
7058                     switch (*RExC_parse) {
7059                     CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
7060                     case LOCALE_PAT_MOD:
7061                         if (has_charset_modifier || flagsp == &negflags) {
7062                             goto fail_modifiers;
7063                         }
7064                         cs = REGEX_LOCALE_CHARSET;
7065                         has_charset_modifier = 1;
7066                         RExC_contains_locale = 1;
7067                         break;
7068                     case UNICODE_PAT_MOD:
7069                         if (has_charset_modifier || flagsp == &negflags) {
7070                             goto fail_modifiers;
7071                         }
7072                         cs = REGEX_UNICODE_CHARSET;
7073                         has_charset_modifier = 1;
7074                         break;
7075                     case ASCII_RESTRICT_PAT_MOD:
7076                         if (has_charset_modifier || flagsp == &negflags) {
7077                             goto fail_modifiers;
7078                         }
7079                         if (*(RExC_parse + 1) == ASCII_RESTRICT_PAT_MOD) {
7080                             /* Doubled modifier implies more restricted */
7081                             cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
7082                             RExC_parse++;
7083                         }
7084                         else {
7085                             cs = REGEX_ASCII_RESTRICTED_CHARSET;
7086                         }
7087                         has_charset_modifier = 1;
7088                         break;
7089                     case DEPENDS_PAT_MOD:
7090                         if (has_use_defaults
7091                             || has_charset_modifier
7092                             || flagsp == &negflags)
7093                         {
7094                             goto fail_modifiers;
7095                         }
7096
7097                         /* The dual charset means unicode semantics if the
7098                          * pattern (or target, not known until runtime) are
7099                          * utf8, or something in the pattern indicates unicode
7100                          * semantics */
7101                         cs = (RExC_utf8 || RExC_uni_semantics)
7102                              ? REGEX_UNICODE_CHARSET
7103                              : REGEX_DEPENDS_CHARSET;
7104                         has_charset_modifier = 1;
7105                         break;
7106                     case ONCE_PAT_MOD: /* 'o' */
7107                     case GLOBAL_PAT_MOD: /* 'g' */
7108                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
7109                             const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
7110                             if (! (wastedflags & wflagbit) ) {
7111                                 wastedflags |= wflagbit;
7112                                 vWARN5(
7113                                     RExC_parse + 1,
7114                                     "Useless (%s%c) - %suse /%c modifier",
7115                                     flagsp == &negflags ? "?-" : "?",
7116                                     *RExC_parse,
7117                                     flagsp == &negflags ? "don't " : "",
7118                                     *RExC_parse
7119                                 );
7120                             }
7121                         }
7122                         break;
7123
7124                     case CONTINUE_PAT_MOD: /* 'c' */
7125                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
7126                             if (! (wastedflags & WASTED_C) ) {
7127                                 wastedflags |= WASTED_GC;
7128                                 vWARN3(
7129                                     RExC_parse + 1,
7130                                     "Useless (%sc) - %suse /gc modifier",
7131                                     flagsp == &negflags ? "?-" : "?",
7132                                     flagsp == &negflags ? "don't " : ""
7133                                 );
7134                             }
7135                         }
7136                         break;
7137                     case KEEPCOPY_PAT_MOD: /* 'p' */
7138                         if (flagsp == &negflags) {
7139                             if (SIZE_ONLY)
7140                                 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
7141                         } else {
7142                             *flagsp |= RXf_PMf_KEEPCOPY;
7143                         }
7144                         break;
7145                     case '-':
7146                         /* A flag is a default iff it is following a minus, so
7147                          * if there is a minus, it means will be trying to
7148                          * re-specify a default which is an error */
7149                         if (has_use_defaults || flagsp == &negflags) {
7150             fail_modifiers:
7151                             RExC_parse++;
7152                             vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
7153                             /*NOTREACHED*/
7154                         }
7155                         flagsp = &negflags;
7156                         wastedflags = 0;  /* reset so (?g-c) warns twice */
7157                         break;
7158                     case ':':
7159                         paren = ':';
7160                         /*FALLTHROUGH*/
7161                     case ')':
7162                         RExC_flags |= posflags;
7163                         RExC_flags &= ~negflags;
7164                         set_regex_charset(&RExC_flags, cs);
7165                         if (paren != ':') {
7166                             oregflags |= posflags;
7167                             oregflags &= ~negflags;
7168                             set_regex_charset(&oregflags, cs);
7169                         }
7170                         nextchar(pRExC_state);
7171                         if (paren != ':') {
7172                             *flagp = TRYAGAIN;
7173                             return NULL;
7174                         } else {
7175                             ret = NULL;
7176                             goto parse_rest;
7177                         }
7178                         /*NOTREACHED*/
7179                     default:
7180                         RExC_parse++;
7181                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
7182                         /*NOTREACHED*/
7183                     }
7184                     ++RExC_parse;
7185                 }
7186             }} /* one for the default block, one for the switch */
7187         }
7188         else {                  /* (...) */
7189           capturing_parens:
7190             parno = RExC_npar;
7191             RExC_npar++;
7192
7193             ret = reganode(pRExC_state, OPEN, parno);
7194             if (!SIZE_ONLY ){
7195                 if (!RExC_nestroot)
7196                     RExC_nestroot = parno;
7197                 if (RExC_seen & REG_SEEN_RECURSE
7198                     && !RExC_open_parens[parno-1])
7199                 {
7200                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
7201                         "Setting open paren #%"IVdf" to %d\n",
7202                         (IV)parno, REG_NODE_NUM(ret)));
7203                     RExC_open_parens[parno-1]= ret;
7204                 }
7205             }
7206             Set_Node_Length(ret, 1); /* MJD */
7207             Set_Node_Offset(ret, RExC_parse); /* MJD */
7208             is_open = 1;
7209         }
7210     }
7211     else                        /* ! paren */
7212         ret = NULL;
7213
7214    parse_rest:
7215     /* Pick up the branches, linking them together. */
7216     parse_start = RExC_parse;   /* MJD */
7217     br = regbranch(pRExC_state, &flags, 1,depth+1);
7218
7219     /*     branch_len = (paren != 0); */
7220
7221     if (br == NULL)
7222         return(NULL);
7223     if (*RExC_parse == '|') {
7224         if (!SIZE_ONLY && RExC_extralen) {
7225             reginsert(pRExC_state, BRANCHJ, br, depth+1);
7226         }
7227         else {                  /* MJD */
7228             reginsert(pRExC_state, BRANCH, br, depth+1);
7229             Set_Node_Length(br, paren != 0);
7230             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
7231         }
7232         have_branch = 1;
7233         if (SIZE_ONLY)
7234             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
7235     }
7236     else if (paren == ':') {
7237         *flagp |= flags&SIMPLE;
7238     }
7239     if (is_open) {                              /* Starts with OPEN. */
7240         REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
7241     }
7242     else if (paren != '?')              /* Not Conditional */
7243         ret = br;
7244     *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
7245     lastbr = br;
7246     while (*RExC_parse == '|') {
7247         if (!SIZE_ONLY && RExC_extralen) {
7248             ender = reganode(pRExC_state, LONGJMP,0);
7249             REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
7250         }
7251         if (SIZE_ONLY)
7252             RExC_extralen += 2;         /* Account for LONGJMP. */
7253         nextchar(pRExC_state);
7254         if (freeze_paren) {
7255             if (RExC_npar > after_freeze)
7256                 after_freeze = RExC_npar;
7257             RExC_npar = freeze_paren;
7258         }
7259         br = regbranch(pRExC_state, &flags, 0, depth+1);
7260
7261         if (br == NULL)
7262             return(NULL);
7263         REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
7264         lastbr = br;
7265         *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
7266     }
7267
7268     if (have_branch || paren != ':') {
7269         /* Make a closing node, and hook it on the end. */
7270         switch (paren) {
7271         case ':':
7272             ender = reg_node(pRExC_state, TAIL);
7273             break;
7274         case 1:
7275             ender = reganode(pRExC_state, CLOSE, parno);
7276             if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
7277                 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
7278                         "Setting close paren #%"IVdf" to %d\n",
7279                         (IV)parno, REG_NODE_NUM(ender)));
7280                 RExC_close_parens[parno-1]= ender;
7281                 if (RExC_nestroot == parno)
7282                     RExC_nestroot = 0;
7283             }
7284             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
7285             Set_Node_Length(ender,1); /* MJD */
7286             break;
7287         case '<':
7288         case ',':
7289         case '=':
7290         case '!':
7291             *flagp &= ~HASWIDTH;
7292             /* FALL THROUGH */
7293         case '>':
7294             ender = reg_node(pRExC_state, SUCCEED);
7295             break;
7296         case 0:
7297             ender = reg_node(pRExC_state, END);
7298             if (!SIZE_ONLY) {
7299                 assert(!RExC_opend); /* there can only be one! */
7300                 RExC_opend = ender;
7301             }
7302             break;
7303         }
7304         REGTAIL(pRExC_state, lastbr, ender);
7305
7306         if (have_branch && !SIZE_ONLY) {
7307             if (depth==1)
7308                 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
7309
7310             /* Hook the tails of the branches to the closing node. */
7311             for (br = ret; br; br = regnext(br)) {
7312                 const U8 op = PL_regkind[OP(br)];
7313                 if (op == BRANCH) {
7314                     REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
7315                 }
7316                 else if (op == BRANCHJ) {
7317                     REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
7318                 }
7319             }
7320         }
7321     }
7322
7323     {
7324         const char *p;
7325         static const char parens[] = "=!<,>";
7326
7327         if (paren && (p = strchr(parens, paren))) {
7328             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
7329             int flag = (p - parens) > 1;
7330
7331             if (paren == '>')
7332                 node = SUSPEND, flag = 0;
7333             reginsert(pRExC_state, node,ret, depth+1);
7334             Set_Node_Cur_Length(ret);
7335             Set_Node_Offset(ret, parse_start + 1);
7336             ret->flags = flag;
7337             REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
7338         }
7339     }
7340
7341     /* Check for proper termination. */
7342     if (paren) {
7343         RExC_flags = oregflags;
7344         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
7345             RExC_parse = oregcomp_parse;
7346             vFAIL("Unmatched (");
7347         }
7348     }
7349     else if (!paren && RExC_parse < RExC_end) {
7350         if (*RExC_parse == ')') {
7351             RExC_parse++;
7352             vFAIL("Unmatched )");
7353         }
7354         else
7355             FAIL("Junk on end of regexp");      /* "Can't happen". */
7356         /* NOTREACHED */
7357     }
7358
7359     if (RExC_in_lookbehind) {
7360         RExC_in_lookbehind--;
7361     }
7362     if (after_freeze > RExC_npar)
7363         RExC_npar = after_freeze;
7364     return(ret);
7365 }
7366
7367 /*
7368  - regbranch - one alternative of an | operator
7369  *
7370  * Implements the concatenation operator.
7371  */
7372 STATIC regnode *
7373 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
7374 {
7375     dVAR;
7376     register regnode *ret;
7377     register regnode *chain = NULL;
7378     register regnode *latest;
7379     I32 flags = 0, c = 0;
7380     GET_RE_DEBUG_FLAGS_DECL;
7381
7382     PERL_ARGS_ASSERT_REGBRANCH;
7383
7384     DEBUG_PARSE("brnc");
7385
7386     if (first)
7387         ret = NULL;
7388     else {
7389         if (!SIZE_ONLY && RExC_extralen)
7390             ret = reganode(pRExC_state, BRANCHJ,0);
7391         else {
7392             ret = reg_node(pRExC_state, BRANCH);
7393             Set_Node_Length(ret, 1);
7394         }
7395     }
7396
7397     if (!first && SIZE_ONLY)
7398         RExC_extralen += 1;                     /* BRANCHJ */
7399
7400     *flagp = WORST;                     /* Tentatively. */
7401
7402     RExC_parse--;
7403     nextchar(pRExC_state);
7404     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
7405         flags &= ~TRYAGAIN;
7406         latest = regpiece(pRExC_state, &flags,depth+1);
7407         if (latest == NULL) {
7408             if (flags & TRYAGAIN)
7409                 continue;
7410             return(NULL);
7411         }
7412         else if (ret == NULL)
7413             ret = latest;
7414         *flagp |= flags&(HASWIDTH|POSTPONED);
7415         if (chain == NULL)      /* First piece. */
7416             *flagp |= flags&SPSTART;
7417         else {
7418             RExC_naughty++;
7419             REGTAIL(pRExC_state, chain, latest);
7420         }
7421         chain = latest;
7422         c++;
7423     }
7424     if (chain == NULL) {        /* Loop ran zero times. */
7425         chain = reg_node(pRExC_state, NOTHING);
7426         if (ret == NULL)
7427             ret = chain;
7428     }
7429     if (c == 1) {
7430         *flagp |= flags&SIMPLE;
7431     }
7432
7433     return ret;
7434 }
7435
7436 /*
7437  - regpiece - something followed by possible [*+?]
7438  *
7439  * Note that the branching code sequences used for ? and the general cases
7440  * of * and + are somewhat optimized:  they use the same NOTHING node as
7441  * both the endmarker for their branch list and the body of the last branch.
7442  * It might seem that this node could be dispensed with entirely, but the
7443  * endmarker role is not redundant.
7444  */
7445 STATIC regnode *
7446 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
7447 {
7448     dVAR;
7449     register regnode *ret;
7450     register char op;
7451     register char *next;
7452     I32 flags;
7453     const char * const origparse = RExC_parse;
7454     I32 min;
7455     I32 max = REG_INFTY;
7456     char *parse_start;
7457     const char *maxpos = NULL;
7458     GET_RE_DEBUG_FLAGS_DECL;
7459
7460     PERL_ARGS_ASSERT_REGPIECE;
7461
7462     DEBUG_PARSE("piec");
7463
7464     ret = regatom(pRExC_state, &flags,depth+1);
7465     if (ret == NULL) {
7466         if (flags & TRYAGAIN)
7467             *flagp |= TRYAGAIN;
7468         return(NULL);
7469     }
7470
7471     op = *RExC_parse;
7472
7473     if (op == '{' && regcurly(RExC_parse)) {
7474         maxpos = NULL;
7475         parse_start = RExC_parse; /* MJD */
7476         next = RExC_parse + 1;
7477         while (isDIGIT(*next) || *next == ',') {
7478             if (*next == ',') {
7479                 if (maxpos)
7480                     break;
7481                 else
7482                     maxpos = next;
7483             }
7484             next++;
7485         }
7486         if (*next == '}') {             /* got one */
7487             if (!maxpos)
7488                 maxpos = next;
7489             RExC_parse++;
7490             min = atoi(RExC_parse);
7491             if (*maxpos == ',')
7492                 maxpos++;
7493             else
7494                 maxpos = RExC_parse;
7495             max = atoi(maxpos);
7496             if (!max && *maxpos != '0')
7497                 max = REG_INFTY;                /* meaning "infinity" */
7498             else if (max >= REG_INFTY)
7499                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
7500             RExC_parse = next;
7501             nextchar(pRExC_state);
7502
7503         do_curly:
7504             if ((flags&SIMPLE)) {
7505                 RExC_naughty += 2 + RExC_naughty / 2;
7506                 reginsert(pRExC_state, CURLY, ret, depth+1);
7507                 Set_Node_Offset(ret, parse_start+1); /* MJD */
7508                 Set_Node_Cur_Length(ret);
7509             }
7510             else {
7511                 regnode * const w = reg_node(pRExC_state, WHILEM);
7512
7513                 w->flags = 0;
7514                 REGTAIL(pRExC_state, ret, w);
7515                 if (!SIZE_ONLY && RExC_extralen) {
7516                     reginsert(pRExC_state, LONGJMP,ret, depth+1);
7517                     reginsert(pRExC_state, NOTHING,ret, depth+1);
7518                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
7519                 }
7520                 reginsert(pRExC_state, CURLYX,ret, depth+1);
7521                                 /* MJD hk */
7522                 Set_Node_Offset(ret, parse_start+1);
7523                 Set_Node_Length(ret,
7524                                 op == '{' ? (RExC_parse - parse_start) : 1);
7525
7526                 if (!SIZE_ONLY && RExC_extralen)
7527                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
7528                 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
7529                 if (SIZE_ONLY)
7530                     RExC_whilem_seen++, RExC_extralen += 3;
7531                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
7532             }
7533             ret->flags = 0;
7534
7535             if (min > 0)
7536                 *flagp = WORST;
7537             if (max > 0)
7538                 *flagp |= HASWIDTH;
7539             if (max < min)
7540                 vFAIL("Can't do {n,m} with n > m");
7541             if (!SIZE_ONLY) {
7542                 ARG1_SET(ret, (U16)min);
7543                 ARG2_SET(ret, (U16)max);
7544             }
7545
7546             goto nest_check;
7547         }
7548     }
7549
7550     if (!ISMULT1(op)) {
7551         *flagp = flags;
7552         return(ret);
7553     }
7554
7555 #if 0                           /* Now runtime fix should be reliable. */
7556
7557     /* if this is reinstated, don't forget to put this back into perldiag:
7558
7559             =item Regexp *+ operand could be empty at {#} in regex m/%s/
7560
7561            (F) The part of the regexp subject to either the * or + quantifier
7562            could match an empty string. The {#} shows in the regular
7563            expression about where the problem was discovered.
7564
7565     */
7566
7567     if (!(flags&HASWIDTH) && op != '?')
7568       vFAIL("Regexp *+ operand could be empty");
7569 #endif
7570
7571     parse_start = RExC_parse;
7572     nextchar(pRExC_state);
7573
7574     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
7575
7576     if (op == '*' && (flags&SIMPLE)) {
7577         reginsert(pRExC_state, STAR, ret, depth+1);
7578         ret->flags = 0;
7579         RExC_naughty += 4;
7580     }
7581     else if (op == '*') {
7582         min = 0;
7583         goto do_curly;
7584     }
7585     else if (op == '+' && (flags&SIMPLE)) {
7586         reginsert(pRExC_state, PLUS, ret, depth+1);
7587         ret->flags = 0;
7588         RExC_naughty += 3;
7589     }
7590     else if (op == '+') {
7591         min = 1;
7592         goto do_curly;
7593     }
7594     else if (op == '?') {
7595         min = 0; max = 1;
7596         goto do_curly;
7597     }
7598   nest_check:
7599     if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
7600         ckWARN3reg(RExC_parse,
7601                    "%.*s matches null string many times",
7602                    (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
7603                    origparse);
7604     }
7605
7606     if (RExC_parse < RExC_end && *RExC_parse == '?') {
7607         nextchar(pRExC_state);
7608         reginsert(pRExC_state, MINMOD, ret, depth+1);
7609         REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
7610     }
7611 #ifndef REG_ALLOW_MINMOD_SUSPEND
7612     else
7613 #endif
7614     if (RExC_parse < RExC_end && *RExC_parse == '+') {
7615         regnode *ender;
7616         nextchar(pRExC_state);
7617         ender = reg_node(pRExC_state, SUCCEED);
7618         REGTAIL(pRExC_state, ret, ender);
7619         reginsert(pRExC_state, SUSPEND, ret, depth+1);
7620         ret->flags = 0;
7621         ender = reg_node(pRExC_state, TAIL);
7622         REGTAIL(pRExC_state, ret, ender);
7623         /*ret= ender;*/
7624     }
7625
7626     if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
7627         RExC_parse++;
7628         vFAIL("Nested quantifiers");
7629     }
7630
7631     return(ret);
7632 }
7633
7634
7635 /* reg_namedseq(pRExC_state,UVp)
7636
7637    This is expected to be called by a parser routine that has
7638    recognized '\N' and needs to handle the rest. RExC_parse is
7639    expected to point at the first char following the N at the time
7640    of the call.
7641
7642    The \N may be inside (indicated by valuep not being NULL) or outside a
7643    character class.
7644
7645    \N may begin either a named sequence, or if outside a character class, mean
7646    to match a non-newline.  For non single-quoted regexes, the tokenizer has
7647    attempted to decide which, and in the case of a named sequence converted it
7648    into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
7649    where c1... are the characters in the sequence.  For single-quoted regexes,
7650    the tokenizer passes the \N sequence through unchanged; this code will not
7651    attempt to determine this nor expand those.  The net effect is that if the
7652    beginning of the passed-in pattern isn't '{U+' or there is no '}', it
7653    signals that this \N occurrence means to match a non-newline.
7654
7655    Only the \N{U+...} form should occur in a character class, for the same
7656    reason that '.' inside a character class means to just match a period: it
7657    just doesn't make sense.
7658
7659    If valuep is non-null then it is assumed that we are parsing inside
7660    of a charclass definition and the first codepoint in the resolved
7661    string is returned via *valuep and the routine will return NULL.
7662    In this mode if a multichar string is returned from the charnames
7663    handler, a warning will be issued, and only the first char in the
7664    sequence will be examined. If the string returned is zero length
7665    then the value of *valuep is undefined and NON-NULL will
7666    be returned to indicate failure. (This will NOT be a valid pointer
7667    to a regnode.)
7668
7669    If valuep is null then it is assumed that we are parsing normal text and a
7670    new EXACT node is inserted into the program containing the resolved string,
7671    and a pointer to the new node is returned.  But if the string is zero length
7672    a NOTHING node is emitted instead.
7673
7674    On success RExC_parse is set to the char following the endbrace.
7675    Parsing failures will generate a fatal error via vFAIL(...)
7676  */
7677 STATIC regnode *
7678 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
7679 {
7680     char * endbrace;    /* '}' following the name */
7681     regnode *ret = NULL;
7682 #ifdef DEBUGGING
7683     char* parse_start = RExC_parse - 2;     /* points to the '\N' */
7684 #endif
7685     char* p;
7686
7687     GET_RE_DEBUG_FLAGS_DECL;
7688
7689     PERL_ARGS_ASSERT_REG_NAMEDSEQ;
7690
7691     GET_RE_DEBUG_FLAGS;
7692
7693     /* The [^\n] meaning of \N ignores spaces and comments under the /x
7694      * modifier.  The other meaning does not */
7695     p = (RExC_flags & RXf_PMf_EXTENDED)
7696         ? regwhite( pRExC_state, RExC_parse )
7697         : RExC_parse;
7698
7699     /* Disambiguate between \N meaning a named character versus \N meaning
7700      * [^\n].  The former is assumed when it can't be the latter. */
7701     if (*p != '{' || regcurly(p)) {
7702         RExC_parse = p;
7703         if (valuep) {
7704             /* no bare \N in a charclass */
7705             vFAIL("\\N in a character class must be a named character: \\N{...}");
7706         }
7707         nextchar(pRExC_state);
7708         ret = reg_node(pRExC_state, REG_ANY);
7709         *flagp |= HASWIDTH|SIMPLE;
7710         RExC_naughty++;
7711         RExC_parse--;
7712         Set_Node_Length(ret, 1); /* MJD */
7713         return ret;
7714     }
7715
7716     /* Here, we have decided it should be a named sequence */
7717
7718     /* The test above made sure that the next real character is a '{', but
7719      * under the /x modifier, it could be separated by space (or a comment and
7720      * \n) and this is not allowed (for consistency with \x{...} and the
7721      * tokenizer handling of \N{NAME}). */
7722     if (*RExC_parse != '{') {
7723         vFAIL("Missing braces on \\N{}");
7724     }
7725
7726     RExC_parse++;       /* Skip past the '{' */
7727
7728     if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
7729         || ! (endbrace == RExC_parse            /* nothing between the {} */
7730               || (endbrace - RExC_parse >= 2    /* U+ (bad hex is checked below */
7731                   && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
7732     {
7733         if (endbrace) RExC_parse = endbrace;    /* position msg's '<--HERE' */
7734         vFAIL("\\N{NAME} must be resolved by the lexer");
7735     }
7736
7737     if (endbrace == RExC_parse) {   /* empty: \N{} */
7738         if (! valuep) {
7739             RExC_parse = endbrace + 1;
7740             return reg_node(pRExC_state,NOTHING);
7741         }
7742
7743         if (SIZE_ONLY) {
7744             ckWARNreg(RExC_parse,
7745                     "Ignoring zero length \\N{} in character class"
7746             );
7747             RExC_parse = endbrace + 1;
7748         }
7749         *valuep = 0;
7750         return (regnode *) &RExC_parse; /* Invalid regnode pointer */
7751     }
7752
7753     REQUIRE_UTF8;       /* named sequences imply Unicode semantics */
7754     RExC_parse += 2;    /* Skip past the 'U+' */
7755
7756     if (valuep) {   /* In a bracketed char class */
7757         /* We only pay attention to the first char of
7758         multichar strings being returned. I kinda wonder
7759         if this makes sense as it does change the behaviour
7760         from earlier versions, OTOH that behaviour was broken
7761         as well. XXX Solution is to recharacterize as
7762         [rest-of-class]|multi1|multi2... */
7763
7764         STRLEN length_of_hex;
7765         I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7766             | PERL_SCAN_DISALLOW_PREFIX
7767             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
7768
7769         char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
7770         if (endchar < endbrace) {
7771             ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
7772         }
7773
7774         length_of_hex = (STRLEN)(endchar - RExC_parse);
7775         *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
7776
7777         /* The tokenizer should have guaranteed validity, but it's possible to
7778          * bypass it by using single quoting, so check */
7779         if (length_of_hex == 0
7780             || length_of_hex != (STRLEN)(endchar - RExC_parse) )
7781         {
7782             RExC_parse += length_of_hex;        /* Includes all the valid */
7783             RExC_parse += (RExC_orig_utf8)      /* point to after 1st invalid */
7784                             ? UTF8SKIP(RExC_parse)
7785                             : 1;
7786             /* Guard against malformed utf8 */
7787             if (RExC_parse >= endchar) RExC_parse = endchar;
7788             vFAIL("Invalid hexadecimal number in \\N{U+...}");
7789         }
7790
7791         RExC_parse = endbrace + 1;
7792         if (endchar == endbrace) return NULL;
7793
7794         ret = (regnode *) &RExC_parse;  /* Invalid regnode pointer */
7795     }
7796     else {      /* Not a char class */
7797         char *s;            /* String to put in generated EXACT node */
7798         STRLEN len = 0;     /* Its current byte length */
7799         char *endchar;      /* Points to '.' or '}' ending cur char in the input
7800                                stream */
7801         ret = reg_node(pRExC_state,
7802                            (U8) ((! FOLD) ? EXACT
7803                                           : (LOC)
7804                                              ? EXACTFL
7805                                              : (MORE_ASCII_RESTRICTED)
7806                                                ? EXACTFA
7807                                                : (AT_LEAST_UNI_SEMANTICS)
7808                                                  ? EXACTFU
7809                                                  : EXACTF));
7810         s= STRING(ret);
7811
7812         /* Exact nodes can hold only a U8 length's of text = 255.  Loop through
7813          * the input which is of the form now 'c1.c2.c3...}' until find the
7814          * ending brace or exceed length 255.  The characters that exceed this
7815          * limit are dropped.  The limit could be relaxed should it become
7816          * desirable by reparsing this as (?:\N{NAME}), so could generate
7817          * multiple EXACT nodes, as is done for just regular input.  But this
7818          * is primarily a named character, and not intended to be a huge long
7819          * string, so 255 bytes should be good enough */
7820         while (1) {
7821             STRLEN length_of_hex;
7822             I32 grok_flags = PERL_SCAN_ALLOW_UNDERSCORES
7823                             | PERL_SCAN_DISALLOW_PREFIX
7824                             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
7825             UV cp;  /* Ord of current character */
7826             bool use_this_char_fold = FOLD;
7827
7828             /* Code points are separated by dots.  If none, there is only one
7829              * code point, and is terminated by the brace */
7830             endchar = RExC_parse + strcspn(RExC_parse, ".}");
7831
7832             /* The values are Unicode even on EBCDIC machines */
7833             length_of_hex = (STRLEN)(endchar - RExC_parse);
7834             cp = grok_hex(RExC_parse, &length_of_hex, &grok_flags, NULL);
7835             if ( length_of_hex == 0
7836                 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
7837             {
7838                 RExC_parse += length_of_hex;        /* Includes all the valid */
7839                 RExC_parse += (RExC_orig_utf8)  /* point to after 1st invalid */
7840                                 ? UTF8SKIP(RExC_parse)
7841                                 : 1;
7842                 /* Guard against malformed utf8 */
7843                 if (RExC_parse >= endchar) RExC_parse = endchar;
7844                 vFAIL("Invalid hexadecimal number in \\N{U+...}");
7845             }
7846
7847             /* XXX ? Change to ANYOF node
7848             if (FOLD
7849                 && (cp > 255 || (! MORE_ASCII_RESTRICTED && ! LOC))
7850                 && is_TRICKYFOLD_cp(cp))
7851             {
7852             }
7853             */
7854
7855             /* Under /aa, we can't mix ASCII with non- in a fold.  If we are
7856              * folding, and the source isn't ASCII, look through all the
7857              * characters it folds to.  If any one of them is ASCII, forbid
7858              * this fold.  (cp is uni, so the 127 below is correct even for
7859              * EBCDIC).  Similarly under locale rules, we don't mix under 256
7860              * with above 255.  XXX It really doesn't make sense to have \N{}
7861              * which means a Unicode rules under locale.  I (khw) think this
7862              * should be warned about, but the counter argument is that people
7863              * who have programmed around Perl's earlier lack of specifying the
7864              * rules and used \N{} to force Unicode things in a local
7865              * environment shouldn't get suddenly a warning */
7866             if (use_this_char_fold) {
7867                 if (LOC && cp < 256) {  /* Fold not known until run-time */
7868                     use_this_char_fold = FALSE;
7869                 }
7870                 else if ((cp > 127 && MORE_ASCII_RESTRICTED)
7871                          || (cp > 255 && LOC))
7872                 {
7873                 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
7874                 U8* s = tmpbuf;
7875                 U8* e;
7876                 STRLEN foldlen;
7877
7878                 (void) toFOLD_uni(cp, tmpbuf, &foldlen);
7879                 e = s + foldlen;
7880
7881                 while (s < e) {
7882                     if (isASCII(*s)
7883                         || (LOC && (UTF8_IS_INVARIANT(*s)
7884                                     || UTF8_IS_DOWNGRADEABLE_START(*s))))
7885                     {
7886                         use_this_char_fold = FALSE;
7887                         break;
7888                     }
7889                     s += UTF8SKIP(s);
7890                 }
7891                 }
7892             }
7893
7894             if (! use_this_char_fold) { /* Not folding, just append to the
7895                                            string */
7896                 STRLEN unilen;
7897
7898                 /* Quit before adding this character if would exceed limit */
7899                 if (len + UNISKIP(cp) > U8_MAX) break;
7900
7901                 unilen = reguni(pRExC_state, cp, s);
7902                 if (unilen > 0) {
7903                     s   += unilen;
7904                     len += unilen;
7905                 }
7906             } else {    /* Folding, output the folded equivalent */
7907                 STRLEN foldlen,numlen;
7908                 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
7909                 cp = toFOLD_uni(cp, tmpbuf, &foldlen);
7910
7911                 /* Quit before exceeding size limit */
7912                 if (len + foldlen > U8_MAX) break;
7913
7914                 for (foldbuf = tmpbuf;
7915                     foldlen;
7916                     foldlen -= numlen)
7917                 {
7918                     cp = utf8_to_uvchr(foldbuf, &numlen);
7919                     if (numlen > 0) {
7920                         const STRLEN unilen = reguni(pRExC_state, cp, s);
7921                         s       += unilen;
7922                         len     += unilen;
7923                         /* In EBCDIC the numlen and unilen can differ. */
7924                         foldbuf += numlen;
7925                         if (numlen >= foldlen)
7926                             break;
7927                     }
7928                     else
7929                         break; /* "Can't happen." */
7930                 }
7931             }
7932
7933             /* Point to the beginning of the next character in the sequence. */
7934             RExC_parse = endchar + 1;
7935
7936             /* Quit if no more characters */
7937             if (RExC_parse >= endbrace) break;
7938         }
7939
7940
7941         if (SIZE_ONLY) {
7942             if (RExC_parse < endbrace) {
7943                 ckWARNreg(RExC_parse - 1,
7944                           "Using just the first characters returned by \\N{}");
7945             }
7946
7947             RExC_size += STR_SZ(len);
7948         } else {
7949             STR_LEN(ret) = len;
7950             RExC_emit += STR_SZ(len);
7951         }
7952
7953         RExC_parse = endbrace + 1;
7954
7955         *flagp |= HASWIDTH; /* Not SIMPLE, as that causes the engine to fail
7956                                with malformed in t/re/pat_advanced.t */
7957         RExC_parse --;
7958         Set_Node_Cur_Length(ret); /* MJD */
7959         nextchar(pRExC_state);
7960     }
7961
7962     return ret;
7963 }
7964
7965
7966 /*
7967  * reg_recode
7968  *
7969  * It returns the code point in utf8 for the value in *encp.
7970  *    value: a code value in the source encoding
7971  *    encp:  a pointer to an Encode object
7972  *
7973  * If the result from Encode is not a single character,
7974  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
7975  */
7976 STATIC UV
7977 S_reg_recode(pTHX_ const char value, SV **encp)
7978 {
7979     STRLEN numlen = 1;
7980     SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
7981     const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
7982     const STRLEN newlen = SvCUR(sv);
7983     UV uv = UNICODE_REPLACEMENT;
7984
7985     PERL_ARGS_ASSERT_REG_RECODE;
7986
7987     if (newlen)
7988         uv = SvUTF8(sv)
7989              ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
7990              : *(U8*)s;
7991
7992     if (!newlen || numlen != newlen) {
7993         uv = UNICODE_REPLACEMENT;
7994         *encp = NULL;
7995     }
7996     return uv;
7997 }
7998
7999
8000 /*
8001  - regatom - the lowest level
8002
8003    Try to identify anything special at the start of the pattern. If there
8004    is, then handle it as required. This may involve generating a single regop,
8005    such as for an assertion; or it may involve recursing, such as to
8006    handle a () structure.
8007
8008    If the string doesn't start with something special then we gobble up
8009    as much literal text as we can.
8010
8011    Once we have been able to handle whatever type of thing started the
8012    sequence, we return.
8013
8014    Note: we have to be careful with escapes, as they can be both literal
8015    and special, and in the case of \10 and friends can either, depending
8016    on context. Specifically there are two separate switches for handling
8017    escape sequences, with the one for handling literal escapes requiring
8018    a dummy entry for all of the special escapes that are actually handled
8019    by the other.
8020 */
8021
8022 STATIC regnode *
8023 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
8024 {
8025     dVAR;
8026     register regnode *ret = NULL;
8027     I32 flags;
8028     char *parse_start = RExC_parse;
8029     U8 op;
8030     GET_RE_DEBUG_FLAGS_DECL;
8031     DEBUG_PARSE("atom");
8032     *flagp = WORST;             /* Tentatively. */
8033
8034     PERL_ARGS_ASSERT_REGATOM;
8035
8036 tryagain:
8037     switch ((U8)*RExC_parse) {
8038     case '^':
8039         RExC_seen_zerolen++;
8040         nextchar(pRExC_state);
8041         if (RExC_flags & RXf_PMf_MULTILINE)
8042             ret = reg_node(pRExC_state, MBOL);
8043         else if (RExC_flags & RXf_PMf_SINGLELINE)
8044             ret = reg_node(pRExC_state, SBOL);
8045         else
8046             ret = reg_node(pRExC_state, BOL);
8047         Set_Node_Length(ret, 1); /* MJD */
8048         break;
8049     case '$':
8050         nextchar(pRExC_state);
8051         if (*RExC_parse)
8052             RExC_seen_zerolen++;
8053         if (RExC_flags & RXf_PMf_MULTILINE)
8054             ret = reg_node(pRExC_state, MEOL);
8055         else if (RExC_flags & RXf_PMf_SINGLELINE)
8056             ret = reg_node(pRExC_state, SEOL);
8057         else
8058             ret = reg_node(pRExC_state, EOL);
8059         Set_Node_Length(ret, 1); /* MJD */
8060         break;
8061     case '.':
8062         nextchar(pRExC_state);
8063         if (RExC_flags & RXf_PMf_SINGLELINE)
8064             ret = reg_node(pRExC_state, SANY);
8065         else
8066             ret = reg_node(pRExC_state, REG_ANY);
8067         *flagp |= HASWIDTH|SIMPLE;
8068         RExC_naughty++;
8069         Set_Node_Length(ret, 1); /* MJD */
8070         break;
8071     case '[':
8072     {
8073         char * const oregcomp_parse = ++RExC_parse;
8074         ret = regclass(pRExC_state,depth+1);
8075         if (*RExC_parse != ']') {
8076             RExC_parse = oregcomp_parse;
8077             vFAIL("Unmatched [");
8078         }
8079         nextchar(pRExC_state);
8080         *flagp |= HASWIDTH|SIMPLE;
8081         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
8082         break;
8083     }
8084     case '(':
8085         nextchar(pRExC_state);
8086         ret = reg(pRExC_state, 1, &flags,depth+1);
8087         if (ret == NULL) {
8088                 if (flags & TRYAGAIN) {
8089                     if (RExC_parse == RExC_end) {
8090                          /* Make parent create an empty node if needed. */
8091                         *flagp |= TRYAGAIN;
8092                         return(NULL);
8093                     }
8094                     goto tryagain;
8095                 }
8096                 return(NULL);
8097         }
8098         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
8099         break;
8100     case '|':
8101     case ')':
8102         if (flags & TRYAGAIN) {
8103             *flagp |= TRYAGAIN;
8104             return NULL;
8105         }
8106         vFAIL("Internal urp");
8107                                 /* Supposed to be caught earlier. */
8108         break;
8109     case '{':
8110         if (!regcurly(RExC_parse)) {
8111             RExC_parse++;
8112             goto defchar;
8113         }
8114         /* FALL THROUGH */
8115     case '?':
8116     case '+':
8117     case '*':
8118         RExC_parse++;
8119         vFAIL("Quantifier follows nothing");
8120         break;
8121     case LATIN_SMALL_LETTER_SHARP_S:
8122     case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
8123     case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
8124 #if UTF8_TWO_BYTE_HI_nocast(UPSILON_D_T) != UTF8_TWO_BYTE_HI_nocast(IOTA_D_T)
8125 #error The beginning utf8 byte of IOTA_D_T and UPSILON_D_T unexpectedly differ.  Other instances in this code should have the case statement below.
8126     case UTF8_TWO_BYTE_HI_nocast(UPSILON_D_T):
8127 #endif
8128         do_foldchar:
8129         if (!LOC && FOLD) {
8130             U32 len,cp;
8131             len=0; /* silence a spurious compiler warning */
8132             if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) {
8133                 *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */
8134                 RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */
8135                 ret = reganode(pRExC_state, FOLDCHAR, cp);
8136                 Set_Node_Length(ret, 1); /* MJD */
8137                 nextchar(pRExC_state); /* kill whitespace under /x */
8138                 return ret;
8139             }
8140         }
8141         goto outer_default;
8142     case '\\':
8143         /* Special Escapes
8144
8145            This switch handles escape sequences that resolve to some kind
8146            of special regop and not to literal text. Escape sequnces that
8147            resolve to literal text are handled below in the switch marked
8148            "Literal Escapes".
8149
8150            Every entry in this switch *must* have a corresponding entry
8151            in the literal escape switch. However, the opposite is not
8152            required, as the default for this switch is to jump to the
8153            literal text handling code.
8154         */
8155         switch ((U8)*++RExC_parse) {
8156         case LATIN_SMALL_LETTER_SHARP_S:
8157         case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
8158         case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
8159                    goto do_foldchar;
8160         /* Special Escapes */
8161         case 'A':
8162             RExC_seen_zerolen++;
8163             ret = reg_node(pRExC_state, SBOL);
8164             *flagp |= SIMPLE;
8165             goto finish_meta_pat;
8166         case 'G':
8167             ret = reg_node(pRExC_state, GPOS);
8168             RExC_seen |= REG_SEEN_GPOS;
8169             *flagp |= SIMPLE;
8170             goto finish_meta_pat;
8171         case 'K':
8172             RExC_seen_zerolen++;
8173             ret = reg_node(pRExC_state, KEEPS);
8174             *flagp |= SIMPLE;
8175             /* XXX:dmq : disabling in-place substitution seems to
8176              * be necessary here to avoid cases of memory corruption, as
8177              * with: C<$_="x" x 80; s/x\K/y/> -- rgs
8178              */
8179             RExC_seen |= REG_SEEN_LOOKBEHIND;
8180             goto finish_meta_pat;
8181         case 'Z':
8182             ret = reg_node(pRExC_state, SEOL);
8183             *flagp |= SIMPLE;
8184             RExC_seen_zerolen++;                /* Do not optimize RE away */
8185             goto finish_meta_pat;
8186         case 'z':
8187             ret = reg_node(pRExC_state, EOS);
8188             *flagp |= SIMPLE;
8189             RExC_seen_zerolen++;                /* Do not optimize RE away */
8190             goto finish_meta_pat;
8191         case 'C':
8192             ret = reg_node(pRExC_state, CANY);
8193             RExC_seen |= REG_SEEN_CANY;
8194             *flagp |= HASWIDTH|SIMPLE;
8195             goto finish_meta_pat;
8196         case 'X':
8197             ret = reg_node(pRExC_state, CLUMP);
8198             *flagp |= HASWIDTH;
8199             goto finish_meta_pat;
8200         case 'w':
8201             switch (get_regex_charset(RExC_flags)) {
8202                 case REGEX_LOCALE_CHARSET:
8203                     op = ALNUML;
8204                     break;
8205                 case REGEX_UNICODE_CHARSET:
8206                     op = ALNUMU;
8207                     break;
8208                 case REGEX_ASCII_RESTRICTED_CHARSET:
8209                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8210                     op = ALNUMA;
8211                     break;
8212                 case REGEX_DEPENDS_CHARSET:
8213                     op = ALNUM;
8214                     break;
8215                 default:
8216                     goto bad_charset;
8217             }
8218             ret = reg_node(pRExC_state, op);
8219             *flagp |= HASWIDTH|SIMPLE;
8220             goto finish_meta_pat;
8221         case 'W':
8222             switch (get_regex_charset(RExC_flags)) {
8223                 case REGEX_LOCALE_CHARSET:
8224                     op = NALNUML;
8225                     break;
8226                 case REGEX_UNICODE_CHARSET:
8227                     op = NALNUMU;
8228                     break;
8229                 case REGEX_ASCII_RESTRICTED_CHARSET:
8230                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8231                     op = NALNUMA;
8232                     break;
8233                 case REGEX_DEPENDS_CHARSET:
8234                     op = NALNUM;
8235                     break;
8236                 default:
8237                     goto bad_charset;
8238             }
8239             ret = reg_node(pRExC_state, op);
8240             *flagp |= HASWIDTH|SIMPLE;
8241             goto finish_meta_pat;
8242         case 'b':
8243             RExC_seen_zerolen++;
8244             RExC_seen |= REG_SEEN_LOOKBEHIND;
8245             switch (get_regex_charset(RExC_flags)) {
8246                 case REGEX_LOCALE_CHARSET:
8247                     op = BOUNDL;
8248                     break;
8249                 case REGEX_UNICODE_CHARSET:
8250                     op = BOUNDU;
8251                     break;
8252                 case REGEX_ASCII_RESTRICTED_CHARSET:
8253                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8254                     op = BOUNDA;
8255                     break;
8256                 case REGEX_DEPENDS_CHARSET:
8257                     op = BOUND;
8258                     break;
8259                 default:
8260                     goto bad_charset;
8261             }
8262             ret = reg_node(pRExC_state, op);
8263             FLAGS(ret) = get_regex_charset(RExC_flags);
8264             *flagp |= SIMPLE;
8265             if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
8266                 ckWARNregdep(RExC_parse, "\"\\b{\" is deprecated; use \"\\b\\{\" instead");
8267             }
8268             goto finish_meta_pat;
8269         case 'B':
8270             RExC_seen_zerolen++;
8271             RExC_seen |= REG_SEEN_LOOKBEHIND;
8272             switch (get_regex_charset(RExC_flags)) {
8273                 case REGEX_LOCALE_CHARSET:
8274                     op = NBOUNDL;
8275                     break;
8276                 case REGEX_UNICODE_CHARSET:
8277                     op = NBOUNDU;
8278                     break;
8279                 case REGEX_ASCII_RESTRICTED_CHARSET:
8280                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8281                     op = NBOUNDA;
8282                     break;
8283                 case REGEX_DEPENDS_CHARSET:
8284                     op = NBOUND;
8285                     break;
8286                 default:
8287                     goto bad_charset;
8288             }
8289             ret = reg_node(pRExC_state, op);
8290             FLAGS(ret) = get_regex_charset(RExC_flags);
8291             *flagp |= SIMPLE;
8292             if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
8293                 ckWARNregdep(RExC_parse, "\"\\B{\" is deprecated; use \"\\B\\{\" instead");
8294             }
8295             goto finish_meta_pat;
8296         case 's':
8297             switch (get_regex_charset(RExC_flags)) {
8298                 case REGEX_LOCALE_CHARSET:
8299                     op = SPACEL;
8300                     break;
8301                 case REGEX_UNICODE_CHARSET:
8302                     op = SPACEU;
8303                     break;
8304                 case REGEX_ASCII_RESTRICTED_CHARSET:
8305                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8306                     op = SPACEA;
8307                     break;
8308                 case REGEX_DEPENDS_CHARSET:
8309                     op = SPACE;
8310                     break;
8311                 default:
8312                     goto bad_charset;
8313             }
8314             ret = reg_node(pRExC_state, op);
8315             *flagp |= HASWIDTH|SIMPLE;
8316             goto finish_meta_pat;
8317         case 'S':
8318             switch (get_regex_charset(RExC_flags)) {
8319                 case REGEX_LOCALE_CHARSET:
8320                     op = NSPACEL;
8321                     break;
8322                 case REGEX_UNICODE_CHARSET:
8323                     op = NSPACEU;
8324                     break;
8325                 case REGEX_ASCII_RESTRICTED_CHARSET:
8326                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8327                     op = NSPACEA;
8328                     break;
8329                 case REGEX_DEPENDS_CHARSET:
8330                     op = NSPACE;
8331                     break;
8332                 default:
8333                     goto bad_charset;
8334             }
8335             ret = reg_node(pRExC_state, op);
8336             *flagp |= HASWIDTH|SIMPLE;
8337             goto finish_meta_pat;
8338         case 'd':
8339             switch (get_regex_charset(RExC_flags)) {
8340                 case REGEX_LOCALE_CHARSET:
8341                     op = DIGITL;
8342                     break;
8343                 case REGEX_ASCII_RESTRICTED_CHARSET:
8344                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8345                     op = DIGITA;
8346                     break;
8347                 case REGEX_DEPENDS_CHARSET: /* No difference between these */
8348                 case REGEX_UNICODE_CHARSET:
8349                     op = DIGIT;
8350                     break;
8351                 default:
8352                     goto bad_charset;
8353             }
8354             ret = reg_node(pRExC_state, op);
8355             *flagp |= HASWIDTH|SIMPLE;
8356             goto finish_meta_pat;
8357         case 'D':
8358             switch (get_regex_charset(RExC_flags)) {
8359                 case REGEX_LOCALE_CHARSET:
8360                     op = NDIGITL;
8361                     break;
8362                 case REGEX_ASCII_RESTRICTED_CHARSET:
8363                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8364                     op = NDIGITA;
8365                     break;
8366                 case REGEX_DEPENDS_CHARSET: /* No difference between these */
8367                 case REGEX_UNICODE_CHARSET:
8368                     op = NDIGIT;
8369                     break;
8370                 default:
8371                     goto bad_charset;
8372             }
8373             ret = reg_node(pRExC_state, op);
8374             *flagp |= HASWIDTH|SIMPLE;
8375             goto finish_meta_pat;
8376         case 'R':
8377             ret = reg_node(pRExC_state, LNBREAK);
8378             *flagp |= HASWIDTH|SIMPLE;
8379             goto finish_meta_pat;
8380         case 'h':
8381             ret = reg_node(pRExC_state, HORIZWS);
8382             *flagp |= HASWIDTH|SIMPLE;
8383             goto finish_meta_pat;
8384         case 'H':
8385             ret = reg_node(pRExC_state, NHORIZWS);
8386             *flagp |= HASWIDTH|SIMPLE;
8387             goto finish_meta_pat;
8388         case 'v':
8389             ret = reg_node(pRExC_state, VERTWS);
8390             *flagp |= HASWIDTH|SIMPLE;
8391             goto finish_meta_pat;
8392         case 'V':
8393             ret = reg_node(pRExC_state, NVERTWS);
8394             *flagp |= HASWIDTH|SIMPLE;
8395          finish_meta_pat:
8396             nextchar(pRExC_state);
8397             Set_Node_Length(ret, 2); /* MJD */
8398             break;
8399         case 'p':
8400         case 'P':
8401             {
8402                 char* const oldregxend = RExC_end;
8403 #ifdef DEBUGGING
8404                 char* parse_start = RExC_parse - 2;
8405 #endif
8406
8407                 if (RExC_parse[1] == '{') {
8408                   /* a lovely hack--pretend we saw [\pX] instead */
8409                     RExC_end = strchr(RExC_parse, '}');
8410                     if (!RExC_end) {
8411                         const U8 c = (U8)*RExC_parse;
8412                         RExC_parse += 2;
8413                         RExC_end = oldregxend;
8414                         vFAIL2("Missing right brace on \\%c{}", c);
8415                     }
8416                     RExC_end++;
8417                 }
8418                 else {
8419                     RExC_end = RExC_parse + 2;
8420                     if (RExC_end > oldregxend)
8421                         RExC_end = oldregxend;
8422                 }
8423                 RExC_parse--;
8424
8425                 ret = regclass(pRExC_state,depth+1);
8426
8427                 RExC_end = oldregxend;
8428                 RExC_parse--;
8429
8430                 Set_Node_Offset(ret, parse_start + 2);
8431                 Set_Node_Cur_Length(ret);
8432                 nextchar(pRExC_state);
8433                 *flagp |= HASWIDTH|SIMPLE;
8434             }
8435             break;
8436         case 'N':
8437             /* Handle \N and \N{NAME} here and not below because it can be
8438             multicharacter. join_exact() will join them up later on.
8439             Also this makes sure that things like /\N{BLAH}+/ and
8440             \N{BLAH} being multi char Just Happen. dmq*/
8441             ++RExC_parse;
8442             ret= reg_namedseq(pRExC_state, NULL, flagp);
8443             break;
8444         case 'k':    /* Handle \k<NAME> and \k'NAME' */
8445         parse_named_seq:
8446         {
8447             char ch= RExC_parse[1];
8448             if (ch != '<' && ch != '\'' && ch != '{') {
8449                 RExC_parse++;
8450                 vFAIL2("Sequence %.2s... not terminated",parse_start);
8451             } else {
8452                 /* this pretty much dupes the code for (?P=...) in reg(), if
8453                    you change this make sure you change that */
8454                 char* name_start = (RExC_parse += 2);
8455                 U32 num = 0;
8456                 SV *sv_dat = reg_scan_name(pRExC_state,
8457                     SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8458                 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
8459                 if (RExC_parse == name_start || *RExC_parse != ch)
8460                     vFAIL2("Sequence %.3s... not terminated",parse_start);
8461
8462                 if (!SIZE_ONLY) {
8463                     num = add_data( pRExC_state, 1, "S" );
8464                     RExC_rxi->data->data[num]=(void*)sv_dat;
8465                     SvREFCNT_inc_simple_void(sv_dat);
8466                 }
8467
8468                 RExC_sawback = 1;
8469                 ret = reganode(pRExC_state,
8470                                ((! FOLD)
8471                                  ? NREF
8472                                  : (MORE_ASCII_RESTRICTED)
8473                                    ? NREFFA
8474                                    : (AT_LEAST_UNI_SEMANTICS)
8475                                      ? NREFFU
8476                                      : (LOC)
8477                                        ? NREFFL
8478                                        : NREFF),
8479                                 num);
8480                 *flagp |= HASWIDTH;
8481
8482                 /* override incorrect value set in reganode MJD */
8483                 Set_Node_Offset(ret, parse_start+1);
8484                 Set_Node_Cur_Length(ret); /* MJD */
8485                 nextchar(pRExC_state);
8486
8487             }
8488             break;
8489         }
8490         case 'g':
8491         case '1': case '2': case '3': case '4':
8492         case '5': case '6': case '7': case '8': case '9':
8493             {
8494                 I32 num;
8495                 bool isg = *RExC_parse == 'g';
8496                 bool isrel = 0;
8497                 bool hasbrace = 0;
8498                 if (isg) {
8499                     RExC_parse++;
8500                     if (*RExC_parse == '{') {
8501                         RExC_parse++;
8502                         hasbrace = 1;
8503                     }
8504                     if (*RExC_parse == '-') {
8505                         RExC_parse++;
8506                         isrel = 1;
8507                     }
8508                     if (hasbrace && !isDIGIT(*RExC_parse)) {
8509                         if (isrel) RExC_parse--;
8510                         RExC_parse -= 2;
8511                         goto parse_named_seq;
8512                 }   }
8513                 num = atoi(RExC_parse);
8514                 if (isg && num == 0)
8515                     vFAIL("Reference to invalid group 0");
8516                 if (isrel) {
8517                     num = RExC_npar - num;
8518                     if (num < 1)
8519                         vFAIL("Reference to nonexistent or unclosed group");
8520                 }
8521                 if (!isg && num > 9 && num >= RExC_npar)
8522                     goto defchar;
8523                 else {
8524                     char * const parse_start = RExC_parse - 1; /* MJD */
8525                     while (isDIGIT(*RExC_parse))
8526                         RExC_parse++;
8527                     if (parse_start == RExC_parse - 1)
8528                         vFAIL("Unterminated \\g... pattern");
8529                     if (hasbrace) {
8530                         if (*RExC_parse != '}')
8531                             vFAIL("Unterminated \\g{...} pattern");
8532                         RExC_parse++;
8533                     }
8534                     if (!SIZE_ONLY) {
8535                         if (num > (I32)RExC_rx->nparens)
8536                             vFAIL("Reference to nonexistent group");
8537                     }
8538                     RExC_sawback = 1;
8539                     ret = reganode(pRExC_state,
8540                                    ((! FOLD)
8541                                      ? REF
8542                                      : (MORE_ASCII_RESTRICTED)
8543                                        ? REFFA
8544                                        : (AT_LEAST_UNI_SEMANTICS)
8545                                          ? REFFU
8546                                          : (LOC)
8547                                            ? REFFL
8548                                            : REFF),
8549                                     num);
8550                     *flagp |= HASWIDTH;
8551
8552                     /* override incorrect value set in reganode MJD */
8553                     Set_Node_Offset(ret, parse_start+1);
8554                     Set_Node_Cur_Length(ret); /* MJD */
8555                     RExC_parse--;
8556                     nextchar(pRExC_state);
8557                 }
8558             }
8559             break;
8560         case '\0':
8561             if (RExC_parse >= RExC_end)
8562                 FAIL("Trailing \\");
8563             /* FALL THROUGH */
8564         default:
8565             /* Do not generate "unrecognized" warnings here, we fall
8566                back into the quick-grab loop below */
8567             parse_start--;
8568             goto defchar;
8569         }
8570         break;
8571
8572     case '#':
8573         if (RExC_flags & RXf_PMf_EXTENDED) {
8574             if ( reg_skipcomment( pRExC_state ) )
8575                 goto tryagain;
8576         }
8577         /* FALL THROUGH */
8578
8579     default:
8580         outer_default:{
8581             register STRLEN len;
8582             register UV ender;
8583             register char *p;
8584             char *s;
8585             STRLEN foldlen;
8586             U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
8587             regnode * orig_emit;
8588
8589             parse_start = RExC_parse - 1;
8590
8591             RExC_parse++;
8592
8593         defchar:
8594             ender = 0;
8595             orig_emit = RExC_emit; /* Save the original output node position in
8596                                       case we need to output a different node
8597                                       type */
8598             ret = reg_node(pRExC_state,
8599                            (U8) ((! FOLD) ? EXACT
8600                                           : (LOC)
8601                                              ? EXACTFL
8602                                              : (MORE_ASCII_RESTRICTED)
8603                                                ? EXACTFA
8604                                                : (AT_LEAST_UNI_SEMANTICS)
8605                                                  ? EXACTFU
8606                                                  : EXACTF)
8607                     );
8608             s = STRING(ret);
8609             for (len = 0, p = RExC_parse - 1;
8610               len < 127 && p < RExC_end;
8611               len++)
8612             {
8613                 char * const oldp = p;
8614
8615                 if (RExC_flags & RXf_PMf_EXTENDED)
8616                     p = regwhite( pRExC_state, p );
8617                 switch ((U8)*p) {
8618                 case LATIN_SMALL_LETTER_SHARP_S:
8619                 case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
8620                 case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
8621                            if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
8622                                 goto normal_default;
8623                 case '^':
8624                 case '$':
8625                 case '.':
8626                 case '[':
8627                 case '(':
8628                 case ')':
8629                 case '|':
8630                     goto loopdone;
8631                 case '\\':
8632                     /* Literal Escapes Switch
8633
8634                        This switch is meant to handle escape sequences that
8635                        resolve to a literal character.
8636
8637                        Every escape sequence that represents something
8638                        else, like an assertion or a char class, is handled
8639                        in the switch marked 'Special Escapes' above in this
8640                        routine, but also has an entry here as anything that
8641                        isn't explicitly mentioned here will be treated as
8642                        an unescaped equivalent literal.
8643                     */
8644
8645                     switch ((U8)*++p) {
8646                     /* These are all the special escapes. */
8647                     case LATIN_SMALL_LETTER_SHARP_S:
8648                     case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
8649                     case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
8650                            if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
8651                                 goto normal_default;
8652                     case 'A':             /* Start assertion */
8653                     case 'b': case 'B':   /* Word-boundary assertion*/
8654                     case 'C':             /* Single char !DANGEROUS! */
8655                     case 'd': case 'D':   /* digit class */
8656                     case 'g': case 'G':   /* generic-backref, pos assertion */
8657                     case 'h': case 'H':   /* HORIZWS */
8658                     case 'k': case 'K':   /* named backref, keep marker */
8659                     case 'N':             /* named char sequence */
8660                     case 'p': case 'P':   /* Unicode property */
8661                               case 'R':   /* LNBREAK */
8662                     case 's': case 'S':   /* space class */
8663                     case 'v': case 'V':   /* VERTWS */
8664                     case 'w': case 'W':   /* word class */
8665                     case 'X':             /* eXtended Unicode "combining character sequence" */
8666                     case 'z': case 'Z':   /* End of line/string assertion */
8667                         --p;
8668                         goto loopdone;
8669
8670                     /* Anything after here is an escape that resolves to a
8671                        literal. (Except digits, which may or may not)
8672                      */
8673                     case 'n':
8674                         ender = '\n';
8675                         p++;
8676                         break;
8677                     case 'r':
8678                         ender = '\r';
8679                         p++;
8680                         break;
8681                     case 't':
8682                         ender = '\t';
8683                         p++;
8684                         break;
8685                     case 'f':
8686                         ender = '\f';
8687                         p++;
8688                         break;
8689                     case 'e':
8690                           ender = ASCII_TO_NATIVE('\033');
8691                         p++;
8692                         break;
8693                     case 'a':
8694                           ender = ASCII_TO_NATIVE('\007');
8695                         p++;
8696                         break;
8697                     case 'o':
8698                         {
8699                             STRLEN brace_len = len;
8700                             UV result;
8701                             const char* error_msg;
8702
8703                             bool valid = grok_bslash_o(p,
8704                                                        &result,
8705                                                        &brace_len,
8706                                                        &error_msg,
8707                                                        1);
8708                             p += brace_len;
8709                             if (! valid) {
8710                                 RExC_parse = p; /* going to die anyway; point
8711                                                    to exact spot of failure */
8712                                 vFAIL(error_msg);
8713                             }
8714                             else
8715                             {
8716                                 ender = result;
8717                             }
8718                             if (PL_encoding && ender < 0x100) {
8719                                 goto recode_encoding;
8720                             }
8721                             if (ender > 0xff) {
8722                                 REQUIRE_UTF8;
8723                             }
8724                             break;
8725                         }
8726                     case 'x':
8727                         if (*++p == '{') {
8728                             char* const e = strchr(p, '}');
8729
8730                             if (!e) {
8731                                 RExC_parse = p + 1;
8732                                 vFAIL("Missing right brace on \\x{}");
8733                             }
8734                             else {
8735                                 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
8736                                     | PERL_SCAN_DISALLOW_PREFIX;
8737                                 STRLEN numlen = e - p - 1;
8738                                 ender = grok_hex(p + 1, &numlen, &flags, NULL);
8739                                 if (ender > 0xff)
8740                                     REQUIRE_UTF8;
8741                                 p = e + 1;
8742                             }
8743                         }
8744                         else {
8745                             I32 flags = PERL_SCAN_DISALLOW_PREFIX;
8746                             STRLEN numlen = 2;
8747                             ender = grok_hex(p, &numlen, &flags, NULL);
8748                             p += numlen;
8749                         }
8750                         if (PL_encoding && ender < 0x100)
8751                             goto recode_encoding;
8752                         break;
8753                     case 'c':
8754                         p++;
8755                         ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
8756                         break;
8757                     case '0': case '1': case '2': case '3':case '4':
8758                     case '5': case '6': case '7': case '8':case '9':
8759                         if (*p == '0' ||
8760                             (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
8761                         {
8762                             I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
8763                             STRLEN numlen = 3;
8764                             ender = grok_oct(p, &numlen, &flags, NULL);
8765                             if (ender > 0xff) {
8766                                 REQUIRE_UTF8;
8767                             }
8768                             p += numlen;
8769                         }
8770                         else {
8771                             --p;
8772                             goto loopdone;
8773                         }
8774                         if (PL_encoding && ender < 0x100)
8775                             goto recode_encoding;
8776                         break;
8777                     recode_encoding:
8778                         {
8779                             SV* enc = PL_encoding;
8780                             ender = reg_recode((const char)(U8)ender, &enc);
8781                             if (!enc && SIZE_ONLY)
8782                                 ckWARNreg(p, "Invalid escape in the specified encoding");
8783                             REQUIRE_UTF8;
8784                         }
8785                         break;
8786                     case '\0':
8787                         if (p >= RExC_end)
8788                             FAIL("Trailing \\");
8789                         /* FALL THROUGH */
8790                     default:
8791                         if (!SIZE_ONLY&& isALPHA(*p)) {
8792                             /* Include any { following the alpha to emphasize
8793                              * that it could be part of an escape at some point
8794                              * in the future */
8795                             int len = (*(p + 1) == '{') ? 2 : 1;
8796                             ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
8797                         }
8798                         goto normal_default;
8799                     }
8800                     break;
8801                 default:
8802                   normal_default:
8803                     if (UTF8_IS_START(*p) && UTF) {
8804                         STRLEN numlen;
8805                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
8806                                                &numlen, UTF8_ALLOW_DEFAULT);
8807                         p += numlen;
8808                     }
8809                     else
8810                         ender = (U8) *p++;
8811                     break;
8812                 } /* End of switch on the literal */
8813
8814                 /* Certain characters are problematic because their folded
8815                  * length is so different from their original length that it
8816                  * isn't handleable by the optimizer.  They are therefore not
8817                  * placed in an EXACTish node; and are here handled specially.
8818                  * (Even if the optimizer handled LATIN_SMALL_LETTER_SHARP_S,
8819                  * putting it in a special node keeps regexec from having to
8820                  * deal with a non-utf8 multi-char fold */
8821                 if (FOLD
8822                     && (ender > 255 || (! MORE_ASCII_RESTRICTED && ! LOC))
8823                     && is_TRICKYFOLD_cp(ender))
8824                 {
8825                     /* If is in middle of outputting characters into an
8826                      * EXACTish node, go output what we have so far, and
8827                      * position the parse so that this will be called again
8828                      * immediately */
8829                     if (len) {
8830                         p  = oldp;
8831                         goto loopdone;
8832                     }
8833                     else {
8834
8835                         /* Here we are ready to output our tricky fold
8836                          * character.  What's done is to pretend it's in a
8837                          * [bracketed] class, and let the code that deals with
8838                          * those handle it, as that code has all the
8839                          * intelligence necessary.  First save the current
8840                          * parse state, get rid of the already allocated EXACT
8841                          * node that the ANYOFV node will replace, and point
8842                          * the parse to a buffer which we fill with the
8843                          * character we want the regclass code to think is
8844                          * being parsed */
8845                         char* const oldregxend = RExC_end;
8846                         char tmpbuf[2];
8847                         RExC_emit = orig_emit;
8848                         RExC_parse = tmpbuf;
8849                         if (UTF) {
8850                             tmpbuf[0] = UTF8_TWO_BYTE_HI(ender);
8851                             tmpbuf[1] = UTF8_TWO_BYTE_LO(ender);
8852                             RExC_end = RExC_parse + 2;
8853                         }
8854                         else {
8855                             tmpbuf[0] = (char) ender;
8856                             RExC_end = RExC_parse + 1;
8857                         }
8858
8859                         ret = regclass(pRExC_state,depth+1);
8860
8861                         /* Here, have parsed the buffer.  Reset the parse to
8862                          * the actual input, and return */
8863                         RExC_end = oldregxend;
8864                         RExC_parse = p - 1;
8865
8866                         Set_Node_Offset(ret, RExC_parse);
8867                         Set_Node_Cur_Length(ret);
8868                         nextchar(pRExC_state);
8869                         *flagp |= HASWIDTH|SIMPLE;
8870                         return ret;
8871                     }
8872                 }
8873
8874                 if ( RExC_flags & RXf_PMf_EXTENDED)
8875                     p = regwhite( pRExC_state, p );
8876                 if (UTF && FOLD) {
8877                     /* Prime the casefolded buffer.  Locale rules, which apply
8878                      * only to code points < 256, aren't known until execution,
8879                      * so for them, just output the original character using
8880                      * utf8 */
8881                     if (LOC && ender < 256) {
8882                         if (UNI_IS_INVARIANT(ender)) {
8883                             *tmpbuf = (U8) ender;
8884                             foldlen = 1;
8885                         } else {
8886                             *tmpbuf = UTF8_TWO_BYTE_HI(ender);
8887                             *(tmpbuf + 1) = UTF8_TWO_BYTE_LO(ender);
8888                             foldlen = 2;
8889                         }
8890                     }
8891                     else if (isASCII(ender)) {  /* Note: Here can't also be LOC
8892                                                  */
8893                         ender = toLOWER(ender);
8894                         *tmpbuf = (U8) ender;
8895                         foldlen = 1;
8896                     }
8897                     else if (! MORE_ASCII_RESTRICTED && ! LOC) {
8898
8899                         /* Locale and /aa require more selectivity about the
8900                          * fold, so are handled below.  Otherwise, here, just
8901                          * use the fold */
8902                         ender = toFOLD_uni(ender, tmpbuf, &foldlen);
8903                     }
8904                     else {
8905                         /* Under locale rules or /aa we are not to mix,
8906                          * respectively, ords < 256 or ASCII with non-.  So
8907                          * reject folds that mix them, using only the
8908                          * non-folded code point.  So do the fold to a
8909                          * temporary, and inspect each character in it. */
8910                         U8 trialbuf[UTF8_MAXBYTES_CASE+1];
8911                         U8* s = trialbuf;
8912                         UV tmpender = toFOLD_uni(ender, trialbuf, &foldlen);
8913                         U8* e = s + foldlen;
8914                         bool fold_ok = TRUE;
8915
8916                         while (s < e) {
8917                             if (isASCII(*s)
8918                                 || (LOC && (UTF8_IS_INVARIANT(*s)
8919                                            || UTF8_IS_DOWNGRADEABLE_START(*s))))
8920                             {
8921                                 fold_ok = FALSE;
8922                                 break;
8923                             }
8924                             s += UTF8SKIP(s);
8925                         }
8926                         if (fold_ok) {
8927                             Copy(trialbuf, tmpbuf, foldlen, U8);
8928                             ender = tmpender;
8929                         }
8930                         else {
8931                             uvuni_to_utf8(tmpbuf, ender);
8932                             foldlen = UNISKIP(ender);
8933                         }
8934                     }
8935                 }
8936                 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
8937                     if (len)
8938                         p = oldp;
8939                     else if (UTF) {
8940                          if (FOLD) {
8941                               /* Emit all the Unicode characters. */
8942                               STRLEN numlen;
8943                               for (foldbuf = tmpbuf;
8944                                    foldlen;
8945                                    foldlen -= numlen) {
8946                                    ender = utf8_to_uvchr(foldbuf, &numlen);
8947                                    if (numlen > 0) {
8948                                         const STRLEN unilen = reguni(pRExC_state, ender, s);
8949                                         s       += unilen;
8950                                         len     += unilen;
8951                                         /* In EBCDIC the numlen
8952                                          * and unilen can differ. */
8953                                         foldbuf += numlen;
8954                                         if (numlen >= foldlen)
8955                                              break;
8956                                    }
8957                                    else
8958                                         break; /* "Can't happen." */
8959                               }
8960                          }
8961                          else {
8962                               const STRLEN unilen = reguni(pRExC_state, ender, s);
8963                               if (unilen > 0) {
8964                                    s   += unilen;
8965                                    len += unilen;
8966                               }
8967                          }
8968                     }
8969                     else {
8970                         len++;
8971                         REGC((char)ender, s++);
8972                     }
8973                     break;
8974                 }
8975                 if (UTF) {
8976                      if (FOLD) {
8977                           /* Emit all the Unicode characters. */
8978                           STRLEN numlen;
8979                           for (foldbuf = tmpbuf;
8980                                foldlen;
8981                                foldlen -= numlen) {
8982                                ender = utf8_to_uvchr(foldbuf, &numlen);
8983                                if (numlen > 0) {
8984                                     const STRLEN unilen = reguni(pRExC_state, ender, s);
8985                                     len     += unilen;
8986                                     s       += unilen;
8987                                     /* In EBCDIC the numlen
8988                                      * and unilen can differ. */
8989                                     foldbuf += numlen;
8990                                     if (numlen >= foldlen)
8991                                          break;
8992                                }
8993                                else
8994                                     break;
8995                           }
8996                      }
8997                      else {
8998                           const STRLEN unilen = reguni(pRExC_state, ender, s);
8999                           if (unilen > 0) {
9000                                s   += unilen;
9001                                len += unilen;
9002                           }
9003                      }
9004                      len--;
9005                 }
9006                 else
9007                     REGC((char)ender, s++);
9008             }
9009         loopdone:   /* Jumped to when encounters something that shouldn't be in
9010                        the node */
9011             RExC_parse = p - 1;
9012             Set_Node_Cur_Length(ret); /* MJD */
9013             nextchar(pRExC_state);
9014             {
9015                 /* len is STRLEN which is unsigned, need to copy to signed */
9016                 IV iv = len;
9017                 if (iv < 0)
9018                     vFAIL("Internal disaster");
9019             }
9020             if (len > 0)
9021                 *flagp |= HASWIDTH;
9022             if (len == 1 && UNI_IS_INVARIANT(ender))
9023                 *flagp |= SIMPLE;
9024
9025             if (SIZE_ONLY)
9026                 RExC_size += STR_SZ(len);
9027             else {
9028                 STR_LEN(ret) = len;
9029                 RExC_emit += STR_SZ(len);
9030             }
9031         }
9032         break;
9033     }
9034
9035     return(ret);
9036
9037 /* Jumped to when an unrecognized character set is encountered */
9038 bad_charset:
9039     Perl_croak(aTHX_ "panic: Unknown regex character set encoding: %u", get_regex_charset(RExC_flags));
9040     return(NULL);
9041 }
9042
9043 STATIC char *
9044 S_regwhite( RExC_state_t *pRExC_state, char *p )
9045 {
9046     const char *e = RExC_end;
9047
9048     PERL_ARGS_ASSERT_REGWHITE;
9049
9050     while (p < e) {
9051         if (isSPACE(*p))
9052             ++p;
9053         else if (*p == '#') {
9054             bool ended = 0;
9055             do {
9056                 if (*p++ == '\n') {
9057                     ended = 1;
9058                     break;
9059                 }
9060             } while (p < e);
9061             if (!ended)
9062                 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
9063         }
9064         else
9065             break;
9066     }
9067     return p;
9068 }
9069
9070 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
9071    Character classes ([:foo:]) can also be negated ([:^foo:]).
9072    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
9073    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
9074    but trigger failures because they are currently unimplemented. */
9075
9076 #define POSIXCC_DONE(c)   ((c) == ':')
9077 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
9078 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
9079
9080 STATIC I32
9081 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
9082 {
9083     dVAR;
9084     I32 namedclass = OOB_NAMEDCLASS;
9085
9086     PERL_ARGS_ASSERT_REGPPOSIXCC;
9087
9088     if (value == '[' && RExC_parse + 1 < RExC_end &&
9089         /* I smell either [: or [= or [. -- POSIX has been here, right? */
9090         POSIXCC(UCHARAT(RExC_parse))) {
9091         const char c = UCHARAT(RExC_parse);
9092         char* const s = RExC_parse++;
9093
9094         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
9095             RExC_parse++;
9096         if (RExC_parse == RExC_end)
9097             /* Grandfather lone [:, [=, [. */
9098             RExC_parse = s;
9099         else {
9100             const char* const t = RExC_parse++; /* skip over the c */
9101             assert(*t == c);
9102
9103             if (UCHARAT(RExC_parse) == ']') {
9104                 const char *posixcc = s + 1;
9105                 RExC_parse++; /* skip over the ending ] */
9106
9107                 if (*s == ':') {
9108                     const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
9109                     const I32 skip = t - posixcc;
9110
9111                     /* Initially switch on the length of the name.  */
9112                     switch (skip) {
9113                     case 4:
9114                         if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
9115                             namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
9116                         break;
9117                     case 5:
9118                         /* Names all of length 5.  */
9119                         /* alnum alpha ascii blank cntrl digit graph lower
9120                            print punct space upper  */
9121                         /* Offset 4 gives the best switch position.  */
9122                         switch (posixcc[4]) {
9123                         case 'a':
9124                             if (memEQ(posixcc, "alph", 4)) /* alpha */
9125                                 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
9126                             break;
9127                         case 'e':
9128                             if (memEQ(posixcc, "spac", 4)) /* space */
9129                                 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
9130                             break;
9131                         case 'h':
9132                             if (memEQ(posixcc, "grap", 4)) /* graph */
9133                                 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
9134                             break;
9135                         case 'i':
9136                             if (memEQ(posixcc, "asci", 4)) /* ascii */
9137                                 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
9138                             break;
9139                         case 'k':
9140                             if (memEQ(posixcc, "blan", 4)) /* blank */
9141                                 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
9142                             break;
9143                         case 'l':
9144                             if (memEQ(posixcc, "cntr", 4)) /* cntrl */
9145                                 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
9146                             break;
9147                         case 'm':
9148                             if (memEQ(posixcc, "alnu", 4)) /* alnum */
9149                                 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
9150                             break;
9151                         case 'r':
9152                             if (memEQ(posixcc, "lowe", 4)) /* lower */
9153                                 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
9154                             else if (memEQ(posixcc, "uppe", 4)) /* upper */
9155                                 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
9156                             break;
9157                         case 't':
9158                             if (memEQ(posixcc, "digi", 4)) /* digit */
9159                                 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
9160                             else if (memEQ(posixcc, "prin", 4)) /* print */
9161                                 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
9162                             else if (memEQ(posixcc, "punc", 4)) /* punct */
9163                                 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
9164                             break;
9165                         }
9166                         break;
9167                     case 6:
9168                         if (memEQ(posixcc, "xdigit", 6))
9169                             namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
9170                         break;
9171                     }
9172
9173                     if (namedclass == OOB_NAMEDCLASS)
9174                         Simple_vFAIL3("POSIX class [:%.*s:] unknown",
9175                                       t - s - 1, s + 1);
9176                     assert (posixcc[skip] == ':');
9177                     assert (posixcc[skip+1] == ']');
9178                 } else if (!SIZE_ONLY) {
9179                     /* [[=foo=]] and [[.foo.]] are still future. */
9180
9181                     /* adjust RExC_parse so the warning shows after
9182                        the class closes */
9183                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
9184                         RExC_parse++;
9185                     Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
9186                 }
9187             } else {
9188                 /* Maternal grandfather:
9189                  * "[:" ending in ":" but not in ":]" */
9190                 RExC_parse = s;
9191             }
9192         }
9193     }
9194
9195     return namedclass;
9196 }
9197
9198 STATIC void
9199 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
9200 {
9201     dVAR;
9202
9203     PERL_ARGS_ASSERT_CHECKPOSIXCC;
9204
9205     if (POSIXCC(UCHARAT(RExC_parse))) {
9206         const char *s = RExC_parse;
9207         const char  c = *s++;
9208
9209         while (isALNUM(*s))
9210             s++;
9211         if (*s && c == *s && s[1] == ']') {
9212             ckWARN3reg(s+2,
9213                        "POSIX syntax [%c %c] belongs inside character classes",
9214                        c, c);
9215
9216             /* [[=foo=]] and [[.foo.]] are still future. */
9217             if (POSIXCC_NOTYET(c)) {
9218                 /* adjust RExC_parse so the error shows after
9219                    the class closes */
9220                 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
9221                     NOOP;
9222                 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
9223             }
9224         }
9225     }
9226 }
9227
9228 /* No locale test, and always Unicode semantics */
9229 #define _C_C_T_NOLOC_(NAME,TEST,WORD)                                          \
9230 ANYOF_##NAME:                                                                  \
9231         for (value = 0; value < 256; value++)                                  \
9232             if (TEST)                                                          \
9233             stored += set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);  \
9234     yesno = '+';                                                               \
9235     what = WORD;                                                               \
9236     break;                                                                     \
9237 case ANYOF_N##NAME:                                                            \
9238         for (value = 0; value < 256; value++)                                  \
9239             if (!TEST)                                                         \
9240             stored += set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);  \
9241     yesno = '!';                                                               \
9242     what = WORD;                                                               \
9243     break
9244
9245 /* Like the above, but there are differences if we are in uni-8-bit or not, so
9246  * there are two tests passed in, to use depending on that. There aren't any
9247  * cases where the label is different from the name, so no need for that
9248  * parameter */
9249 #define _C_C_T_(NAME, TEST_8, TEST_7, WORD)                                    \
9250 ANYOF_##NAME:                                                                  \
9251     if (LOC) ANYOF_CLASS_SET(ret, ANYOF_##NAME);                               \
9252     else if (UNI_SEMANTICS) {                                                  \
9253         for (value = 0; value < 256; value++) {                                \
9254             if (TEST_8(value)) stored +=                                       \
9255                       set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);  \
9256         }                                                                      \
9257     }                                                                          \
9258     else {                                                                     \
9259         for (value = 0; value < 128; value++) {                                \
9260             if (TEST_7(UNI_TO_NATIVE(value))) stored +=                        \
9261                 set_regclass_bit(pRExC_state, ret,                     \
9262                                    (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate);                 \
9263         }                                                                      \
9264     }                                                                          \
9265     yesno = '+';                                                               \
9266     what = WORD;                                                               \
9267     break;                                                                     \
9268 case ANYOF_N##NAME:                                                            \
9269     if (LOC) ANYOF_CLASS_SET(ret, ANYOF_N##NAME);                              \
9270     else if (UNI_SEMANTICS) {                                                  \
9271         for (value = 0; value < 256; value++) {                                \
9272             if (! TEST_8(value)) stored +=                                     \
9273                     set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);    \
9274         }                                                                      \
9275     }                                                                          \
9276     else {                                                                     \
9277         for (value = 0; value < 128; value++) {                                \
9278             if (! TEST_7(UNI_TO_NATIVE(value))) stored += set_regclass_bit(  \
9279                         pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate);    \
9280         }                                                                      \
9281         if (AT_LEAST_ASCII_RESTRICTED) {                                       \
9282             for (value = 128; value < 256; value++) {                          \
9283              stored += set_regclass_bit(                                     \
9284                            pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate); \
9285             }                                                                  \
9286             ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;                             \
9287         }                                                                      \
9288         else {                                                                 \
9289             /* For a non-ut8 target string with DEPENDS semantics, all above   \
9290              * ASCII Latin1 code points match the complement of any of the     \
9291              * classes.  But in utf8, they have their Unicode semantics, so    \
9292              * can't just set them in the bitmap, or else regexec.c will think \
9293              * they matched when they shouldn't. */                            \
9294             ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;                     \
9295         }                                                                      \
9296     }                                                                          \
9297     yesno = '!';                                                               \
9298     what = WORD;                                                               \
9299     break
9300
9301 STATIC U8
9302 S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, HV** invlist_ptr, AV** alternate_ptr)
9303 {
9304
9305     /* Handle the setting of folds in the bitmap for non-locale ANYOF nodes.
9306      * Locale folding is done at run-time, so this function should not be
9307      * called for nodes that are for locales.
9308      *
9309      * This function sets the bit corresponding to the fold of the input
9310      * 'value', if not already set.  The fold of 'f' is 'F', and the fold of
9311      * 'F' is 'f'.
9312      *
9313      * It also knows about the characters that are in the bitmap that have
9314      * folds that are matchable only outside it, and sets the appropriate lists
9315      * and flags.
9316      *
9317      * It returns the number of bits that actually changed from 0 to 1 */
9318
9319     U8 stored = 0;
9320     U8 fold;
9321
9322     PERL_ARGS_ASSERT_SET_REGCLASS_BIT_FOLD;
9323
9324     fold = (AT_LEAST_UNI_SEMANTICS) ? PL_fold_latin1[value]
9325                                     : PL_fold[value];
9326
9327     /* It assumes the bit for 'value' has already been set */
9328     if (fold != value && ! ANYOF_BITMAP_TEST(node, fold)) {
9329         ANYOF_BITMAP_SET(node, fold);
9330         stored++;
9331     }
9332     if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value) && (! isASCII(value) || ! MORE_ASCII_RESTRICTED)) {
9333         /* Certain Latin1 characters have matches outside the bitmap.  To get
9334          * here, 'value' is one of those characters.   None of these matches is
9335          * valid for ASCII characters under /aa, which have been excluded by
9336          * the 'if' above.  The matches fall into three categories:
9337          * 1) They are singly folded-to or -from an above 255 character, as
9338          *    LATIN SMALL LETTER Y WITH DIAERESIS and LATIN CAPITAL LETTER Y
9339          *    WITH DIAERESIS;
9340          * 2) They are part of a multi-char fold with another character in the
9341          *    bitmap, only LATIN SMALL LETTER SHARP S => "ss" fits that bill;
9342          * 3) They are part of a multi-char fold with a character not in the
9343          *    bitmap, such as various ligatures.
9344          * We aren't dealing fully with multi-char folds, except we do deal
9345          * with the pattern containing a character that has a multi-char fold
9346          * (not so much the inverse).
9347          * For types 1) and 3), the matches only happen when the target string
9348          * is utf8; that's not true for 2), and we set a flag for it.
9349          *
9350          * The code below adds to the passed in inversion list the single fold
9351          * closures for 'value'.  The values are hard-coded here so that an
9352          * innocent-looking character class, like /[ks]/i won't have to go out
9353          * to disk to find the possible matches.  XXX It would be better to
9354          * generate these via regen, in case a new version of the Unicode
9355          * standard adds new mappings, though that is not really likely. */
9356         switch (value) {
9357             case 'k':
9358             case 'K':
9359                 /* KELVIN SIGN */
9360                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212A);
9361                 break;
9362             case 's':
9363             case 'S':
9364                 /* LATIN SMALL LETTER LONG S */
9365                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x017F);
9366                 break;
9367             case MICRO_SIGN:
9368                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
9369                                                  GREEK_SMALL_LETTER_MU);
9370                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
9371                                                  GREEK_CAPITAL_LETTER_MU);
9372                 break;
9373             case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
9374             case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
9375                 /* ANGSTROM SIGN */
9376                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212B);
9377                 if (DEPENDS_SEMANTICS) {    /* See DEPENDS comment below */
9378                     *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
9379                                                      PL_fold_latin1[value]);
9380                 }
9381                 break;
9382             case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
9383                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
9384                                         LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
9385                 break;
9386             case LATIN_SMALL_LETTER_SHARP_S:
9387                 /* 0x1E9E is LATIN CAPITAL LETTER SHARP S */
9388                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x1E9E);
9389
9390                 /* Under /a, /d, and /u, this can match the two chars "ss" */
9391                 if (! MORE_ASCII_RESTRICTED) {
9392                     add_alternate(alternate_ptr, (U8 *) "ss", 2);
9393
9394                     /* And under /u or /a, it can match even if the target is
9395                      * not utf8 */
9396                     if (AT_LEAST_UNI_SEMANTICS) {
9397                         ANYOF_FLAGS(node) |= ANYOF_NONBITMAP_NON_UTF8;
9398                     }
9399                 }
9400                 break;
9401             case 'F': case 'f':
9402             case 'I': case 'i':
9403             case 'L': case 'l':
9404             case 'T': case 't':
9405                 /* These all are targets of multi-character folds, which can
9406                  * occur with only non-Latin1 characters in the fold, so they
9407                  * can match if the target string isn't UTF-8 */
9408                 ANYOF_FLAGS(node) |= ANYOF_NONBITMAP_NON_UTF8;
9409                 break;
9410             case 'A': case 'a':
9411             case 'H': case 'h':
9412             case 'J': case 'j':
9413             case 'N': case 'n':
9414             case 'W': case 'w':
9415             case 'Y': case 'y':
9416                 /* These all are targets of multi-character folds, which occur
9417                  * only with a non-Latin1 character as part of the fold, so
9418                  * they can't match unless the target string is in UTF-8, so no
9419                  * action here is necessary */
9420                 break;
9421             default:
9422                 /* Use deprecated warning to increase the chances of this
9423                  * being output */
9424                 ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%x; please use the perlbug utility to report;", value);
9425                 break;
9426         }
9427     }
9428     else if (DEPENDS_SEMANTICS
9429             && ! isASCII(value)
9430             && PL_fold_latin1[value] != value)
9431     {
9432            /* Under DEPENDS rules, non-ASCII Latin1 characters match their
9433             * folds only when the target string is in UTF-8.  We add the fold
9434             * here to the list of things to match outside the bitmap, which
9435             * won't be looked at unless it is UTF8 (or else if something else
9436             * says to look even if not utf8, but those things better not happen
9437             * under DEPENDS semantics. */
9438         *invlist_ptr = add_cp_to_invlist(*invlist_ptr, PL_fold_latin1[value]);
9439     }
9440
9441     return stored;
9442 }
9443
9444
9445 PERL_STATIC_INLINE U8
9446 S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, HV** invlist_ptr, AV** alternate_ptr)
9447 {
9448     /* This inline function sets a bit in the bitmap if not already set, and if
9449      * appropriate, its fold, returning the number of bits that actually
9450      * changed from 0 to 1 */
9451
9452     U8 stored;
9453
9454     PERL_ARGS_ASSERT_SET_REGCLASS_BIT;
9455
9456     if (ANYOF_BITMAP_TEST(node, value)) {   /* Already set */
9457         return 0;
9458     }
9459
9460     ANYOF_BITMAP_SET(node, value);
9461     stored = 1;
9462
9463     if (FOLD && ! LOC) {        /* Locale folds aren't known until runtime */
9464         stored += set_regclass_bit_fold(pRExC_state, node, value, invlist_ptr, alternate_ptr);
9465     }
9466
9467     return stored;
9468 }
9469
9470 STATIC void
9471 S_add_alternate(pTHX_ AV** alternate_ptr, U8* string, STRLEN len)
9472 {
9473     /* Adds input 'string' with length 'len' to the ANYOF node's unicode
9474      * alternate list, pointed to by 'alternate_ptr'.  This is an array of
9475      * the multi-character folds of characters in the node */
9476     SV *sv;
9477
9478     PERL_ARGS_ASSERT_ADD_ALTERNATE;
9479
9480     if (! *alternate_ptr) {
9481         *alternate_ptr = newAV();
9482     }
9483     sv = newSVpvn_utf8((char*)string, len, TRUE);
9484     av_push(*alternate_ptr, sv);
9485     return;
9486 }
9487
9488 /*
9489    parse a class specification and produce either an ANYOF node that
9490    matches the pattern or perhaps will be optimized into an EXACTish node
9491    instead. The node contains a bit map for the first 256 characters, with the
9492    corresponding bit set if that character is in the list.  For characters
9493    above 255, a range list is used */
9494
9495 STATIC regnode *
9496 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
9497 {
9498     dVAR;
9499     register UV nextvalue;
9500     register IV prevvalue = OOB_UNICODE;
9501     register IV range = 0;
9502     UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
9503     register regnode *ret;
9504     STRLEN numlen;
9505     IV namedclass;
9506     char *rangebegin = NULL;
9507     bool need_class = 0;
9508     SV *listsv = NULL;
9509     STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
9510                                       than just initialized.  */
9511     UV n;
9512
9513     /* code points this node matches that can't be stored in the bitmap */
9514     HV* nonbitmap = NULL;
9515
9516     /* The items that are to match that aren't stored in the bitmap, but are a
9517      * result of things that are stored there.  This is the fold closure of
9518      * such a character, either because it has DEPENDS semantics and shouldn't
9519      * be matched unless the target string is utf8, or is a code point that is
9520      * too large for the bit map, as for example, the fold of the MICRO SIGN is
9521      * above 255.  This all is solely for performance reasons.  By having this
9522      * code know the outside-the-bitmap folds that the bitmapped characters are
9523      * involved with, we don't have to go out to disk to find the list of
9524      * matches, unless the character class includes code points that aren't
9525      * storable in the bit map.  That means that a character class with an 's'
9526      * in it, for example, doesn't need to go out to disk to find everything
9527      * that matches.  A 2nd list is used so that the 'nonbitmap' list is kept
9528      * empty unless there is something whose fold we don't know about, and will
9529      * have to go out to the disk to find. */
9530     HV* l1_fold_invlist = NULL;
9531
9532     /* List of multi-character folds that are matched by this node */
9533     AV* unicode_alternate  = NULL;
9534 #ifdef EBCDIC
9535     UV literal_endpoint = 0;
9536 #endif
9537     UV stored = 0;  /* how many chars stored in the bitmap */
9538
9539     regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
9540         case we need to change the emitted regop to an EXACT. */
9541     const char * orig_parse = RExC_parse;
9542     GET_RE_DEBUG_FLAGS_DECL;
9543
9544     PERL_ARGS_ASSERT_REGCLASS;
9545 #ifndef DEBUGGING
9546     PERL_UNUSED_ARG(depth);
9547 #endif
9548
9549     DEBUG_PARSE("clas");
9550
9551     /* Assume we are going to generate an ANYOF node. */
9552     ret = reganode(pRExC_state, ANYOF, 0);
9553
9554
9555     if (!SIZE_ONLY) {
9556         ANYOF_FLAGS(ret) = 0;
9557     }
9558
9559     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
9560         RExC_naughty++;
9561         RExC_parse++;
9562         if (!SIZE_ONLY)
9563             ANYOF_FLAGS(ret) |= ANYOF_INVERT;
9564     }
9565
9566     if (SIZE_ONLY) {
9567         RExC_size += ANYOF_SKIP;
9568         listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
9569     }
9570     else {
9571         RExC_emit += ANYOF_SKIP;
9572         if (LOC) {
9573             ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
9574         }
9575         ANYOF_BITMAP_ZERO(ret);
9576         listsv = newSVpvs("# comment\n");
9577         initial_listsv_len = SvCUR(listsv);
9578     }
9579
9580     nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
9581
9582     if (!SIZE_ONLY && POSIXCC(nextvalue))
9583         checkposixcc(pRExC_state);
9584
9585     /* allow 1st char to be ] (allowing it to be - is dealt with later) */
9586     if (UCHARAT(RExC_parse) == ']')
9587         goto charclassloop;
9588
9589 parseit:
9590     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
9591
9592     charclassloop:
9593
9594         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
9595
9596         if (!range)
9597             rangebegin = RExC_parse;
9598         if (UTF) {
9599             value = utf8n_to_uvchr((U8*)RExC_parse,
9600                                    RExC_end - RExC_parse,
9601                                    &numlen, UTF8_ALLOW_DEFAULT);
9602             RExC_parse += numlen;
9603         }
9604         else
9605             value = UCHARAT(RExC_parse++);
9606
9607         nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
9608         if (value == '[' && POSIXCC(nextvalue))
9609             namedclass = regpposixcc(pRExC_state, value);
9610         else if (value == '\\') {
9611             if (UTF) {
9612                 value = utf8n_to_uvchr((U8*)RExC_parse,
9613                                    RExC_end - RExC_parse,
9614                                    &numlen, UTF8_ALLOW_DEFAULT);
9615                 RExC_parse += numlen;
9616             }
9617             else
9618                 value = UCHARAT(RExC_parse++);
9619             /* Some compilers cannot handle switching on 64-bit integer
9620              * values, therefore value cannot be an UV.  Yes, this will
9621              * be a problem later if we want switch on Unicode.
9622              * A similar issue a little bit later when switching on
9623              * namedclass. --jhi */
9624             switch ((I32)value) {
9625             case 'w':   namedclass = ANYOF_ALNUM;       break;
9626             case 'W':   namedclass = ANYOF_NALNUM;      break;
9627             case 's':   namedclass = ANYOF_SPACE;       break;
9628             case 'S':   namedclass = ANYOF_NSPACE;      break;
9629             case 'd':   namedclass = ANYOF_DIGIT;       break;
9630             case 'D':   namedclass = ANYOF_NDIGIT;      break;
9631             case 'v':   namedclass = ANYOF_VERTWS;      break;
9632             case 'V':   namedclass = ANYOF_NVERTWS;     break;
9633             case 'h':   namedclass = ANYOF_HORIZWS;     break;
9634             case 'H':   namedclass = ANYOF_NHORIZWS;    break;
9635             case 'N':  /* Handle \N{NAME} in class */
9636                 {
9637                     /* We only pay attention to the first char of
9638                     multichar strings being returned. I kinda wonder
9639                     if this makes sense as it does change the behaviour
9640                     from earlier versions, OTOH that behaviour was broken
9641                     as well. */
9642                     UV v; /* value is register so we cant & it /grrr */
9643                     if (reg_namedseq(pRExC_state, &v, NULL)) {
9644                         goto parseit;
9645                     }
9646                     value= v;
9647                 }
9648                 break;
9649             case 'p':
9650             case 'P':
9651                 {
9652                 char *e;
9653                 if (RExC_parse >= RExC_end)
9654                     vFAIL2("Empty \\%c{}", (U8)value);
9655                 if (*RExC_parse == '{') {
9656                     const U8 c = (U8)value;
9657                     e = strchr(RExC_parse++, '}');
9658                     if (!e)
9659                         vFAIL2("Missing right brace on \\%c{}", c);
9660                     while (isSPACE(UCHARAT(RExC_parse)))
9661                         RExC_parse++;
9662                     if (e == RExC_parse)
9663                         vFAIL2("Empty \\%c{}", c);
9664                     n = e - RExC_parse;
9665                     while (isSPACE(UCHARAT(RExC_parse + n - 1)))
9666                         n--;
9667                 }
9668                 else {
9669                     e = RExC_parse;
9670                     n = 1;
9671                 }
9672                 if (!SIZE_ONLY) {
9673                     if (UCHARAT(RExC_parse) == '^') {
9674                          RExC_parse++;
9675                          n--;
9676                          value = value == 'p' ? 'P' : 'p'; /* toggle */
9677                          while (isSPACE(UCHARAT(RExC_parse))) {
9678                               RExC_parse++;
9679                               n--;
9680                          }
9681                     }
9682
9683                     /* Add the property name to the list.  If /i matching, give
9684                      * a different name which consists of the normal name
9685                      * sandwiched between two underscores and '_i'.  The design
9686                      * is discussed in the commit message for this. */
9687                     Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s%.*s%s\n",
9688                                         (value=='p' ? '+' : '!'),
9689                                         (FOLD) ? "__" : "",
9690                                         (int)n,
9691                                         RExC_parse,
9692                                         (FOLD) ? "_i" : ""
9693                                     );
9694                 }
9695                 RExC_parse = e + 1;
9696
9697                 /* The \p could match something in the Latin1 range, hence
9698                  * something that isn't utf8 */
9699                 ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
9700                 namedclass = ANYOF_MAX;  /* no official name, but it's named */
9701
9702                 /* \p means they want Unicode semantics */
9703                 RExC_uni_semantics = 1;
9704                 }
9705                 break;
9706             case 'n':   value = '\n';                   break;
9707             case 'r':   value = '\r';                   break;
9708             case 't':   value = '\t';                   break;
9709             case 'f':   value = '\f';                   break;
9710             case 'b':   value = '\b';                   break;
9711             case 'e':   value = ASCII_TO_NATIVE('\033');break;
9712             case 'a':   value = ASCII_TO_NATIVE('\007');break;
9713             case 'o':
9714                 RExC_parse--;   /* function expects to be pointed at the 'o' */
9715                 {
9716                     const char* error_msg;
9717                     bool valid = grok_bslash_o(RExC_parse,
9718                                                &value,
9719                                                &numlen,
9720                                                &error_msg,
9721                                                SIZE_ONLY);
9722                     RExC_parse += numlen;
9723                     if (! valid) {
9724                         vFAIL(error_msg);
9725                     }
9726                 }
9727                 if (PL_encoding && value < 0x100) {
9728                     goto recode_encoding;
9729                 }
9730                 break;
9731             case 'x':
9732                 if (*RExC_parse == '{') {
9733                     I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
9734                         | PERL_SCAN_DISALLOW_PREFIX;
9735                     char * const e = strchr(RExC_parse++, '}');
9736                     if (!e)
9737                         vFAIL("Missing right brace on \\x{}");
9738
9739                     numlen = e - RExC_parse;
9740                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
9741                     RExC_parse = e + 1;
9742                 }
9743                 else {
9744                     I32 flags = PERL_SCAN_DISALLOW_PREFIX;
9745                     numlen = 2;
9746                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
9747                     RExC_parse += numlen;
9748                 }
9749                 if (PL_encoding && value < 0x100)
9750                     goto recode_encoding;
9751                 break;
9752             case 'c':
9753                 value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
9754                 break;
9755             case '0': case '1': case '2': case '3': case '4':
9756             case '5': case '6': case '7':
9757                 {
9758                     /* Take 1-3 octal digits */
9759                     I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
9760                     numlen = 3;
9761                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
9762                     RExC_parse += numlen;
9763                     if (PL_encoding && value < 0x100)
9764                         goto recode_encoding;
9765                     break;
9766                 }
9767             recode_encoding:
9768                 {
9769                     SV* enc = PL_encoding;
9770                     value = reg_recode((const char)(U8)value, &enc);
9771                     if (!enc && SIZE_ONLY)
9772                         ckWARNreg(RExC_parse,
9773                                   "Invalid escape in the specified encoding");
9774                     break;
9775                 }
9776             default:
9777                 /* Allow \_ to not give an error */
9778                 if (!SIZE_ONLY && isALNUM(value) && value != '_') {
9779                     ckWARN2reg(RExC_parse,
9780                                "Unrecognized escape \\%c in character class passed through",
9781                                (int)value);
9782                 }
9783                 break;
9784             }
9785         } /* end of \blah */
9786 #ifdef EBCDIC
9787         else
9788             literal_endpoint++;
9789 #endif
9790
9791         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
9792
9793             /* What matches in a locale is not known until runtime, so need to
9794              * (one time per class) allocate extra space to pass to regexec.
9795              * The space will contain a bit for each named class that is to be
9796              * matched against.  This isn't needed for \p{} and pseudo-classes,
9797              * as they are not affected by locale, and hence are dealt with
9798              * separately */
9799             if (LOC && namedclass < ANYOF_MAX && ! need_class) {
9800                 need_class = 1;
9801                 if (SIZE_ONLY) {
9802                     RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
9803                 }
9804                 else {
9805                     RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
9806                     ANYOF_CLASS_ZERO(ret);
9807                 }
9808                 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
9809             }
9810
9811             /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
9812              * literal, as is the character that began the false range, i.e.
9813              * the 'a' in the examples */
9814             if (range) {
9815                 if (!SIZE_ONLY) {
9816                     const int w =
9817                         RExC_parse >= rangebegin ?
9818                         RExC_parse - rangebegin : 0;
9819                     ckWARN4reg(RExC_parse,
9820                                "False [] range \"%*.*s\"",
9821                                w, w, rangebegin);
9822
9823                     stored +=
9824                          set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
9825                     if (prevvalue < 256) {
9826                         stored +=
9827                          set_regclass_bit(pRExC_state, ret, (U8) prevvalue, &l1_fold_invlist, &unicode_alternate);
9828                     }
9829                     else {
9830                         nonbitmap = add_cp_to_invlist(nonbitmap, prevvalue);
9831                     }
9832                 }
9833
9834                 range = 0; /* this was not a true range */
9835             }
9836
9837
9838
9839             if (!SIZE_ONLY) {
9840                 const char *what = NULL;
9841                 char yesno = 0;
9842
9843                 /* Possible truncation here but in some 64-bit environments
9844                  * the compiler gets heartburn about switch on 64-bit values.
9845                  * A similar issue a little earlier when switching on value.
9846                  * --jhi */
9847                 switch ((I32)namedclass) {
9848
9849                 case _C_C_T_(ALNUMC, isALNUMC_L1, isALNUMC, "XPosixAlnum");
9850                 case _C_C_T_(ALPHA, isALPHA_L1, isALPHA, "XPosixAlpha");
9851                 case _C_C_T_(BLANK, isBLANK_L1, isBLANK, "XPosixBlank");
9852                 case _C_C_T_(CNTRL, isCNTRL_L1, isCNTRL, "XPosixCntrl");
9853                 case _C_C_T_(GRAPH, isGRAPH_L1, isGRAPH, "XPosixGraph");
9854                 case _C_C_T_(LOWER, isLOWER_L1, isLOWER, "XPosixLower");
9855                 case _C_C_T_(PRINT, isPRINT_L1, isPRINT, "XPosixPrint");
9856                 case _C_C_T_(PSXSPC, isPSXSPC_L1, isPSXSPC, "XPosixSpace");
9857                 case _C_C_T_(PUNCT, isPUNCT_L1, isPUNCT, "XPosixPunct");
9858                 case _C_C_T_(UPPER, isUPPER_L1, isUPPER, "XPosixUpper");
9859                 /* \s, \w match all unicode if utf8. */
9860                 case _C_C_T_(SPACE, isSPACE_L1, isSPACE, "SpacePerl");
9861                 case _C_C_T_(ALNUM, isWORDCHAR_L1, isALNUM, "Word");
9862                 case _C_C_T_(XDIGIT, isXDIGIT_L1, isXDIGIT, "XPosixXDigit");
9863                 case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
9864                 case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
9865                 case ANYOF_ASCII:
9866                     if (LOC)
9867                         ANYOF_CLASS_SET(ret, ANYOF_ASCII);
9868                     else {
9869                         for (value = 0; value < 128; value++)
9870                             stored +=
9871                               set_regclass_bit(pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate);
9872                     }
9873                     yesno = '+';
9874                     what = NULL;        /* Doesn't match outside ascii, so
9875                                            don't want to add +utf8:: */
9876                     break;
9877                 case ANYOF_NASCII:
9878                     if (LOC)
9879                         ANYOF_CLASS_SET(ret, ANYOF_NASCII);
9880                     else {
9881                         for (value = 128; value < 256; value++)
9882                             stored +=
9883                               set_regclass_bit(pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate);
9884                     }
9885                     ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
9886                     yesno = '!';
9887                     what = "ASCII";
9888                     break;
9889                 case ANYOF_DIGIT:
9890                     if (LOC)
9891                         ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
9892                     else {
9893                         /* consecutive digits assumed */
9894                         for (value = '0'; value <= '9'; value++)
9895                             stored +=
9896                               set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);
9897                     }
9898                     yesno = '+';
9899                     what = "Digit";
9900                     break;
9901                 case ANYOF_NDIGIT:
9902                     if (LOC)
9903                         ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
9904                     else {
9905                         /* consecutive digits assumed */
9906                         for (value = 0; value < '0'; value++)
9907                             stored +=
9908                               set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);
9909                         for (value = '9' + 1; value < 256; value++)
9910                             stored +=
9911                               set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);
9912                     }
9913                     yesno = '!';
9914                     what = "Digit";
9915                     if (AT_LEAST_ASCII_RESTRICTED ) {
9916                         ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
9917                     }
9918                     break;
9919                 case ANYOF_MAX:
9920                     /* this is to handle \p and \P */
9921                     break;
9922                 default:
9923                     vFAIL("Invalid [::] class");
9924                     break;
9925                 }
9926                 if (what && ! (AT_LEAST_ASCII_RESTRICTED)) {
9927                     /* Strings such as "+utf8::isWord\n" */
9928                     Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
9929                 }
9930
9931                 continue;
9932             }
9933         } /* end of namedclass \blah */
9934
9935         if (range) {
9936             if (prevvalue > (IV)value) /* b-a */ {
9937                 const int w = RExC_parse - rangebegin;
9938                 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
9939                 range = 0; /* not a valid range */
9940             }
9941         }
9942         else {
9943             prevvalue = value; /* save the beginning of the range */
9944             if (RExC_parse+1 < RExC_end
9945                 && *RExC_parse == '-'
9946                 && RExC_parse[1] != ']')
9947             {
9948                 RExC_parse++;
9949
9950                 /* a bad range like \w-, [:word:]- ? */
9951                 if (namedclass > OOB_NAMEDCLASS) {
9952                     if (ckWARN(WARN_REGEXP)) {
9953                         const int w =
9954                             RExC_parse >= rangebegin ?
9955                             RExC_parse - rangebegin : 0;
9956                         vWARN4(RExC_parse,
9957                                "False [] range \"%*.*s\"",
9958                                w, w, rangebegin);
9959                     }
9960                     if (!SIZE_ONLY)
9961                         stored +=
9962                             set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
9963                 } else
9964                     range = 1;  /* yeah, it's a range! */
9965                 continue;       /* but do it the next time */
9966             }
9967         }
9968
9969         /* non-Latin1 code point implies unicode semantics.  Must be set in
9970          * pass1 so is there for the whole of pass 2 */
9971         if (value > 255) {
9972             RExC_uni_semantics = 1;
9973         }
9974
9975         /* now is the next time */
9976         if (!SIZE_ONLY) {
9977             if (prevvalue < 256) {
9978                 const IV ceilvalue = value < 256 ? value : 255;
9979                 IV i;
9980 #ifdef EBCDIC
9981                 /* In EBCDIC [\x89-\x91] should include
9982                  * the \x8e but [i-j] should not. */
9983                 if (literal_endpoint == 2 &&
9984                     ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
9985                      (isUPPER(prevvalue) && isUPPER(ceilvalue))))
9986                 {
9987                     if (isLOWER(prevvalue)) {
9988                         for (i = prevvalue; i <= ceilvalue; i++)
9989                             if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
9990                                 stored +=
9991                                   set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
9992                             }
9993                     } else {
9994                         for (i = prevvalue; i <= ceilvalue; i++)
9995                             if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
9996                                 stored +=
9997                                   set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
9998                             }
9999                     }
10000                 }
10001                 else
10002 #endif
10003                       for (i = prevvalue; i <= ceilvalue; i++) {
10004                         stored += set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
10005                       }
10006           }
10007           if (value > 255) {
10008             const UV prevnatvalue  = NATIVE_TO_UNI(prevvalue);
10009             const UV natvalue      = NATIVE_TO_UNI(value);
10010             nonbitmap = add_range_to_invlist(nonbitmap, prevnatvalue, natvalue);
10011         }
10012 #ifdef EBCDIC
10013             literal_endpoint = 0;
10014 #endif
10015         }
10016
10017         range = 0; /* this range (if it was one) is done now */
10018     }
10019
10020
10021
10022     if (SIZE_ONLY)
10023         return ret;
10024     /****** !SIZE_ONLY AFTER HERE *********/
10025
10026     /* If folding and there are code points above 255, we calculate all
10027      * characters that could fold to or from the ones already on the list */
10028     if (FOLD && nonbitmap) {
10029         UV i;
10030
10031         HV* fold_intersection;
10032         UV* fold_list;
10033
10034         /* This is a list of all the characters that participate in folds
10035             * (except marks, etc in multi-char folds */
10036         if (! PL_utf8_foldable) {
10037             SV* swash = swash_init("utf8", "Cased", &PL_sv_undef, 1, 0);
10038             PL_utf8_foldable = _swash_to_invlist(swash);
10039         }
10040
10041         /* This is a hash that for a particular fold gives all characters
10042             * that are involved in it */
10043         if (! PL_utf8_foldclosures) {
10044
10045             /* If we were unable to find any folds, then we likely won't be
10046              * able to find the closures.  So just create an empty list.
10047              * Folding will effectively be restricted to the non-Unicode rules
10048              * hard-coded into Perl.  (This case happens legitimately during
10049              * compilation of Perl itself before the Unicode tables are
10050              * generated) */
10051             if (invlist_len(PL_utf8_foldable) == 0) {
10052                 PL_utf8_foldclosures = _new_invlist(0);
10053             } else {
10054                 /* If the folds haven't been read in, call a fold function
10055                     * to force that */
10056                 if (! PL_utf8_tofold) {
10057                     U8 dummy[UTF8_MAXBYTES+1];
10058                     STRLEN dummy_len;
10059                     to_utf8_fold((U8*) "A", dummy, &dummy_len);
10060                 }
10061                 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
10062             }
10063         }
10064
10065         /* Only the characters in this class that participate in folds need
10066             * be checked.  Get the intersection of this class and all the
10067             * possible characters that are foldable.  This can quickly narrow
10068             * down a large class */
10069         fold_intersection = invlist_intersection(PL_utf8_foldable, nonbitmap);
10070
10071         /* Now look at the foldable characters in this class individually */
10072         fold_list = invlist_array(fold_intersection);
10073         for (i = 0; i < invlist_len(fold_intersection); i++) {
10074             UV j;
10075
10076             /* The next entry is the beginning of the range that is in the
10077              * class */
10078             UV start = fold_list[i++];
10079
10080
10081             /* The next entry is the beginning of the next range, which
10082                 * isn't in the class, so the end of the current range is one
10083                 * less than that */
10084             UV end = fold_list[i] - 1;
10085
10086             /* Look at every character in the range */
10087             for (j = start; j <= end; j++) {
10088
10089                 /* Get its fold */
10090                 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
10091                 STRLEN foldlen;
10092                 const UV f = to_uni_fold(j, foldbuf, &foldlen);
10093
10094                 if (foldlen > (STRLEN)UNISKIP(f)) {
10095
10096                     /* Any multicharacter foldings (disallowed in
10097                         * lookbehind patterns) require the following
10098                         * transform: [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst) where
10099                         * E folds into "pq" and F folds into "rst", all other
10100                         * characters fold to single characters.  We save away
10101                         * these multicharacter foldings, to be later saved as
10102                         * part of the additional "s" data. */
10103                     if (! RExC_in_lookbehind) {
10104                         U8* loc = foldbuf;
10105                         U8* e = foldbuf + foldlen;
10106
10107                         /* If any of the folded characters of this are in
10108                             * the Latin1 range, tell the regex engine that
10109                             * this can match a non-utf8 target string.  The
10110                             * only multi-byte fold whose source is in the
10111                             * Latin1 range (U+00DF) applies only when the
10112                             * target string is utf8, or under unicode rules */
10113                         if (j > 255 || AT_LEAST_UNI_SEMANTICS) {
10114                             while (loc < e) {
10115
10116                                 /* Can't mix ascii with non- under /aa */
10117                                 if (MORE_ASCII_RESTRICTED
10118                                     && (isASCII(*loc) != isASCII(j)))
10119                                 {
10120                                     goto end_multi_fold;
10121                                 }
10122                                 if (UTF8_IS_INVARIANT(*loc)
10123                                     || UTF8_IS_DOWNGRADEABLE_START(*loc))
10124                                 {
10125                                     /* Can't mix above and below 256 under
10126                                         * LOC */
10127                                     if (LOC) {
10128                                         goto end_multi_fold;
10129                                     }
10130                                     ANYOF_FLAGS(ret)
10131                                             |= ANYOF_NONBITMAP_NON_UTF8;
10132                                     break;
10133                                 }
10134                                 loc += UTF8SKIP(loc);
10135                             }
10136                         }
10137
10138                         add_alternate(&unicode_alternate, foldbuf, foldlen);
10139                     end_multi_fold: ;
10140                     }
10141                 }
10142                 else {
10143                     /* Single character fold.  Add everything in its fold
10144                         * closure to the list that this node should match */
10145                     SV** listp;
10146
10147                     /* The fold closures data structure is a hash with the
10148                         * keys being every character that is folded to, like
10149                         * 'k', and the values each an array of everything that
10150                         * folds to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ] */
10151                     if ((listp = hv_fetch(PL_utf8_foldclosures,
10152                                     (char *) foldbuf, foldlen, FALSE)))
10153                     {
10154                         AV* list = (AV*) *listp;
10155                         IV k;
10156                         for (k = 0; k <= av_len(list); k++) {
10157                             SV** c_p = av_fetch(list, k, FALSE);
10158                             UV c;
10159                             if (c_p == NULL) {
10160                                 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
10161                             }
10162                             c = SvUV(*c_p);
10163
10164                             /* /aa doesn't allow folds between ASCII and
10165                                 * non-; /l doesn't allow them between above
10166                                 * and below 256 */
10167                             if ((MORE_ASCII_RESTRICTED
10168                                  && (isASCII(c) != isASCII(j)))
10169                                     || (LOC && ((c < 256) != (j < 256))))
10170                             {
10171                                 continue;
10172                             }
10173
10174                             if (c < 256 && AT_LEAST_UNI_SEMANTICS) {
10175                                 stored += set_regclass_bit(pRExC_state,
10176                                         ret,
10177                                         (U8) c,
10178                                         &l1_fold_invlist, &unicode_alternate);
10179                             }
10180                                 /* It may be that the code point is already
10181                                     * in this range or already in the bitmap,
10182                                     * in which case we need do nothing */
10183                             else if ((c < start || c > end)
10184                                         && (c > 255
10185                                             || ! ANYOF_BITMAP_TEST(ret, c)))
10186                             {
10187                                 nonbitmap = add_cp_to_invlist(nonbitmap, c);
10188                             }
10189                         }
10190                     }
10191                 }
10192             }
10193         }
10194         invlist_destroy(fold_intersection);
10195     }
10196
10197     /* Combine the two lists into one. */
10198     if (l1_fold_invlist) {
10199         if (nonbitmap) {
10200             nonbitmap = invlist_union(nonbitmap, l1_fold_invlist);
10201         }
10202         else {
10203             nonbitmap = l1_fold_invlist;
10204         }
10205     }
10206
10207     /* Here, we have calculated what code points should be in the character
10208      * class.   Now we can see about various optimizations.  Fold calculation
10209      * needs to take place before inversion.  Otherwise /[^k]/i would invert to
10210      * include K, which under /i would match k. */
10211
10212     /* Optimize inverted simple patterns (e.g. [^a-z]).  Note that we haven't
10213      * set the FOLD flag yet, so this this does optimize those.  It doesn't
10214      * optimize locale.  Doing so perhaps could be done as long as there is
10215      * nothing like \w in it; some thought also would have to be given to the
10216      * interaction with above 0x100 chars */
10217     if (! LOC
10218         && (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT
10219         && ! unicode_alternate
10220         && ! nonbitmap
10221         && SvCUR(listsv) == initial_listsv_len)
10222     {
10223         for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
10224             ANYOF_BITMAP(ret)[value] ^= 0xFF;
10225         stored = 256 - stored;
10226
10227         /* The inversion means that everything above 255 is matched; and at the
10228          * same time we clear the invert flag */
10229         ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
10230     }
10231
10232     /* Folding in the bitmap is taken care of above, but not for locale (for
10233      * which we have to wait to see what folding is in effect at runtime), and
10234      * for things not in the bitmap.  Set run-time fold flag for these */
10235     if (FOLD && (LOC || nonbitmap || unicode_alternate)) {
10236         ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD;
10237     }
10238
10239     /* A single character class can be "optimized" into an EXACTish node.
10240      * Note that since we don't currently count how many characters there are
10241      * outside the bitmap, we are XXX missing optimization possibilities for
10242      * them.  This optimization can't happen unless this is a truly single
10243      * character class, which means that it can't be an inversion into a
10244      * many-character class, and there must be no possibility of there being
10245      * things outside the bitmap.  'stored' (only) for locales doesn't include
10246      * \w, etc, so have to make a special test that they aren't present
10247      *
10248      * Similarly A 2-character class of the very special form like [bB] can be
10249      * optimized into an EXACTFish node, but only for non-locales, and for
10250      * characters which only have the two folds; so things like 'fF' and 'Ii'
10251      * wouldn't work because they are part of the fold of 'LATIN SMALL LIGATURE
10252      * FI'. */
10253     if (! nonbitmap
10254         && ! unicode_alternate
10255         && SvCUR(listsv) == initial_listsv_len
10256         && ! (ANYOF_FLAGS(ret) & (ANYOF_INVERT|ANYOF_UNICODE_ALL))
10257         && (((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
10258                               || (! ANYOF_CLASS_TEST_ANY_SET(ret)))))
10259             || (stored == 2 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
10260                                  && (! _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value))
10261                                  /* If the latest code point has a fold whose
10262                                   * bit is set, it must be the only other one */
10263                                 && ((prevvalue = PL_fold_latin1[value]) != (IV)value)
10264                                  && ANYOF_BITMAP_TEST(ret, prevvalue)))))
10265     {
10266         /* Note that the information needed to decide to do this optimization
10267          * is not currently available until the 2nd pass, and that the actually
10268          * used EXACTish node takes less space than the calculated ANYOF node,
10269          * and hence the amount of space calculated in the first pass is larger
10270          * than actually used, so this optimization doesn't gain us any space.
10271          * But an EXACT node is faster than an ANYOF node, and can be combined
10272          * with any adjacent EXACT nodes later by the optimizer for further
10273          * gains.  The speed of executing an EXACTF is similar to an ANYOF
10274          * node, so the optimization advantage comes from the ability to join
10275          * it to adjacent EXACT nodes */
10276
10277         const char * cur_parse= RExC_parse;
10278         U8 op;
10279         RExC_emit = (regnode *)orig_emit;
10280         RExC_parse = (char *)orig_parse;
10281
10282         if (stored == 1) {
10283
10284             /* A locale node with one point can be folded; all the other cases
10285              * with folding will have two points, since we calculate them above
10286              */
10287             if (ANYOF_FLAGS(ret) & ANYOF_LOC_NONBITMAP_FOLD) {
10288                  op = EXACTFL;
10289             }
10290             else {
10291                 op = EXACT;
10292             }
10293         }   /* else 2 chars in the bit map: the folds of each other */
10294         else if (AT_LEAST_UNI_SEMANTICS || !isASCII(value)) {
10295
10296             /* To join adjacent nodes, they must be the exact EXACTish type.
10297              * Try to use the most likely type, by using EXACTFU if the regex
10298              * calls for them, or is required because the character is
10299              * non-ASCII */
10300             op = EXACTFU;
10301         }
10302         else {    /* Otherwise, more likely to be EXACTF type */
10303             op = EXACTF;
10304         }
10305
10306         ret = reg_node(pRExC_state, op);
10307         RExC_parse = (char *)cur_parse;
10308         if (UTF && ! NATIVE_IS_INVARIANT(value)) {
10309             *STRING(ret)= UTF8_EIGHT_BIT_HI((U8) value);
10310             *(STRING(ret) + 1)= UTF8_EIGHT_BIT_LO((U8) value);
10311             STR_LEN(ret)= 2;
10312             RExC_emit += STR_SZ(2);
10313         }
10314         else {
10315             *STRING(ret)= (char)value;
10316             STR_LEN(ret)= 1;
10317             RExC_emit += STR_SZ(1);
10318         }
10319         SvREFCNT_dec(listsv);
10320         return ret;
10321     }
10322
10323     if (nonbitmap) {
10324         UV* nonbitmap_array = invlist_array(nonbitmap);
10325         UV nonbitmap_len = invlist_len(nonbitmap);
10326         UV i;
10327
10328         /*  Here have the full list of items to match that aren't in the
10329          *  bitmap.  Convert to the structure that the rest of the code is
10330          *  expecting.   XXX That rest of the code should convert to this
10331          *  structure */
10332         for (i = 0; i < nonbitmap_len; i++) {
10333
10334             /* The next entry is the beginning of the range that is in the
10335              * class */
10336             UV start = nonbitmap_array[i++];
10337             UV end;
10338
10339             /* The next entry is the beginning of the next range, which isn't
10340              * in the class, so the end of the current range is one less than
10341              * that.  But if there is no next range, it means that the range
10342              * begun by 'start' extends to infinity, which for this platform
10343              * ends at UV_MAX */
10344             if (i == nonbitmap_len) {
10345                 end = UV_MAX;
10346             }
10347             else {
10348                 end = nonbitmap_array[i] - 1;
10349             }
10350
10351             if (start == end) {
10352                 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", start);
10353             }
10354             else {
10355                 /* The \t sets the whole range */
10356                 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
10357                         /* XXX EBCDIC */
10358                                    start, end);
10359             }
10360         }
10361         invlist_destroy(nonbitmap);
10362     }
10363
10364     if (SvCUR(listsv) == initial_listsv_len && ! unicode_alternate) {
10365         ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
10366         SvREFCNT_dec(listsv);
10367         SvREFCNT_dec(unicode_alternate);
10368     }
10369     else {
10370
10371         AV * const av = newAV();
10372         SV *rv;
10373         /* The 0th element stores the character class description
10374          * in its textual form: used later (regexec.c:Perl_regclass_swash())
10375          * to initialize the appropriate swash (which gets stored in
10376          * the 1st element), and also useful for dumping the regnode.
10377          * The 2nd element stores the multicharacter foldings,
10378          * used later (regexec.c:S_reginclass()). */
10379         av_store(av, 0, listsv);
10380         av_store(av, 1, NULL);
10381         av_store(av, 2, MUTABLE_SV(unicode_alternate));
10382         if (unicode_alternate) { /* This node is variable length */
10383             OP(ret) = ANYOFV;
10384         }
10385         rv = newRV_noinc(MUTABLE_SV(av));
10386         n = add_data(pRExC_state, 1, "s");
10387         RExC_rxi->data->data[n] = (void*)rv;
10388         ARG_SET(ret, n);
10389     }
10390     return ret;
10391 }
10392 #undef _C_C_T_
10393
10394
10395 /* reg_skipcomment()
10396
10397    Absorbs an /x style # comments from the input stream.
10398    Returns true if there is more text remaining in the stream.
10399    Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
10400    terminates the pattern without including a newline.
10401
10402    Note its the callers responsibility to ensure that we are
10403    actually in /x mode
10404
10405 */
10406
10407 STATIC bool
10408 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
10409 {
10410     bool ended = 0;
10411
10412     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
10413
10414     while (RExC_parse < RExC_end)
10415         if (*RExC_parse++ == '\n') {
10416             ended = 1;
10417             break;
10418         }
10419     if (!ended) {
10420         /* we ran off the end of the pattern without ending
10421            the comment, so we have to add an \n when wrapping */
10422         RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
10423         return 0;
10424     } else
10425         return 1;
10426 }
10427
10428 /* nextchar()
10429
10430    Advances the parse position, and optionally absorbs
10431    "whitespace" from the inputstream.
10432
10433    Without /x "whitespace" means (?#...) style comments only,
10434    with /x this means (?#...) and # comments and whitespace proper.
10435
10436    Returns the RExC_parse point from BEFORE the scan occurs.
10437
10438    This is the /x friendly way of saying RExC_parse++.
10439 */
10440
10441 STATIC char*
10442 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
10443 {
10444     char* const retval = RExC_parse++;
10445
10446     PERL_ARGS_ASSERT_NEXTCHAR;
10447
10448     for (;;) {
10449         if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
10450                 RExC_parse[2] == '#') {
10451             while (*RExC_parse != ')') {
10452                 if (RExC_parse == RExC_end)
10453                     FAIL("Sequence (?#... not terminated");
10454                 RExC_parse++;
10455             }
10456             RExC_parse++;
10457             continue;
10458         }
10459         if (RExC_flags & RXf_PMf_EXTENDED) {
10460             if (isSPACE(*RExC_parse)) {
10461                 RExC_parse++;
10462                 continue;
10463             }
10464             else if (*RExC_parse == '#') {
10465                 if ( reg_skipcomment( pRExC_state ) )
10466                     continue;
10467             }
10468         }
10469         return retval;
10470     }
10471 }
10472
10473 /*
10474 - reg_node - emit a node
10475 */
10476 STATIC regnode *                        /* Location. */
10477 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
10478 {
10479     dVAR;
10480     register regnode *ptr;
10481     regnode * const ret = RExC_emit;
10482     GET_RE_DEBUG_FLAGS_DECL;
10483
10484     PERL_ARGS_ASSERT_REG_NODE;
10485
10486     if (SIZE_ONLY) {
10487         SIZE_ALIGN(RExC_size);
10488         RExC_size += 1;
10489         return(ret);
10490     }
10491     if (RExC_emit >= RExC_emit_bound)
10492         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
10493
10494     NODE_ALIGN_FILL(ret);
10495     ptr = ret;
10496     FILL_ADVANCE_NODE(ptr, op);
10497     REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 1);
10498 #ifdef RE_TRACK_PATTERN_OFFSETS
10499     if (RExC_offsets) {         /* MJD */
10500         MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
10501               "reg_node", __LINE__,
10502               PL_reg_name[op],
10503               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
10504                 ? "Overwriting end of array!\n" : "OK",
10505               (UV)(RExC_emit - RExC_emit_start),
10506               (UV)(RExC_parse - RExC_start),
10507               (UV)RExC_offsets[0]));
10508         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
10509     }
10510 #endif
10511     RExC_emit = ptr;
10512     return(ret);
10513 }
10514
10515 /*
10516 - reganode - emit a node with an argument
10517 */
10518 STATIC regnode *                        /* Location. */
10519 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
10520 {
10521     dVAR;
10522     register regnode *ptr;
10523     regnode * const ret = RExC_emit;
10524     GET_RE_DEBUG_FLAGS_DECL;
10525
10526     PERL_ARGS_ASSERT_REGANODE;
10527
10528     if (SIZE_ONLY) {
10529         SIZE_ALIGN(RExC_size);
10530         RExC_size += 2;
10531         /*
10532            We can't do this:
10533
10534            assert(2==regarglen[op]+1);
10535
10536            Anything larger than this has to allocate the extra amount.
10537            If we changed this to be:
10538
10539            RExC_size += (1 + regarglen[op]);
10540
10541            then it wouldn't matter. Its not clear what side effect
10542            might come from that so its not done so far.
10543            -- dmq
10544         */
10545         return(ret);
10546     }
10547     if (RExC_emit >= RExC_emit_bound)
10548         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
10549
10550     NODE_ALIGN_FILL(ret);
10551     ptr = ret;
10552     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
10553     REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 2);
10554 #ifdef RE_TRACK_PATTERN_OFFSETS
10555     if (RExC_offsets) {         /* MJD */
10556         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
10557               "reganode",
10558               __LINE__,
10559               PL_reg_name[op],
10560               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
10561               "Overwriting end of array!\n" : "OK",
10562               (UV)(RExC_emit - RExC_emit_start),
10563               (UV)(RExC_parse - RExC_start),
10564               (UV)RExC_offsets[0]));
10565         Set_Cur_Node_Offset;
10566     }
10567 #endif
10568     RExC_emit = ptr;
10569     return(ret);
10570 }
10571
10572 /*
10573 - reguni - emit (if appropriate) a Unicode character
10574 */
10575 STATIC STRLEN
10576 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
10577 {
10578     dVAR;
10579
10580     PERL_ARGS_ASSERT_REGUNI;
10581
10582     return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
10583 }
10584
10585 /*
10586 - reginsert - insert an operator in front of already-emitted operand
10587 *
10588 * Means relocating the operand.
10589 */
10590 STATIC void
10591 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
10592 {
10593     dVAR;
10594     register regnode *src;
10595     register regnode *dst;
10596     register regnode *place;
10597     const int offset = regarglen[(U8)op];
10598     const int size = NODE_STEP_REGNODE + offset;
10599     GET_RE_DEBUG_FLAGS_DECL;
10600
10601     PERL_ARGS_ASSERT_REGINSERT;
10602     PERL_UNUSED_ARG(depth);
10603 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
10604     DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
10605     if (SIZE_ONLY) {
10606         RExC_size += size;
10607         return;
10608     }
10609
10610     src = RExC_emit;
10611     RExC_emit += size;
10612     dst = RExC_emit;
10613     if (RExC_open_parens) {
10614         int paren;
10615         /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
10616         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
10617             if ( RExC_open_parens[paren] >= opnd ) {
10618                 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
10619                 RExC_open_parens[paren] += size;
10620             } else {
10621                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
10622             }
10623             if ( RExC_close_parens[paren] >= opnd ) {
10624                 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
10625                 RExC_close_parens[paren] += size;
10626             } else {
10627                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
10628             }
10629         }
10630     }
10631
10632     while (src > opnd) {
10633         StructCopy(--src, --dst, regnode);
10634 #ifdef RE_TRACK_PATTERN_OFFSETS
10635         if (RExC_offsets) {     /* MJD 20010112 */
10636             MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
10637                   "reg_insert",
10638                   __LINE__,
10639                   PL_reg_name[op],
10640                   (UV)(dst - RExC_emit_start) > RExC_offsets[0]
10641                     ? "Overwriting end of array!\n" : "OK",
10642                   (UV)(src - RExC_emit_start),
10643                   (UV)(dst - RExC_emit_start),
10644                   (UV)RExC_offsets[0]));
10645             Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
10646             Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
10647         }
10648 #endif
10649     }
10650
10651
10652     place = opnd;               /* Op node, where operand used to be. */
10653 #ifdef RE_TRACK_PATTERN_OFFSETS
10654     if (RExC_offsets) {         /* MJD */
10655         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
10656               "reginsert",
10657               __LINE__,
10658               PL_reg_name[op],
10659               (UV)(place - RExC_emit_start) > RExC_offsets[0]
10660               ? "Overwriting end of array!\n" : "OK",
10661               (UV)(place - RExC_emit_start),
10662               (UV)(RExC_parse - RExC_start),
10663               (UV)RExC_offsets[0]));
10664         Set_Node_Offset(place, RExC_parse);
10665         Set_Node_Length(place, 1);
10666     }
10667 #endif
10668     src = NEXTOPER(place);
10669     FILL_ADVANCE_NODE(place, op);
10670     REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (place) - 1);
10671     Zero(src, offset, regnode);
10672 }
10673
10674 /*
10675 - regtail - set the next-pointer at the end of a node chain of p to val.
10676 - SEE ALSO: regtail_study
10677 */
10678 /* TODO: All three parms should be const */
10679 STATIC void
10680 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
10681 {
10682     dVAR;
10683     register regnode *scan;
10684     GET_RE_DEBUG_FLAGS_DECL;
10685
10686     PERL_ARGS_ASSERT_REGTAIL;
10687 #ifndef DEBUGGING
10688     PERL_UNUSED_ARG(depth);
10689 #endif
10690
10691     if (SIZE_ONLY)
10692         return;
10693
10694     /* Find last node. */
10695     scan = p;
10696     for (;;) {
10697         regnode * const temp = regnext(scan);
10698         DEBUG_PARSE_r({
10699             SV * const mysv=sv_newmortal();
10700             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
10701             regprop(RExC_rx, mysv, scan);
10702             PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
10703                 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
10704                     (temp == NULL ? "->" : ""),
10705                     (temp == NULL ? PL_reg_name[OP(val)] : "")
10706             );
10707         });
10708         if (temp == NULL)
10709             break;
10710         scan = temp;
10711     }
10712
10713     if (reg_off_by_arg[OP(scan)]) {
10714         ARG_SET(scan, val - scan);
10715     }
10716     else {
10717         NEXT_OFF(scan) = val - scan;
10718     }
10719 }
10720
10721 #ifdef DEBUGGING
10722 /*
10723 - regtail_study - set the next-pointer at the end of a node chain of p to val.
10724 - Look for optimizable sequences at the same time.
10725 - currently only looks for EXACT chains.
10726
10727 This is experimental code. The idea is to use this routine to perform
10728 in place optimizations on branches and groups as they are constructed,
10729 with the long term intention of removing optimization from study_chunk so
10730 that it is purely analytical.
10731
10732 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
10733 to control which is which.
10734
10735 */
10736 /* TODO: All four parms should be const */
10737
10738 STATIC U8
10739 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
10740 {
10741     dVAR;
10742     register regnode *scan;
10743     U8 exact = PSEUDO;
10744 #ifdef EXPERIMENTAL_INPLACESCAN
10745     I32 min = 0;
10746 #endif
10747     GET_RE_DEBUG_FLAGS_DECL;
10748
10749     PERL_ARGS_ASSERT_REGTAIL_STUDY;
10750
10751
10752     if (SIZE_ONLY)
10753         return exact;
10754
10755     /* Find last node. */
10756
10757     scan = p;
10758     for (;;) {
10759         regnode * const temp = regnext(scan);
10760 #ifdef EXPERIMENTAL_INPLACESCAN
10761         if (PL_regkind[OP(scan)] == EXACT)
10762             if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
10763                 return EXACT;
10764 #endif
10765         if ( exact ) {
10766             switch (OP(scan)) {
10767                 case EXACT:
10768                 case EXACTF:
10769                 case EXACTFA:
10770                 case EXACTFU:
10771                 case EXACTFL:
10772                         if( exact == PSEUDO )
10773                             exact= OP(scan);
10774                         else if ( exact != OP(scan) )
10775                             exact= 0;
10776                 case NOTHING:
10777                     break;
10778                 default:
10779                     exact= 0;
10780             }
10781         }
10782         DEBUG_PARSE_r({
10783             SV * const mysv=sv_newmortal();
10784             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
10785             regprop(RExC_rx, mysv, scan);
10786             PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
10787                 SvPV_nolen_const(mysv),
10788                 REG_NODE_NUM(scan),
10789                 PL_reg_name[exact]);
10790         });
10791         if (temp == NULL)
10792             break;
10793         scan = temp;
10794     }
10795     DEBUG_PARSE_r({
10796         SV * const mysv_val=sv_newmortal();
10797         DEBUG_PARSE_MSG("");
10798         regprop(RExC_rx, mysv_val, val);
10799         PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
10800                       SvPV_nolen_const(mysv_val),
10801                       (IV)REG_NODE_NUM(val),
10802                       (IV)(val - scan)
10803         );
10804     });
10805     if (reg_off_by_arg[OP(scan)]) {
10806         ARG_SET(scan, val - scan);
10807     }
10808     else {
10809         NEXT_OFF(scan) = val - scan;
10810     }
10811
10812     return exact;
10813 }
10814 #endif
10815
10816 /*
10817  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
10818  */
10819 #ifdef DEBUGGING
10820 static void
10821 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
10822 {
10823     int bit;
10824     int set=0;
10825     regex_charset cs;
10826
10827     for (bit=0; bit<32; bit++) {
10828         if (flags & (1<<bit)) {
10829             if ((1<<bit) & RXf_PMf_CHARSET) {   /* Output separately, below */
10830                 continue;
10831             }
10832             if (!set++ && lead)
10833                 PerlIO_printf(Perl_debug_log, "%s",lead);
10834             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
10835         }
10836     }
10837     if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
10838             if (!set++ && lead) {
10839                 PerlIO_printf(Perl_debug_log, "%s",lead);
10840             }
10841             switch (cs) {
10842                 case REGEX_UNICODE_CHARSET:
10843                     PerlIO_printf(Perl_debug_log, "UNICODE");
10844                     break;
10845                 case REGEX_LOCALE_CHARSET:
10846                     PerlIO_printf(Perl_debug_log, "LOCALE");
10847                     break;
10848                 case REGEX_ASCII_RESTRICTED_CHARSET:
10849                     PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
10850                     break;
10851                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
10852                     PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
10853                     break;
10854                 default:
10855                     PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
10856                     break;
10857             }
10858     }
10859     if (lead)  {
10860         if (set)
10861             PerlIO_printf(Perl_debug_log, "\n");
10862         else
10863             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
10864     }
10865 }
10866 #endif
10867
10868 void
10869 Perl_regdump(pTHX_ const regexp *r)
10870 {
10871 #ifdef DEBUGGING
10872     dVAR;
10873     SV * const sv = sv_newmortal();
10874     SV *dsv= sv_newmortal();
10875     RXi_GET_DECL(r,ri);
10876     GET_RE_DEBUG_FLAGS_DECL;
10877
10878     PERL_ARGS_ASSERT_REGDUMP;
10879
10880     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
10881
10882     /* Header fields of interest. */
10883     if (r->anchored_substr) {
10884         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
10885             RE_SV_DUMPLEN(r->anchored_substr), 30);
10886         PerlIO_printf(Perl_debug_log,
10887                       "anchored %s%s at %"IVdf" ",
10888                       s, RE_SV_TAIL(r->anchored_substr),
10889                       (IV)r->anchored_offset);
10890     } else if (r->anchored_utf8) {
10891         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
10892             RE_SV_DUMPLEN(r->anchored_utf8), 30);
10893         PerlIO_printf(Perl_debug_log,
10894                       "anchored utf8 %s%s at %"IVdf" ",
10895                       s, RE_SV_TAIL(r->anchored_utf8),
10896                       (IV)r->anchored_offset);
10897     }
10898     if (r->float_substr) {
10899         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
10900             RE_SV_DUMPLEN(r->float_substr), 30);
10901         PerlIO_printf(Perl_debug_log,
10902                       "floating %s%s at %"IVdf"..%"UVuf" ",
10903                       s, RE_SV_TAIL(r->float_substr),
10904                       (IV)r->float_min_offset, (UV)r->float_max_offset);
10905     } else if (r->float_utf8) {
10906         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
10907             RE_SV_DUMPLEN(r->float_utf8), 30);
10908         PerlIO_printf(Perl_debug_log,
10909                       "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
10910                       s, RE_SV_TAIL(r->float_utf8),
10911                       (IV)r->float_min_offset, (UV)r->float_max_offset);
10912     }
10913     if (r->check_substr || r->check_utf8)
10914         PerlIO_printf(Perl_debug_log,
10915                       (const char *)
10916                       (r->check_substr == r->float_substr
10917                        && r->check_utf8 == r->float_utf8
10918                        ? "(checking floating" : "(checking anchored"));
10919     if (r->extflags & RXf_NOSCAN)
10920         PerlIO_printf(Perl_debug_log, " noscan");
10921     if (r->extflags & RXf_CHECK_ALL)
10922         PerlIO_printf(Perl_debug_log, " isall");
10923     if (r->check_substr || r->check_utf8)
10924         PerlIO_printf(Perl_debug_log, ") ");
10925
10926     if (ri->regstclass) {
10927         regprop(r, sv, ri->regstclass);
10928         PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
10929     }
10930     if (r->extflags & RXf_ANCH) {
10931         PerlIO_printf(Perl_debug_log, "anchored");
10932         if (r->extflags & RXf_ANCH_BOL)
10933             PerlIO_printf(Perl_debug_log, "(BOL)");
10934         if (r->extflags & RXf_ANCH_MBOL)
10935             PerlIO_printf(Perl_debug_log, "(MBOL)");
10936         if (r->extflags & RXf_ANCH_SBOL)
10937             PerlIO_printf(Perl_debug_log, "(SBOL)");
10938         if (r->extflags & RXf_ANCH_GPOS)
10939             PerlIO_printf(Perl_debug_log, "(GPOS)");
10940         PerlIO_putc(Perl_debug_log, ' ');
10941     }
10942     if (r->extflags & RXf_GPOS_SEEN)
10943         PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
10944     if (r->intflags & PREGf_SKIP)
10945         PerlIO_printf(Perl_debug_log, "plus ");
10946     if (r->intflags & PREGf_IMPLICIT)
10947         PerlIO_printf(Perl_debug_log, "implicit ");
10948     PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
10949     if (r->extflags & RXf_EVAL_SEEN)
10950         PerlIO_printf(Perl_debug_log, "with eval ");
10951     PerlIO_printf(Perl_debug_log, "\n");
10952     DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
10953 #else
10954     PERL_ARGS_ASSERT_REGDUMP;
10955     PERL_UNUSED_CONTEXT;
10956     PERL_UNUSED_ARG(r);
10957 #endif  /* DEBUGGING */
10958 }
10959
10960 /*
10961 - regprop - printable representation of opcode
10962 */
10963 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
10964 STMT_START { \
10965         if (do_sep) {                           \
10966             Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
10967             if (flags & ANYOF_INVERT)           \
10968                 /*make sure the invert info is in each */ \
10969                 sv_catpvs(sv, "^");             \
10970             do_sep = 0;                         \
10971         }                                       \
10972 } STMT_END
10973
10974 void
10975 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
10976 {
10977 #ifdef DEBUGGING
10978     dVAR;
10979     register int k;
10980     RXi_GET_DECL(prog,progi);
10981     GET_RE_DEBUG_FLAGS_DECL;
10982
10983     PERL_ARGS_ASSERT_REGPROP;
10984
10985     sv_setpvs(sv, "");
10986
10987     if (OP(o) > REGNODE_MAX)            /* regnode.type is unsigned */
10988         /* It would be nice to FAIL() here, but this may be called from
10989            regexec.c, and it would be hard to supply pRExC_state. */
10990         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
10991     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
10992
10993     k = PL_regkind[OP(o)];
10994
10995     if (k == EXACT) {
10996         sv_catpvs(sv, " ");
10997         /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
10998          * is a crude hack but it may be the best for now since
10999          * we have no flag "this EXACTish node was UTF-8"
11000          * --jhi */
11001         pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
11002                   PERL_PV_ESCAPE_UNI_DETECT |
11003                   PERL_PV_ESCAPE_NONASCII   |
11004                   PERL_PV_PRETTY_ELLIPSES   |
11005                   PERL_PV_PRETTY_LTGT       |
11006                   PERL_PV_PRETTY_NOCLEAR
11007                   );
11008     } else if (k == TRIE) {
11009         /* print the details of the trie in dumpuntil instead, as
11010          * progi->data isn't available here */
11011         const char op = OP(o);
11012         const U32 n = ARG(o);
11013         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
11014                (reg_ac_data *)progi->data->data[n] :
11015                NULL;
11016         const reg_trie_data * const trie
11017             = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
11018
11019         Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
11020         DEBUG_TRIE_COMPILE_r(
11021             Perl_sv_catpvf(aTHX_ sv,
11022                 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
11023                 (UV)trie->startstate,
11024                 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
11025                 (UV)trie->wordcount,
11026                 (UV)trie->minlen,
11027                 (UV)trie->maxlen,
11028                 (UV)TRIE_CHARCOUNT(trie),
11029                 (UV)trie->uniquecharcount
11030             )
11031         );
11032         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
11033             int i;
11034             int rangestart = -1;
11035             U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
11036             sv_catpvs(sv, "[");
11037             for (i = 0; i <= 256; i++) {
11038                 if (i < 256 && BITMAP_TEST(bitmap,i)) {
11039                     if (rangestart == -1)
11040                         rangestart = i;
11041                 } else if (rangestart != -1) {
11042                     if (i <= rangestart + 3)
11043                         for (; rangestart < i; rangestart++)
11044                             put_byte(sv, rangestart);
11045                     else {
11046                         put_byte(sv, rangestart);
11047                         sv_catpvs(sv, "-");
11048                         put_byte(sv, i - 1);
11049                     }
11050                     rangestart = -1;
11051                 }
11052             }
11053             sv_catpvs(sv, "]");
11054         }
11055
11056     } else if (k == CURLY) {
11057         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
11058             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
11059         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
11060     }
11061     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
11062         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
11063     else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
11064         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
11065         if ( RXp_PAREN_NAMES(prog) ) {
11066             if ( k != REF || (OP(o) < NREF)) {
11067                 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
11068                 SV **name= av_fetch(list, ARG(o), 0 );
11069                 if (name)
11070                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
11071             }
11072             else {
11073                 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
11074                 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
11075                 I32 *nums=(I32*)SvPVX(sv_dat);
11076                 SV **name= av_fetch(list, nums[0], 0 );
11077                 I32 n;
11078                 if (name) {
11079                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
11080                         Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
11081                                     (n ? "," : ""), (IV)nums[n]);
11082                     }
11083                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
11084                 }
11085             }
11086         }
11087     } else if (k == GOSUB)
11088         Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
11089     else if (k == VERB) {
11090         if (!o->flags)
11091             Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
11092                            SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
11093     } else if (k == LOGICAL)
11094         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);     /* 2: embedded, otherwise 1 */
11095     else if (k == FOLDCHAR)
11096         Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) );
11097     else if (k == ANYOF) {
11098         int i, rangestart = -1;
11099         const U8 flags = ANYOF_FLAGS(o);
11100         int do_sep = 0;
11101
11102         /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
11103         static const char * const anyofs[] = {
11104             "\\w",
11105             "\\W",
11106             "\\s",
11107             "\\S",
11108             "\\d",
11109             "\\D",
11110             "[:alnum:]",
11111             "[:^alnum:]",
11112             "[:alpha:]",
11113             "[:^alpha:]",
11114             "[:ascii:]",
11115             "[:^ascii:]",
11116             "[:cntrl:]",
11117             "[:^cntrl:]",
11118             "[:graph:]",
11119             "[:^graph:]",
11120             "[:lower:]",
11121             "[:^lower:]",
11122             "[:print:]",
11123             "[:^print:]",
11124             "[:punct:]",
11125             "[:^punct:]",
11126             "[:upper:]",
11127             "[:^upper:]",
11128             "[:xdigit:]",
11129             "[:^xdigit:]",
11130             "[:space:]",
11131             "[:^space:]",
11132             "[:blank:]",
11133             "[:^blank:]"
11134         };
11135
11136         if (flags & ANYOF_LOCALE)
11137             sv_catpvs(sv, "{loc}");
11138         if (flags & ANYOF_LOC_NONBITMAP_FOLD)
11139             sv_catpvs(sv, "{i}");
11140         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
11141         if (flags & ANYOF_INVERT)
11142             sv_catpvs(sv, "^");
11143
11144         /* output what the standard cp 0-255 bitmap matches */
11145         for (i = 0; i <= 256; i++) {
11146             if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
11147                 if (rangestart == -1)
11148                     rangestart = i;
11149             } else if (rangestart != -1) {
11150                 if (i <= rangestart + 3)
11151                     for (; rangestart < i; rangestart++)
11152                         put_byte(sv, rangestart);
11153                 else {
11154                     put_byte(sv, rangestart);
11155                     sv_catpvs(sv, "-");
11156                     put_byte(sv, i - 1);
11157                 }
11158                 do_sep = 1;
11159                 rangestart = -1;
11160             }
11161         }
11162
11163         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
11164         /* output any special charclass tests (used entirely under use locale) */
11165         if (ANYOF_CLASS_TEST_ANY_SET(o))
11166             for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
11167                 if (ANYOF_CLASS_TEST(o,i)) {
11168                     sv_catpv(sv, anyofs[i]);
11169                     do_sep = 1;
11170                 }
11171
11172         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
11173
11174         if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
11175             sv_catpvs(sv, "{non-utf8-latin1-all}");
11176         }
11177
11178         /* output information about the unicode matching */
11179         if (flags & ANYOF_UNICODE_ALL)
11180             sv_catpvs(sv, "{unicode_all}");
11181         else if (ANYOF_NONBITMAP(o))
11182             sv_catpvs(sv, "{unicode}");
11183         if (flags & ANYOF_NONBITMAP_NON_UTF8)
11184             sv_catpvs(sv, "{outside bitmap}");
11185
11186         if (ANYOF_NONBITMAP(o)) {
11187             SV *lv;
11188             SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
11189
11190             if (lv) {
11191                 if (sw) {
11192                     U8 s[UTF8_MAXBYTES_CASE+1];
11193
11194                     for (i = 0; i <= 256; i++) { /* just the first 256 */
11195                         uvchr_to_utf8(s, i);
11196
11197                         if (i < 256 && swash_fetch(sw, s, TRUE)) {
11198                             if (rangestart == -1)
11199                                 rangestart = i;
11200                         } else if (rangestart != -1) {
11201                             if (i <= rangestart + 3)
11202                                 for (; rangestart < i; rangestart++) {
11203                                     const U8 * const e = uvchr_to_utf8(s,rangestart);
11204                                     U8 *p;
11205                                     for(p = s; p < e; p++)
11206                                         put_byte(sv, *p);
11207                                 }
11208                             else {
11209                                 const U8 *e = uvchr_to_utf8(s,rangestart);
11210                                 U8 *p;
11211                                 for (p = s; p < e; p++)
11212                                     put_byte(sv, *p);
11213                                 sv_catpvs(sv, "-");
11214                                 e = uvchr_to_utf8(s, i-1);
11215                                 for (p = s; p < e; p++)
11216                                     put_byte(sv, *p);
11217                                 }
11218                                 rangestart = -1;
11219                             }
11220                         }
11221
11222                     sv_catpvs(sv, "..."); /* et cetera */
11223                 }
11224
11225                 {
11226                     char *s = savesvpv(lv);
11227                     char * const origs = s;
11228
11229                     while (*s && *s != '\n')
11230                         s++;
11231
11232                     if (*s == '\n') {
11233                         const char * const t = ++s;
11234
11235                         while (*s) {
11236                             if (*s == '\n')
11237                                 *s = ' ';
11238                             s++;
11239                         }
11240                         if (s[-1] == ' ')
11241                             s[-1] = 0;
11242
11243                         sv_catpv(sv, t);
11244                     }
11245
11246                     Safefree(origs);
11247                 }
11248             }
11249         }
11250
11251         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
11252     }
11253     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
11254         Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
11255 #else
11256     PERL_UNUSED_CONTEXT;
11257     PERL_UNUSED_ARG(sv);
11258     PERL_UNUSED_ARG(o);
11259     PERL_UNUSED_ARG(prog);
11260 #endif  /* DEBUGGING */
11261 }
11262
11263 SV *
11264 Perl_re_intuit_string(pTHX_ REGEXP * const r)
11265 {                               /* Assume that RE_INTUIT is set */
11266     dVAR;
11267     struct regexp *const prog = (struct regexp *)SvANY(r);
11268     GET_RE_DEBUG_FLAGS_DECL;
11269
11270     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
11271     PERL_UNUSED_CONTEXT;
11272
11273     DEBUG_COMPILE_r(
11274         {
11275             const char * const s = SvPV_nolen_const(prog->check_substr
11276                       ? prog->check_substr : prog->check_utf8);
11277
11278             if (!PL_colorset) reginitcolors();
11279             PerlIO_printf(Perl_debug_log,
11280                       "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
11281                       PL_colors[4],
11282                       prog->check_substr ? "" : "utf8 ",
11283                       PL_colors[5],PL_colors[0],
11284                       s,
11285                       PL_colors[1],
11286                       (strlen(s) > 60 ? "..." : ""));
11287         } );
11288
11289     return prog->check_substr ? prog->check_substr : prog->check_utf8;
11290 }
11291
11292 /*
11293    pregfree()
11294
11295    handles refcounting and freeing the perl core regexp structure. When
11296    it is necessary to actually free the structure the first thing it
11297    does is call the 'free' method of the regexp_engine associated to
11298    the regexp, allowing the handling of the void *pprivate; member
11299    first. (This routine is not overridable by extensions, which is why
11300    the extensions free is called first.)
11301
11302    See regdupe and regdupe_internal if you change anything here.
11303 */
11304 #ifndef PERL_IN_XSUB_RE
11305 void
11306 Perl_pregfree(pTHX_ REGEXP *r)
11307 {
11308     SvREFCNT_dec(r);
11309 }
11310
11311 void
11312 Perl_pregfree2(pTHX_ REGEXP *rx)
11313 {
11314     dVAR;
11315     struct regexp *const r = (struct regexp *)SvANY(rx);
11316     GET_RE_DEBUG_FLAGS_DECL;
11317
11318     PERL_ARGS_ASSERT_PREGFREE2;
11319
11320     if (r->mother_re) {
11321         ReREFCNT_dec(r->mother_re);
11322     } else {
11323         CALLREGFREE_PVT(rx); /* free the private data */
11324         SvREFCNT_dec(RXp_PAREN_NAMES(r));
11325     }
11326     if (r->substrs) {
11327         SvREFCNT_dec(r->anchored_substr);
11328         SvREFCNT_dec(r->anchored_utf8);
11329         SvREFCNT_dec(r->float_substr);
11330         SvREFCNT_dec(r->float_utf8);
11331         Safefree(r->substrs);
11332     }
11333     RX_MATCH_COPY_FREE(rx);
11334 #ifdef PERL_OLD_COPY_ON_WRITE
11335     SvREFCNT_dec(r->saved_copy);
11336 #endif
11337     Safefree(r->offs);
11338 }
11339
11340 /*  reg_temp_copy()
11341
11342     This is a hacky workaround to the structural issue of match results
11343     being stored in the regexp structure which is in turn stored in
11344     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
11345     could be PL_curpm in multiple contexts, and could require multiple
11346     result sets being associated with the pattern simultaneously, such
11347     as when doing a recursive match with (??{$qr})
11348
11349     The solution is to make a lightweight copy of the regexp structure
11350     when a qr// is returned from the code executed by (??{$qr}) this
11351     lightweight copy doesn't actually own any of its data except for
11352     the starp/end and the actual regexp structure itself.
11353
11354 */
11355
11356
11357 REGEXP *
11358 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
11359 {
11360     struct regexp *ret;
11361     struct regexp *const r = (struct regexp *)SvANY(rx);
11362     register const I32 npar = r->nparens+1;
11363
11364     PERL_ARGS_ASSERT_REG_TEMP_COPY;
11365
11366     if (!ret_x)
11367         ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
11368     ret = (struct regexp *)SvANY(ret_x);
11369
11370     (void)ReREFCNT_inc(rx);
11371     /* We can take advantage of the existing "copied buffer" mechanism in SVs
11372        by pointing directly at the buffer, but flagging that the allocated
11373        space in the copy is zero. As we've just done a struct copy, it's now
11374        a case of zero-ing that, rather than copying the current length.  */
11375     SvPV_set(ret_x, RX_WRAPPED(rx));
11376     SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
11377     memcpy(&(ret->xpv_cur), &(r->xpv_cur),
11378            sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
11379     SvLEN_set(ret_x, 0);
11380     SvSTASH_set(ret_x, NULL);
11381     SvMAGIC_set(ret_x, NULL);
11382     Newx(ret->offs, npar, regexp_paren_pair);
11383     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
11384     if (r->substrs) {
11385         Newx(ret->substrs, 1, struct reg_substr_data);
11386         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
11387
11388         SvREFCNT_inc_void(ret->anchored_substr);
11389         SvREFCNT_inc_void(ret->anchored_utf8);
11390         SvREFCNT_inc_void(ret->float_substr);
11391         SvREFCNT_inc_void(ret->float_utf8);
11392
11393         /* check_substr and check_utf8, if non-NULL, point to either their
11394            anchored or float namesakes, and don't hold a second reference.  */
11395     }
11396     RX_MATCH_COPIED_off(ret_x);
11397 #ifdef PERL_OLD_COPY_ON_WRITE
11398     ret->saved_copy = NULL;
11399 #endif
11400     ret->mother_re = rx;
11401
11402     return ret_x;
11403 }
11404 #endif
11405
11406 /* regfree_internal()
11407
11408    Free the private data in a regexp. This is overloadable by
11409    extensions. Perl takes care of the regexp structure in pregfree(),
11410    this covers the *pprivate pointer which technically perl doesn't
11411    know about, however of course we have to handle the
11412    regexp_internal structure when no extension is in use.
11413
11414    Note this is called before freeing anything in the regexp
11415    structure.
11416  */
11417
11418 void
11419 Perl_regfree_internal(pTHX_ REGEXP * const rx)
11420 {
11421     dVAR;
11422     struct regexp *const r = (struct regexp *)SvANY(rx);
11423     RXi_GET_DECL(r,ri);
11424     GET_RE_DEBUG_FLAGS_DECL;
11425
11426     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
11427
11428     DEBUG_COMPILE_r({
11429         if (!PL_colorset)
11430             reginitcolors();
11431         {
11432             SV *dsv= sv_newmortal();
11433             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
11434                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
11435             PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
11436                 PL_colors[4],PL_colors[5],s);
11437         }
11438     });
11439 #ifdef RE_TRACK_PATTERN_OFFSETS
11440     if (ri->u.offsets)
11441         Safefree(ri->u.offsets);             /* 20010421 MJD */
11442 #endif
11443     if (ri->data) {
11444         int n = ri->data->count;
11445         PAD* new_comppad = NULL;
11446         PAD* old_comppad;
11447         PADOFFSET refcnt;
11448
11449         while (--n >= 0) {
11450           /* If you add a ->what type here, update the comment in regcomp.h */
11451             switch (ri->data->what[n]) {
11452             case 'a':
11453             case 's':
11454             case 'S':
11455             case 'u':
11456                 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
11457                 break;
11458             case 'f':
11459                 Safefree(ri->data->data[n]);
11460                 break;
11461             case 'p':
11462                 new_comppad = MUTABLE_AV(ri->data->data[n]);
11463                 break;
11464             case 'o':
11465                 if (new_comppad == NULL)
11466                     Perl_croak(aTHX_ "panic: pregfree comppad");
11467                 PAD_SAVE_LOCAL(old_comppad,
11468                     /* Watch out for global destruction's random ordering. */
11469                     (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
11470                 );
11471                 OP_REFCNT_LOCK;
11472                 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
11473                 OP_REFCNT_UNLOCK;
11474                 if (!refcnt)
11475                     op_free((OP_4tree*)ri->data->data[n]);
11476
11477                 PAD_RESTORE_LOCAL(old_comppad);
11478                 SvREFCNT_dec(MUTABLE_SV(new_comppad));
11479                 new_comppad = NULL;
11480                 break;
11481             case 'n':
11482                 break;
11483             case 'T':
11484                 { /* Aho Corasick add-on structure for a trie node.
11485                      Used in stclass optimization only */
11486                     U32 refcount;
11487                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
11488                     OP_REFCNT_LOCK;
11489                     refcount = --aho->refcount;
11490                     OP_REFCNT_UNLOCK;
11491                     if ( !refcount ) {
11492                         PerlMemShared_free(aho->states);
11493                         PerlMemShared_free(aho->fail);
11494                          /* do this last!!!! */
11495                         PerlMemShared_free(ri->data->data[n]);
11496                         PerlMemShared_free(ri->regstclass);
11497                     }
11498                 }
11499                 break;
11500             case 't':
11501                 {
11502                     /* trie structure. */
11503                     U32 refcount;
11504                     reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
11505                     OP_REFCNT_LOCK;
11506                     refcount = --trie->refcount;
11507                     OP_REFCNT_UNLOCK;
11508                     if ( !refcount ) {
11509                         PerlMemShared_free(trie->charmap);
11510                         PerlMemShared_free(trie->states);
11511                         PerlMemShared_free(trie->trans);
11512                         if (trie->bitmap)
11513                             PerlMemShared_free(trie->bitmap);
11514                         if (trie->jump)
11515                             PerlMemShared_free(trie->jump);
11516                         PerlMemShared_free(trie->wordinfo);
11517                         /* do this last!!!! */
11518                         PerlMemShared_free(ri->data->data[n]);
11519                     }
11520                 }
11521                 break;
11522             default:
11523                 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
11524             }
11525         }
11526         Safefree(ri->data->what);
11527         Safefree(ri->data);
11528     }
11529
11530     Safefree(ri);
11531 }
11532
11533 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
11534 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
11535 #define SAVEPVN(p,n)    ((p) ? savepvn(p,n) : NULL)
11536
11537 /*
11538    re_dup - duplicate a regexp.
11539
11540    This routine is expected to clone a given regexp structure. It is only
11541    compiled under USE_ITHREADS.
11542
11543    After all of the core data stored in struct regexp is duplicated
11544    the regexp_engine.dupe method is used to copy any private data
11545    stored in the *pprivate pointer. This allows extensions to handle
11546    any duplication it needs to do.
11547
11548    See pregfree() and regfree_internal() if you change anything here.
11549 */
11550 #if defined(USE_ITHREADS)
11551 #ifndef PERL_IN_XSUB_RE
11552 void
11553 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
11554 {
11555     dVAR;
11556     I32 npar;
11557     const struct regexp *r = (const struct regexp *)SvANY(sstr);
11558     struct regexp *ret = (struct regexp *)SvANY(dstr);
11559
11560     PERL_ARGS_ASSERT_RE_DUP_GUTS;
11561
11562     npar = r->nparens+1;
11563     Newx(ret->offs, npar, regexp_paren_pair);
11564     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
11565     if(ret->swap) {
11566         /* no need to copy these */
11567         Newx(ret->swap, npar, regexp_paren_pair);
11568     }
11569
11570     if (ret->substrs) {
11571         /* Do it this way to avoid reading from *r after the StructCopy().
11572            That way, if any of the sv_dup_inc()s dislodge *r from the L1
11573            cache, it doesn't matter.  */
11574         const bool anchored = r->check_substr
11575             ? r->check_substr == r->anchored_substr
11576             : r->check_utf8 == r->anchored_utf8;
11577         Newx(ret->substrs, 1, struct reg_substr_data);
11578         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
11579
11580         ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
11581         ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
11582         ret->float_substr = sv_dup_inc(ret->float_substr, param);
11583         ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
11584
11585         /* check_substr and check_utf8, if non-NULL, point to either their
11586            anchored or float namesakes, and don't hold a second reference.  */
11587
11588         if (ret->check_substr) {
11589             if (anchored) {
11590                 assert(r->check_utf8 == r->anchored_utf8);
11591                 ret->check_substr = ret->anchored_substr;
11592                 ret->check_utf8 = ret->anchored_utf8;
11593             } else {
11594                 assert(r->check_substr == r->float_substr);
11595                 assert(r->check_utf8 == r->float_utf8);
11596                 ret->check_substr = ret->float_substr;
11597                 ret->check_utf8 = ret->float_utf8;
11598             }
11599         } else if (ret->check_utf8) {
11600             if (anchored) {
11601                 ret->check_utf8 = ret->anchored_utf8;
11602             } else {
11603                 ret->check_utf8 = ret->float_utf8;
11604             }
11605         }
11606     }
11607
11608     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
11609
11610     if (ret->pprivate)
11611         RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
11612
11613     if (RX_MATCH_COPIED(dstr))
11614         ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
11615     else
11616         ret->subbeg = NULL;
11617 #ifdef PERL_OLD_COPY_ON_WRITE
11618     ret->saved_copy = NULL;
11619 #endif
11620
11621     if (ret->mother_re) {
11622         if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
11623             /* Our storage points directly to our mother regexp, but that's
11624                1: a buffer in a different thread
11625                2: something we no longer hold a reference on
11626                so we need to copy it locally.  */
11627             /* Note we need to sue SvCUR() on our mother_re, because it, in
11628                turn, may well be pointing to its own mother_re.  */
11629             SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
11630                                    SvCUR(ret->mother_re)+1));
11631             SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
11632         }
11633         ret->mother_re      = NULL;
11634     }
11635     ret->gofs = 0;
11636 }
11637 #endif /* PERL_IN_XSUB_RE */
11638
11639 /*
11640    regdupe_internal()
11641
11642    This is the internal complement to regdupe() which is used to copy
11643    the structure pointed to by the *pprivate pointer in the regexp.
11644    This is the core version of the extension overridable cloning hook.
11645    The regexp structure being duplicated will be copied by perl prior
11646    to this and will be provided as the regexp *r argument, however
11647    with the /old/ structures pprivate pointer value. Thus this routine
11648    may override any copying normally done by perl.
11649
11650    It returns a pointer to the new regexp_internal structure.
11651 */
11652
11653 void *
11654 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
11655 {
11656     dVAR;
11657     struct regexp *const r = (struct regexp *)SvANY(rx);
11658     regexp_internal *reti;
11659     int len, npar;
11660     RXi_GET_DECL(r,ri);
11661
11662     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
11663
11664     npar = r->nparens+1;
11665     len = ProgLen(ri);
11666
11667     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
11668     Copy(ri->program, reti->program, len+1, regnode);
11669
11670
11671     reti->regstclass = NULL;
11672
11673     if (ri->data) {
11674         struct reg_data *d;
11675         const int count = ri->data->count;
11676         int i;
11677
11678         Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
11679                 char, struct reg_data);
11680         Newx(d->what, count, U8);
11681
11682         d->count = count;
11683         for (i = 0; i < count; i++) {
11684             d->what[i] = ri->data->what[i];
11685             switch (d->what[i]) {
11686                 /* legal options are one of: sSfpontTua
11687                    see also regcomp.h and pregfree() */
11688             case 'a': /* actually an AV, but the dup function is identical.  */
11689             case 's':
11690             case 'S':
11691             case 'p': /* actually an AV, but the dup function is identical.  */
11692             case 'u': /* actually an HV, but the dup function is identical.  */
11693                 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
11694                 break;
11695             case 'f':
11696                 /* This is cheating. */
11697                 Newx(d->data[i], 1, struct regnode_charclass_class);
11698                 StructCopy(ri->data->data[i], d->data[i],
11699                             struct regnode_charclass_class);
11700                 reti->regstclass = (regnode*)d->data[i];
11701                 break;
11702             case 'o':
11703                 /* Compiled op trees are readonly and in shared memory,
11704                    and can thus be shared without duplication. */
11705                 OP_REFCNT_LOCK;
11706                 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
11707                 OP_REFCNT_UNLOCK;
11708                 break;
11709             case 'T':
11710                 /* Trie stclasses are readonly and can thus be shared
11711                  * without duplication. We free the stclass in pregfree
11712                  * when the corresponding reg_ac_data struct is freed.
11713                  */
11714                 reti->regstclass= ri->regstclass;
11715                 /* Fall through */
11716             case 't':
11717                 OP_REFCNT_LOCK;
11718                 ((reg_trie_data*)ri->data->data[i])->refcount++;
11719                 OP_REFCNT_UNLOCK;
11720                 /* Fall through */
11721             case 'n':
11722                 d->data[i] = ri->data->data[i];
11723                 break;
11724             default:
11725                 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
11726             }
11727         }
11728
11729         reti->data = d;
11730     }
11731     else
11732         reti->data = NULL;
11733
11734     reti->name_list_idx = ri->name_list_idx;
11735
11736 #ifdef RE_TRACK_PATTERN_OFFSETS
11737     if (ri->u.offsets) {
11738         Newx(reti->u.offsets, 2*len+1, U32);
11739         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
11740     }
11741 #else
11742     SetProgLen(reti,len);
11743 #endif
11744
11745     return (void*)reti;
11746 }
11747
11748 #endif    /* USE_ITHREADS */
11749
11750 #ifndef PERL_IN_XSUB_RE
11751
11752 /*
11753  - regnext - dig the "next" pointer out of a node
11754  */
11755 regnode *
11756 Perl_regnext(pTHX_ register regnode *p)
11757 {
11758     dVAR;
11759     register I32 offset;
11760
11761     if (!p)
11762         return(NULL);
11763
11764     if (OP(p) > REGNODE_MAX) {          /* regnode.type is unsigned */
11765         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
11766     }
11767
11768     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
11769     if (offset == 0)
11770         return(NULL);
11771
11772     return(p+offset);
11773 }
11774 #endif
11775
11776 STATIC void
11777 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
11778 {
11779     va_list args;
11780     STRLEN l1 = strlen(pat1);
11781     STRLEN l2 = strlen(pat2);
11782     char buf[512];
11783     SV *msv;
11784     const char *message;
11785
11786     PERL_ARGS_ASSERT_RE_CROAK2;
11787
11788     if (l1 > 510)
11789         l1 = 510;
11790     if (l1 + l2 > 510)
11791         l2 = 510 - l1;
11792     Copy(pat1, buf, l1 , char);
11793     Copy(pat2, buf + l1, l2 , char);
11794     buf[l1 + l2] = '\n';
11795     buf[l1 + l2 + 1] = '\0';
11796 #ifdef I_STDARG
11797     /* ANSI variant takes additional second argument */
11798     va_start(args, pat2);
11799 #else
11800     va_start(args);
11801 #endif
11802     msv = vmess(buf, &args);
11803     va_end(args);
11804     message = SvPV_const(msv,l1);
11805     if (l1 > 512)
11806         l1 = 512;
11807     Copy(message, buf, l1 , char);
11808     buf[l1-1] = '\0';                   /* Overwrite \n */
11809     Perl_croak(aTHX_ "%s", buf);
11810 }
11811
11812 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
11813
11814 #ifndef PERL_IN_XSUB_RE
11815 void
11816 Perl_save_re_context(pTHX)
11817 {
11818     dVAR;
11819
11820     struct re_save_state *state;
11821
11822     SAVEVPTR(PL_curcop);
11823     SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
11824
11825     state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
11826     PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
11827     SSPUSHUV(SAVEt_RE_STATE);
11828
11829     Copy(&PL_reg_state, state, 1, struct re_save_state);
11830
11831     PL_reg_start_tmp = 0;
11832     PL_reg_start_tmpl = 0;
11833     PL_reg_oldsaved = NULL;
11834     PL_reg_oldsavedlen = 0;
11835     PL_reg_maxiter = 0;
11836     PL_reg_leftiter = 0;
11837     PL_reg_poscache = NULL;
11838     PL_reg_poscache_size = 0;
11839 #ifdef PERL_OLD_COPY_ON_WRITE
11840     PL_nrs = NULL;
11841 #endif
11842
11843     /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
11844     if (PL_curpm) {
11845         const REGEXP * const rx = PM_GETRE(PL_curpm);
11846         if (rx) {
11847             U32 i;
11848             for (i = 1; i <= RX_NPARENS(rx); i++) {
11849                 char digits[TYPE_CHARS(long)];
11850                 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
11851                 GV *const *const gvp
11852                     = (GV**)hv_fetch(PL_defstash, digits, len, 0);
11853
11854                 if (gvp) {
11855                     GV * const gv = *gvp;
11856                     if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
11857                         save_scalar(gv);
11858                 }
11859             }
11860         }
11861     }
11862 }
11863 #endif
11864
11865 static void
11866 clear_re(pTHX_ void *r)
11867 {
11868     dVAR;
11869     ReREFCNT_dec((REGEXP *)r);
11870 }
11871
11872 #ifdef DEBUGGING
11873
11874 STATIC void
11875 S_put_byte(pTHX_ SV *sv, int c)
11876 {
11877     PERL_ARGS_ASSERT_PUT_BYTE;
11878
11879     /* Our definition of isPRINT() ignores locales, so only bytes that are
11880        not part of UTF-8 are considered printable. I assume that the same
11881        holds for UTF-EBCDIC.
11882        Also, code point 255 is not printable in either (it's E0 in EBCDIC,
11883        which Wikipedia says:
11884
11885        EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
11886        ones (binary 1111 1111, hexadecimal FF). It is similar, but not
11887        identical, to the ASCII delete (DEL) or rubout control character.
11888        ) So the old condition can be simplified to !isPRINT(c)  */
11889     if (!isPRINT(c)) {
11890         if (c < 256) {
11891             Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
11892         }
11893         else {
11894             Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
11895         }
11896     }
11897     else {
11898         const char string = c;
11899         if (c == '-' || c == ']' || c == '\\' || c == '^')
11900             sv_catpvs(sv, "\\");
11901         sv_catpvn(sv, &string, 1);
11902     }
11903 }
11904
11905
11906 #define CLEAR_OPTSTART \
11907     if (optstart) STMT_START { \
11908             DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
11909             optstart=NULL; \
11910     } STMT_END
11911
11912 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
11913
11914 STATIC const regnode *
11915 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
11916             const regnode *last, const regnode *plast,
11917             SV* sv, I32 indent, U32 depth)
11918 {
11919     dVAR;
11920     register U8 op = PSEUDO;    /* Arbitrary non-END op. */
11921     register const regnode *next;
11922     const regnode *optstart= NULL;
11923
11924     RXi_GET_DECL(r,ri);
11925     GET_RE_DEBUG_FLAGS_DECL;
11926
11927     PERL_ARGS_ASSERT_DUMPUNTIL;
11928
11929 #ifdef DEBUG_DUMPUNTIL
11930     PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
11931         last ? last-start : 0,plast ? plast-start : 0);
11932 #endif
11933
11934     if (plast && plast < last)
11935         last= plast;
11936
11937     while (PL_regkind[op] != END && (!last || node < last)) {
11938         /* While that wasn't END last time... */
11939         NODE_ALIGN(node);
11940         op = OP(node);
11941         if (op == CLOSE || op == WHILEM)
11942             indent--;
11943         next = regnext((regnode *)node);
11944
11945         /* Where, what. */
11946         if (OP(node) == OPTIMIZED) {
11947             if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
11948                 optstart = node;
11949             else
11950                 goto after_print;
11951         } else
11952             CLEAR_OPTSTART;
11953
11954         regprop(r, sv, node);
11955         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
11956                       (int)(2*indent + 1), "", SvPVX_const(sv));
11957
11958         if (OP(node) != OPTIMIZED) {
11959             if (next == NULL)           /* Next ptr. */
11960                 PerlIO_printf(Perl_debug_log, " (0)");
11961             else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
11962                 PerlIO_printf(Perl_debug_log, " (FAIL)");
11963             else
11964                 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
11965             (void)PerlIO_putc(Perl_debug_log, '\n');
11966         }
11967
11968       after_print:
11969         if (PL_regkind[(U8)op] == BRANCHJ) {
11970             assert(next);
11971             {
11972                 register const regnode *nnode = (OP(next) == LONGJMP
11973                                              ? regnext((regnode *)next)
11974                                              : next);
11975                 if (last && nnode > last)
11976                     nnode = last;
11977                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
11978             }
11979         }
11980         else if (PL_regkind[(U8)op] == BRANCH) {
11981             assert(next);
11982             DUMPUNTIL(NEXTOPER(node), next);
11983         }
11984         else if ( PL_regkind[(U8)op]  == TRIE ) {
11985             const regnode *this_trie = node;
11986             const char op = OP(node);
11987             const U32 n = ARG(node);
11988             const reg_ac_data * const ac = op>=AHOCORASICK ?
11989                (reg_ac_data *)ri->data->data[n] :
11990                NULL;
11991             const reg_trie_data * const trie =
11992                 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
11993 #ifdef DEBUGGING
11994             AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
11995 #endif
11996             const regnode *nextbranch= NULL;
11997             I32 word_idx;
11998             sv_setpvs(sv, "");
11999             for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
12000                 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
12001
12002                 PerlIO_printf(Perl_debug_log, "%*s%s ",
12003                    (int)(2*(indent+3)), "",
12004                     elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
12005                             PL_colors[0], PL_colors[1],
12006                             (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
12007                             PERL_PV_PRETTY_ELLIPSES    |
12008                             PERL_PV_PRETTY_LTGT
12009                             )
12010                             : "???"
12011                 );
12012                 if (trie->jump) {
12013                     U16 dist= trie->jump[word_idx+1];
12014                     PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
12015                                   (UV)((dist ? this_trie + dist : next) - start));
12016                     if (dist) {
12017                         if (!nextbranch)
12018                             nextbranch= this_trie + trie->jump[0];
12019                         DUMPUNTIL(this_trie + dist, nextbranch);
12020                     }
12021                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
12022                         nextbranch= regnext((regnode *)nextbranch);
12023                 } else {
12024                     PerlIO_printf(Perl_debug_log, "\n");
12025                 }
12026             }
12027             if (last && next > last)
12028                 node= last;
12029             else
12030                 node= next;
12031         }
12032         else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
12033             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
12034                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
12035         }
12036         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
12037             assert(next);
12038             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
12039         }
12040         else if ( op == PLUS || op == STAR) {
12041             DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
12042         }
12043         else if (PL_regkind[(U8)op] == ANYOF) {
12044             /* arglen 1 + class block */
12045             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
12046                     ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
12047             node = NEXTOPER(node);
12048         }
12049         else if (PL_regkind[(U8)op] == EXACT) {
12050             /* Literal string, where present. */
12051             node += NODE_SZ_STR(node) - 1;
12052             node = NEXTOPER(node);
12053         }
12054         else {
12055             node = NEXTOPER(node);
12056             node += regarglen[(U8)op];
12057         }
12058         if (op == CURLYX || op == OPEN)
12059             indent++;
12060     }
12061     CLEAR_OPTSTART;
12062 #ifdef DEBUG_DUMPUNTIL
12063     PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
12064 #endif
12065     return node;
12066 }
12067
12068 #endif  /* DEBUGGING */
12069
12070 /*
12071  * Local variables:
12072  * c-indentation-style: bsd
12073  * c-basic-offset: 4
12074  * indent-tabs-mode: t
12075  * End:
12076  *
12077  * ex: set ts=8 sts=4 sw=4 noet:
12078  */