src/5013011/regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  *      Copyright (c) 1986 by University of Toronto.
  42  *      Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  *      Permission is granted to anyone to use this software for any
  45  *      purpose on any computer system, and to redistribute it freely,
  46  *      subject to the following restrictions:
  47  *
  48  *      1. The author is not responsible for the consequences of use of
  49  *              this software, no matter how awful, even if they arise
  50  *              from defects in it.
  51  *
  52  *      2. The origin of this software must not be misrepresented, either
  53  *              by explicit claim or by omission.
  54  *
  55  *      3. Altered versions must be plainly marked as such, and must not
  56  *              be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #include "perl.h"
  76
  77 #ifndef PERL_IN_XSUB_RE
  78 #include "re_defs.h"
  79 #endif
  80
  81 #define REG_COMP_C
  82 #ifdef PERL_IN_XSUB_RE
  83 #  include "re_comp.h"
  84 #else
  85 #  include "regcomp.h"
  86 #endif
  87
  88 #include "dquote_static.c"
  89
  90 #ifdef op
  91 #undef op
  92 #endif /* op */
  93
  94 #ifdef MSDOS
  95 #  if defined(BUGGY_MSC6)
  96  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
  97 #    pragma optimize("a",off)
  98  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
  99 #    pragma optimize("w",on )
 100 #  endif /* BUGGY_MSC6 */
 101 #endif /* MSDOS */
 102
 103 #ifndef STATIC
 104 #define STATIC  static
 105 #endif
 106
 107 typedef struct RExC_state_t {
 108     U32         flags;                  /* are we folding, multilining? */
 109     char        *precomp;               /* uncompiled string. */
 110     REGEXP      *rx_sv;                 /* The SV that is the regexp. */
 111     regexp      *rx;                    /* perl core regexp structure */
 112     regexp_internal     *rxi;           /* internal data for regexp object pprivate field */
 113     char        *start;                 /* Start of input for compile */
 114     char        *end;                   /* End of input for compile */
 115     char        *parse;                 /* Input-scan pointer. */
 116     I32         whilem_seen;            /* number of WHILEM in this expr */
 117     regnode     *emit_start;            /* Start of emitted-code area */
 118     regnode     *emit_bound;            /* First regnode outside of the allocated space */
 119     regnode     *emit;                  /* Code-emit pointer; &regdummy = don't = compiling */
 120     I32         naughty;                /* How bad is this pattern? */
 121     I32         sawback;                /* Did we see \1, ...? */
 122     U32         seen;
 123     I32         size;                   /* Code size. */
 124     I32         npar;                   /* Capture buffer count, (OPEN). */
 125     I32         cpar;                   /* Capture buffer count, (CLOSE). */
 126     I32         nestroot;               /* root parens we are in - used by accept */
 127     I32         extralen;
 128     I32         seen_zerolen;
 129     I32         seen_evals;
 130     regnode     **open_parens;          /* pointers to open parens */
 131     regnode     **close_parens;         /* pointers to close parens */
 132     regnode     *opend;                 /* END node in program */
 133     I32         utf8;           /* whether the pattern is utf8 or not */
 134     I32         orig_utf8;      /* whether the pattern was originally in utf8 */
 135                                 /* XXX use this for future optimisation of case
 136                                  * where pattern must be upgraded to utf8. */
 137     I32         uni_semantics;  /* If a d charset modifier should use unicode
 138                                    rules, even if the pattern is not in
 139                                    utf8 */
 140     HV          *paren_names;           /* Paren names */
 141
 142     regnode     **recurse;              /* Recurse regops */
 143     I32         recurse_count;          /* Number of recurse regops */
 144     I32         in_lookbehind;
 145     I32         contains_locale;
 146 #if ADD_TO_REGEXEC
 147     char        *starttry;              /* -Dr: where regtry was called. */
 148 #define RExC_starttry   (pRExC_state->starttry)
 149 #endif
 150 #ifdef DEBUGGING
 151     const char  *lastparse;
 152     I32         lastnum;
 153     AV          *paren_name_list;       /* idx -> name */
 154 #define RExC_lastparse  (pRExC_state->lastparse)
 155 #define RExC_lastnum    (pRExC_state->lastnum)
 156 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 157 #endif
 158 } RExC_state_t;
 159
 160 #define RExC_flags      (pRExC_state->flags)
 161 #define RExC_precomp    (pRExC_state->precomp)
 162 #define RExC_rx_sv      (pRExC_state->rx_sv)
 163 #define RExC_rx         (pRExC_state->rx)
 164 #define RExC_rxi        (pRExC_state->rxi)
 165 #define RExC_start      (pRExC_state->start)
 166 #define RExC_end        (pRExC_state->end)
 167 #define RExC_parse      (pRExC_state->parse)
 168 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
 169 #ifdef RE_TRACK_PATTERN_OFFSETS
 170 #define RExC_offsets    (pRExC_state->rxi->u.offsets) /* I am not like the others */
 171 #endif
 172 #define RExC_emit       (pRExC_state->emit)
 173 #define RExC_emit_start (pRExC_state->emit_start)
 174 #define RExC_emit_bound (pRExC_state->emit_bound)
 175 #define RExC_naughty    (pRExC_state->naughty)
 176 #define RExC_sawback    (pRExC_state->sawback)
 177 #define RExC_seen       (pRExC_state->seen)
 178 #define RExC_size       (pRExC_state->size)
 179 #define RExC_npar       (pRExC_state->npar)
 180 #define RExC_nestroot   (pRExC_state->nestroot)
 181 #define RExC_extralen   (pRExC_state->extralen)
 182 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
 183 #define RExC_seen_evals (pRExC_state->seen_evals)
 184 #define RExC_utf8       (pRExC_state->utf8)
 185 #define RExC_uni_semantics      (pRExC_state->uni_semantics)
 186 #define RExC_orig_utf8  (pRExC_state->orig_utf8)
 187 #define RExC_open_parens        (pRExC_state->open_parens)
 188 #define RExC_close_parens       (pRExC_state->close_parens)
 189 #define RExC_opend      (pRExC_state->opend)
 190 #define RExC_paren_names        (pRExC_state->paren_names)
 191 #define RExC_recurse    (pRExC_state->recurse)
 192 #define RExC_recurse_count      (pRExC_state->recurse_count)
 193 #define RExC_in_lookbehind      (pRExC_state->in_lookbehind)
 194 #define RExC_contains_locale    (pRExC_state->contains_locale)
 195
 196
 197 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
 198 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 199         ((*s) == '{' && regcurly(s)))
 200
 201 #ifdef SPSTART
 202 #undef SPSTART          /* dratted cpp namespace... */
 203 #endif
 204 /*
 205  * Flags to be passed up and down.
 206  */
 207 #define WORST           0       /* Worst case. */
 208 #define HASWIDTH        0x01    /* Known to match non-null strings. */
 209
 210 /* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
 211  * character, and if utf8, must be invariant.  Note that this is not the same thing as REGNODE_SIMPLE */
 212 #define SIMPLE          0x02
 213 #define SPSTART         0x04    /* Starts with * or +. */
 214 #define TRYAGAIN        0x08    /* Weeded out a declaration. */
 215 #define POSTPONED       0x10    /* (?1),(?&name), (??{...}) or similar */
 216
 217 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 218
 219 /* whether trie related optimizations are enabled */
 220 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 221 #define TRIE_STUDY_OPT
 222 #define FULL_TRIE_STUDY
 223 #define TRIE_STCLASS
 224 #endif
 225
 226
 227
 228 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 229 #define PBITVAL(paren) (1 << ((paren) & 7))
 230 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 231 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 232 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 233
 234 /* If not already in utf8, do a longjmp back to the beginning */
 235 #define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
 236 #define REQUIRE_UTF8    STMT_START {                                       \
 237                                      if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
 238                         } STMT_END
 239
 240 /* About scan_data_t.
 241
 242   During optimisation we recurse through the regexp program performing
 243   various inplace (keyhole style) optimisations. In addition study_chunk
 244   and scan_commit populate this data structure with information about
 245   what strings MUST appear in the pattern. We look for the longest
 246   string that must appear at a fixed location, and we look for the
 247   longest string that may appear at a floating location. So for instance
 248   in the pattern:
 249
 250     /FOO[xX]A.*B[xX]BAR/
 251
 252   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 253   strings (because they follow a .* construct). study_chunk will identify
 254   both FOO and BAR as being the longest fixed and floating strings respectively.
 255
 256   The strings can be composites, for instance
 257
 258      /(f)(o)(o)/
 259
 260   will result in a composite fixed substring 'foo'.
 261
 262   For each string some basic information is maintained:
 263
 264   - offset or min_offset
 265     This is the position the string must appear at, or not before.
 266     It also implicitly (when combined with minlenp) tells us how many
 267     characters must match before the string we are searching for.
 268     Likewise when combined with minlenp and the length of the string it
 269     tells us how many characters must appear after the string we have
 270     found.
 271
 272   - max_offset
 273     Only used for floating strings. This is the rightmost point that
 274     the string can appear at. If set to I32 max it indicates that the
 275     string can occur infinitely far to the right.
 276
 277   - minlenp
 278     A pointer to the minimum length of the pattern that the string
 279     was found inside. This is important as in the case of positive
 280     lookahead or positive lookbehind we can have multiple patterns
 281     involved. Consider
 282
 283     /(?=FOO).*F/
 284
 285     The minimum length of the pattern overall is 3, the minimum length
 286     of the lookahead part is 3, but the minimum length of the part that
 287     will actually match is 1. So 'FOO's minimum length is 3, but the
 288     minimum length for the F is 1. This is important as the minimum length
 289     is used to determine offsets in front of and behind the string being
 290     looked for.  Since strings can be composites this is the length of the
 291     pattern at the time it was committed with a scan_commit. Note that
 292     the length is calculated by study_chunk, so that the minimum lengths
 293     are not known until the full pattern has been compiled, thus the
 294     pointer to the value.
 295
 296   - lookbehind
 297
 298     In the case of lookbehind the string being searched for can be
 299     offset past the start point of the final matching string.
 300     If this value was just blithely removed from the min_offset it would
 301     invalidate some of the calculations for how many chars must match
 302     before or after (as they are derived from min_offset and minlen and
 303     the length of the string being searched for).
 304     When the final pattern is compiled and the data is moved from the
 305     scan_data_t structure into the regexp structure the information
 306     about lookbehind is factored in, with the information that would
 307     have been lost precalculated in the end_shift field for the
 308     associated string.
 309
 310   The fields pos_min and pos_delta are used to store the minimum offset
 311   and the delta to the maximum offset at the current point in the pattern.
 312
 313 */
 314
 315 typedef struct scan_data_t {
 316     /*I32 len_min;      unused */
 317     /*I32 len_delta;    unused */
 318     I32 pos_min;
 319     I32 pos_delta;
 320     SV *last_found;
 321     I32 last_end;           /* min value, <0 unless valid. */
 322     I32 last_start_min;
 323     I32 last_start_max;
 324     SV **longest;           /* Either &l_fixed, or &l_float. */
 325     SV *longest_fixed;      /* longest fixed string found in pattern */
 326     I32 offset_fixed;       /* offset where it starts */
 327     I32 *minlen_fixed;      /* pointer to the minlen relevant to the string */
 328     I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 329     SV *longest_float;      /* longest floating string found in pattern */
 330     I32 offset_float_min;   /* earliest point in string it can appear */
 331     I32 offset_float_max;   /* latest point in string it can appear */
 332     I32 *minlen_float;      /* pointer to the minlen relevant to the string */
 333     I32 lookbehind_float;   /* is the position of the string modified by LB */
 334     I32 flags;
 335     I32 whilem_c;
 336     I32 *last_closep;
 337     struct regnode_charclass_class *start_class;
 338 } scan_data_t;
 339
 340 /*
 341  * Forward declarations for pregcomp()'s friends.
 342  */
 343
 344 static const scan_data_t zero_scan_data =
 345   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 346
 347 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 348 #define SF_BEFORE_SEOL          0x0001
 349 #define SF_BEFORE_MEOL          0x0002
 350 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 351 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 352
 353 #ifdef NO_UNARY_PLUS
 354 #  define SF_FIX_SHIFT_EOL      (0+2)
 355 #  define SF_FL_SHIFT_EOL               (0+4)
 356 #else
 357 #  define SF_FIX_SHIFT_EOL      (+2)
 358 #  define SF_FL_SHIFT_EOL               (+4)
 359 #endif
 360
 361 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 362 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 363
 364 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 365 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 366 #define SF_IS_INF               0x0040
 367 #define SF_HAS_PAR              0x0080
 368 #define SF_IN_PAR               0x0100
 369 #define SF_HAS_EVAL             0x0200
 370 #define SCF_DO_SUBSTR           0x0400
 371 #define SCF_DO_STCLASS_AND      0x0800
 372 #define SCF_DO_STCLASS_OR       0x1000
 373 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 374 #define SCF_WHILEM_VISITED_POS  0x2000
 375
 376 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 377 #define SCF_SEEN_ACCEPT         0x8000
 378
 379 #define UTF cBOOL(RExC_utf8)
 380 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 381 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
 382 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
 383 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
 384 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
 385 #define MORE_ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
 386 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
 387
 388 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 389
 390 #define OOB_UNICODE             12345678
 391 #define OOB_NAMEDCLASS          -1
 392
 393 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 394 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 395
 396
 397 /* length of regex to show in messages that don't mark a position within */
 398 #define RegexLengthToShowInErrorMessages 127
 399
 400 /*
 401  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 402  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 403  * op/pragma/warn/regcomp.
 404  */
 405 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 406 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 407
 408 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
 409
 410 /*
 411  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 412  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 413  * "...".
 414  */
 415 #define _FAIL(code) STMT_START {                                        \
 416     const char *ellipses = "";                                          \
 417     IV len = RExC_end - RExC_precomp;                                   \
 418                                                                         \
 419     if (!SIZE_ONLY)                                                     \
 420         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);                   \
 421     if (len > RegexLengthToShowInErrorMessages) {                       \
 422         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
 423         len = RegexLengthToShowInErrorMessages - 10;                    \
 424         ellipses = "...";                                               \
 425     }                                                                   \
 426     code;                                                               \
 427 } STMT_END
 428
 429 #define FAIL(msg) _FAIL(                            \
 430     Perl_croak(aTHX_ "%s in regex m/%.*s%s/",       \
 431             msg, (int)len, RExC_precomp, ellipses))
 432
 433 #define FAIL2(msg,arg) _FAIL(                       \
 434     Perl_croak(aTHX_ msg " in regex m/%.*s%s/",     \
 435             arg, (int)len, RExC_precomp, ellipses))
 436
 437 /*
 438  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 439  */
 440 #define Simple_vFAIL(m) STMT_START {                                    \
 441     const IV offset = RExC_parse - RExC_precomp;                        \
 442     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
 443             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
 444 } STMT_END
 445
 446 /*
 447  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 448  */
 449 #define vFAIL(m) STMT_START {                           \
 450     if (!SIZE_ONLY)                                     \
 451         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 452     Simple_vFAIL(m);                                    \
 453 } STMT_END
 454
 455 /*
 456  * Like Simple_vFAIL(), but accepts two arguments.
 457  */
 458 #define Simple_vFAIL2(m,a1) STMT_START {                        \
 459     const IV offset = RExC_parse - RExC_precomp;                        \
 460     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,                   \
 461             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 462 } STMT_END
 463
 464 /*
 465  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 466  */
 467 #define vFAIL2(m,a1) STMT_START {                       \
 468     if (!SIZE_ONLY)                                     \
 469         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 470     Simple_vFAIL2(m, a1);                               \
 471 } STMT_END
 472
 473
 474 /*
 475  * Like Simple_vFAIL(), but accepts three arguments.
 476  */
 477 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
 478     const IV offset = RExC_parse - RExC_precomp;                \
 479     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,               \
 480             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 481 } STMT_END
 482
 483 /*
 484  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 485  */
 486 #define vFAIL3(m,a1,a2) STMT_START {                    \
 487     if (!SIZE_ONLY)                                     \
 488         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 489     Simple_vFAIL3(m, a1, a2);                           \
 490 } STMT_END
 491
 492 /*
 493  * Like Simple_vFAIL(), but accepts four arguments.
 494  */
 495 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
 496     const IV offset = RExC_parse - RExC_precomp;                \
 497     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,           \
 498             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 499 } STMT_END
 500
 501 #define ckWARNreg(loc,m) STMT_START {                                   \
 502     const IV offset = loc - RExC_precomp;                               \
 503     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 504             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 505 } STMT_END
 506
 507 #define ckWARNregdep(loc,m) STMT_START {                                \
 508     const IV offset = loc - RExC_precomp;                               \
 509     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 510             m REPORT_LOCATION,                                          \
 511             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 512 } STMT_END
 513
 514 #define ckWARN2regdep(loc,m, a1) STMT_START {                           \
 515     const IV offset = loc - RExC_precomp;                               \
 516     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 517             m REPORT_LOCATION,                                          \
 518             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
 519 } STMT_END
 520
 521 #define ckWARN2reg(loc, m, a1) STMT_START {                             \
 522     const IV offset = loc - RExC_precomp;                               \
 523     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 524             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
 525 } STMT_END
 526
 527 #define vWARN3(loc, m, a1, a2) STMT_START {                             \
 528     const IV offset = loc - RExC_precomp;                               \
 529     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 530             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 531 } STMT_END
 532
 533 #define ckWARN3reg(loc, m, a1, a2) STMT_START {                         \
 534     const IV offset = loc - RExC_precomp;                               \
 535     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 536             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 537 } STMT_END
 538
 539 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
 540     const IV offset = loc - RExC_precomp;                               \
 541     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 542             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 543 } STMT_END
 544
 545 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START {                     \
 546     const IV offset = loc - RExC_precomp;                               \
 547     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 548             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 549 } STMT_END
 550
 551 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
 552     const IV offset = loc - RExC_precomp;                               \
 553     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 554             a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
 555 } STMT_END
 556
 557
 558 /* Allow for side effects in s */
 559 #define REGC(c,s) STMT_START {                  \
 560     if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
 561 } STMT_END
 562
 563 /* Macros for recording node offsets.   20001227 mjd@plover.com
 564  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 565  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 566  * Element 0 holds the number n.
 567  * Position is 1 indexed.
 568  */
 569 #ifndef RE_TRACK_PATTERN_OFFSETS
 570 #define Set_Node_Offset_To_R(node,byte)
 571 #define Set_Node_Offset(node,byte)
 572 #define Set_Cur_Node_Offset
 573 #define Set_Node_Length_To_R(node,len)
 574 #define Set_Node_Length(node,len)
 575 #define Set_Node_Cur_Length(node)
 576 #define Node_Offset(n)
 577 #define Node_Length(n)
 578 #define Set_Node_Offset_Length(node,offset,len)
 579 #define ProgLen(ri) ri->u.proglen
 580 #define SetProgLen(ri,x) ri->u.proglen = x
 581 #else
 582 #define ProgLen(ri) ri->u.offsets[0]
 583 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 584 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
 585     if (! SIZE_ONLY) {                                                  \
 586         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
 587                     __LINE__, (int)(node), (int)(byte)));               \
 588         if((node) < 0) {                                                \
 589             Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
 590         } else {                                                        \
 591             RExC_offsets[2*(node)-1] = (byte);                          \
 592         }                                                               \
 593     }                                                                   \
 594 } STMT_END
 595
 596 #define Set_Node_Offset(node,byte) \
 597     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 598 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 599
 600 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
 601     if (! SIZE_ONLY) {                                                  \
 602         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
 603                 __LINE__, (int)(node), (int)(len)));                    \
 604         if((node) < 0) {                                                \
 605             Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
 606         } else {                                                        \
 607             RExC_offsets[2*(node)] = (len);                             \
 608         }                                                               \
 609     }                                                                   \
 610 } STMT_END
 611
 612 #define Set_Node_Length(node,len) \
 613     Set_Node_Length_To_R((node)-RExC_emit_start, len)
 614 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
 615 #define Set_Node_Cur_Length(node) \
 616     Set_Node_Length(node, RExC_parse - parse_start)
 617
 618 /* Get offsets and lengths */
 619 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 620 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 621
 622 #define Set_Node_Offset_Length(node,offset,len) STMT_START {    \
 623     Set_Node_Offset_To_R((node)-RExC_emit_start, (offset));     \
 624     Set_Node_Length_To_R((node)-RExC_emit_start, (len));        \
 625 } STMT_END
 626 #endif
 627
 628 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 629 #define EXPERIMENTAL_INPLACESCAN
 630 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
 631
 632 #define DEBUG_STUDYDATA(str,data,depth)                              \
 633 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
 634     PerlIO_printf(Perl_debug_log,                                    \
 635         "%*s" str "Pos:%"IVdf"/%"IVdf                                \
 636         " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s",       \
 637         (int)(depth)*2, "",                                          \
 638         (IV)((data)->pos_min),                                       \
 639         (IV)((data)->pos_delta),                                     \
 640         (UV)((data)->flags),                                         \
 641         (IV)((data)->whilem_c),                                      \
 642         (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
 643         is_inf ? "INF " : ""                                         \
 644     );                                                               \
 645     if ((data)->last_found)                                          \
 646         PerlIO_printf(Perl_debug_log,                                \
 647             "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 648             " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 649             SvPVX_const((data)->last_found),                         \
 650             (IV)((data)->last_end),                                  \
 651             (IV)((data)->last_start_min),                            \
 652             (IV)((data)->last_start_max),                            \
 653             ((data)->longest &&                                      \
 654              (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 655             SvPVX_const((data)->longest_fixed),                      \
 656             (IV)((data)->offset_fixed),                              \
 657             ((data)->longest &&                                      \
 658              (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 659             SvPVX_const((data)->longest_float),                      \
 660             (IV)((data)->offset_float_min),                          \
 661             (IV)((data)->offset_float_max)                           \
 662         );                                                           \
 663     PerlIO_printf(Perl_debug_log,"\n");                              \
 664 });
 665
 666 static void clear_re(pTHX_ void *r);
 667
 668 /* Mark that we cannot extend a found fixed substring at this point.
 669    Update the longest found anchored substring and the longest found
 670    floating substrings if needed. */
 671
 672 STATIC void
 673 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
 674 {
 675     const STRLEN l = CHR_SVLEN(data->last_found);
 676     const STRLEN old_l = CHR_SVLEN(*data->longest);
 677     GET_RE_DEBUG_FLAGS_DECL;
 678
 679     PERL_ARGS_ASSERT_SCAN_COMMIT;
 680
 681     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 682         SvSetMagicSV(*data->longest, data->last_found);
 683         if (*data->longest == data->longest_fixed) {
 684             data->offset_fixed = l ? data->last_start_min : data->pos_min;
 685             if (data->flags & SF_BEFORE_EOL)
 686                 data->flags
 687                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
 688             else
 689                 data->flags &= ~SF_FIX_BEFORE_EOL;
 690             data->minlen_fixed=minlenp;
 691             data->lookbehind_fixed=0;
 692         }
 693         else { /* *data->longest == data->longest_float */
 694             data->offset_float_min = l ? data->last_start_min : data->pos_min;
 695             data->offset_float_max = (l
 696                                       ? data->last_start_max
 697                                       : data->pos_min + data->pos_delta);
 698             if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
 699                 data->offset_float_max = I32_MAX;
 700             if (data->flags & SF_BEFORE_EOL)
 701                 data->flags
 702                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
 703             else
 704                 data->flags &= ~SF_FL_BEFORE_EOL;
 705             data->minlen_float=minlenp;
 706             data->lookbehind_float=0;
 707         }
 708     }
 709     SvCUR_set(data->last_found, 0);
 710     {
 711         SV * const sv = data->last_found;
 712         if (SvUTF8(sv) && SvMAGICAL(sv)) {
 713             MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 714             if (mg)
 715                 mg->mg_len = 0;
 716         }
 717     }
 718     data->last_end = -1;
 719     data->flags &= ~SF_BEFORE_EOL;
 720     DEBUG_STUDYDATA("commit: ",data,0);
 721 }
 722
 723 /* Can match anything (initialization) */
 724 STATIC void
 725 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 726 {
 727     PERL_ARGS_ASSERT_CL_ANYTHING;
 728
 729     ANYOF_BITMAP_SETALL(cl);
 730     cl->flags = ANYOF_CLASS|ANYOF_EOS|ANYOF_UNICODE_ALL
 731                 |ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL
 732                     /* Even though no bitmap is in use here, we need to set
 733                      * the flag below so an AND with a node that does have one
 734                      * doesn't lose that one.  The flag should get cleared if
 735                      * the other one doesn't; and the code in regexec.c is
 736                      * structured so this being set when not needed does no
 737                      * harm.  It seemed a little cleaner to set it here than do
 738                      * a special case in cl_and() */
 739                 |ANYOF_NONBITMAP_NON_UTF8;
 740
 741     /* If any portion of the regex is to operate under locale rules,
 742      * initialization includes it.  The reason this isn't done for all regexes
 743      * is that the optimizer was written under the assumption that locale was
 744      * all-or-nothing.  Given the complexity and lack of documentation in the
 745      * optimizer, and that there are inadequate test cases for locale, so many
 746      * parts of it may not work properly, it is safest to avoid locale unless
 747      * necessary. */
 748     if (RExC_contains_locale) {
 749         ANYOF_CLASS_SETALL(cl);     /* /l uses class */
 750         cl->flags |= ANYOF_LOCALE;
 751     }
 752     else {
 753         ANYOF_CLASS_ZERO(cl);       /* Only /l uses class now */
 754     }
 755 }
 756
 757 /* Can match anything (initialization) */
 758 STATIC int
 759 S_cl_is_anything(const struct regnode_charclass_class *cl)
 760 {
 761     int value;
 762
 763     PERL_ARGS_ASSERT_CL_IS_ANYTHING;
 764
 765     for (value = 0; value <= ANYOF_MAX; value += 2)
 766         if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
 767             return 1;
 768     if (!(cl->flags & ANYOF_UNICODE_ALL))
 769         return 0;
 770     if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
 771         return 0;
 772     return 1;
 773 }
 774
 775 /* Can match anything (initialization) */
 776 STATIC void
 777 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 778 {
 779     PERL_ARGS_ASSERT_CL_INIT;
 780
 781     Zero(cl, 1, struct regnode_charclass_class);
 782     cl->type = ANYOF;
 783     cl_anything(pRExC_state, cl);
 784     ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
 785 }
 786
 787 /* These two functions currently do the exact same thing */
 788 #define cl_init_zero            S_cl_init
 789
 790 /* 'AND' a given class with another one.  Can create false positives.  'cl'
 791  * should not be inverted.  'and_with->flags & ANYOF_CLASS' should be 0 if
 792  * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
 793 STATIC void
 794 S_cl_and(struct regnode_charclass_class *cl,
 795         const struct regnode_charclass_class *and_with)
 796 {
 797     PERL_ARGS_ASSERT_CL_AND;
 798
 799     assert(and_with->type == ANYOF);
 800
 801     /* I (khw) am not sure all these restrictions are necessary XXX */
 802     if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
 803         && !(ANYOF_CLASS_TEST_ANY_SET(cl))
 804         && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 805         && !(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
 806         && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) {
 807         int i;
 808
 809         if (and_with->flags & ANYOF_INVERT)
 810             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 811                 cl->bitmap[i] &= ~and_with->bitmap[i];
 812         else
 813             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 814                 cl->bitmap[i] &= and_with->bitmap[i];
 815     } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
 816
 817     if (and_with->flags & ANYOF_INVERT) {
 818
 819         /* Here, the and'ed node is inverted.  Get the AND of the flags that
 820          * aren't affected by the inversion.  Those that are affected are
 821          * handled individually below */
 822         U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
 823         cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
 824         cl->flags |= affected_flags;
 825
 826         /* We currently don't know how to deal with things that aren't in the
 827          * bitmap, but we know that the intersection is no greater than what
 828          * is already in cl, so let there be false positives that get sorted
 829          * out after the synthetic start class succeeds, and the node is
 830          * matched for real. */
 831
 832         /* The inversion of these two flags indicate that the resulting
 833          * intersection doesn't have them */
 834         if (and_with->flags & ANYOF_UNICODE_ALL) {
 835             cl->flags &= ~ANYOF_UNICODE_ALL;
 836         }
 837         if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
 838             cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
 839         }
 840     }
 841     else {   /* and'd node is not inverted */
 842         if (! ANYOF_NONBITMAP(and_with)) {
 843
 844             /* Here 'and_with' doesn't match anything outside the bitmap
 845              * (except possibly ANYOF_UNICODE_ALL), which means the
 846              * intersection can't either, except for ANYOF_UNICODE_ALL, in
 847              * which case we don't know what the intersection is, but it's no
 848              * greater than what cl already has, so can just leave it alone,
 849              * with possible false positives */
 850             if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
 851                 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
 852                 cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
 853             }
 854         }
 855         else if (! ANYOF_NONBITMAP(cl)) {
 856
 857             /* Here, 'and_with' does match something outside the bitmap, and cl
 858              * doesn't have a list of things to match outside the bitmap.  If
 859              * cl can match all code points above 255, the intersection will
 860              * be those above-255 code points that 'and_with' matches.  There
 861              * may be false positives from code points in 'and_with' that are
 862              * outside the bitmap but below 256, but those get sorted out
 863              * after the synthetic start class succeeds).  If cl can't match
 864              * all Unicode code points, it means here that it can't match *
 865              * anything outside the bitmap, so we leave the bitmap empty */
 866             if (cl->flags & ANYOF_UNICODE_ALL) {
 867                 ARG_SET(cl, ARG(and_with));
 868             }
 869         }
 870         else {
 871             /* Here, both 'and_with' and cl match something outside the
 872              * bitmap.  Currently we do not do the intersection, so just match
 873              * whatever cl had at the beginning.  */
 874         }
 875
 876
 877         /* Take the intersection of the two sets of flags */
 878         cl->flags &= and_with->flags;
 879     }
 880 }
 881
 882 /* 'OR' a given class with another one.  Can create false positives.  'cl'
 883  * should not be inverted.  'or_with->flags & ANYOF_CLASS' should be 0 if
 884  * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
 885 STATIC void
 886 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
 887 {
 888     PERL_ARGS_ASSERT_CL_OR;
 889
 890     if (or_with->flags & ANYOF_INVERT) {
 891
 892         /* Here, the or'd node is to be inverted.  This means we take the
 893          * complement of everything not in the bitmap, but currently we don't
 894          * know what that is, so give up and match anything */
 895         if (ANYOF_NONBITMAP(or_with)) {
 896             cl_anything(pRExC_state, cl);
 897         }
 898         /* We do not use
 899          * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
 900          *   <= (B1 | !B2) | (CL1 | !CL2)
 901          * which is wasteful if CL2 is small, but we ignore CL2:
 902          *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
 903          * XXXX Can we handle case-fold?  Unclear:
 904          *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
 905          *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
 906          */
 907         else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 908              && !(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
 909              && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD) ) {
 910             int i;
 911
 912             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 913                 cl->bitmap[i] |= ~or_with->bitmap[i];
 914         } /* XXXX: logic is complicated otherwise */
 915         else {
 916             cl_anything(pRExC_state, cl);
 917         }
 918
 919         /* And, we can just take the union of the flags that aren't affected
 920          * by the inversion */
 921         cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
 922
 923         /* For the remaining flags:
 924             ANYOF_UNICODE_ALL and inverted means to not match anything above
 925                     255, which means that the union with cl should just be
 926                     what cl has in it, so can ignore this flag
 927             ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
 928                     is 127-255 to match them, but then invert that, so the
 929                     union with cl should just be what cl has in it, so can
 930                     ignore this flag
 931          */
 932     } else {    /* 'or_with' is not inverted */
 933         /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
 934         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 935              && (!(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
 936                  || (cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) ) {
 937             int i;
 938
 939             /* OR char bitmap and class bitmap separately */
 940             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 941                 cl->bitmap[i] |= or_with->bitmap[i];
 942             if (ANYOF_CLASS_TEST_ANY_SET(or_with)) {
 943                 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
 944                     cl->classflags[i] |= or_with->classflags[i];
 945                 cl->flags |= ANYOF_CLASS;
 946             }
 947         }
 948         else { /* XXXX: logic is complicated, leave it along for a moment. */
 949             cl_anything(pRExC_state, cl);
 950         }
 951
 952         if (ANYOF_NONBITMAP(or_with)) {
 953
 954             /* Use the added node's outside-the-bit-map match if there isn't a
 955              * conflict.  If there is a conflict (both nodes match something
 956              * outside the bitmap, but what they match outside is not the same
 957              * pointer, and hence not easily compared until XXX we extend
 958              * inversion lists this far), give up and allow the start class to
 959              * match everything outside the bitmap.  If that stuff is all above
 960              * 255, can just set UNICODE_ALL, otherwise caould be anything. */
 961             if (! ANYOF_NONBITMAP(cl)) {
 962                 ARG_SET(cl, ARG(or_with));
 963             }
 964             else if (ARG(cl) != ARG(or_with)) {
 965
 966                 if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
 967                     cl_anything(pRExC_state, cl);
 968                 }
 969                 else {
 970                     cl->flags |= ANYOF_UNICODE_ALL;
 971                 }
 972             }
 973
 974         /* Take the union */
 975         cl->flags |= or_with->flags;
 976         }
 977     }
 978 }
 979
 980 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
 981 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
 982 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
 983 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
 984
 985
 986 #ifdef DEBUGGING
 987 /*
 988    dump_trie(trie,widecharmap,revcharmap)
 989    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
 990    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
 991
 992    These routines dump out a trie in a somewhat readable format.
 993    The _interim_ variants are used for debugging the interim
 994    tables that are used to generate the final compressed
 995    representation which is what dump_trie expects.
 996
 997    Part of the reason for their existence is to provide a form
 998    of documentation as to how the different representations function.
 999
1000 */
1001
1002 /*
1003   Dumps the final compressed table form of the trie to Perl_debug_log.
1004   Used for debugging make_trie().
1005 */
1006
1007 STATIC void
1008 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1009             AV *revcharmap, U32 depth)
1010 {
1011     U32 state;
1012     SV *sv=sv_newmortal();
1013     int colwidth= widecharmap ? 6 : 4;
1014     U16 word;
1015     GET_RE_DEBUG_FLAGS_DECL;
1016
1017     PERL_ARGS_ASSERT_DUMP_TRIE;
1018
1019     PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1020         (int)depth * 2 + 2,"",
1021         "Match","Base","Ofs" );
1022
1023     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1024         SV ** const tmp = av_fetch( revcharmap, state, 0);
1025         if ( tmp ) {
1026             PerlIO_printf( Perl_debug_log, "%*s",
1027                 colwidth,
1028                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1029                             PL_colors[0], PL_colors[1],
1030                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1031                             PERL_PV_ESCAPE_FIRSTCHAR
1032                 )
1033             );
1034         }
1035     }
1036     PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1037         (int)depth * 2 + 2,"");
1038
1039     for( state = 0 ; state < trie->uniquecharcount ; state++ )
1040         PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1041     PerlIO_printf( Perl_debug_log, "\n");
1042
1043     for( state = 1 ; state < trie->statecount ; state++ ) {
1044         const U32 base = trie->states[ state ].trans.base;
1045
1046         PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1047
1048         if ( trie->states[ state ].wordnum ) {
1049             PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1050         } else {
1051             PerlIO_printf( Perl_debug_log, "%6s", "" );
1052         }
1053
1054         PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1055
1056         if ( base ) {
1057             U32 ofs = 0;
1058
1059             while( ( base + ofs  < trie->uniquecharcount ) ||
1060                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
1061                      && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1062                     ofs++;
1063
1064             PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1065
1066             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1067                 if ( ( base + ofs >= trie->uniquecharcount ) &&
1068                      ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1069                      trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1070                 {
1071                    PerlIO_printf( Perl_debug_log, "%*"UVXf,
1072                     colwidth,
1073                     (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1074                 } else {
1075                     PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
1076                 }
1077             }
1078
1079             PerlIO_printf( Perl_debug_log, "]");
1080
1081         }
1082         PerlIO_printf( Perl_debug_log, "\n" );
1083     }
1084     PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1085     for (word=1; word <= trie->wordcount; word++) {
1086         PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1087             (int)word, (int)(trie->wordinfo[word].prev),
1088             (int)(trie->wordinfo[word].len));
1089     }
1090     PerlIO_printf(Perl_debug_log, "\n" );
1091 }
1092 /*
1093   Dumps a fully constructed but uncompressed trie in list form.
1094   List tries normally only are used for construction when the number of
1095   possible chars (trie->uniquecharcount) is very high.
1096   Used for debugging make_trie().
1097 */
1098 STATIC void
1099 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1100                          HV *widecharmap, AV *revcharmap, U32 next_alloc,
1101                          U32 depth)
1102 {
1103     U32 state;
1104     SV *sv=sv_newmortal();
1105     int colwidth= widecharmap ? 6 : 4;
1106     GET_RE_DEBUG_FLAGS_DECL;
1107
1108     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1109
1110     /* print out the table precompression.  */
1111     PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1112         (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1113         "------:-----+-----------------\n" );
1114
1115     for( state=1 ; state < next_alloc ; state ++ ) {
1116         U16 charid;
1117
1118         PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1119             (int)depth * 2 + 2,"", (UV)state  );
1120         if ( ! trie->states[ state ].wordnum ) {
1121             PerlIO_printf( Perl_debug_log, "%5s| ","");
1122         } else {
1123             PerlIO_printf( Perl_debug_log, "W%4x| ",
1124                 trie->states[ state ].wordnum
1125             );
1126         }
1127         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1128             SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1129             if ( tmp ) {
1130                 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1131                     colwidth,
1132                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1133                             PL_colors[0], PL_colors[1],
1134                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1135                             PERL_PV_ESCAPE_FIRSTCHAR
1136                     ) ,
1137                     TRIE_LIST_ITEM(state,charid).forid,
1138                     (UV)TRIE_LIST_ITEM(state,charid).newstate
1139                 );
1140                 if (!(charid % 10))
1141                     PerlIO_printf(Perl_debug_log, "\n%*s| ",
1142                         (int)((depth * 2) + 14), "");
1143             }
1144         }
1145         PerlIO_printf( Perl_debug_log, "\n");
1146     }
1147 }
1148
1149 /*
1150   Dumps a fully constructed but uncompressed trie in table form.
1151   This is the normal DFA style state transition table, with a few
1152   twists to facilitate compression later.
1153   Used for debugging make_trie().
1154 */
1155 STATIC void
1156 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1157                           HV *widecharmap, AV *revcharmap, U32 next_alloc,
1158                           U32 depth)
1159 {
1160     U32 state;
1161     U16 charid;
1162     SV *sv=sv_newmortal();
1163     int colwidth= widecharmap ? 6 : 4;
1164     GET_RE_DEBUG_FLAGS_DECL;
1165
1166     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1167
1168     /*
1169        print out the table precompression so that we can do a visual check
1170        that they are identical.
1171      */
1172
1173     PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1174
1175     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1176         SV ** const tmp = av_fetch( revcharmap, charid, 0);
1177         if ( tmp ) {
1178             PerlIO_printf( Perl_debug_log, "%*s",
1179                 colwidth,
1180                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1181                             PL_colors[0], PL_colors[1],
1182                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1183                             PERL_PV_ESCAPE_FIRSTCHAR
1184                 )
1185             );
1186         }
1187     }
1188
1189     PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1190
1191     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1192         PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1193     }
1194
1195     PerlIO_printf( Perl_debug_log, "\n" );
1196
1197     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1198
1199         PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1200             (int)depth * 2 + 2,"",
1201             (UV)TRIE_NODENUM( state ) );
1202
1203         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1204             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1205             if (v)
1206                 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1207             else
1208                 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1209         }
1210         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1211             PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1212         } else {
1213             PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1214             trie->states[ TRIE_NODENUM( state ) ].wordnum );
1215         }
1216     }
1217 }
1218
1219 #endif
1220
1221
1222 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1223   startbranch: the first branch in the whole branch sequence
1224   first      : start branch of sequence of branch-exact nodes.
1225                May be the same as startbranch
1226   last       : Thing following the last branch.
1227                May be the same as tail.
1228   tail       : item following the branch sequence
1229   count      : words in the sequence
1230   flags      : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1231   depth      : indent depth
1232
1233 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1234
1235 A trie is an N'ary tree where the branches are determined by digital
1236 decomposition of the key. IE, at the root node you look up the 1st character and
1237 follow that branch repeat until you find the end of the branches. Nodes can be
1238 marked as "accepting" meaning they represent a complete word. Eg:
1239
1240   /he|she|his|hers/
1241
1242 would convert into the following structure. Numbers represent states, letters
1243 following numbers represent valid transitions on the letter from that state, if
1244 the number is in square brackets it represents an accepting state, otherwise it
1245 will be in parenthesis.
1246
1247       +-h->+-e->[3]-+-r->(8)-+-s->[9]
1248       |    |
1249       |   (2)
1250       |    |
1251      (1)   +-i->(6)-+-s->[7]
1252       |
1253       +-s->(3)-+-h->(4)-+-e->[5]
1254
1255       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1256
1257 This shows that when matching against the string 'hers' we will begin at state 1
1258 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1259 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1260 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1261 single traverse. We store a mapping from accepting to state to which word was
1262 matched, and then when we have multiple possibilities we try to complete the
1263 rest of the regex in the order in which they occured in the alternation.
1264
1265 The only prior NFA like behaviour that would be changed by the TRIE support is
1266 the silent ignoring of duplicate alternations which are of the form:
1267
1268  / (DUPE|DUPE) X? (?{ ... }) Y /x
1269
1270 Thus EVAL blocks following a trie may be called a different number of times with
1271 and without the optimisation. With the optimisations dupes will be silently
1272 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1273 the following demonstrates:
1274
1275  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1276
1277 which prints out 'word' three times, but
1278
1279  'words'=~/(word|word|word)(?{ print $1 })S/
1280
1281 which doesnt print it out at all. This is due to other optimisations kicking in.
1282
1283 Example of what happens on a structural level:
1284
1285 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1286
1287    1: CURLYM[1] {1,32767}(18)
1288    5:   BRANCH(8)
1289    6:     EXACT <ac>(16)
1290    8:   BRANCH(11)
1291    9:     EXACT <ad>(16)
1292   11:   BRANCH(14)
1293   12:     EXACT <ab>(16)
1294   16:   SUCCEED(0)
1295   17:   NOTHING(18)
1296   18: END(0)
1297
1298 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1299 and should turn into:
1300
1301    1: CURLYM[1] {1,32767}(18)
1302    5:   TRIE(16)
1303         [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1304           <ac>
1305           <ad>
1306           <ab>
1307   16:   SUCCEED(0)
1308   17:   NOTHING(18)
1309   18: END(0)
1310
1311 Cases where tail != last would be like /(?foo|bar)baz/:
1312
1313    1: BRANCH(4)
1314    2:   EXACT <foo>(8)
1315    4: BRANCH(7)
1316    5:   EXACT <bar>(8)
1317    7: TAIL(8)
1318    8: EXACT <baz>(10)
1319   10: END(0)
1320
1321 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1322 and would end up looking like:
1323
1324     1: TRIE(8)
1325       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1326         <foo>
1327         <bar>
1328    7: TAIL(8)
1329    8: EXACT <baz>(10)
1330   10: END(0)
1331
1332     d = uvuni_to_utf8_flags(d, uv, 0);
1333
1334 is the recommended Unicode-aware way of saying
1335
1336     *(d++) = uv;
1337 */
1338
1339 #define TRIE_STORE_REVCHAR                                                 \
1340     STMT_START {                                                           \
1341         if (UTF) {                                                         \
1342             SV *zlopp = newSV(2);                                          \
1343             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
1344             unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
1345             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
1346             SvPOK_on(zlopp);                                               \
1347             SvUTF8_on(zlopp);                                              \
1348             av_push(revcharmap, zlopp);                                    \
1349         } else {                                                           \
1350             char ooooff = (char)uvc;                                               \
1351             av_push(revcharmap, newSVpvn(&ooooff, 1));                     \
1352         }                                                                  \
1353         } STMT_END
1354
1355 #define TRIE_READ_CHAR STMT_START {                                           \
1356     wordlen++;                                                                \
1357     if ( UTF ) {                                                              \
1358         if ( folder ) {                                                       \
1359             if ( foldlen > 0 ) {                                              \
1360                uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags );     \
1361                foldlen -= len;                                                \
1362                scan += len;                                                   \
1363                len = 0;                                                       \
1364             } else {                                                          \
1365                 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1366                 uvc = to_uni_fold( uvc, foldbuf, &foldlen );                  \
1367                 foldlen -= UNISKIP( uvc );                                    \
1368                 scan = foldbuf + UNISKIP( uvc );                              \
1369             }                                                                 \
1370         } else {                                                              \
1371             uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1372         }                                                                     \
1373     } else {                                                                  \
1374         uvc = (U32)*uc;                                                       \
1375         len = 1;                                                              \
1376     }                                                                         \
1377 } STMT_END
1378
1379
1380
1381 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
1382     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
1383         U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
1384         Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1385     }                                                           \
1386     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
1387     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
1388     TRIE_LIST_CUR( state )++;                                   \
1389 } STMT_END
1390
1391 #define TRIE_LIST_NEW(state) STMT_START {                       \
1392     Newxz( trie->states[ state ].trans.list,               \
1393         4, reg_trie_trans_le );                                 \
1394      TRIE_LIST_CUR( state ) = 1;                                \
1395      TRIE_LIST_LEN( state ) = 4;                                \
1396 } STMT_END
1397
1398 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
1399     U16 dupe= trie->states[ state ].wordnum;                    \
1400     regnode * const noper_next = regnext( noper );              \
1401                                                                 \
1402     DEBUG_r({                                                   \
1403         /* store the word for dumping */                        \
1404         SV* tmp;                                                \
1405         if (OP(noper) != NOTHING)                               \
1406             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);    \
1407         else                                                    \
1408             tmp = newSVpvn_utf8( "", 0, UTF );                  \
1409         av_push( trie_words, tmp );                             \
1410     });                                                         \
1411                                                                 \
1412     curword++;                                                  \
1413     trie->wordinfo[curword].prev   = 0;                         \
1414     trie->wordinfo[curword].len    = wordlen;                   \
1415     trie->wordinfo[curword].accept = state;                     \
1416                                                                 \
1417     if ( noper_next < tail ) {                                  \
1418         if (!trie->jump)                                        \
1419             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1420         trie->jump[curword] = (U16)(noper_next - convert);      \
1421         if (!jumper)                                            \
1422             jumper = noper_next;                                \
1423         if (!nextbranch)                                        \
1424             nextbranch= regnext(cur);                           \
1425     }                                                           \
1426                                                                 \
1427     if ( dupe ) {                                               \
1428         /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
1429         /* chain, so that when the bits of chain are later    */\
1430         /* linked together, the dups appear in the chain      */\
1431         trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1432         trie->wordinfo[dupe].prev = curword;                    \
1433     } else {                                                    \
1434         /* we haven't inserted this word yet.                */ \
1435         trie->states[ state ].wordnum = curword;                \
1436     }                                                           \
1437 } STMT_END
1438
1439
1440 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)          \
1441      ( ( base + charid >=  ucharcount                                   \
1442          && base + charid < ubound                                      \
1443          && state == trie->trans[ base - ucharcount + charid ].check    \
1444          && trie->trans[ base - ucharcount + charid ].next )            \
1445            ? trie->trans[ base - ucharcount + charid ].next             \
1446            : ( state==1 ? special : 0 )                                 \
1447       )
1448
1449 #define MADE_TRIE       1
1450 #define MADE_JUMP_TRIE  2
1451 #define MADE_EXACT_TRIE 4
1452
1453 STATIC I32
1454 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1455 {
1456     dVAR;
1457     /* first pass, loop through and scan words */
1458     reg_trie_data *trie;
1459     HV *widecharmap = NULL;
1460     AV *revcharmap = newAV();
1461     regnode *cur;
1462     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1463     STRLEN len = 0;
1464     UV uvc = 0;
1465     U16 curword = 0;
1466     U32 next_alloc = 0;
1467     regnode *jumper = NULL;
1468     regnode *nextbranch = NULL;
1469     regnode *convert = NULL;
1470     U32 *prev_states; /* temp array mapping each state to previous one */
1471     /* we just use folder as a flag in utf8 */
1472     const U8 * folder = NULL;
1473
1474 #ifdef DEBUGGING
1475     const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1476     AV *trie_words = NULL;
1477     /* along with revcharmap, this only used during construction but both are
1478      * useful during debugging so we store them in the struct when debugging.
1479      */
1480 #else
1481     const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1482     STRLEN trie_charcount=0;
1483 #endif
1484     SV *re_trie_maxbuff;
1485     GET_RE_DEBUG_FLAGS_DECL;
1486
1487     PERL_ARGS_ASSERT_MAKE_TRIE;
1488 #ifndef DEBUGGING
1489     PERL_UNUSED_ARG(depth);
1490 #endif
1491
1492     switch (flags) {
1493         case EXACTFA:
1494         case EXACTFU: folder = PL_fold_latin1; break;
1495         case EXACTF:  folder = PL_fold; break;
1496         case EXACTFL: folder = PL_fold_locale; break;
1497     }
1498
1499     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1500     trie->refcount = 1;
1501     trie->startstate = 1;
1502     trie->wordcount = word_count;
1503     RExC_rxi->data->data[ data_slot ] = (void*)trie;
1504     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1505     if (!(UTF && folder))
1506         trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1507     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1508                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
1509
1510     DEBUG_r({
1511         trie_words = newAV();
1512     });
1513
1514     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1515     if (!SvIOK(re_trie_maxbuff)) {
1516         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1517     }
1518     DEBUG_OPTIMISE_r({
1519                 PerlIO_printf( Perl_debug_log,
1520                   "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1521                   (int)depth * 2 + 2, "",
1522                   REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1523                   REG_NODE_NUM(last), REG_NODE_NUM(tail),
1524                   (int)depth);
1525     });
1526
1527    /* Find the node we are going to overwrite */
1528     if ( first == startbranch && OP( last ) != BRANCH ) {
1529         /* whole branch chain */
1530         convert = first;
1531     } else {
1532         /* branch sub-chain */
1533         convert = NEXTOPER( first );
1534     }
1535
1536     /*  -- First loop and Setup --
1537
1538        We first traverse the branches and scan each word to determine if it
1539        contains widechars, and how many unique chars there are, this is
1540        important as we have to build a table with at least as many columns as we
1541        have unique chars.
1542
1543        We use an array of integers to represent the character codes 0..255
1544        (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1545        native representation of the character value as the key and IV's for the
1546        coded index.
1547
1548        *TODO* If we keep track of how many times each character is used we can
1549        remap the columns so that the table compression later on is more
1550        efficient in terms of memory by ensuring the most common value is in the
1551        middle and the least common are on the outside.  IMO this would be better
1552        than a most to least common mapping as theres a decent chance the most
1553        common letter will share a node with the least common, meaning the node
1554        will not be compressible. With a middle is most common approach the worst
1555        case is when we have the least common nodes twice.
1556
1557      */
1558
1559     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1560         regnode * const noper = NEXTOPER( cur );
1561         const U8 *uc = (U8*)STRING( noper );
1562         const U8 * const e  = uc + STR_LEN( noper );
1563         STRLEN foldlen = 0;
1564         U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1565         const U8 *scan = (U8*)NULL;
1566         U32 wordlen      = 0;         /* required init */
1567         STRLEN chars = 0;
1568         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1569
1570         if (OP(noper) == NOTHING) {
1571             trie->minlen= 0;
1572             continue;
1573         }
1574         if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1575             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1576                                           regardless of encoding */
1577
1578         for ( ; uc < e ; uc += len ) {
1579             TRIE_CHARCOUNT(trie)++;
1580             TRIE_READ_CHAR;
1581             chars++;
1582             if ( uvc < 256 ) {
1583                 if ( !trie->charmap[ uvc ] ) {
1584                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1585                     if ( folder )
1586                         trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1587                     TRIE_STORE_REVCHAR;
1588                 }
1589                 if ( set_bit ) {
1590                     /* store the codepoint in the bitmap, and its folded
1591                      * equivalent. */
1592                     TRIE_BITMAP_SET(trie,uvc);
1593
1594                     /* store the folded codepoint */
1595                     if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1596
1597                     if ( !UTF ) {
1598                         /* store first byte of utf8 representation of
1599                            variant codepoints */
1600                         if (! UNI_IS_INVARIANT(uvc)) {
1601                             TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
1602                         }
1603                     }
1604                     set_bit = 0; /* We've done our bit :-) */
1605                 }
1606             } else {
1607                 SV** svpp;
1608                 if ( !widecharmap )
1609                     widecharmap = newHV();
1610
1611                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1612
1613                 if ( !svpp )
1614                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1615
1616                 if ( !SvTRUE( *svpp ) ) {
1617                     sv_setiv( *svpp, ++trie->uniquecharcount );
1618                     TRIE_STORE_REVCHAR;
1619                 }
1620             }
1621         }
1622         if( cur == first ) {
1623             trie->minlen=chars;
1624             trie->maxlen=chars;
1625         } else if (chars < trie->minlen) {
1626             trie->minlen=chars;
1627         } else if (chars > trie->maxlen) {
1628             trie->maxlen=chars;
1629         }
1630
1631     } /* end first pass */
1632     DEBUG_TRIE_COMPILE_r(
1633         PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1634                 (int)depth * 2 + 2,"",
1635                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1636                 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1637                 (int)trie->minlen, (int)trie->maxlen )
1638     );
1639
1640     /*
1641         We now know what we are dealing with in terms of unique chars and
1642         string sizes so we can calculate how much memory a naive
1643         representation using a flat table  will take. If it's over a reasonable
1644         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1645         conservative but potentially much slower representation using an array
1646         of lists.
1647
1648         At the end we convert both representations into the same compressed
1649         form that will be used in regexec.c for matching with. The latter
1650         is a form that cannot be used to construct with but has memory
1651         properties similar to the list form and access properties similar
1652         to the table form making it both suitable for fast searches and
1653         small enough that its feasable to store for the duration of a program.
1654
1655         See the comment in the code where the compressed table is produced
1656         inplace from the flat tabe representation for an explanation of how
1657         the compression works.
1658
1659     */
1660
1661
1662     Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1663     prev_states[1] = 0;
1664
1665     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1666         /*
1667             Second Pass -- Array Of Lists Representation
1668
1669             Each state will be represented by a list of charid:state records
1670             (reg_trie_trans_le) the first such element holds the CUR and LEN
1671             points of the allocated array. (See defines above).
1672
1673             We build the initial structure using the lists, and then convert
1674             it into the compressed table form which allows faster lookups
1675             (but cant be modified once converted).
1676         */
1677
1678         STRLEN transcount = 1;
1679
1680         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1681             "%*sCompiling trie using list compiler\n",
1682             (int)depth * 2 + 2, ""));
1683
1684         trie->states = (reg_trie_state *)
1685             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1686                                   sizeof(reg_trie_state) );
1687         TRIE_LIST_NEW(1);
1688         next_alloc = 2;
1689
1690         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1691
1692             regnode * const noper = NEXTOPER( cur );
1693             U8 *uc           = (U8*)STRING( noper );
1694             const U8 * const e = uc + STR_LEN( noper );
1695             U32 state        = 1;         /* required init */
1696             U16 charid       = 0;         /* sanity init */
1697             U8 *scan         = (U8*)NULL; /* sanity init */
1698             STRLEN foldlen   = 0;         /* required init */
1699             U32 wordlen      = 0;         /* required init */
1700             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1701
1702             if (OP(noper) != NOTHING) {
1703                 for ( ; uc < e ; uc += len ) {
1704
1705                     TRIE_READ_CHAR;
1706
1707                     if ( uvc < 256 ) {
1708                         charid = trie->charmap[ uvc ];
1709                     } else {
1710                         SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1711                         if ( !svpp ) {
1712                             charid = 0;
1713                         } else {
1714                             charid=(U16)SvIV( *svpp );
1715                         }
1716                     }
1717                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
1718                     if ( charid ) {
1719
1720                         U16 check;
1721                         U32 newstate = 0;
1722
1723                         charid--;
1724                         if ( !trie->states[ state ].trans.list ) {
1725                             TRIE_LIST_NEW( state );
1726                         }
1727                         for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1728                             if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1729                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1730                                 break;
1731                             }
1732                         }
1733                         if ( ! newstate ) {
1734                             newstate = next_alloc++;
1735                             prev_states[newstate] = state;
1736                             TRIE_LIST_PUSH( state, charid, newstate );
1737                             transcount++;
1738                         }
1739                         state = newstate;
1740                     } else {
1741                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1742                     }
1743                 }
1744             }
1745             TRIE_HANDLE_WORD(state);
1746
1747         } /* end second pass */
1748
1749         /* next alloc is the NEXT state to be allocated */
1750         trie->statecount = next_alloc;
1751         trie->states = (reg_trie_state *)
1752             PerlMemShared_realloc( trie->states,
1753                                    next_alloc
1754                                    * sizeof(reg_trie_state) );
1755
1756         /* and now dump it out before we compress it */
1757         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1758                                                          revcharmap, next_alloc,
1759                                                          depth+1)
1760         );
1761
1762         trie->trans = (reg_trie_trans *)
1763             PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1764         {
1765             U32 state;
1766             U32 tp = 0;
1767             U32 zp = 0;
1768
1769
1770             for( state=1 ; state < next_alloc ; state ++ ) {
1771                 U32 base=0;
1772
1773                 /*
1774                 DEBUG_TRIE_COMPILE_MORE_r(
1775                     PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1776                 );
1777                 */
1778
1779                 if (trie->states[state].trans.list) {
1780                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1781                     U16 maxid=minid;
1782                     U16 idx;
1783
1784                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1785                         const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1786                         if ( forid < minid ) {
1787                             minid=forid;
1788                         } else if ( forid > maxid ) {
1789                             maxid=forid;
1790                         }
1791                     }
1792                     if ( transcount < tp + maxid - minid + 1) {
1793                         transcount *= 2;
1794                         trie->trans = (reg_trie_trans *)
1795                             PerlMemShared_realloc( trie->trans,
1796                                                      transcount
1797                                                      * sizeof(reg_trie_trans) );
1798                         Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1799                     }
1800                     base = trie->uniquecharcount + tp - minid;
1801                     if ( maxid == minid ) {
1802                         U32 set = 0;
1803                         for ( ; zp < tp ; zp++ ) {
1804                             if ( ! trie->trans[ zp ].next ) {
1805                                 base = trie->uniquecharcount + zp - minid;
1806                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1807                                 trie->trans[ zp ].check = state;
1808                                 set = 1;
1809                                 break;
1810                             }
1811                         }
1812                         if ( !set ) {
1813                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1814                             trie->trans[ tp ].check = state;
1815                             tp++;
1816                             zp = tp;
1817                         }
1818                     } else {
1819                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1820                             const U32 tid = base -  trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1821                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1822                             trie->trans[ tid ].check = state;
1823                         }
1824                         tp += ( maxid - minid + 1 );
1825                     }
1826                     Safefree(trie->states[ state ].trans.list);
1827                 }
1828                 /*
1829                 DEBUG_TRIE_COMPILE_MORE_r(
1830                     PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1831                 );
1832                 */
1833                 trie->states[ state ].trans.base=base;
1834             }
1835             trie->lasttrans = tp + 1;
1836         }
1837     } else {
1838         /*
1839            Second Pass -- Flat Table Representation.
1840
1841            we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1842            We know that we will need Charcount+1 trans at most to store the data
1843            (one row per char at worst case) So we preallocate both structures
1844            assuming worst case.
1845
1846            We then construct the trie using only the .next slots of the entry
1847            structs.
1848
1849            We use the .check field of the first entry of the node temporarily to
1850            make compression both faster and easier by keeping track of how many non
1851            zero fields are in the node.
1852
1853            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1854            transition.
1855
1856            There are two terms at use here: state as a TRIE_NODEIDX() which is a
1857            number representing the first entry of the node, and state as a
1858            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1859            TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1860            are 2 entrys per node. eg:
1861
1862              A B       A B
1863           1. 2 4    1. 3 7
1864           2. 0 3    3. 0 5
1865           3. 0 0    5. 0 0
1866           4. 0 0    7. 0 0
1867
1868            The table is internally in the right hand, idx form. However as we also
1869            have to deal with the states array which is indexed by nodenum we have to
1870            use TRIE_NODENUM() to convert.
1871
1872         */
1873         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1874             "%*sCompiling trie using table compiler\n",
1875             (int)depth * 2 + 2, ""));
1876
1877         trie->trans = (reg_trie_trans *)
1878             PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1879                                   * trie->uniquecharcount + 1,
1880                                   sizeof(reg_trie_trans) );
1881         trie->states = (reg_trie_state *)
1882             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1883                                   sizeof(reg_trie_state) );
1884         next_alloc = trie->uniquecharcount + 1;
1885
1886
1887         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1888
1889             regnode * const noper   = NEXTOPER( cur );
1890             const U8 *uc     = (U8*)STRING( noper );
1891             const U8 * const e = uc + STR_LEN( noper );
1892
1893             U32 state        = 1;         /* required init */
1894
1895             U16 charid       = 0;         /* sanity init */
1896             U32 accept_state = 0;         /* sanity init */
1897             U8 *scan         = (U8*)NULL; /* sanity init */
1898
1899             STRLEN foldlen   = 0;         /* required init */
1900             U32 wordlen      = 0;         /* required init */
1901             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1902
1903             if ( OP(noper) != NOTHING ) {
1904                 for ( ; uc < e ; uc += len ) {
1905
1906                     TRIE_READ_CHAR;
1907
1908                     if ( uvc < 256 ) {
1909                         charid = trie->charmap[ uvc ];
1910                     } else {
1911                         SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1912                         charid = svpp ? (U16)SvIV(*svpp) : 0;
1913                     }
1914                     if ( charid ) {
1915                         charid--;
1916                         if ( !trie->trans[ state + charid ].next ) {
1917                             trie->trans[ state + charid ].next = next_alloc;
1918                             trie->trans[ state ].check++;
1919                             prev_states[TRIE_NODENUM(next_alloc)]
1920                                     = TRIE_NODENUM(state);
1921                             next_alloc += trie->uniquecharcount;
1922                         }
1923                         state = trie->trans[ state + charid ].next;
1924                     } else {
1925                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1926                     }
1927                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
1928                 }
1929             }
1930             accept_state = TRIE_NODENUM( state );
1931             TRIE_HANDLE_WORD(accept_state);
1932
1933         } /* end second pass */
1934
1935         /* and now dump it out before we compress it */
1936         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1937                                                           revcharmap,
1938                                                           next_alloc, depth+1));
1939
1940         {
1941         /*
1942            * Inplace compress the table.*
1943
1944            For sparse data sets the table constructed by the trie algorithm will
1945            be mostly 0/FAIL transitions or to put it another way mostly empty.
1946            (Note that leaf nodes will not contain any transitions.)
1947
1948            This algorithm compresses the tables by eliminating most such
1949            transitions, at the cost of a modest bit of extra work during lookup:
1950
1951            - Each states[] entry contains a .base field which indicates the
1952            index in the state[] array wheres its transition data is stored.
1953
1954            - If .base is 0 there are no valid transitions from that node.
1955
1956            - If .base is nonzero then charid is added to it to find an entry in
1957            the trans array.
1958
1959            -If trans[states[state].base+charid].check!=state then the
1960            transition is taken to be a 0/Fail transition. Thus if there are fail
1961            transitions at the front of the node then the .base offset will point
1962            somewhere inside the previous nodes data (or maybe even into a node
1963            even earlier), but the .check field determines if the transition is
1964            valid.
1965
1966            XXX - wrong maybe?
1967            The following process inplace converts the table to the compressed
1968            table: We first do not compress the root node 1,and mark all its
1969            .check pointers as 1 and set its .base pointer as 1 as well. This
1970            allows us to do a DFA construction from the compressed table later,
1971            and ensures that any .base pointers we calculate later are greater
1972            than 0.
1973
1974            - We set 'pos' to indicate the first entry of the second node.
1975
1976            - We then iterate over the columns of the node, finding the first and
1977            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1978            and set the .check pointers accordingly, and advance pos
1979            appropriately and repreat for the next node. Note that when we copy
1980            the next pointers we have to convert them from the original
1981            NODEIDX form to NODENUM form as the former is not valid post
1982            compression.
1983
1984            - If a node has no transitions used we mark its base as 0 and do not
1985            advance the pos pointer.
1986
1987            - If a node only has one transition we use a second pointer into the
1988            structure to fill in allocated fail transitions from other states.
1989            This pointer is independent of the main pointer and scans forward
1990            looking for null transitions that are allocated to a state. When it
1991            finds one it writes the single transition into the "hole".  If the
1992            pointer doesnt find one the single transition is appended as normal.
1993
1994            - Once compressed we can Renew/realloc the structures to release the
1995            excess space.
1996
1997            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1998            specifically Fig 3.47 and the associated pseudocode.
1999
2000            demq
2001         */
2002         const U32 laststate = TRIE_NODENUM( next_alloc );
2003         U32 state, charid;
2004         U32 pos = 0, zp=0;
2005         trie->statecount = laststate;
2006
2007         for ( state = 1 ; state < laststate ; state++ ) {
2008             U8 flag = 0;
2009             const U32 stateidx = TRIE_NODEIDX( state );
2010             const U32 o_used = trie->trans[ stateidx ].check;
2011             U32 used = trie->trans[ stateidx ].check;
2012             trie->trans[ stateidx ].check = 0;
2013
2014             for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2015                 if ( flag || trie->trans[ stateidx + charid ].next ) {
2016                     if ( trie->trans[ stateidx + charid ].next ) {
2017                         if (o_used == 1) {
2018                             for ( ; zp < pos ; zp++ ) {
2019                                 if ( ! trie->trans[ zp ].next ) {
2020                                     break;
2021                                 }
2022                             }
2023                             trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2024                             trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2025                             trie->trans[ zp ].check = state;
2026                             if ( ++zp > pos ) pos = zp;
2027                             break;
2028                         }
2029                         used--;
2030                     }
2031                     if ( !flag ) {
2032                         flag = 1;
2033                         trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2034                     }
2035                     trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2036                     trie->trans[ pos ].check = state;
2037                     pos++;
2038                 }
2039             }
2040         }
2041         trie->lasttrans = pos + 1;
2042         trie->states = (reg_trie_state *)
2043             PerlMemShared_realloc( trie->states, laststate
2044                                    * sizeof(reg_trie_state) );
2045         DEBUG_TRIE_COMPILE_MORE_r(
2046                 PerlIO_printf( Perl_debug_log,
2047                     "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2048                     (int)depth * 2 + 2,"",
2049                     (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
2050                     (IV)next_alloc,
2051                     (IV)pos,
2052                     ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2053             );
2054
2055         } /* end table compress */
2056     }
2057     DEBUG_TRIE_COMPILE_MORE_r(
2058             PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2059                 (int)depth * 2 + 2, "",
2060                 (UV)trie->statecount,
2061                 (UV)trie->lasttrans)
2062     );
2063     /* resize the trans array to remove unused space */
2064     trie->trans = (reg_trie_trans *)
2065         PerlMemShared_realloc( trie->trans, trie->lasttrans
2066                                * sizeof(reg_trie_trans) );
2067
2068     {   /* Modify the program and insert the new TRIE node */
2069         U8 nodetype =(U8)(flags & 0xFF);
2070         char *str=NULL;
2071
2072 #ifdef DEBUGGING
2073         regnode *optimize = NULL;
2074 #ifdef RE_TRACK_PATTERN_OFFSETS
2075
2076         U32 mjd_offset = 0;
2077         U32 mjd_nodelen = 0;
2078 #endif /* RE_TRACK_PATTERN_OFFSETS */
2079 #endif /* DEBUGGING */
2080         /*
2081            This means we convert either the first branch or the first Exact,
2082            depending on whether the thing following (in 'last') is a branch
2083            or not and whther first is the startbranch (ie is it a sub part of
2084            the alternation or is it the whole thing.)
2085            Assuming its a sub part we convert the EXACT otherwise we convert
2086            the whole branch sequence, including the first.
2087          */
2088         /* Find the node we are going to overwrite */
2089         if ( first != startbranch || OP( last ) == BRANCH ) {
2090             /* branch sub-chain */
2091             NEXT_OFF( first ) = (U16)(last - first);
2092 #ifdef RE_TRACK_PATTERN_OFFSETS
2093             DEBUG_r({
2094                 mjd_offset= Node_Offset((convert));
2095                 mjd_nodelen= Node_Length((convert));
2096             });
2097 #endif
2098             /* whole branch chain */
2099         }
2100 #ifdef RE_TRACK_PATTERN_OFFSETS
2101         else {
2102             DEBUG_r({
2103                 const  regnode *nop = NEXTOPER( convert );
2104                 mjd_offset= Node_Offset((nop));
2105                 mjd_nodelen= Node_Length((nop));
2106             });
2107         }
2108         DEBUG_OPTIMISE_r(
2109             PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2110                 (int)depth * 2 + 2, "",
2111                 (UV)mjd_offset, (UV)mjd_nodelen)
2112         );
2113 #endif
2114         /* But first we check to see if there is a common prefix we can
2115            split out as an EXACT and put in front of the TRIE node.  */
2116         trie->startstate= 1;
2117         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
2118             U32 state;
2119             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2120                 U32 ofs = 0;
2121                 I32 idx = -1;
2122                 U32 count = 0;
2123                 const U32 base = trie->states[ state ].trans.base;
2124
2125                 if ( trie->states[state].wordnum )
2126                         count = 1;
2127
2128                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2129                     if ( ( base + ofs >= trie->uniquecharcount ) &&
2130                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2131                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2132                     {
2133                         if ( ++count > 1 ) {
2134                             SV **tmp = av_fetch( revcharmap, ofs, 0);
2135                             const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2136                             if ( state == 1 ) break;
2137                             if ( count == 2 ) {
2138                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2139                                 DEBUG_OPTIMISE_r(
2140                                     PerlIO_printf(Perl_debug_log,
2141                                         "%*sNew Start State=%"UVuf" Class: [",
2142                                         (int)depth * 2 + 2, "",
2143                                         (UV)state));
2144                                 if (idx >= 0) {
2145                                     SV ** const tmp = av_fetch( revcharmap, idx, 0);
2146                                     const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2147
2148                                     TRIE_BITMAP_SET(trie,*ch);
2149                                     if ( folder )
2150                                         TRIE_BITMAP_SET(trie, folder[ *ch ]);
2151                                     DEBUG_OPTIMISE_r(
2152                                         PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2153                                     );
2154                                 }
2155                             }
2156                             TRIE_BITMAP_SET(trie,*ch);
2157                             if ( folder )
2158                                 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2159                             DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2160                         }
2161                         idx = ofs;
2162                     }
2163                 }
2164                 if ( count == 1 ) {
2165                     SV **tmp = av_fetch( revcharmap, idx, 0);
2166                     STRLEN len;
2167                     char *ch = SvPV( *tmp, len );
2168                     DEBUG_OPTIMISE_r({
2169                         SV *sv=sv_newmortal();
2170                         PerlIO_printf( Perl_debug_log,
2171                             "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2172                             (int)depth * 2 + 2, "",
2173                             (UV)state, (UV)idx,
2174                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2175                                 PL_colors[0], PL_colors[1],
2176                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2177                                 PERL_PV_ESCAPE_FIRSTCHAR
2178                             )
2179                         );
2180                     });
2181                     if ( state==1 ) {
2182                         OP( convert ) = nodetype;
2183                         str=STRING(convert);
2184                         STR_LEN(convert)=0;
2185                     }
2186                     STR_LEN(convert) += len;
2187                     while (len--)
2188                         *str++ = *ch++;
2189                 } else {
2190 #ifdef DEBUGGING
2191                     if (state>1)
2192                         DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2193 #endif
2194                     break;
2195                 }
2196             }
2197             trie->prefixlen = (state-1);
2198             if (str) {
2199                 regnode *n = convert+NODE_SZ_STR(convert);
2200                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2201                 trie->startstate = state;
2202                 trie->minlen -= (state - 1);
2203                 trie->maxlen -= (state - 1);
2204 #ifdef DEBUGGING
2205                /* At least the UNICOS C compiler choked on this
2206                 * being argument to DEBUG_r(), so let's just have
2207                 * it right here. */
2208                if (
2209 #ifdef PERL_EXT_RE_BUILD
2210                    1
2211 #else
2212                    DEBUG_r_TEST
2213 #endif
2214                    ) {
2215                    regnode *fix = convert;
2216                    U32 word = trie->wordcount;
2217                    mjd_nodelen++;
2218                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2219                    while( ++fix < n ) {
2220                        Set_Node_Offset_Length(fix, 0, 0);
2221                    }
2222                    while (word--) {
2223                        SV ** const tmp = av_fetch( trie_words, word, 0 );
2224                        if (tmp) {
2225                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
2226                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2227                            else
2228                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2229                        }
2230                    }
2231                }
2232 #endif
2233                 if (trie->maxlen) {
2234                     convert = n;
2235                 } else {
2236                     NEXT_OFF(convert) = (U16)(tail - convert);
2237                     DEBUG_r(optimize= n);
2238                 }
2239             }
2240         }
2241         if (!jumper)
2242             jumper = last;
2243         if ( trie->maxlen ) {
2244             NEXT_OFF( convert ) = (U16)(tail - convert);
2245             ARG_SET( convert, data_slot );
2246             /* Store the offset to the first unabsorbed branch in
2247                jump[0], which is otherwise unused by the jump logic.
2248                We use this when dumping a trie and during optimisation. */
2249             if (trie->jump)
2250                 trie->jump[0] = (U16)(nextbranch - convert);
2251
2252             /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2253              *   and there is a bitmap
2254              *   and the first "jump target" node we found leaves enough room
2255              * then convert the TRIE node into a TRIEC node, with the bitmap
2256              * embedded inline in the opcode - this is hypothetically faster.
2257              */
2258             if ( !trie->states[trie->startstate].wordnum
2259                  && trie->bitmap
2260                  && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2261             {
2262                 OP( convert ) = TRIEC;
2263                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2264                 PerlMemShared_free(trie->bitmap);
2265                 trie->bitmap= NULL;
2266             } else
2267                 OP( convert ) = TRIE;
2268
2269             /* store the type in the flags */
2270             convert->flags = nodetype;
2271             DEBUG_r({
2272             optimize = convert
2273                       + NODE_STEP_REGNODE
2274                       + regarglen[ OP( convert ) ];
2275             });
2276             /* XXX We really should free up the resource in trie now,
2277                    as we won't use them - (which resources?) dmq */
2278         }
2279         /* needed for dumping*/
2280         DEBUG_r(if (optimize) {
2281             regnode *opt = convert;
2282
2283             while ( ++opt < optimize) {
2284                 Set_Node_Offset_Length(opt,0,0);
2285             }
2286             /*
2287                 Try to clean up some of the debris left after the
2288                 optimisation.
2289              */
2290             while( optimize < jumper ) {
2291                 mjd_nodelen += Node_Length((optimize));
2292                 OP( optimize ) = OPTIMIZED;
2293                 Set_Node_Offset_Length(optimize,0,0);
2294                 optimize++;
2295             }
2296             Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2297         });
2298     } /* end node insert */
2299
2300     /*  Finish populating the prev field of the wordinfo array.  Walk back
2301      *  from each accept state until we find another accept state, and if
2302      *  so, point the first word's .prev field at the second word. If the
2303      *  second already has a .prev field set, stop now. This will be the
2304      *  case either if we've already processed that word's accept state,
2305      *  or that state had multiple words, and the overspill words were
2306      *  already linked up earlier.
2307      */
2308     {
2309         U16 word;
2310         U32 state;
2311         U16 prev;
2312
2313         for (word=1; word <= trie->wordcount; word++) {
2314             prev = 0;
2315             if (trie->wordinfo[word].prev)
2316                 continue;
2317             state = trie->wordinfo[word].accept;
2318             while (state) {
2319                 state = prev_states[state];
2320                 if (!state)
2321                     break;
2322                 prev = trie->states[state].wordnum;
2323                 if (prev)
2324                     break;
2325             }
2326             trie->wordinfo[word].prev = prev;
2327         }
2328         Safefree(prev_states);
2329     }
2330
2331
2332     /* and now dump out the compressed format */
2333     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2334
2335     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2336 #ifdef DEBUGGING
2337     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2338     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2339 #else
2340     SvREFCNT_dec(revcharmap);
2341 #endif
2342     return trie->jump
2343            ? MADE_JUMP_TRIE
2344            : trie->startstate>1
2345              ? MADE_EXACT_TRIE
2346              : MADE_TRIE;
2347 }
2348
2349 STATIC void
2350 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode *stclass, U32 depth)
2351 {
2352 /* The Trie is constructed and compressed now so we can build a fail array if it's needed
2353
2354    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2355    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2356    ISBN 0-201-10088-6
2357
2358    We find the fail state for each state in the trie, this state is the longest proper
2359    suffix of the current state's 'word' that is also a proper prefix of another word in our
2360    trie. State 1 represents the word '' and is thus the default fail state. This allows
2361    the DFA not to have to restart after its tried and failed a word at a given point, it
2362    simply continues as though it had been matching the other word in the first place.
2363    Consider
2364       'abcdgu'=~/abcdefg|cdgu/
2365    When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2366    fail, which would bring us to the state representing 'd' in the second word where we would
2367    try 'g' and succeed, proceeding to match 'cdgu'.
2368  */
2369  /* add a fail transition */
2370     const U32 trie_offset = ARG(source);
2371     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2372     U32 *q;
2373     const U32 ucharcount = trie->uniquecharcount;
2374     const U32 numstates = trie->statecount;
2375     const U32 ubound = trie->lasttrans + ucharcount;
2376     U32 q_read = 0;
2377     U32 q_write = 0;
2378     U32 charid;
2379     U32 base = trie->states[ 1 ].trans.base;
2380     U32 *fail;
2381     reg_ac_data *aho;
2382     const U32 data_slot = add_data( pRExC_state, 1, "T" );
2383     GET_RE_DEBUG_FLAGS_DECL;
2384
2385     PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2386 #ifndef DEBUGGING
2387     PERL_UNUSED_ARG(depth);
2388 #endif
2389
2390
2391     ARG_SET( stclass, data_slot );
2392     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2393     RExC_rxi->data->data[ data_slot ] = (void*)aho;
2394     aho->trie=trie_offset;
2395     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2396     Copy( trie->states, aho->states, numstates, reg_trie_state );
2397     Newxz( q, numstates, U32);
2398     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2399     aho->refcount = 1;
2400     fail = aho->fail;
2401     /* initialize fail[0..1] to be 1 so that we always have
2402        a valid final fail state */
2403     fail[ 0 ] = fail[ 1 ] = 1;
2404
2405     for ( charid = 0; charid < ucharcount ; charid++ ) {
2406         const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2407         if ( newstate ) {
2408             q[ q_write ] = newstate;
2409             /* set to point at the root */
2410             fail[ q[ q_write++ ] ]=1;
2411         }
2412     }
2413     while ( q_read < q_write) {
2414         const U32 cur = q[ q_read++ % numstates ];
2415         base = trie->states[ cur ].trans.base;
2416
2417         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2418             const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2419             if (ch_state) {
2420                 U32 fail_state = cur;
2421                 U32 fail_base;
2422                 do {
2423                     fail_state = fail[ fail_state ];
2424                     fail_base = aho->states[ fail_state ].trans.base;
2425                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2426
2427                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2428                 fail[ ch_state ] = fail_state;
2429                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2430                 {
2431                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
2432                 }
2433                 q[ q_write++ % numstates] = ch_state;
2434             }
2435         }
2436     }
2437     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2438        when we fail in state 1, this allows us to use the
2439        charclass scan to find a valid start char. This is based on the principle
2440        that theres a good chance the string being searched contains lots of stuff
2441        that cant be a start char.
2442      */
2443     fail[ 0 ] = fail[ 1 ] = 0;
2444     DEBUG_TRIE_COMPILE_r({
2445         PerlIO_printf(Perl_debug_log,
2446                       "%*sStclass Failtable (%"UVuf" states): 0",
2447                       (int)(depth * 2), "", (UV)numstates
2448         );
2449         for( q_read=1; q_read<numstates; q_read++ ) {
2450             PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2451         }
2452         PerlIO_printf(Perl_debug_log, "\n");
2453     });
2454     Safefree(q);
2455     /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2456 }
2457
2458
2459 /*
2460  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2461  * These need to be revisited when a newer toolchain becomes available.
2462  */
2463 #if defined(__sparc64__) && defined(__GNUC__)
2464 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2465 #       undef  SPARC64_GCC_WORKAROUND
2466 #       define SPARC64_GCC_WORKAROUND 1
2467 #   endif
2468 #endif
2469
2470 #define DEBUG_PEEP(str,scan,depth) \
2471     DEBUG_OPTIMISE_r({if (scan){ \
2472        SV * const mysv=sv_newmortal(); \
2473        regnode *Next = regnext(scan); \
2474        regprop(RExC_rx, mysv, scan); \
2475        PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2476        (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2477        Next ? (REG_NODE_NUM(Next)) : 0 ); \
2478    }});
2479
2480
2481
2482
2483
2484 #define JOIN_EXACT(scan,min,flags) \
2485     if (PL_regkind[OP(scan)] == EXACT) \
2486         join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2487
2488 STATIC U32
2489 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2490     /* Merge several consecutive EXACTish nodes into one. */
2491     regnode *n = regnext(scan);
2492     U32 stringok = 1;
2493     regnode *next = scan + NODE_SZ_STR(scan);
2494     U32 merged = 0;
2495     U32 stopnow = 0;
2496 #ifdef DEBUGGING
2497     regnode *stop = scan;
2498     GET_RE_DEBUG_FLAGS_DECL;
2499 #else
2500     PERL_UNUSED_ARG(depth);
2501 #endif
2502
2503     PERL_ARGS_ASSERT_JOIN_EXACT;
2504 #ifndef EXPERIMENTAL_INPLACESCAN
2505     PERL_UNUSED_ARG(flags);
2506     PERL_UNUSED_ARG(val);
2507 #endif
2508     DEBUG_PEEP("join",scan,depth);
2509
2510     /* Skip NOTHING, merge EXACT*. */
2511     while (n &&
2512            ( PL_regkind[OP(n)] == NOTHING ||
2513              (stringok && (OP(n) == OP(scan))))
2514            && NEXT_OFF(n)
2515            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2516
2517         if (OP(n) == TAIL || n > next)
2518             stringok = 0;
2519         if (PL_regkind[OP(n)] == NOTHING) {
2520             DEBUG_PEEP("skip:",n,depth);
2521             NEXT_OFF(scan) += NEXT_OFF(n);
2522             next = n + NODE_STEP_REGNODE;
2523 #ifdef DEBUGGING
2524             if (stringok)
2525                 stop = n;
2526 #endif
2527             n = regnext(n);
2528         }
2529         else if (stringok) {
2530             const unsigned int oldl = STR_LEN(scan);
2531             regnode * const nnext = regnext(n);
2532
2533             DEBUG_PEEP("merg",n,depth);
2534
2535             merged++;
2536             if (oldl + STR_LEN(n) > U8_MAX)
2537                 break;
2538             NEXT_OFF(scan) += NEXT_OFF(n);
2539             STR_LEN(scan) += STR_LEN(n);
2540             next = n + NODE_SZ_STR(n);
2541             /* Now we can overwrite *n : */
2542             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2543 #ifdef DEBUGGING
2544             stop = next - 1;
2545 #endif
2546             n = nnext;
2547             if (stopnow) break;
2548         }
2549
2550 #ifdef EXPERIMENTAL_INPLACESCAN
2551         if (flags && !NEXT_OFF(n)) {
2552             DEBUG_PEEP("atch", val, depth);
2553             if (reg_off_by_arg[OP(n)]) {
2554                 ARG_SET(n, val - n);
2555             }
2556             else {
2557                 NEXT_OFF(n) = val - n;
2558             }
2559             stopnow = 1;
2560         }
2561 #endif
2562     }
2563 #define GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS   0x0390
2564 #define IOTA_D_T        GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS
2565 #define GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS     0x03B0
2566 #define UPSILON_D_T     GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS
2567
2568     if (UTF
2569         && ( OP(scan) == EXACTF || OP(scan) == EXACTFU || OP(scan) == EXACTFA)
2570         && ( STR_LEN(scan) >= 6 ) )
2571     {
2572     /*
2573     Two problematic code points in Unicode casefolding of EXACT nodes:
2574
2575     U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2576     U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2577
2578     which casefold to
2579
2580     Unicode                      UTF-8
2581
2582     U+03B9 U+0308 U+0301         0xCE 0xB9 0xCC 0x88 0xCC 0x81
2583     U+03C5 U+0308 U+0301         0xCF 0x85 0xCC 0x88 0xCC 0x81
2584
2585     This means that in case-insensitive matching (or "loose matching",
2586     as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2587     length of the above casefolded versions) can match a target string
2588     of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2589     This would rather mess up the minimum length computation.
2590
2591     What we'll do is to look for the tail four bytes, and then peek
2592     at the preceding two bytes to see whether we need to decrease
2593     the minimum length by four (six minus two).
2594
2595     Thanks to the design of UTF-8, there cannot be false matches:
2596     A sequence of valid UTF-8 bytes cannot be a subsequence of
2597     another valid sequence of UTF-8 bytes.
2598
2599     */
2600          char * const s0 = STRING(scan), *s, *t;
2601          char * const s1 = s0 + STR_LEN(scan) - 1;
2602          char * const s2 = s1 - 4;
2603 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2604          const char t0[] = "\xaf\x49\xaf\x42";
2605 #else
2606          const char t0[] = "\xcc\x88\xcc\x81";
2607 #endif
2608          const char * const t1 = t0 + 3;
2609
2610          for (s = s0 + 2;
2611               s < s2 && (t = ninstr(s, s1, t0, t1));
2612               s = t + 4) {
2613 #ifdef EBCDIC
2614               if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2615                   ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2616 #else
2617               if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2618                   ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
2619 #endif
2620                    *min -= 4;
2621          }
2622     }
2623
2624 #ifdef DEBUGGING
2625     /* Allow dumping */
2626     n = scan + NODE_SZ_STR(scan);
2627     while (n <= stop) {
2628         if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2629             OP(n) = OPTIMIZED;
2630             NEXT_OFF(n) = 0;
2631         }
2632         n++;
2633     }
2634 #endif
2635     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2636     return stopnow;
2637 }
2638
2639 /* REx optimizer.  Converts nodes into quicker variants "in place".
2640    Finds fixed substrings.  */
2641
2642 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2643    to the position after last scanned or to NULL. */
2644
2645 #define INIT_AND_WITHP \
2646     assert(!and_withp); \
2647     Newx(and_withp,1,struct regnode_charclass_class); \
2648     SAVEFREEPV(and_withp)
2649
2650 /* this is a chain of data about sub patterns we are processing that
2651    need to be handled separately/specially in study_chunk. Its so
2652    we can simulate recursion without losing state.  */
2653 struct scan_frame;
2654 typedef struct scan_frame {
2655     regnode *last;  /* last node to process in this frame */
2656     regnode *next;  /* next node to process when last is reached */
2657     struct scan_frame *prev; /*previous frame*/
2658     I32 stop; /* what stopparen do we use */
2659 } scan_frame;
2660
2661
2662 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2663
2664 #define CASE_SYNST_FNC(nAmE)                                       \
2665 case nAmE:                                                         \
2666     if (flags & SCF_DO_STCLASS_AND) {                              \
2667             for (value = 0; value < 256; value++)                  \
2668                 if (!is_ ## nAmE ## _cp(value))                       \
2669                     ANYOF_BITMAP_CLEAR(data->start_class, value);  \
2670     }                                                              \
2671     else {                                                         \
2672             for (value = 0; value < 256; value++)                  \
2673                 if (is_ ## nAmE ## _cp(value))                        \
2674                     ANYOF_BITMAP_SET(data->start_class, value);    \
2675     }                                                              \
2676     break;                                                         \
2677 case N ## nAmE:                                                    \
2678     if (flags & SCF_DO_STCLASS_AND) {                              \
2679             for (value = 0; value < 256; value++)                   \
2680                 if (is_ ## nAmE ## _cp(value))                         \
2681                     ANYOF_BITMAP_CLEAR(data->start_class, value);   \
2682     }                                                               \
2683     else {                                                          \
2684             for (value = 0; value < 256; value++)                   \
2685                 if (!is_ ## nAmE ## _cp(value))                        \
2686                     ANYOF_BITMAP_SET(data->start_class, value);     \
2687     }                                                               \
2688     break
2689
2690
2691
2692 STATIC I32
2693 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2694                         I32 *minlenp, I32 *deltap,
2695                         regnode *last,
2696                         scan_data_t *data,
2697                         I32 stopparen,
2698                         U8* recursed,
2699                         struct regnode_charclass_class *and_withp,
2700                         U32 flags, U32 depth)
2701                         /* scanp: Start here (read-write). */
2702                         /* deltap: Write maxlen-minlen here. */
2703                         /* last: Stop before this one. */
2704                         /* data: string data about the pattern */
2705                         /* stopparen: treat close N as END */
2706                         /* recursed: which subroutines have we recursed into */
2707                         /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2708 {
2709     dVAR;
2710     I32 min = 0, pars = 0, code;
2711     regnode *scan = *scanp, *next;
2712     I32 delta = 0;
2713     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2714     int is_inf_internal = 0;            /* The studied chunk is infinite */
2715     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2716     scan_data_t data_fake;
2717     SV *re_trie_maxbuff = NULL;
2718     regnode *first_non_open = scan;
2719     I32 stopmin = I32_MAX;
2720     scan_frame *frame = NULL;
2721     GET_RE_DEBUG_FLAGS_DECL;
2722
2723     PERL_ARGS_ASSERT_STUDY_CHUNK;
2724
2725 #ifdef DEBUGGING
2726     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2727 #endif
2728
2729     if ( depth == 0 ) {
2730         while (first_non_open && OP(first_non_open) == OPEN)
2731             first_non_open=regnext(first_non_open);
2732     }
2733
2734
2735   fake_study_recurse:
2736     while ( scan && OP(scan) != END && scan < last ){
2737         /* Peephole optimizer: */
2738         DEBUG_STUDYDATA("Peep:", data,depth);
2739         DEBUG_PEEP("Peep",scan,depth);
2740         JOIN_EXACT(scan,&min,0);
2741
2742         /* Follow the next-chain of the current node and optimize
2743            away all the NOTHINGs from it.  */
2744         if (OP(scan) != CURLYX) {
2745             const int max = (reg_off_by_arg[OP(scan)]
2746                        ? I32_MAX
2747                        /* I32 may be smaller than U16 on CRAYs! */
2748                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2749             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2750             int noff;
2751             regnode *n = scan;
2752
2753             /* Skip NOTHING and LONGJMP. */
2754             while ((n = regnext(n))
2755                    && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2756                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2757                    && off + noff < max)
2758                 off += noff;
2759             if (reg_off_by_arg[OP(scan)])
2760                 ARG(scan) = off;
2761             else
2762                 NEXT_OFF(scan) = off;
2763         }
2764
2765
2766
2767         /* The principal pseudo-switch.  Cannot be a switch, since we
2768            look into several different things.  */
2769         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2770                    || OP(scan) == IFTHEN) {
2771             next = regnext(scan);
2772             code = OP(scan);
2773             /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2774
2775             if (OP(next) == code || code == IFTHEN) {
2776                 /* NOTE - There is similar code to this block below for handling
2777                    TRIE nodes on a re-study.  If you change stuff here check there
2778                    too. */
2779                 I32 max1 = 0, min1 = I32_MAX, num = 0;
2780                 struct regnode_charclass_class accum;
2781                 regnode * const startbranch=scan;
2782
2783                 if (flags & SCF_DO_SUBSTR)
2784                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2785                 if (flags & SCF_DO_STCLASS)
2786                     cl_init_zero(pRExC_state, &accum);
2787
2788                 while (OP(scan) == code) {
2789                     I32 deltanext, minnext, f = 0, fake;
2790                     struct regnode_charclass_class this_class;
2791
2792                     num++;
2793                     data_fake.flags = 0;
2794                     if (data) {
2795                         data_fake.whilem_c = data->whilem_c;
2796                         data_fake.last_closep = data->last_closep;
2797                     }
2798                     else
2799                         data_fake.last_closep = &fake;
2800
2801                     data_fake.pos_delta = delta;
2802                     next = regnext(scan);
2803                     scan = NEXTOPER(scan);
2804                     if (code != BRANCH)
2805                         scan = NEXTOPER(scan);
2806                     if (flags & SCF_DO_STCLASS) {
2807                         cl_init(pRExC_state, &this_class);
2808                         data_fake.start_class = &this_class;
2809                         f = SCF_DO_STCLASS_AND;
2810                     }
2811                     if (flags & SCF_WHILEM_VISITED_POS)
2812                         f |= SCF_WHILEM_VISITED_POS;
2813
2814                     /* we suppose the run is continuous, last=next...*/
2815                     minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2816                                           next, &data_fake,
2817                                           stopparen, recursed, NULL, f,depth+1);
2818                     if (min1 > minnext)
2819                         min1 = minnext;
2820                     if (max1 < minnext + deltanext)
2821                         max1 = minnext + deltanext;
2822                     if (deltanext == I32_MAX)
2823                         is_inf = is_inf_internal = 1;
2824                     scan = next;
2825                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2826                         pars++;
2827                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
2828                         if ( stopmin > minnext)
2829                             stopmin = min + min1;
2830                         flags &= ~SCF_DO_SUBSTR;
2831                         if (data)
2832                             data->flags |= SCF_SEEN_ACCEPT;
2833                     }
2834                     if (data) {
2835                         if (data_fake.flags & SF_HAS_EVAL)
2836                             data->flags |= SF_HAS_EVAL;
2837                         data->whilem_c = data_fake.whilem_c;
2838                     }
2839                     if (flags & SCF_DO_STCLASS)
2840                         cl_or(pRExC_state, &accum, &this_class);
2841                 }
2842                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2843                     min1 = 0;
2844                 if (flags & SCF_DO_SUBSTR) {
2845                     data->pos_min += min1;
2846                     data->pos_delta += max1 - min1;
2847                     if (max1 != min1 || is_inf)
2848                         data->longest = &(data->longest_float);
2849                 }
2850                 min += min1;
2851                 delta += max1 - min1;
2852                 if (flags & SCF_DO_STCLASS_OR) {
2853                     cl_or(pRExC_state, data->start_class, &accum);
2854                     if (min1) {
2855                         cl_and(data->start_class, and_withp);
2856                         flags &= ~SCF_DO_STCLASS;
2857                     }
2858                 }
2859                 else if (flags & SCF_DO_STCLASS_AND) {
2860                     if (min1) {
2861                         cl_and(data->start_class, &accum);
2862                         flags &= ~SCF_DO_STCLASS;
2863                     }
2864                     else {
2865                         /* Switch to OR mode: cache the old value of
2866                          * data->start_class */
2867                         INIT_AND_WITHP;
2868                         StructCopy(data->start_class, and_withp,
2869                                    struct regnode_charclass_class);
2870                         flags &= ~SCF_DO_STCLASS_AND;
2871                         StructCopy(&accum, data->start_class,
2872                                    struct regnode_charclass_class);
2873                         flags |= SCF_DO_STCLASS_OR;
2874                         data->start_class->flags |= ANYOF_EOS;
2875                     }
2876                 }
2877
2878                 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2879                 /* demq.
2880
2881                    Assuming this was/is a branch we are dealing with: 'scan' now
2882                    points at the item that follows the branch sequence, whatever
2883                    it is. We now start at the beginning of the sequence and look
2884                    for subsequences of
2885
2886                    BRANCH->EXACT=>x1
2887                    BRANCH->EXACT=>x2
2888                    tail
2889
2890                    which would be constructed from a pattern like /A|LIST|OF|WORDS/
2891
2892                    If we can find such a subsequence we need to turn the first
2893                    element into a trie and then add the subsequent branch exact
2894                    strings to the trie.
2895
2896                    We have two cases
2897
2898                      1. patterns where the whole set of branches can be converted.
2899
2900                      2. patterns where only a subset can be converted.
2901
2902                    In case 1 we can replace the whole set with a single regop
2903                    for the trie. In case 2 we need to keep the start and end
2904                    branches so
2905
2906                      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2907                      becomes BRANCH TRIE; BRANCH X;
2908
2909                   There is an additional case, that being where there is a
2910                   common prefix, which gets split out into an EXACT like node
2911                   preceding the TRIE node.
2912
2913                   If x(1..n)==tail then we can do a simple trie, if not we make
2914                   a "jump" trie, such that when we match the appropriate word
2915                   we "jump" to the appropriate tail node. Essentially we turn
2916                   a nested if into a case structure of sorts.
2917
2918                 */
2919
2920                     int made=0;
2921                     if (!re_trie_maxbuff) {
2922                         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2923                         if (!SvIOK(re_trie_maxbuff))
2924                             sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2925                     }
2926                     if ( SvIV(re_trie_maxbuff)>=0  ) {
2927                         regnode *cur;
2928                         regnode *first = (regnode *)NULL;
2929                         regnode *last = (regnode *)NULL;
2930                         regnode *tail = scan;
2931                         U8 optype = 0;
2932                         U32 count=0;
2933
2934 #ifdef DEBUGGING
2935                         SV * const mysv = sv_newmortal();       /* for dumping */
2936 #endif
2937                         /* var tail is used because there may be a TAIL
2938                            regop in the way. Ie, the exacts will point to the
2939                            thing following the TAIL, but the last branch will
2940                            point at the TAIL. So we advance tail. If we
2941                            have nested (?:) we may have to move through several
2942                            tails.
2943                          */
2944
2945                         while ( OP( tail ) == TAIL ) {
2946                             /* this is the TAIL generated by (?:) */
2947                             tail = regnext( tail );
2948                         }
2949
2950
2951                         DEBUG_OPTIMISE_r({
2952                             regprop(RExC_rx, mysv, tail );
2953                             PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2954                                 (int)depth * 2 + 2, "",
2955                                 "Looking for TRIE'able sequences. Tail node is: ",
2956                                 SvPV_nolen_const( mysv )
2957                             );
2958                         });
2959
2960                         /*
2961
2962                            step through the branches, cur represents each
2963                            branch, noper is the first thing to be matched
2964                            as part of that branch and noper_next is the
2965                            regnext() of that node. if noper is an EXACT
2966                            and noper_next is the same as scan (our current
2967                            position in the regex) then the EXACT branch is
2968                            a possible optimization target. Once we have
2969                            two or more consecutive such branches we can
2970                            create a trie of the EXACT's contents and stich
2971                            it in place. If the sequence represents all of
2972                            the branches we eliminate the whole thing and
2973                            replace it with a single TRIE. If it is a
2974                            subsequence then we need to stitch it in. This
2975                            means the first branch has to remain, and needs
2976                            to be repointed at the item on the branch chain
2977                            following the last branch optimized. This could
2978                            be either a BRANCH, in which case the
2979                            subsequence is internal, or it could be the
2980                            item following the branch sequence in which
2981                            case the subsequence is at the end.
2982
2983                         */
2984
2985                         /* dont use tail as the end marker for this traverse */
2986                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2987                             regnode * const noper = NEXTOPER( cur );
2988 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
2989                             regnode * const noper_next = regnext( noper );
2990 #endif
2991
2992                             DEBUG_OPTIMISE_r({
2993                                 regprop(RExC_rx, mysv, cur);
2994                                 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2995                                    (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2996
2997                                 regprop(RExC_rx, mysv, noper);
2998                                 PerlIO_printf( Perl_debug_log, " -> %s",
2999                                     SvPV_nolen_const(mysv));
3000
3001                                 if ( noper_next ) {
3002                                   regprop(RExC_rx, mysv, noper_next );
3003                                   PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3004                                     SvPV_nolen_const(mysv));
3005                                 }
3006                                 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
3007                                    REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
3008                             });
3009                             if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
3010                                          : PL_regkind[ OP( noper ) ] == EXACT )
3011                                   || OP(noper) == NOTHING )
3012 #ifdef NOJUMPTRIE
3013                                   && noper_next == tail
3014 #endif
3015                                   && count < U16_MAX)
3016                             {
3017                                 count++;
3018                                 if ( !first || optype == NOTHING ) {
3019                                     if (!first) first = cur;
3020                                     optype = OP( noper );
3021                                 } else {
3022                                     last = cur;
3023                                 }
3024                             } else {
3025 /*
3026     Currently we do not believe that the trie logic can
3027     handle case insensitive matching properly when the
3028     pattern is not unicode (thus forcing unicode semantics).
3029
3030     If/when this is fixed the following define can be swapped
3031     in below to fully enable trie logic.
3032
3033     XXX It may work if not UTF and/or /a (AT_LEAST_UNI_SEMANTICS) but perhaps
3034     not /aa
3035
3036 #define TRIE_TYPE_IS_SAFE 1
3037
3038 */
3039 #define TRIE_TYPE_IS_SAFE ((UTF && UNI_SEMANTICS) || optype==EXACT)
3040
3041                                 if ( last && TRIE_TYPE_IS_SAFE ) {
3042                                     make_trie( pRExC_state,
3043                                             startbranch, first, cur, tail, count,
3044                                             optype, depth+1 );
3045                                 }
3046                                 if ( PL_regkind[ OP( noper ) ] == EXACT
3047 #ifdef NOJUMPTRIE
3048                                      && noper_next == tail
3049 #endif
3050                                 ){
3051                                     count = 1;
3052                                     first = cur;
3053                                     optype = OP( noper );
3054                                 } else {
3055                                     count = 0;
3056                                     first = NULL;
3057                                     optype = 0;
3058                                 }
3059                                 last = NULL;
3060                             }
3061                         }
3062                         DEBUG_OPTIMISE_r({
3063                             regprop(RExC_rx, mysv, cur);
3064                             PerlIO_printf( Perl_debug_log,
3065                               "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3066                               "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3067
3068                         });
3069
3070                         if ( last && TRIE_TYPE_IS_SAFE ) {
3071                             made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3072 #ifdef TRIE_STUDY_OPT
3073                             if ( ((made == MADE_EXACT_TRIE &&
3074                                  startbranch == first)
3075                                  || ( first_non_open == first )) &&
3076                                  depth==0 ) {
3077                                 flags |= SCF_TRIE_RESTUDY;
3078                                 if ( startbranch == first
3079                                      && scan == tail )
3080                                 {
3081                                     RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3082                                 }
3083                             }
3084 #endif
3085                         }
3086                     }
3087
3088                 } /* do trie */
3089
3090             }
3091             else if ( code == BRANCHJ ) {  /* single branch is optimized. */
3092                 scan = NEXTOPER(NEXTOPER(scan));
3093             } else                      /* single branch is optimized. */
3094                 scan = NEXTOPER(scan);
3095             continue;
3096         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3097             scan_frame *newframe = NULL;
3098             I32 paren;
3099             regnode *start;
3100             regnode *end;
3101
3102             if (OP(scan) != SUSPEND) {
3103             /* set the pointer */
3104                 if (OP(scan) == GOSUB) {
3105                     paren = ARG(scan);
3106                     RExC_recurse[ARG2L(scan)] = scan;
3107                     start = RExC_open_parens[paren-1];
3108                     end   = RExC_close_parens[paren-1];
3109                 } else {
3110                     paren = 0;
3111                     start = RExC_rxi->program + 1;
3112                     end   = RExC_opend;
3113                 }
3114                 if (!recursed) {
3115                     Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3116                     SAVEFREEPV(recursed);
3117                 }
3118                 if (!PAREN_TEST(recursed,paren+1)) {
3119                     PAREN_SET(recursed,paren+1);
3120                     Newx(newframe,1,scan_frame);
3121                 } else {
3122                     if (flags & SCF_DO_SUBSTR) {
3123                         SCAN_COMMIT(pRExC_state,data,minlenp);
3124                         data->longest = &(data->longest_float);
3125                     }
3126                     is_inf = is_inf_internal = 1;
3127                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3128                         cl_anything(pRExC_state, data->start_class);
3129                     flags &= ~SCF_DO_STCLASS;
3130                 }
3131             } else {
3132                 Newx(newframe,1,scan_frame);
3133                 paren = stopparen;
3134                 start = scan+2;
3135                 end = regnext(scan);
3136             }
3137             if (newframe) {
3138                 assert(start);
3139                 assert(end);
3140                 SAVEFREEPV(newframe);
3141                 newframe->next = regnext(scan);
3142                 newframe->last = last;
3143                 newframe->stop = stopparen;
3144                 newframe->prev = frame;
3145
3146                 frame = newframe;
3147                 scan =  start;
3148                 stopparen = paren;
3149                 last = end;
3150
3151                 continue;
3152             }
3153         }
3154         else if (OP(scan) == EXACT) {
3155             I32 l = STR_LEN(scan);
3156             UV uc;
3157             if (UTF) {
3158                 const U8 * const s = (U8*)STRING(scan);
3159                 l = utf8_length(s, s + l);
3160                 uc = utf8_to_uvchr(s, NULL);
3161             } else {
3162                 uc = *((U8*)STRING(scan));
3163             }
3164             min += l;
3165             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3166                 /* The code below prefers earlier match for fixed
3167                    offset, later match for variable offset.  */
3168                 if (data->last_end == -1) { /* Update the start info. */
3169                     data->last_start_min = data->pos_min;
3170                     data->last_start_max = is_inf
3171                         ? I32_MAX : data->pos_min + data->pos_delta;
3172                 }
3173                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3174                 if (UTF)
3175                     SvUTF8_on(data->last_found);
3176                 {
3177                     SV * const sv = data->last_found;
3178                     MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3179                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
3180                     if (mg && mg->mg_len >= 0)
3181                         mg->mg_len += utf8_length((U8*)STRING(scan),
3182                                                   (U8*)STRING(scan)+STR_LEN(scan));
3183                 }
3184                 data->last_end = data->pos_min + l;
3185                 data->pos_min += l; /* As in the first entry. */
3186                 data->flags &= ~SF_BEFORE_EOL;
3187             }
3188             if (flags & SCF_DO_STCLASS_AND) {
3189                 /* Check whether it is compatible with what we know already! */
3190                 int compat = 1;
3191
3192
3193                 /* If compatible, we or it in below.  It is compatible if is
3194                  * in the bitmp and either 1) its bit or its fold is set, or 2)
3195                  * it's for a locale.  Even if there isn't unicode semantics
3196                  * here, at runtime there may be because of matching against a
3197                  * utf8 string, so accept a possible false positive for
3198                  * latin1-range folds */
3199                 if (uc >= 0x100 ||
3200                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3201                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
3202                     && (!(data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD)
3203                         || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3204                     )
3205                 {
3206                     compat = 0;
3207                 }
3208                 ANYOF_CLASS_ZERO(data->start_class);
3209                 ANYOF_BITMAP_ZERO(data->start_class);
3210                 if (compat)
3211                     ANYOF_BITMAP_SET(data->start_class, uc);
3212                 else if (uc >= 0x100) {
3213                     int i;
3214
3215                     /* Some Unicode code points fold to the Latin1 range; as
3216                      * XXX temporary code, instead of figuring out if this is
3217                      * one, just assume it is and set all the start class bits
3218                      * that could be some such above 255 code point's fold
3219                      * which will generate fals positives.  As the code
3220                      * elsewhere that does compute the fold settles down, it
3221                      * can be extracted out and re-used here */
3222                     for (i = 0; i < 256; i++){
3223                         if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3224                             ANYOF_BITMAP_SET(data->start_class, i);
3225                         }
3226                     }
3227                 }
3228                 data->start_class->flags &= ~ANYOF_EOS;
3229                 if (uc < 0x100)
3230                   data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3231             }
3232             else if (flags & SCF_DO_STCLASS_OR) {
3233                 /* false positive possible if the class is case-folded */
3234                 if (uc < 0x100)
3235                     ANYOF_BITMAP_SET(data->start_class, uc);
3236                 else
3237                     data->start_class->flags |= ANYOF_UNICODE_ALL;
3238                 data->start_class->flags &= ~ANYOF_EOS;
3239                 cl_and(data->start_class, and_withp);
3240             }
3241             flags &= ~SCF_DO_STCLASS;
3242         }
3243         else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3244             I32 l = STR_LEN(scan);
3245             UV uc = *((U8*)STRING(scan));
3246
3247             /* Search for fixed substrings supports EXACT only. */
3248             if (flags & SCF_DO_SUBSTR) {
3249                 assert(data);
3250                 SCAN_COMMIT(pRExC_state, data, minlenp);
3251             }
3252             if (UTF) {
3253                 const U8 * const s = (U8 *)STRING(scan);
3254                 l = utf8_length(s, s + l);
3255                 uc = utf8_to_uvchr(s, NULL);
3256             }
3257             min += l;
3258             if (flags & SCF_DO_SUBSTR)
3259                 data->pos_min += l;
3260             if (flags & SCF_DO_STCLASS_AND) {
3261                 /* Check whether it is compatible with what we know already! */
3262                 int compat = 1;
3263                 if (uc >= 0x100 ||
3264                  (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3265                   && !ANYOF_BITMAP_TEST(data->start_class, uc)
3266                   && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3267                 {
3268                     compat = 0;
3269                 }
3270                 ANYOF_CLASS_ZERO(data->start_class);
3271                 ANYOF_BITMAP_ZERO(data->start_class);
3272                 if (compat) {
3273                     ANYOF_BITMAP_SET(data->start_class, uc);
3274                     data->start_class->flags &= ~ANYOF_EOS;
3275                     data->start_class->flags |= ANYOF_LOC_NONBITMAP_FOLD;
3276                     if (OP(scan) == EXACTFL) {
3277                         /* XXX This set is probably no longer necessary, and
3278                          * probably wrong as LOCALE now is on in the initial
3279                          * state */
3280                         data->start_class->flags |= ANYOF_LOCALE;
3281                     }
3282                     else {
3283
3284                         /* Also set the other member of the fold pair.  In case
3285                          * that unicode semantics is called for at runtime, use
3286                          * the full latin1 fold.  (Can't do this for locale,
3287                          * because not known until runtime */
3288                         ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3289                     }
3290                 }
3291                 else if (uc >= 0x100) {
3292                     int i;
3293                     for (i = 0; i < 256; i++){
3294                         if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3295                             ANYOF_BITMAP_SET(data->start_class, i);
3296                         }
3297                     }
3298                 }
3299             }
3300             else if (flags & SCF_DO_STCLASS_OR) {
3301                 if (data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) {
3302                     /* false positive possible if the class is case-folded.
3303                        Assume that the locale settings are the same... */
3304                     if (uc < 0x100) {
3305                         ANYOF_BITMAP_SET(data->start_class, uc);
3306                         if (OP(scan) != EXACTFL) {
3307
3308                             /* And set the other member of the fold pair, but
3309                              * can't do that in locale because not known until
3310                              * run-time */
3311                             ANYOF_BITMAP_SET(data->start_class,
3312                                              PL_fold_latin1[uc]);
3313                         }
3314                     }
3315                     data->start_class->flags &= ~ANYOF_EOS;
3316                 }
3317                 cl_and(data->start_class, and_withp);
3318             }
3319             flags &= ~SCF_DO_STCLASS;
3320         }
3321         else if (REGNODE_VARIES(OP(scan))) {
3322             I32 mincount, maxcount, minnext, deltanext, fl = 0;
3323             I32 f = flags, pos_before = 0;
3324             regnode * const oscan = scan;
3325             struct regnode_charclass_class this_class;
3326             struct regnode_charclass_class *oclass = NULL;
3327             I32 next_is_eval = 0;
3328
3329             switch (PL_regkind[OP(scan)]) {
3330             case WHILEM:                /* End of (?:...)* . */
3331                 scan = NEXTOPER(scan);
3332                 goto finish;
3333             case PLUS:
3334                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3335                     next = NEXTOPER(scan);
3336                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3337                         mincount = 1;
3338                         maxcount = REG_INFTY;
3339                         next = regnext(scan);
3340                         scan = NEXTOPER(scan);
3341                         goto do_curly;
3342                     }
3343                 }
3344                 if (flags & SCF_DO_SUBSTR)
3345                     data->pos_min++;
3346                 min++;
3347                 /* Fall through. */
3348             case STAR:
3349                 if (flags & SCF_DO_STCLASS) {
3350                     mincount = 0;
3351                     maxcount = REG_INFTY;
3352                     next = regnext(scan);
3353                     scan = NEXTOPER(scan);
3354                     goto do_curly;
3355                 }
3356                 is_inf = is_inf_internal = 1;
3357                 scan = regnext(scan);
3358                 if (flags & SCF_DO_SUBSTR) {
3359                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3360                     data->longest = &(data->longest_float);
3361                 }
3362                 goto optimize_curly_tail;
3363             case CURLY:
3364                 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3365                     && (scan->flags == stopparen))
3366                 {
3367                     mincount = 1;
3368                     maxcount = 1;
3369                 } else {
3370                     mincount = ARG1(scan);
3371                     maxcount = ARG2(scan);
3372                 }
3373                 next = regnext(scan);
3374                 if (OP(scan) == CURLYX) {
3375                     I32 lp = (data ? *(data->last_closep) : 0);
3376                     scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3377                 }
3378                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3379                 next_is_eval = (OP(scan) == EVAL);
3380               do_curly:
3381                 if (flags & SCF_DO_SUBSTR) {
3382                     if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3383                     pos_before = data->pos_min;
3384                 }
3385                 if (data) {
3386                     fl = data->flags;
3387                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3388                     if (is_inf)
3389                         data->flags |= SF_IS_INF;
3390                 }
3391                 if (flags & SCF_DO_STCLASS) {
3392                     cl_init(pRExC_state, &this_class);
3393                     oclass = data->start_class;
3394                     data->start_class = &this_class;
3395                     f |= SCF_DO_STCLASS_AND;
3396                     f &= ~SCF_DO_STCLASS_OR;
3397                 }
3398                 /* Exclude from super-linear cache processing any {n,m}
3399                    regops for which the combination of input pos and regex
3400                    pos is not enough information to determine if a match
3401                    will be possible.
3402
3403                    For example, in the regex /foo(bar\s*){4,8}baz/ with the
3404                    regex pos at the \s*, the prospects for a match depend not
3405                    only on the input position but also on how many (bar\s*)
3406                    repeats into the {4,8} we are. */
3407                if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3408                     f &= ~SCF_WHILEM_VISITED_POS;
3409
3410                 /* This will finish on WHILEM, setting scan, or on NULL: */
3411                 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3412                                       last, data, stopparen, recursed, NULL,
3413                                       (mincount == 0
3414                                         ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3415
3416                 if (flags & SCF_DO_STCLASS)
3417                     data->start_class = oclass;
3418                 if (mincount == 0 || minnext == 0) {
3419                     if (flags & SCF_DO_STCLASS_OR) {
3420                         cl_or(pRExC_state, data->start_class, &this_class);
3421                     }
3422                     else if (flags & SCF_DO_STCLASS_AND) {
3423                         /* Switch to OR mode: cache the old value of
3424                          * data->start_class */
3425                         INIT_AND_WITHP;
3426                         StructCopy(data->start_class, and_withp,
3427                                    struct regnode_charclass_class);
3428                         flags &= ~SCF_DO_STCLASS_AND;
3429                         StructCopy(&this_class, data->start_class,
3430                                    struct regnode_charclass_class);
3431                         flags |= SCF_DO_STCLASS_OR;
3432                         data->start_class->flags |= ANYOF_EOS;
3433                     }
3434                 } else {                /* Non-zero len */
3435                     if (flags & SCF_DO_STCLASS_OR) {
3436                         cl_or(pRExC_state, data->start_class, &this_class);
3437                         cl_and(data->start_class, and_withp);
3438                     }
3439                     else if (flags & SCF_DO_STCLASS_AND)
3440                         cl_and(data->start_class, &this_class);
3441                     flags &= ~SCF_DO_STCLASS;
3442                 }
3443                 if (!scan)              /* It was not CURLYX, but CURLY. */
3444                     scan = next;
3445                 if ( /* ? quantifier ok, except for (?{ ... }) */
3446                     (next_is_eval || !(mincount == 0 && maxcount == 1))
3447                     && (minnext == 0) && (deltanext == 0)
3448                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3449                     && maxcount <= REG_INFTY/3) /* Complement check for big count */
3450                 {
3451                     ckWARNreg(RExC_parse,
3452                               "Quantifier unexpected on zero-length expression");
3453                 }
3454
3455                 min += minnext * mincount;
3456                 is_inf_internal |= ((maxcount == REG_INFTY
3457                                      && (minnext + deltanext) > 0)
3458                                     || deltanext == I32_MAX);
3459                 is_inf |= is_inf_internal;
3460                 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3461
3462                 /* Try powerful optimization CURLYX => CURLYN. */
3463                 if (  OP(oscan) == CURLYX && data
3464                       && data->flags & SF_IN_PAR
3465                       && !(data->flags & SF_HAS_EVAL)
3466                       && !deltanext && minnext == 1 ) {
3467                     /* Try to optimize to CURLYN.  */
3468                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3469                     regnode * const nxt1 = nxt;
3470 #ifdef DEBUGGING
3471                     regnode *nxt2;
3472 #endif
3473
3474                     /* Skip open. */
3475                     nxt = regnext(nxt);
3476                     if (!REGNODE_SIMPLE(OP(nxt))
3477                         && !(PL_regkind[OP(nxt)] == EXACT
3478                              && STR_LEN(nxt) == 1))
3479                         goto nogo;
3480 #ifdef DEBUGGING
3481                     nxt2 = nxt;
3482 #endif
3483                     nxt = regnext(nxt);
3484                     if (OP(nxt) != CLOSE)
3485                         goto nogo;
3486                     if (RExC_open_parens) {
3487                         RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3488                         RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3489                     }
3490                     /* Now we know that nxt2 is the only contents: */
3491                     oscan->flags = (U8)ARG(nxt);
3492                     OP(oscan) = CURLYN;
3493                     OP(nxt1) = NOTHING; /* was OPEN. */
3494
3495 #ifdef DEBUGGING
3496                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3497                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3498                     NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3499                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
3500                     OP(nxt + 1) = OPTIMIZED; /* was count. */
3501                     NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3502 #endif
3503                 }
3504               nogo:
3505
3506                 /* Try optimization CURLYX => CURLYM. */
3507                 if (  OP(oscan) == CURLYX && data
3508                       && !(data->flags & SF_HAS_PAR)
3509                       && !(data->flags & SF_HAS_EVAL)
3510                       && !deltanext     /* atom is fixed width */
3511                       && minnext != 0   /* CURLYM can't handle zero width */
3512                 ) {
3513                     /* XXXX How to optimize if data == 0? */
3514                     /* Optimize to a simpler form.  */
3515                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3516                     regnode *nxt2;
3517
3518                     OP(oscan) = CURLYM;
3519                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3520                             && (OP(nxt2) != WHILEM))
3521                         nxt = nxt2;
3522                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
3523                     /* Need to optimize away parenths. */
3524                     if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3525                         /* Set the parenth number.  */
3526                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3527
3528                         oscan->flags = (U8)ARG(nxt);
3529                         if (RExC_open_parens) {
3530                             RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3531                             RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3532                         }
3533                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
3534                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
3535
3536 #ifdef DEBUGGING
3537                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3538                         OP(nxt + 1) = OPTIMIZED; /* was count. */
3539                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3540                         NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
3541 #endif
3542 #if 0
3543                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
3544                             regnode *nnxt = regnext(nxt1);
3545                             if (nnxt == nxt) {
3546                                 if (reg_off_by_arg[OP(nxt1)])
3547                                     ARG_SET(nxt1, nxt2 - nxt1);
3548                                 else if (nxt2 - nxt1 < U16_MAX)
3549                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
3550                                 else
3551                                     OP(nxt) = NOTHING;  /* Cannot beautify */
3552                             }
3553                             nxt1 = nnxt;
3554                         }
3555 #endif
3556                         /* Optimize again: */
3557                         study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3558                                     NULL, stopparen, recursed, NULL, 0,depth+1);
3559                     }
3560                     else
3561                         oscan->flags = 0;
3562                 }
3563                 else if ((OP(oscan) == CURLYX)
3564                          && (flags & SCF_WHILEM_VISITED_POS)
3565                          /* See the comment on a similar expression above.
3566                             However, this time it's not a subexpression
3567                             we care about, but the expression itself. */
3568                          && (maxcount == REG_INFTY)
3569                          && data && ++data->whilem_c < 16) {
3570                     /* This stays as CURLYX, we can put the count/of pair. */
3571                     /* Find WHILEM (as in regexec.c) */
3572                     regnode *nxt = oscan + NEXT_OFF(oscan);
3573
3574                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3575                         nxt += ARG(nxt);
3576                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
3577                         | (RExC_whilem_seen << 4)); /* On WHILEM */
3578                 }
3579                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3580                     pars++;
3581                 if (flags & SCF_DO_SUBSTR) {
3582                     SV *last_str = NULL;
3583                     int counted = mincount != 0;
3584
3585                     if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3586 #if defined(SPARC64_GCC_WORKAROUND)
3587                         I32 b = 0;
3588                         STRLEN l = 0;
3589                         const char *s = NULL;
3590                         I32 old = 0;
3591
3592                         if (pos_before >= data->last_start_min)
3593                             b = pos_before;
3594                         else
3595                             b = data->last_start_min;
3596
3597                         l = 0;
3598                         s = SvPV_const(data->last_found, l);
3599                         old = b - data->last_start_min;
3600
3601 #else
3602                         I32 b = pos_before >= data->last_start_min
3603                             ? pos_before : data->last_start_min;
3604                         STRLEN l;
3605                         const char * const s = SvPV_const(data->last_found, l);
3606                         I32 old = b - data->last_start_min;
3607 #endif
3608
3609                         if (UTF)
3610                             old = utf8_hop((U8*)s, old) - (U8*)s;
3611                         l -= old;
3612                         /* Get the added string: */
3613                         last_str = newSVpvn_utf8(s  + old, l, UTF);
3614                         if (deltanext == 0 && pos_before == b) {
3615                             /* What was added is a constant string */
3616                             if (mincount > 1) {
3617                                 SvGROW(last_str, (mincount * l) + 1);
3618                                 repeatcpy(SvPVX(last_str) + l,
3619                                           SvPVX_const(last_str), l, mincount - 1);
3620                                 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3621                                 /* Add additional parts. */
3622                                 SvCUR_set(data->last_found,
3623                                           SvCUR(data->last_found) - l);
3624                                 sv_catsv(data->last_found, last_str);
3625                                 {
3626                                     SV * sv = data->last_found;
3627                                     MAGIC *mg =
3628                                         SvUTF8(sv) && SvMAGICAL(sv) ?
3629                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
3630                                     if (mg && mg->mg_len >= 0)
3631                                         mg->mg_len += CHR_SVLEN(last_str) - l;
3632                                 }
3633                                 data->last_end += l * (mincount - 1);
3634                             }
3635                         } else {
3636                             /* start offset must point into the last copy */
3637                             data->last_start_min += minnext * (mincount - 1);
3638                             data->last_start_max += is_inf ? I32_MAX
3639                                 : (maxcount - 1) * (minnext + data->pos_delta);
3640                         }
3641                     }
3642                     /* It is counted once already... */
3643                     data->pos_min += minnext * (mincount - counted);
3644                     data->pos_delta += - counted * deltanext +
3645                         (minnext + deltanext) * maxcount - minnext * mincount;
3646                     if (mincount != maxcount) {
3647                          /* Cannot extend fixed substrings found inside
3648                             the group.  */
3649                         SCAN_COMMIT(pRExC_state,data,minlenp);
3650                         if (mincount && last_str) {
3651                             SV * const sv = data->last_found;
3652                             MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3653                                 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3654
3655                             if (mg)
3656                                 mg->mg_len = -1;
3657                             sv_setsv(sv, last_str);
3658                             data->last_end = data->pos_min;
3659                             data->last_start_min =
3660                                 data->pos_min - CHR_SVLEN(last_str);
3661                             data->last_start_max = is_inf
3662                                 ? I32_MAX
3663                                 : data->pos_min + data->pos_delta
3664                                 - CHR_SVLEN(last_str);
3665                         }
3666                         data->longest = &(data->longest_float);
3667                     }
3668                     SvREFCNT_dec(last_str);
3669                 }
3670                 if (data && (fl & SF_HAS_EVAL))
3671                     data->flags |= SF_HAS_EVAL;
3672               optimize_curly_tail:
3673                 if (OP(oscan) != CURLYX) {
3674                     while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3675                            && NEXT_OFF(next))
3676                         NEXT_OFF(oscan) += NEXT_OFF(next);
3677                 }
3678                 continue;
3679             default:                    /* REF, ANYOFV, and CLUMP only? */
3680                 if (flags & SCF_DO_SUBSTR) {
3681                     SCAN_COMMIT(pRExC_state,data,minlenp);      /* Cannot expect anything... */
3682                     data->longest = &(data->longest_float);
3683                 }
3684                 is_inf = is_inf_internal = 1;
3685                 if (flags & SCF_DO_STCLASS_OR)
3686                     cl_anything(pRExC_state, data->start_class);
3687                 flags &= ~SCF_DO_STCLASS;
3688                 break;
3689             }
3690         }
3691         else if (OP(scan) == LNBREAK) {
3692             if (flags & SCF_DO_STCLASS) {
3693                 int value = 0;
3694                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3695                 if (flags & SCF_DO_STCLASS_AND) {
3696                     for (value = 0; value < 256; value++)
3697                         if (!is_VERTWS_cp(value))
3698                             ANYOF_BITMAP_CLEAR(data->start_class, value);
3699                 }
3700                 else {
3701                     for (value = 0; value < 256; value++)
3702                         if (is_VERTWS_cp(value))
3703                             ANYOF_BITMAP_SET(data->start_class, value);
3704                 }
3705                 if (flags & SCF_DO_STCLASS_OR)
3706                     cl_and(data->start_class, and_withp);
3707                 flags &= ~SCF_DO_STCLASS;
3708             }
3709             min += 1;
3710             delta += 1;
3711             if (flags & SCF_DO_SUBSTR) {
3712                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
3713                 data->pos_min += 1;
3714                 data->pos_delta += 1;
3715                 data->longest = &(data->longest_float);
3716             }
3717         }
3718         else if (OP(scan) == FOLDCHAR) {
3719             int d = ARG(scan) == LATIN_SMALL_LETTER_SHARP_S ? 1 : 2;
3720             flags &= ~SCF_DO_STCLASS;
3721             min += 1;
3722             delta += d;
3723             if (flags & SCF_DO_SUBSTR) {
3724                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
3725                 data->pos_min += 1;
3726                 data->pos_delta += d;
3727                 data->longest = &(data->longest_float);
3728             }
3729         }
3730         else if (REGNODE_SIMPLE(OP(scan))) {
3731             int value = 0;
3732
3733             if (flags & SCF_DO_SUBSTR) {
3734                 SCAN_COMMIT(pRExC_state,data,minlenp);
3735                 data->pos_min++;
3736             }
3737             min++;
3738             if (flags & SCF_DO_STCLASS) {
3739                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3740
3741                 /* Some of the logic below assumes that switching
3742                    locale on will only add false positives. */
3743                 switch (PL_regkind[OP(scan)]) {
3744                 case SANY:
3745                 default:
3746                   do_default:
3747                     /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3748                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3749                         cl_anything(pRExC_state, data->start_class);
3750                     break;
3751                 case REG_ANY:
3752                     if (OP(scan) == SANY)
3753                         goto do_default;
3754                     if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3755                         value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3756                                  || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
3757                         cl_anything(pRExC_state, data->start_class);
3758                     }
3759                     if (flags & SCF_DO_STCLASS_AND || !value)
3760                         ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3761                     break;
3762                 case ANYOF:
3763                     if (flags & SCF_DO_STCLASS_AND)
3764                         cl_and(data->start_class,
3765                                (struct regnode_charclass_class*)scan);
3766                     else
3767                         cl_or(pRExC_state, data->start_class,
3768                               (struct regnode_charclass_class*)scan);
3769                     break;
3770                 case ALNUM:
3771                     if (flags & SCF_DO_STCLASS_AND) {
3772                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3773                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3774                             if (OP(scan) == ALNUMU) {
3775                                 for (value = 0; value < 256; value++) {
3776                                     if (!isWORDCHAR_L1(value)) {
3777                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3778                                     }
3779                                 }
3780                             } else {
3781                                 for (value = 0; value < 256; value++) {
3782                                     if (!isALNUM(value)) {
3783                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3784                                     }
3785                                 }
3786                             }
3787                         }
3788                     }
3789                     else {
3790                         if (data->start_class->flags & ANYOF_LOCALE)
3791                             ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3792
3793                         /* Even if under locale, set the bits for non-locale
3794                          * in case it isn't a true locale-node.  This will
3795                          * create false positives if it truly is locale */
3796                         if (OP(scan) == ALNUMU) {
3797                             for (value = 0; value < 256; value++) {
3798                                 if (isWORDCHAR_L1(value)) {
3799                                     ANYOF_BITMAP_SET(data->start_class, value);
3800                                 }
3801                             }
3802                         } else {
3803                             for (value = 0; value < 256; value++) {
3804                                 if (isALNUM(value)) {
3805                                     ANYOF_BITMAP_SET(data->start_class, value);
3806                                 }
3807                             }
3808                         }
3809                     }
3810                     break;
3811                 case NALNUM:
3812                     if (flags & SCF_DO_STCLASS_AND) {
3813                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3814                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3815                             if (OP(scan) == NALNUMU) {
3816                                 for (value = 0; value < 256; value++) {
3817                                     if (isWORDCHAR_L1(value)) {
3818                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3819                                     }
3820                                 }
3821                             } else {
3822                                 for (value = 0; value < 256; value++) {
3823                                     if (isALNUM(value)) {
3824                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3825                                     }
3826                                 }
3827                             }
3828                         }
3829                     }
3830                     else {
3831                         if (data->start_class->flags & ANYOF_LOCALE)
3832                             ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3833
3834                         /* Even if under locale, set the bits for non-locale in
3835                          * case it isn't a true locale-node.  This will create
3836                          * false positives if it truly is locale */
3837                         if (OP(scan) == NALNUMU) {
3838                             for (value = 0; value < 256; value++) {
3839                                 if (! isWORDCHAR_L1(value)) {
3840                                     ANYOF_BITMAP_SET(data->start_class, value);
3841                                 }
3842                             }
3843                         } else {
3844                             for (value = 0; value < 256; value++) {
3845                                 if (! isALNUM(value)) {
3846                                     ANYOF_BITMAP_SET(data->start_class, value);
3847                                 }
3848                             }
3849                         }
3850                     }
3851                     break;
3852                 case SPACE:
3853                     if (flags & SCF_DO_STCLASS_AND) {
3854                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3855                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3856                             if (OP(scan) == SPACEU) {
3857                                 for (value = 0; value < 256; value++) {
3858                                     if (!isSPACE_L1(value)) {
3859                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3860                                     }
3861                                 }
3862                             } else {
3863                                 for (value = 0; value < 256; value++) {
3864                                     if (!isSPACE(value)) {
3865                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3866                                     }
3867                                 }
3868                             }
3869                         }
3870                     }
3871                     else {
3872                         if (data->start_class->flags & ANYOF_LOCALE) {
3873                             ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3874                         }
3875                         if (OP(scan) == SPACEU) {
3876                             for (value = 0; value < 256; value++) {
3877                                 if (isSPACE_L1(value)) {
3878                                     ANYOF_BITMAP_SET(data->start_class, value);
3879                                 }
3880                             }
3881                         } else {
3882                             for (value = 0; value < 256; value++) {
3883                                 if (isSPACE(value)) {
3884                                     ANYOF_BITMAP_SET(data->start_class, value);
3885                                 }
3886                             }
3887                         }
3888                     }
3889                     break;
3890                 case NSPACE:
3891                     if (flags & SCF_DO_STCLASS_AND) {
3892                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3893                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3894                             if (OP(scan) == NSPACEU) {
3895                                 for (value = 0; value < 256; value++) {
3896                                     if (isSPACE_L1(value)) {
3897                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3898                                     }
3899                                 }
3900                             } else {
3901                                 for (value = 0; value < 256; value++) {
3902                                     if (isSPACE(value)) {
3903                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3904                                     }
3905                                 }
3906                             }
3907                         }
3908                     }
3909                     else {
3910                         if (data->start_class->flags & ANYOF_LOCALE)
3911                             ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3912                         if (OP(scan) == NSPACEU) {
3913                             for (value = 0; value < 256; value++) {
3914                                 if (!isSPACE_L1(value)) {
3915                                     ANYOF_BITMAP_SET(data->start_class, value);
3916                                 }
3917                             }
3918                         }
3919                         else {
3920                             for (value = 0; value < 256; value++) {
3921                                 if (!isSPACE(value)) {
3922                                     ANYOF_BITMAP_SET(data->start_class, value);
3923                                 }
3924                             }
3925                         }
3926                     }
3927                     break;
3928                 case DIGIT:
3929                     if (flags & SCF_DO_STCLASS_AND) {
3930                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3931                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3932                             for (value = 0; value < 256; value++)
3933                                 if (!isDIGIT(value))
3934                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
3935                         }
3936                     }
3937                     else {
3938                         if (data->start_class->flags & ANYOF_LOCALE)
3939                             ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3940                         for (value = 0; value < 256; value++)
3941                             if (isDIGIT(value))
3942                                 ANYOF_BITMAP_SET(data->start_class, value);
3943                     }
3944                     break;
3945                 case NDIGIT:
3946                     if (flags & SCF_DO_STCLASS_AND) {
3947                         if (!(data->start_class->flags & ANYOF_LOCALE))
3948                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3949                         for (value = 0; value < 256; value++)
3950                             if (isDIGIT(value))
3951                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
3952                     }
3953                     else {
3954                         if (data->start_class->flags & ANYOF_LOCALE)
3955                             ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3956                         for (value = 0; value < 256; value++)
3957                             if (!isDIGIT(value))
3958                                 ANYOF_BITMAP_SET(data->start_class, value);
3959                     }
3960                     break;
3961                 CASE_SYNST_FNC(VERTWS);
3962                 CASE_SYNST_FNC(HORIZWS);
3963
3964                 }
3965                 if (flags & SCF_DO_STCLASS_OR)
3966                     cl_and(data->start_class, and_withp);
3967                 flags &= ~SCF_DO_STCLASS;
3968             }
3969         }
3970         else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3971             data->flags |= (OP(scan) == MEOL
3972                             ? SF_BEFORE_MEOL
3973                             : SF_BEFORE_SEOL);
3974         }
3975         else if (  PL_regkind[OP(scan)] == BRANCHJ
3976                  /* Lookbehind, or need to calculate parens/evals/stclass: */
3977                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
3978                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3979             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3980                 || OP(scan) == UNLESSM )
3981             {
3982                 /* Negative Lookahead/lookbehind
3983                    In this case we can't do fixed string optimisation.
3984                 */
3985
3986                 I32 deltanext, minnext, fake = 0;
3987                 regnode *nscan;
3988                 struct regnode_charclass_class intrnl;
3989                 int f = 0;
3990
3991                 data_fake.flags = 0;
3992                 if (data) {
3993                     data_fake.whilem_c = data->whilem_c;
3994                     data_fake.last_closep = data->last_closep;
3995                 }
3996                 else
3997                     data_fake.last_closep = &fake;
3998                 data_fake.pos_delta = delta;
3999                 if ( flags & SCF_DO_STCLASS && !scan->flags
4000                      && OP(scan) == IFMATCH ) { /* Lookahead */
4001                     cl_init(pRExC_state, &intrnl);
4002                     data_fake.start_class = &intrnl;
4003                     f |= SCF_DO_STCLASS_AND;
4004                 }
4005                 if (flags & SCF_WHILEM_VISITED_POS)
4006                     f |= SCF_WHILEM_VISITED_POS;
4007                 next = regnext(scan);
4008                 nscan = NEXTOPER(NEXTOPER(scan));
4009                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
4010                     last, &data_fake, stopparen, recursed, NULL, f, depth+1);
4011                 if (scan->flags) {
4012                     if (deltanext) {
4013                         FAIL("Variable length lookbehind not implemented");
4014                     }
4015                     else if (minnext > (I32)U8_MAX) {
4016                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4017                     }
4018                     scan->flags = (U8)minnext;
4019                 }
4020                 if (data) {
4021                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4022                         pars++;
4023                     if (data_fake.flags & SF_HAS_EVAL)
4024                         data->flags |= SF_HAS_EVAL;
4025                     data->whilem_c = data_fake.whilem_c;
4026                 }
4027                 if (f & SCF_DO_STCLASS_AND) {
4028                     if (flags & SCF_DO_STCLASS_OR) {
4029                         /* OR before, AND after: ideally we would recurse with
4030                          * data_fake to get the AND applied by study of the
4031                          * remainder of the pattern, and then derecurse;
4032                          * *** HACK *** for now just treat as "no information".
4033                          * See [perl #56690].
4034                          */
4035                         cl_init(pRExC_state, data->start_class);
4036                     }  else {
4037                         /* AND before and after: combine and continue */
4038                         const int was = (data->start_class->flags & ANYOF_EOS);
4039
4040                         cl_and(data->start_class, &intrnl);
4041                         if (was)
4042                             data->start_class->flags |= ANYOF_EOS;
4043                     }
4044                 }
4045             }
4046 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4047             else {
4048                 /* Positive Lookahead/lookbehind
4049                    In this case we can do fixed string optimisation,
4050                    but we must be careful about it. Note in the case of
4051                    lookbehind the positions will be offset by the minimum
4052                    length of the pattern, something we won't know about
4053                    until after the recurse.
4054                 */
4055                 I32 deltanext, fake = 0;
4056                 regnode *nscan;
4057                 struct regnode_charclass_class intrnl;
4058                 int f = 0;
4059                 /* We use SAVEFREEPV so that when the full compile
4060                     is finished perl will clean up the allocated
4061                     minlens when it's all done. This way we don't
4062                     have to worry about freeing them when we know
4063                     they wont be used, which would be a pain.
4064                  */
4065                 I32 *minnextp;
4066                 Newx( minnextp, 1, I32 );
4067                 SAVEFREEPV(minnextp);
4068
4069                 if (data) {
4070                     StructCopy(data, &data_fake, scan_data_t);
4071                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
4072                         f |= SCF_DO_SUBSTR;
4073                         if (scan->flags)
4074                             SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
4075                         data_fake.last_found=newSVsv(data->last_found);
4076                     }
4077                 }
4078                 else
4079                     data_fake.last_closep = &fake;
4080                 data_fake.flags = 0;
4081                 data_fake.pos_delta = delta;
4082                 if (is_inf)
4083                     data_fake.flags |= SF_IS_INF;
4084                 if ( flags & SCF_DO_STCLASS && !scan->flags
4085                      && OP(scan) == IFMATCH ) { /* Lookahead */
4086                     cl_init(pRExC_state, &intrnl);
4087                     data_fake.start_class = &intrnl;
4088                     f |= SCF_DO_STCLASS_AND;
4089                 }
4090                 if (flags & SCF_WHILEM_VISITED_POS)
4091                     f |= SCF_WHILEM_VISITED_POS;
4092                 next = regnext(scan);
4093                 nscan = NEXTOPER(NEXTOPER(scan));
4094
4095                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
4096                     last, &data_fake, stopparen, recursed, NULL, f,depth+1);
4097                 if (scan->flags) {
4098                     if (deltanext) {
4099                         FAIL("Variable length lookbehind not implemented");
4100                     }
4101                     else if (*minnextp > (I32)U8_MAX) {
4102                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4103                     }
4104                     scan->flags = (U8)*minnextp;
4105                 }
4106
4107                 *minnextp += min;
4108
4109                 if (f & SCF_DO_STCLASS_AND) {
4110                     const int was = (data->start_class->flags & ANYOF_EOS);
4111
4112                     cl_and(data->start_class, &intrnl);
4113                     if (was)
4114                         data->start_class->flags |= ANYOF_EOS;
4115                 }
4116                 if (data) {
4117                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4118                         pars++;
4119                     if (data_fake.flags & SF_HAS_EVAL)
4120                         data->flags |= SF_HAS_EVAL;
4121                     data->whilem_c = data_fake.whilem_c;
4122                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4123                         if (RExC_rx->minlen<*minnextp)
4124                             RExC_rx->minlen=*minnextp;
4125                         SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4126                         SvREFCNT_dec(data_fake.last_found);
4127
4128                         if ( data_fake.minlen_fixed != minlenp )
4129                         {
4130                             data->offset_fixed= data_fake.offset_fixed;
4131                             data->minlen_fixed= data_fake.minlen_fixed;
4132                             data->lookbehind_fixed+= scan->flags;
4133                         }
4134                         if ( data_fake.minlen_float != minlenp )
4135                         {
4136                             data->minlen_float= data_fake.minlen_float;
4137                             data->offset_float_min=data_fake.offset_float_min;
4138                             data->offset_float_max=data_fake.offset_float_max;
4139                             data->lookbehind_float+= scan->flags;
4140                         }
4141                     }
4142                 }
4143
4144
4145             }
4146 #endif
4147         }
4148         else if (OP(scan) == OPEN) {
4149             if (stopparen != (I32)ARG(scan))
4150                 pars++;
4151         }
4152         else if (OP(scan) == CLOSE) {
4153             if (stopparen == (I32)ARG(scan)) {
4154                 break;
4155             }
4156             if ((I32)ARG(scan) == is_par) {
4157                 next = regnext(scan);
4158
4159                 if ( next && (OP(next) != WHILEM) && next < last)
4160                     is_par = 0;         /* Disable optimization */
4161             }
4162             if (data)
4163                 *(data->last_closep) = ARG(scan);
4164         }
4165         else if (OP(scan) == EVAL) {
4166                 if (data)
4167                     data->flags |= SF_HAS_EVAL;
4168         }
4169         else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4170             if (flags & SCF_DO_SUBSTR) {
4171                 SCAN_COMMIT(pRExC_state,data,minlenp);
4172                 flags &= ~SCF_DO_SUBSTR;
4173             }
4174             if (data && OP(scan)==ACCEPT) {
4175                 data->flags |= SCF_SEEN_ACCEPT;
4176                 if (stopmin > min)
4177                     stopmin = min;
4178             }
4179         }
4180         else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4181         {
4182                 if (flags & SCF_DO_SUBSTR) {
4183                     SCAN_COMMIT(pRExC_state,data,minlenp);
4184                     data->longest = &(data->longest_float);
4185                 }
4186                 is_inf = is_inf_internal = 1;
4187                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4188                     cl_anything(pRExC_state, data->start_class);
4189                 flags &= ~SCF_DO_STCLASS;
4190         }
4191         else if (OP(scan) == GPOS) {
4192             if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4193                 !(delta || is_inf || (data && data->pos_delta)))
4194             {
4195                 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4196                     RExC_rx->extflags |= RXf_ANCH_GPOS;
4197                 if (RExC_rx->gofs < (U32)min)
4198                     RExC_rx->gofs = min;
4199             } else {
4200                 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4201                 RExC_rx->gofs = 0;
4202             }
4203         }
4204 #ifdef TRIE_STUDY_OPT
4205 #ifdef FULL_TRIE_STUDY
4206         else if (PL_regkind[OP(scan)] == TRIE) {
4207             /* NOTE - There is similar code to this block above for handling
4208                BRANCH nodes on the initial study.  If you change stuff here
4209                check there too. */
4210             regnode *trie_node= scan;
4211             regnode *tail= regnext(scan);
4212             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4213             I32 max1 = 0, min1 = I32_MAX;
4214             struct regnode_charclass_class accum;
4215
4216             if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4217                 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4218             if (flags & SCF_DO_STCLASS)
4219                 cl_init_zero(pRExC_state, &accum);
4220
4221             if (!trie->jump) {
4222                 min1= trie->minlen;
4223                 max1= trie->maxlen;
4224             } else {
4225                 const regnode *nextbranch= NULL;
4226                 U32 word;
4227
4228                 for ( word=1 ; word <= trie->wordcount ; word++)
4229                 {
4230                     I32 deltanext=0, minnext=0, f = 0, fake;
4231                     struct regnode_charclass_class this_class;
4232
4233                     data_fake.flags = 0;
4234                     if (data) {
4235                         data_fake.whilem_c = data->whilem_c;
4236                         data_fake.last_closep = data->last_closep;
4237                     }
4238                     else
4239                         data_fake.last_closep = &fake;
4240                     data_fake.pos_delta = delta;
4241                     if (flags & SCF_DO_STCLASS) {
4242                         cl_init(pRExC_state, &this_class);
4243                         data_fake.start_class = &this_class;
4244                         f = SCF_DO_STCLASS_AND;
4245                     }
4246                     if (flags & SCF_WHILEM_VISITED_POS)
4247                         f |= SCF_WHILEM_VISITED_POS;
4248
4249                     if (trie->jump[word]) {
4250                         if (!nextbranch)
4251                             nextbranch = trie_node + trie->jump[0];
4252                         scan= trie_node + trie->jump[word];
4253                         /* We go from the jump point to the branch that follows
4254                            it. Note this means we need the vestigal unused branches
4255                            even though they arent otherwise used.
4256                          */
4257                         minnext = study_chunk(pRExC_state, &scan, minlenp,
4258                             &deltanext, (regnode *)nextbranch, &data_fake,
4259                             stopparen, recursed, NULL, f,depth+1);
4260                     }
4261                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4262                         nextbranch= regnext((regnode*)nextbranch);
4263
4264                     if (min1 > (I32)(minnext + trie->minlen))
4265                         min1 = minnext + trie->minlen;
4266                     if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4267                         max1 = minnext + deltanext + trie->maxlen;
4268                     if (deltanext == I32_MAX)
4269                         is_inf = is_inf_internal = 1;
4270
4271                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4272                         pars++;
4273                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
4274                         if ( stopmin > min + min1)
4275                             stopmin = min + min1;
4276                         flags &= ~SCF_DO_SUBSTR;
4277                         if (data)
4278                             data->flags |= SCF_SEEN_ACCEPT;
4279                     }
4280                     if (data) {
4281                         if (data_fake.flags & SF_HAS_EVAL)
4282                             data->flags |= SF_HAS_EVAL;
4283                         data->whilem_c = data_fake.whilem_c;
4284                     }
4285                     if (flags & SCF_DO_STCLASS)
4286                         cl_or(pRExC_state, &accum, &this_class);
4287                 }
4288             }
4289             if (flags & SCF_DO_SUBSTR) {
4290                 data->pos_min += min1;
4291                 data->pos_delta += max1 - min1;
4292                 if (max1 != min1 || is_inf)
4293                     data->longest = &(data->longest_float);
4294             }
4295             min += min1;
4296             delta += max1 - min1;
4297             if (flags & SCF_DO_STCLASS_OR) {
4298                 cl_or(pRExC_state, data->start_class, &accum);
4299                 if (min1) {
4300                     cl_and(data->start_class, and_withp);
4301                     flags &= ~SCF_DO_STCLASS;
4302                 }
4303             }
4304             else if (flags & SCF_DO_STCLASS_AND) {
4305                 if (min1) {
4306                     cl_and(data->start_class, &accum);
4307                     flags &= ~SCF_DO_STCLASS;
4308                 }
4309                 else {
4310                     /* Switch to OR mode: cache the old value of
4311                      * data->start_class */
4312                     INIT_AND_WITHP;
4313                     StructCopy(data->start_class, and_withp,
4314                                struct regnode_charclass_class);
4315                     flags &= ~SCF_DO_STCLASS_AND;
4316                     StructCopy(&accum, data->start_class,
4317                                struct regnode_charclass_class);
4318                     flags |= SCF_DO_STCLASS_OR;
4319                     data->start_class->flags |= ANYOF_EOS;
4320                 }
4321             }
4322             scan= tail;
4323             continue;
4324         }
4325 #else
4326         else if (PL_regkind[OP(scan)] == TRIE) {
4327             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4328             U8*bang=NULL;
4329
4330             min += trie->minlen;
4331             delta += (trie->maxlen - trie->minlen);
4332             flags &= ~SCF_DO_STCLASS; /* xxx */
4333             if (flags & SCF_DO_SUBSTR) {
4334                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
4335                 data->pos_min += trie->minlen;
4336                 data->pos_delta += (trie->maxlen - trie->minlen);
4337                 if (trie->maxlen != trie->minlen)
4338                     data->longest = &(data->longest_float);
4339             }
4340             if (trie->jump) /* no more substrings -- for now /grr*/
4341                 flags &= ~SCF_DO_SUBSTR;
4342         }
4343 #endif /* old or new */
4344 #endif /* TRIE_STUDY_OPT */
4345
4346         /* Else: zero-length, ignore. */
4347         scan = regnext(scan);
4348     }
4349     if (frame) {
4350         last = frame->last;
4351         scan = frame->next;
4352         stopparen = frame->stop;
4353         frame = frame->prev;
4354         goto fake_study_recurse;
4355     }
4356
4357   finish:
4358     assert(!frame);
4359     DEBUG_STUDYDATA("pre-fin:",data,depth);
4360
4361     *scanp = scan;
4362     *deltap = is_inf_internal ? I32_MAX : delta;
4363     if (flags & SCF_DO_SUBSTR && is_inf)
4364         data->pos_delta = I32_MAX - data->pos_min;
4365     if (is_par > (I32)U8_MAX)
4366         is_par = 0;
4367     if (is_par && pars==1 && data) {
4368         data->flags |= SF_IN_PAR;
4369         data->flags &= ~SF_HAS_PAR;
4370     }
4371     else if (pars && data) {
4372         data->flags |= SF_HAS_PAR;
4373         data->flags &= ~SF_IN_PAR;
4374     }
4375     if (flags & SCF_DO_STCLASS_OR)
4376         cl_and(data->start_class, and_withp);
4377     if (flags & SCF_TRIE_RESTUDY)
4378         data->flags |=  SCF_TRIE_RESTUDY;
4379
4380     DEBUG_STUDYDATA("post-fin:",data,depth);
4381
4382     return min < stopmin ? min : stopmin;
4383 }
4384
4385 STATIC U32
4386 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4387 {
4388     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4389
4390     PERL_ARGS_ASSERT_ADD_DATA;
4391
4392     Renewc(RExC_rxi->data,
4393            sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4394            char, struct reg_data);
4395     if(count)
4396         Renew(RExC_rxi->data->what, count + n, U8);
4397     else
4398         Newx(RExC_rxi->data->what, n, U8);
4399     RExC_rxi->data->count = count + n;
4400     Copy(s, RExC_rxi->data->what + count, n, U8);
4401     return count;
4402 }
4403
4404 /*XXX: todo make this not included in a non debugging perl */
4405 #ifndef PERL_IN_XSUB_RE
4406 void
4407 Perl_reginitcolors(pTHX)
4408 {
4409     dVAR;
4410     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4411     if (s) {
4412         char *t = savepv(s);
4413         int i = 0;
4414         PL_colors[0] = t;
4415         while (++i < 6) {
4416             t = strchr(t, '\t');
4417             if (t) {
4418                 *t = '\0';
4419                 PL_colors[i] = ++t;
4420             }
4421             else
4422                 PL_colors[i] = t = (char *)"";
4423         }
4424     } else {
4425         int i = 0;
4426         while (i < 6)
4427             PL_colors[i++] = (char *)"";
4428     }
4429     PL_colorset = 1;
4430 }
4431 #endif
4432
4433
4434 #ifdef TRIE_STUDY_OPT
4435 #define CHECK_RESTUDY_GOTO                                  \
4436         if (                                                \
4437               (data.flags & SCF_TRIE_RESTUDY)               \
4438               && ! restudied++                              \
4439         )     goto reStudy
4440 #else
4441 #define CHECK_RESTUDY_GOTO
4442 #endif
4443
4444 /*
4445  - pregcomp - compile a regular expression into internal code
4446  *
4447  * We can't allocate space until we know how big the compiled form will be,
4448  * but we can't compile it (and thus know how big it is) until we've got a
4449  * place to put the code.  So we cheat:  we compile it twice, once with code
4450  * generation turned off and size counting turned on, and once "for real".
4451  * This also means that we don't allocate space until we are sure that the
4452  * thing really will compile successfully, and we never have to move the
4453  * code and thus invalidate pointers into it.  (Note that it has to be in
4454  * one piece because free() must be able to free it all.) [NB: not true in perl]
4455  *
4456  * Beware that the optimization-preparation code in here knows about some
4457  * of the structure of the compiled regexp.  [I'll say.]
4458  */
4459
4460
4461
4462 #ifndef PERL_IN_XSUB_RE
4463 #define RE_ENGINE_PTR &PL_core_reg_engine
4464 #else
4465 extern const struct regexp_engine my_reg_engine;
4466 #define RE_ENGINE_PTR &my_reg_engine
4467 #endif
4468
4469 #ifndef PERL_IN_XSUB_RE
4470 REGEXP *
4471 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4472 {
4473     dVAR;
4474     HV * const table = GvHV(PL_hintgv);
4475
4476     PERL_ARGS_ASSERT_PREGCOMP;
4477
4478     /* Dispatch a request to compile a regexp to correct
4479        regexp engine. */
4480     if (table) {
4481         SV **ptr= hv_fetchs(table, "regcomp", FALSE);
4482         GET_RE_DEBUG_FLAGS_DECL;
4483         if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
4484             const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4485             DEBUG_COMPILE_r({
4486                 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4487                     SvIV(*ptr));
4488             });
4489             return CALLREGCOMP_ENG(eng, pattern, flags);
4490         }
4491     }
4492     return Perl_re_compile(aTHX_ pattern, flags);
4493 }
4494 #endif
4495
4496 REGEXP *
4497 Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
4498 {
4499     dVAR;
4500     REGEXP *rx;
4501     struct regexp *r;
4502     register regexp_internal *ri;
4503     STRLEN plen;
4504     char  *exp;
4505     char* xend;
4506     regnode *scan;
4507     I32 flags;
4508     I32 minlen = 0;
4509     U32 pm_flags;
4510
4511     /* these are all flags - maybe they should be turned
4512      * into a single int with different bit masks */
4513     I32 sawlookahead = 0;
4514     I32 sawplus = 0;
4515     I32 sawopen = 0;
4516     bool used_setjump = FALSE;
4517     regex_charset initial_charset = get_regex_charset(orig_pm_flags);
4518
4519     U8 jump_ret = 0;
4520     dJMPENV;
4521     scan_data_t data;
4522     RExC_state_t RExC_state;
4523     RExC_state_t * const pRExC_state = &RExC_state;
4524 #ifdef TRIE_STUDY_OPT
4525     int restudied;
4526     RExC_state_t copyRExC_state;
4527 #endif
4528     GET_RE_DEBUG_FLAGS_DECL;
4529
4530     PERL_ARGS_ASSERT_RE_COMPILE;
4531
4532     DEBUG_r(if (!PL_colorset) reginitcolors());
4533
4534     RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
4535     RExC_uni_semantics = 0;
4536     RExC_contains_locale = 0;
4537
4538     /****************** LONG JUMP TARGET HERE***********************/
4539     /* Longjmp back to here if have to switch in midstream to utf8 */
4540     if (! RExC_orig_utf8) {
4541         JMPENV_PUSH(jump_ret);
4542         used_setjump = TRUE;
4543     }
4544
4545     if (jump_ret == 0) {    /* First time through */
4546         exp = SvPV(pattern, plen);
4547         xend = exp + plen;
4548         /* ignore the utf8ness if the pattern is 0 length */
4549         if (plen == 0) {
4550             RExC_utf8 = RExC_orig_utf8 = 0;
4551         }
4552
4553         DEBUG_COMPILE_r({
4554             SV *dsv= sv_newmortal();
4555             RE_PV_QUOTED_DECL(s, RExC_utf8,
4556                 dsv, exp, plen, 60);
4557             PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4558                            PL_colors[4],PL_colors[5],s);
4559         });
4560     }
4561     else {  /* longjumped back */
4562         STRLEN len = plen;
4563
4564         /* If the cause for the longjmp was other than changing to utf8, pop
4565          * our own setjmp, and longjmp to the correct handler */
4566         if (jump_ret != UTF8_LONGJMP) {
4567             JMPENV_POP;
4568             JMPENV_JUMP(jump_ret);
4569         }
4570
4571         GET_RE_DEBUG_FLAGS;
4572
4573         /* It's possible to write a regexp in ascii that represents Unicode
4574         codepoints outside of the byte range, such as via \x{100}. If we
4575         detect such a sequence we have to convert the entire pattern to utf8
4576         and then recompile, as our sizing calculation will have been based
4577         on 1 byte == 1 character, but we will need to use utf8 to encode
4578         at least some part of the pattern, and therefore must convert the whole
4579         thing.
4580         -- dmq */
4581         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4582             "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4583         exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)SvPV(pattern, plen), &len);
4584         xend = exp + len;
4585         RExC_orig_utf8 = RExC_utf8 = 1;
4586         SAVEFREEPV(exp);
4587     }
4588
4589 #ifdef TRIE_STUDY_OPT
4590     restudied = 0;
4591 #endif
4592
4593     pm_flags = orig_pm_flags;
4594
4595     if (initial_charset == REGEX_LOCALE_CHARSET) {
4596         RExC_contains_locale = 1;
4597     }
4598     else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
4599
4600         /* Set to use unicode semantics if the pattern is in utf8 and has the
4601          * 'depends' charset specified, as it means unicode when utf8  */
4602         set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
4603     }
4604
4605     RExC_precomp = exp;
4606     RExC_flags = pm_flags;
4607     RExC_sawback = 0;
4608
4609     RExC_seen = 0;
4610     RExC_in_lookbehind = 0;
4611     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4612     RExC_seen_evals = 0;
4613     RExC_extralen = 0;
4614
4615     /* First pass: determine size, legality. */
4616     RExC_parse = exp;
4617     RExC_start = exp;
4618     RExC_end = xend;
4619     RExC_naughty = 0;
4620     RExC_npar = 1;
4621     RExC_nestroot = 0;
4622     RExC_size = 0L;
4623     RExC_emit = &PL_regdummy;
4624     RExC_whilem_seen = 0;
4625     RExC_open_parens = NULL;
4626     RExC_close_parens = NULL;
4627     RExC_opend = NULL;
4628     RExC_paren_names = NULL;
4629 #ifdef DEBUGGING
4630     RExC_paren_name_list = NULL;
4631 #endif
4632     RExC_recurse = NULL;
4633     RExC_recurse_count = 0;
4634
4635 #if 0 /* REGC() is (currently) a NOP at the first pass.
4636        * Clever compilers notice this and complain. --jhi */
4637     REGC((U8)REG_MAGIC, (char*)RExC_emit);
4638 #endif
4639     DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4640     if (reg(pRExC_state, 0, &flags,1) == NULL) {
4641         RExC_precomp = NULL;
4642         return(NULL);
4643     }
4644
4645     /* Here, finished first pass.  Get rid of any added setjmp */
4646     if (used_setjump) {
4647         JMPENV_POP;
4648     }
4649
4650     DEBUG_PARSE_r({
4651         PerlIO_printf(Perl_debug_log,
4652             "Required size %"IVdf" nodes\n"
4653             "Starting second pass (creation)\n",
4654             (IV)RExC_size);
4655         RExC_lastnum=0;
4656         RExC_lastparse=NULL;
4657     });
4658
4659     /* The first pass could have found things that force Unicode semantics */
4660     if ((RExC_utf8 || RExC_uni_semantics)
4661          && get_regex_charset(pm_flags) == REGEX_DEPENDS_CHARSET)
4662     {
4663         set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
4664     }
4665
4666     /* Small enough for pointer-storage convention?
4667        If extralen==0, this means that we will not need long jumps. */
4668     if (RExC_size >= 0x10000L && RExC_extralen)
4669         RExC_size += RExC_extralen;
4670     else
4671         RExC_extralen = 0;
4672     if (RExC_whilem_seen > 15)
4673         RExC_whilem_seen = 15;
4674
4675     /* Allocate space and zero-initialize. Note, the two step process
4676        of zeroing when in debug mode, thus anything assigned has to
4677        happen after that */
4678     rx = (REGEXP*) newSV_type(SVt_REGEXP);
4679     r = (struct regexp*)SvANY(rx);
4680     Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4681          char, regexp_internal);
4682     if ( r == NULL || ri == NULL )
4683         FAIL("Regexp out of space");
4684 #ifdef DEBUGGING
4685     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
4686     Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
4687 #else
4688     /* bulk initialize base fields with 0. */
4689     Zero(ri, sizeof(regexp_internal), char);
4690 #endif
4691
4692     /* non-zero initialization begins here */
4693     RXi_SET( r, ri );
4694     r->engine= RE_ENGINE_PTR;
4695     r->extflags = pm_flags;
4696     {
4697         bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
4698         bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
4699
4700         /* The caret is output if there are any defaults: if not all the STD
4701          * flags are set, or if no character set specifier is needed */
4702         bool has_default =
4703                     (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
4704                     || ! has_charset);
4705         bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
4706         U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
4707                             >> RXf_PMf_STD_PMMOD_SHIFT);
4708         const char *fptr = STD_PAT_MODS;        /*"msix"*/
4709         char *p;
4710         /* Allocate for the worst case, which is all the std flags are turned
4711          * on.  If more precision is desired, we could do a population count of
4712          * the flags set.  This could be done with a small lookup table, or by
4713          * shifting, masking and adding, or even, when available, assembly
4714          * language for a machine-language population count.
4715          * We never output a minus, as all those are defaults, so are
4716          * covered by the caret */
4717         const STRLEN wraplen = plen + has_p + has_runon
4718             + has_default       /* If needs a caret */
4719
4720                 /* If needs a character set specifier */
4721             + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
4722             + (sizeof(STD_PAT_MODS) - 1)
4723             + (sizeof("(?:)") - 1);
4724
4725         p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
4726         SvPOK_on(rx);
4727         SvFLAGS(rx) |= SvUTF8(pattern);
4728         *p++='('; *p++='?';
4729
4730         /* If a default, cover it using the caret */
4731         if (has_default) {
4732             *p++= DEFAULT_PAT_MOD;
4733         }
4734         if (has_charset) {
4735             STRLEN len;
4736             const char* const name = get_regex_charset_name(r->extflags, &len);
4737             Copy(name, p, len, char);
4738             p += len;
4739         }
4740         if (has_p)
4741             *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
4742         {
4743             char ch;
4744             while((ch = *fptr++)) {
4745                 if(reganch & 1)
4746                     *p++ = ch;
4747                 reganch >>= 1;
4748             }
4749         }
4750
4751         *p++ = ':';
4752         Copy(RExC_precomp, p, plen, char);
4753         assert ((RX_WRAPPED(rx) - p) < 16);
4754         r->pre_prefix = p - RX_WRAPPED(rx);
4755         p += plen;
4756         if (has_runon)
4757             *p++ = '\n';
4758         *p++ = ')';
4759         *p = 0;
4760         SvCUR_set(rx, p - SvPVX_const(rx));
4761     }
4762
4763     r->intflags = 0;
4764     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4765
4766     if (RExC_seen & REG_SEEN_RECURSE) {
4767         Newxz(RExC_open_parens, RExC_npar,regnode *);
4768         SAVEFREEPV(RExC_open_parens);
4769         Newxz(RExC_close_parens,RExC_npar,regnode *);
4770         SAVEFREEPV(RExC_close_parens);
4771     }
4772
4773     /* Useful during FAIL. */
4774 #ifdef RE_TRACK_PATTERN_OFFSETS
4775     Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
4776     DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
4777                           "%s %"UVuf" bytes for offset annotations.\n",
4778                           ri->u.offsets ? "Got" : "Couldn't get",
4779                           (UV)((2*RExC_size+1) * sizeof(U32))));
4780 #endif
4781     SetProgLen(ri,RExC_size);
4782     RExC_rx_sv = rx;
4783     RExC_rx = r;
4784     RExC_rxi = ri;
4785     REH_CALL_COMP_BEGIN_HOOK(pRExC_state->rx);
4786
4787     /* Second pass: emit code. */
4788     RExC_flags = pm_flags;      /* don't let top level (?i) bleed */
4789     RExC_parse = exp;
4790     RExC_end = xend;
4791     RExC_naughty = 0;
4792     RExC_npar = 1;
4793     RExC_emit_start = ri->program;
4794     RExC_emit = ri->program;
4795     RExC_emit_bound = ri->program + RExC_size + 1;
4796
4797     /* Store the count of eval-groups for security checks: */
4798     RExC_rx->seen_evals = RExC_seen_evals;
4799     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
4800     if (reg(pRExC_state, 0, &flags,1) == NULL) {
4801         ReREFCNT_dec(rx);
4802         return(NULL);
4803     }
4804     /* XXXX To minimize changes to RE engine we always allocate
4805        3-units-long substrs field. */
4806     Newx(r->substrs, 1, struct reg_substr_data);
4807     if (RExC_recurse_count) {
4808         Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4809         SAVEFREEPV(RExC_recurse);
4810     }
4811
4812 reStudy:
4813     r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
4814     Zero(r->substrs, 1, struct reg_substr_data);
4815
4816 #ifdef TRIE_STUDY_OPT
4817     if (!restudied) {
4818         StructCopy(&zero_scan_data, &data, scan_data_t);
4819         copyRExC_state = RExC_state;
4820     } else {
4821         U32 seen=RExC_seen;
4822         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
4823
4824         RExC_state = copyRExC_state;
4825         if (seen & REG_TOP_LEVEL_BRANCHES)
4826             RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4827         else
4828             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
4829         if (data.last_found) {
4830             SvREFCNT_dec(data.longest_fixed);
4831             SvREFCNT_dec(data.longest_float);
4832             SvREFCNT_dec(data.last_found);
4833         }
4834         StructCopy(&zero_scan_data, &data, scan_data_t);
4835     }
4836 #else
4837     StructCopy(&zero_scan_data, &data, scan_data_t);
4838 #endif
4839
4840     /* Dig out information for optimizations. */
4841     r->extflags = RExC_flags; /* was pm_op */
4842     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
4843
4844     if (UTF)
4845         SvUTF8_on(rx);  /* Unicode in it? */
4846     ri->regstclass = NULL;
4847     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
4848         r->intflags |= PREGf_NAUGHTY;
4849     scan = ri->program + 1;             /* First BRANCH. */
4850
4851     /* testing for BRANCH here tells us whether there is "must appear"
4852        data in the pattern. If there is then we can use it for optimisations */
4853     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /*  Only one top-level choice. */
4854         I32 fake;
4855         STRLEN longest_float_length, longest_fixed_length;
4856         struct regnode_charclass_class ch_class; /* pointed to by data */
4857         int stclass_flag;
4858         I32 last_close = 0; /* pointed to by data */
4859         regnode *first= scan;
4860         regnode *first_next= regnext(first);
4861         /*
4862          * Skip introductions and multiplicators >= 1
4863          * so that we can extract the 'meat' of the pattern that must
4864          * match in the large if() sequence following.
4865          * NOTE that EXACT is NOT covered here, as it is normally
4866          * picked up by the optimiser separately.
4867          *
4868          * This is unfortunate as the optimiser isnt handling lookahead
4869          * properly currently.
4870          *
4871          */
4872         while ((OP(first) == OPEN && (sawopen = 1)) ||
4873                /* An OR of *one* alternative - should not happen now. */
4874             (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
4875             /* for now we can't handle lookbehind IFMATCH*/
4876             (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
4877             (OP(first) == PLUS) ||
4878             (OP(first) == MINMOD) ||
4879                /* An {n,m} with n>0 */
4880             (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
4881             (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
4882         {
4883                 /*
4884                  * the only op that could be a regnode is PLUS, all the rest
4885                  * will be regnode_1 or regnode_2.
4886                  *
4887                  */
4888                 if (OP(first) == PLUS)
4889                     sawplus = 1;
4890                 else
4891                     first += regarglen[OP(first)];
4892
4893                 first = NEXTOPER(first);
4894                 first_next= regnext(first);
4895         }
4896
4897         /* Starting-point info. */
4898       again:
4899         DEBUG_PEEP("first:",first,0);
4900         /* Ignore EXACT as we deal with it later. */
4901         if (PL_regkind[OP(first)] == EXACT) {
4902             if (OP(first) == EXACT)
4903                 NOOP;   /* Empty, get anchored substr later. */
4904             else
4905                 ri->regstclass = first;
4906         }
4907 #ifdef TRIE_STCLASS
4908         else if (PL_regkind[OP(first)] == TRIE &&
4909                 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
4910         {
4911             regnode *trie_op;
4912             /* this can happen only on restudy */
4913             if ( OP(first) == TRIE ) {
4914                 struct regnode_1 *trieop = (struct regnode_1 *)
4915                     PerlMemShared_calloc(1, sizeof(struct regnode_1));
4916                 StructCopy(first,trieop,struct regnode_1);
4917                 trie_op=(regnode *)trieop;
4918             } else {
4919                 struct regnode_charclass *trieop = (struct regnode_charclass *)
4920                     PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
4921                 StructCopy(first,trieop,struct regnode_charclass);
4922                 trie_op=(regnode *)trieop;
4923             }
4924             OP(trie_op)+=2;
4925             make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
4926             ri->regstclass = trie_op;
4927         }
4928 #endif
4929         else if (REGNODE_SIMPLE(OP(first)))
4930             ri->regstclass = first;
4931         else if (PL_regkind[OP(first)] == BOUND ||
4932                  PL_regkind[OP(first)] == NBOUND)
4933             ri->regstclass = first;
4934         else if (PL_regkind[OP(first)] == BOL) {
4935             r->extflags |= (OP(first) == MBOL
4936                            ? RXf_ANCH_MBOL
4937                            : (OP(first) == SBOL
4938                               ? RXf_ANCH_SBOL
4939                               : RXf_ANCH_BOL));
4940             first = NEXTOPER(first);
4941             goto again;
4942         }
4943         else if (OP(first) == GPOS) {
4944             r->extflags |= RXf_ANCH_GPOS;
4945             first = NEXTOPER(first);
4946             goto again;
4947         }
4948         else if ((!sawopen || !RExC_sawback) &&
4949             (OP(first) == STAR &&
4950             PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
4951             !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
4952         {
4953             /* turn .* into ^.* with an implied $*=1 */
4954             const int type =
4955                 (OP(NEXTOPER(first)) == REG_ANY)
4956                     ? RXf_ANCH_MBOL
4957                     : RXf_ANCH_SBOL;
4958             r->extflags |= type;
4959             r->intflags |= PREGf_IMPLICIT;
4960             first = NEXTOPER(first);
4961             goto again;
4962         }
4963         if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
4964             && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
4965             /* x+ must match at the 1st pos of run of x's */
4966             r->intflags |= PREGf_SKIP;
4967
4968         /* Scan is after the zeroth branch, first is atomic matcher. */
4969 #ifdef TRIE_STUDY_OPT
4970         DEBUG_PARSE_r(
4971             if (!restudied)
4972                 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4973                               (IV)(first - scan + 1))
4974         );
4975 #else
4976         DEBUG_PARSE_r(
4977             PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4978                 (IV)(first - scan + 1))
4979         );
4980 #endif
4981
4982
4983         /*
4984         * If there's something expensive in the r.e., find the
4985         * longest literal string that must appear and make it the
4986         * regmust.  Resolve ties in favor of later strings, since
4987         * the regstart check works with the beginning of the r.e.
4988         * and avoiding duplication strengthens checking.  Not a
4989         * strong reason, but sufficient in the absence of others.
4990         * [Now we resolve ties in favor of the earlier string if
4991         * it happens that c_offset_min has been invalidated, since the
4992         * earlier string may buy us something the later one won't.]
4993         */
4994
4995         data.longest_fixed = newSVpvs("");
4996         data.longest_float = newSVpvs("");
4997         data.last_found = newSVpvs("");
4998         data.longest = &(data.longest_fixed);
4999         first = scan;
5000         if (!ri->regstclass) {
5001             cl_init(pRExC_state, &ch_class);
5002             data.start_class = &ch_class;
5003             stclass_flag = SCF_DO_STCLASS_AND;
5004         } else                          /* XXXX Check for BOUND? */
5005             stclass_flag = 0;
5006         data.last_closep = &last_close;
5007
5008         minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
5009             &data, -1, NULL, NULL,
5010             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
5011
5012
5013         CHECK_RESTUDY_GOTO;
5014
5015
5016         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
5017              && data.last_start_min == 0 && data.last_end > 0
5018              && !RExC_seen_zerolen
5019              && !(RExC_seen & REG_SEEN_VERBARG)
5020              && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
5021             r->extflags |= RXf_CHECK_ALL;
5022         scan_commit(pRExC_state, &data,&minlen,0);
5023         SvREFCNT_dec(data.last_found);
5024
5025         /* Note that code very similar to this but for anchored string
5026            follows immediately below, changes may need to be made to both.
5027            Be careful.
5028          */
5029         longest_float_length = CHR_SVLEN(data.longest_float);
5030         if (longest_float_length
5031             || (data.flags & SF_FL_BEFORE_EOL
5032                 && (!(data.flags & SF_FL_BEFORE_MEOL)
5033                     || (RExC_flags & RXf_PMf_MULTILINE))))
5034         {
5035             I32 t,ml;
5036
5037             if (SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
5038                 && data.offset_fixed == data.offset_float_min
5039                 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
5040                     goto remove_float;          /* As in (a)+. */
5041
5042             /* copy the information about the longest float from the reg_scan_data
5043                over to the program. */
5044             if (SvUTF8(data.longest_float)) {
5045                 r->float_utf8 = data.longest_float;
5046                 r->float_substr = NULL;
5047             } else {
5048                 r->float_substr = data.longest_float;
5049                 r->float_utf8 = NULL;
5050             }
5051             /* float_end_shift is how many chars that must be matched that
5052                follow this item. We calculate it ahead of time as once the
5053                lookbehind offset is added in we lose the ability to correctly
5054                calculate it.*/
5055             ml = data.minlen_float ? *(data.minlen_float)
5056                                    : (I32)longest_float_length;
5057             r->float_end_shift = ml - data.offset_float_min
5058                 - longest_float_length + (SvTAIL(data.longest_float) != 0)
5059                 + data.lookbehind_float;
5060             r->float_min_offset = data.offset_float_min - data.lookbehind_float;
5061             r->float_max_offset = data.offset_float_max;
5062             if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
5063                 r->float_max_offset -= data.lookbehind_float;
5064
5065             t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
5066                        && (!(data.flags & SF_FL_BEFORE_MEOL)
5067                            || (RExC_flags & RXf_PMf_MULTILINE)));
5068             fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
5069         }
5070         else {
5071           remove_float:
5072             r->float_substr = r->float_utf8 = NULL;
5073             SvREFCNT_dec(data.longest_float);
5074             longest_float_length = 0;
5075         }
5076
5077         /* Note that code very similar to this but for floating string
5078            is immediately above, changes may need to be made to both.
5079            Be careful.
5080          */
5081         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
5082         if (longest_fixed_length
5083             || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
5084                 && (!(data.flags & SF_FIX_BEFORE_MEOL)
5085                     || (RExC_flags & RXf_PMf_MULTILINE))))
5086         {
5087             I32 t,ml;
5088
5089             /* copy the information about the longest fixed
5090                from the reg_scan_data over to the program. */
5091             if (SvUTF8(data.longest_fixed)) {
5092                 r->anchored_utf8 = data.longest_fixed;
5093                 r->anchored_substr = NULL;
5094             } else {
5095                 r->anchored_substr = data.longest_fixed;
5096                 r->anchored_utf8 = NULL;
5097             }
5098             /* fixed_end_shift is how many chars that must be matched that
5099                follow this item. We calculate it ahead of time as once the
5100                lookbehind offset is added in we lose the ability to correctly
5101                calculate it.*/
5102             ml = data.minlen_fixed ? *(data.minlen_fixed)
5103                                    : (I32)longest_fixed_length;
5104             r->anchored_end_shift = ml - data.offset_fixed
5105                 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
5106                 + data.lookbehind_fixed;
5107             r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
5108
5109             t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
5110                  && (!(data.flags & SF_FIX_BEFORE_MEOL)
5111                      || (RExC_flags & RXf_PMf_MULTILINE)));
5112             fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
5113         }
5114         else {
5115             r->anchored_substr = r->anchored_utf8 = NULL;
5116             SvREFCNT_dec(data.longest_fixed);
5117             longest_fixed_length = 0;
5118         }
5119         if (ri->regstclass
5120             && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
5121             ri->regstclass = NULL;
5122
5123         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
5124             && stclass_flag
5125             && !(data.start_class->flags & ANYOF_EOS)
5126             && !cl_is_anything(data.start_class))
5127         {
5128             const U32 n = add_data(pRExC_state, 1, "f");
5129             data.start_class->flags |= ANYOF_IS_SYNTHETIC;
5130
5131             Newx(RExC_rxi->data->data[n], 1,
5132                 struct regnode_charclass_class);
5133             StructCopy(data.start_class,
5134                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
5135                        struct regnode_charclass_class);
5136             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
5137             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
5138             DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
5139                       regprop(r, sv, (regnode*)data.start_class);
5140                       PerlIO_printf(Perl_debug_log,
5141                                     "synthetic stclass \"%s\".\n",
5142                                     SvPVX_const(sv));});
5143         }
5144
5145         /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
5146         if (longest_fixed_length > longest_float_length) {
5147             r->check_end_shift = r->anchored_end_shift;
5148             r->check_substr = r->anchored_substr;
5149             r->check_utf8 = r->anchored_utf8;
5150             r->check_offset_min = r->check_offset_max = r->anchored_offset;
5151             if (r->extflags & RXf_ANCH_SINGLE)
5152                 r->extflags |= RXf_NOSCAN;
5153         }
5154         else {
5155             r->check_end_shift = r->float_end_shift;
5156             r->check_substr = r->float_substr;
5157             r->check_utf8 = r->float_utf8;
5158             r->check_offset_min = r->float_min_offset;
5159             r->check_offset_max = r->float_max_offset;
5160         }
5161         /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
5162            This should be changed ASAP!  */
5163         if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
5164             r->extflags |= RXf_USE_INTUIT;
5165             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
5166                 r->extflags |= RXf_INTUIT_TAIL;
5167         }
5168         /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
5169         if ( (STRLEN)minlen < longest_float_length )
5170             minlen= longest_float_length;
5171         if ( (STRLEN)minlen < longest_fixed_length )
5172             minlen= longest_fixed_length;
5173         */
5174     }
5175     else {
5176         /* Several toplevels. Best we can is to set minlen. */
5177         I32 fake;
5178         struct regnode_charclass_class ch_class;
5179         I32 last_close = 0;
5180
5181         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
5182
5183         scan = ri->program + 1;
5184         cl_init(pRExC_state, &ch_class);
5185         data.start_class = &ch_class;
5186         data.last_closep = &last_close;
5187
5188
5189         minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
5190             &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
5191
5192         CHECK_RESTUDY_GOTO;
5193
5194         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
5195                 = r->float_substr = r->float_utf8 = NULL;
5196
5197         if (!(data.start_class->flags & ANYOF_EOS)
5198             && !cl_is_anything(data.start_class))
5199         {
5200             const U32 n = add_data(pRExC_state, 1, "f");
5201             data.start_class->flags |= ANYOF_IS_SYNTHETIC;
5202
5203             Newx(RExC_rxi->data->data[n], 1,
5204                 struct regnode_charclass_class);
5205             StructCopy(data.start_class,
5206                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
5207                        struct regnode_charclass_class);
5208             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
5209             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
5210             DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
5211                       regprop(r, sv, (regnode*)data.start_class);
5212                       PerlIO_printf(Perl_debug_log,
5213                                     "synthetic stclass \"%s\".\n",
5214                                     SvPVX_const(sv));});
5215         }
5216     }
5217
5218     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
5219        the "real" pattern. */
5220     DEBUG_OPTIMISE_r({
5221         PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
5222                       (IV)minlen, (IV)r->minlen);
5223     });
5224     r->minlenret = minlen;
5225     if (r->minlen < minlen)
5226         r->minlen = minlen;
5227
5228     if (RExC_seen & REG_SEEN_GPOS)
5229         r->extflags |= RXf_GPOS_SEEN;
5230     if (RExC_seen & REG_SEEN_LOOKBEHIND)
5231         r->extflags |= RXf_LOOKBEHIND_SEEN;
5232     if (RExC_seen & REG_SEEN_EVAL)
5233         r->extflags |= RXf_EVAL_SEEN;
5234     if (RExC_seen & REG_SEEN_CANY)
5235         r->extflags |= RXf_CANY_SEEN;
5236     if (RExC_seen & REG_SEEN_VERBARG)
5237         r->intflags |= PREGf_VERBARG_SEEN;
5238     if (RExC_seen & REG_SEEN_CUTGROUP)
5239         r->intflags |= PREGf_CUTGROUP_SEEN;
5240     if (RExC_paren_names)
5241         RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
5242     else
5243         RXp_PAREN_NAMES(r) = NULL;
5244
5245 #ifdef STUPID_PATTERN_CHECKS
5246     if (RX_PRELEN(rx) == 0)
5247         r->extflags |= RXf_NULL;
5248     if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
5249         /* XXX: this should happen BEFORE we compile */
5250         r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5251     else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
5252         r->extflags |= RXf_WHITE;
5253     else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
5254         r->extflags |= RXf_START_ONLY;
5255 #else
5256     if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
5257             /* XXX: this should happen BEFORE we compile */
5258             r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5259     else {
5260         regnode *first = ri->program + 1;
5261         U8 fop = OP(first);
5262
5263         if (PL_regkind[fop] == NOTHING && OP(NEXTOPER(first)) == END)
5264             r->extflags |= RXf_NULL;
5265         else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END)
5266             r->extflags |= RXf_START_ONLY;
5267         else if (fop == PLUS && OP(NEXTOPER(first)) == SPACE
5268                              && OP(regnext(first)) == END)
5269             r->extflags |= RXf_WHITE;
5270     }
5271 #endif
5272 #ifdef DEBUGGING
5273     if (RExC_paren_names) {
5274         ri->name_list_idx = add_data( pRExC_state, 1, "a" );
5275         ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
5276     } else
5277 #endif
5278         ri->name_list_idx = 0;
5279
5280     if (RExC_recurse_count) {
5281         for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
5282             const regnode *scan = RExC_recurse[RExC_recurse_count-1];
5283             ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
5284         }
5285     }
5286     Newxz(r->offs, RExC_npar, regexp_paren_pair);
5287     /* assume we don't need to swap parens around before we match */
5288
5289     DEBUG_DUMP_r({
5290         PerlIO_printf(Perl_debug_log,"Final program:\n");
5291         regdump(r);
5292     });
5293 #ifdef RE_TRACK_PATTERN_OFFSETS
5294     DEBUG_OFFSETS_r(if (ri->u.offsets) {
5295         const U32 len = ri->u.offsets[0];
5296         U32 i;
5297         GET_RE_DEBUG_FLAGS_DECL;
5298         PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
5299         for (i = 1; i <= len; i++) {
5300             if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
5301                 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
5302                 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
5303             }
5304         PerlIO_printf(Perl_debug_log, "\n");
5305     });
5306 #endif
5307     return rx;
5308 }
5309
5310 #undef RE_ENGINE_PTR
5311
5312
5313 SV*
5314 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
5315                     const U32 flags)
5316 {
5317     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
5318
5319     PERL_UNUSED_ARG(value);
5320
5321     if (flags & RXapif_FETCH) {
5322         return reg_named_buff_fetch(rx, key, flags);
5323     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
5324         Perl_croak_no_modify(aTHX);
5325         return NULL;
5326     } else if (flags & RXapif_EXISTS) {
5327         return reg_named_buff_exists(rx, key, flags)
5328             ? &PL_sv_yes
5329             : &PL_sv_no;
5330     } else if (flags & RXapif_REGNAMES) {
5331         return reg_named_buff_all(rx, flags);
5332     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
5333         return reg_named_buff_scalar(rx, flags);
5334     } else {
5335         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
5336         return NULL;
5337     }
5338 }
5339
5340 SV*
5341 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
5342                          const U32 flags)
5343 {
5344     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
5345     PERL_UNUSED_ARG(lastkey);
5346
5347     if (flags & RXapif_FIRSTKEY)
5348         return reg_named_buff_firstkey(rx, flags);
5349     else if (flags & RXapif_NEXTKEY)
5350         return reg_named_buff_nextkey(rx, flags);
5351     else {
5352         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
5353         return NULL;
5354     }
5355 }
5356
5357 SV*
5358 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
5359                           const U32 flags)
5360 {
5361     AV *retarray = NULL;
5362     SV *ret;
5363     struct regexp *const rx = (struct regexp *)SvANY(r);
5364
5365     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
5366
5367     if (flags & RXapif_ALL)
5368         retarray=newAV();
5369
5370     if (rx && RXp_PAREN_NAMES(rx)) {
5371         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
5372         if (he_str) {
5373             IV i;
5374             SV* sv_dat=HeVAL(he_str);
5375             I32 *nums=(I32*)SvPVX(sv_dat);
5376             for ( i=0; i<SvIVX(sv_dat); i++ ) {
5377                 if ((I32)(rx->nparens) >= nums[i]
5378                     && rx->offs[nums[i]].start != -1
5379                     && rx->offs[nums[i]].end != -1)
5380                 {
5381                     ret = newSVpvs("");
5382                     CALLREG_NUMBUF_FETCH(r,nums[i],ret);
5383                     if (!retarray)
5384                         return ret;
5385                 } else {
5386                     ret = newSVsv(&PL_sv_undef);
5387                 }
5388                 if (retarray)
5389                     av_push(retarray, ret);
5390             }
5391             if (retarray)
5392                 return newRV_noinc(MUTABLE_SV(retarray));
5393         }
5394     }
5395     return NULL;
5396 }
5397
5398 bool
5399 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
5400                            const U32 flags)
5401 {
5402     struct regexp *const rx = (struct regexp *)SvANY(r);
5403
5404     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
5405
5406     if (rx && RXp_PAREN_NAMES(rx)) {
5407         if (flags & RXapif_ALL) {
5408             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
5409         } else {
5410             SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
5411             if (sv) {
5412                 SvREFCNT_dec(sv);
5413                 return TRUE;
5414             } else {
5415                 return FALSE;
5416             }
5417         }
5418     } else {
5419         return FALSE;
5420     }
5421 }
5422
5423 SV*
5424 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
5425 {
5426     struct regexp *const rx = (struct regexp *)SvANY(r);
5427
5428     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
5429
5430     if ( rx && RXp_PAREN_NAMES(rx) ) {
5431         (void)hv_iterinit(RXp_PAREN_NAMES(rx));
5432
5433         return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
5434     } else {
5435         return FALSE;
5436     }
5437 }
5438
5439 SV*
5440 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
5441 {
5442     struct regexp *const rx = (struct regexp *)SvANY(r);
5443     GET_RE_DEBUG_FLAGS_DECL;
5444
5445     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
5446
5447     if (rx && RXp_PAREN_NAMES(rx)) {
5448         HV *hv = RXp_PAREN_NAMES(rx);
5449         HE *temphe;
5450         while ( (temphe = hv_iternext_flags(hv,0)) ) {
5451             IV i;
5452             IV parno = 0;
5453             SV* sv_dat = HeVAL(temphe);
5454             I32 *nums = (I32*)SvPVX(sv_dat);
5455             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5456                 if ((I32)(rx->lastparen) >= nums[i] &&
5457                     rx->offs[nums[i]].start != -1 &&
5458                     rx->offs[nums[i]].end != -1)
5459                 {
5460                     parno = nums[i];
5461                     break;
5462                 }
5463             }
5464             if (parno || flags & RXapif_ALL) {
5465                 return newSVhek(HeKEY_hek(temphe));
5466             }
5467         }
5468     }
5469     return NULL;
5470 }
5471
5472 SV*
5473 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
5474 {
5475     SV *ret;
5476     AV *av;
5477     I32 length;
5478     struct regexp *const rx = (struct regexp *)SvANY(r);
5479
5480     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
5481
5482     if (rx && RXp_PAREN_NAMES(rx)) {
5483         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5484             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
5485         } else if (flags & RXapif_ONE) {
5486             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
5487             av = MUTABLE_AV(SvRV(ret));
5488             length = av_len(av);
5489             SvREFCNT_dec(ret);
5490             return newSViv(length + 1);
5491         } else {
5492             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5493             return NULL;
5494         }
5495     }
5496     return &PL_sv_undef;
5497 }
5498
5499 SV*
5500 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
5501 {
5502     struct regexp *const rx = (struct regexp *)SvANY(r);
5503     AV *av = newAV();
5504
5505     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
5506
5507     if (rx && RXp_PAREN_NAMES(rx)) {
5508         HV *hv= RXp_PAREN_NAMES(rx);
5509         HE *temphe;
5510         (void)hv_iterinit(hv);
5511         while ( (temphe = hv_iternext_flags(hv,0)) ) {
5512             IV i;
5513             IV parno = 0;
5514             SV* sv_dat = HeVAL(temphe);
5515             I32 *nums = (I32*)SvPVX(sv_dat);
5516             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5517                 if ((I32)(rx->lastparen) >= nums[i] &&
5518                     rx->offs[nums[i]].start != -1 &&
5519                     rx->offs[nums[i]].end != -1)
5520                 {
5521                     parno = nums[i];
5522                     break;
5523                 }
5524             }
5525             if (parno || flags & RXapif_ALL) {
5526                 av_push(av, newSVhek(HeKEY_hek(temphe)));
5527             }
5528         }
5529     }
5530
5531     return newRV_noinc(MUTABLE_SV(av));
5532 }
5533
5534 void
5535 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
5536                              SV * const sv)
5537 {
5538     struct regexp *const rx = (struct regexp *)SvANY(r);
5539     char *s = NULL;
5540     I32 i = 0;
5541     I32 s1, t1;
5542
5543     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
5544
5545     if (!rx->subbeg) {
5546         sv_setsv(sv,&PL_sv_undef);
5547         return;
5548     }
5549     else
5550     if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
5551         /* $` */
5552         i = rx->offs[0].start;
5553         s = rx->subbeg;
5554     }
5555     else
5556     if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
5557         /* $' */
5558         s = rx->subbeg + rx->offs[0].end;
5559         i = rx->sublen - rx->offs[0].end;
5560     }
5561     else
5562     if ( 0 <= paren && paren <= (I32)rx->nparens &&
5563         (s1 = rx->offs[paren].start) != -1 &&
5564         (t1 = rx->offs[paren].end) != -1)
5565     {
5566         /* $& $1 ... */
5567         i = t1 - s1;
5568         s = rx->subbeg + s1;
5569     } else {
5570         sv_setsv(sv,&PL_sv_undef);
5571         return;
5572     }
5573     assert(rx->sublen >= (s - rx->subbeg) + i );
5574     if (i >= 0) {
5575         const int oldtainted = PL_tainted;
5576         TAINT_NOT;
5577         sv_setpvn(sv, s, i);
5578         PL_tainted = oldtainted;
5579         if ( (rx->extflags & RXf_CANY_SEEN)
5580             ? (RXp_MATCH_UTF8(rx)
5581                         && (!i || is_utf8_string((U8*)s, i)))
5582             : (RXp_MATCH_UTF8(rx)) )
5583         {
5584             SvUTF8_on(sv);
5585         }
5586         else
5587             SvUTF8_off(sv);
5588         if (PL_tainting) {
5589             if (RXp_MATCH_TAINTED(rx)) {
5590                 if (SvTYPE(sv) >= SVt_PVMG) {
5591                     MAGIC* const mg = SvMAGIC(sv);
5592                     MAGIC* mgt;
5593                     PL_tainted = 1;
5594                     SvMAGIC_set(sv, mg->mg_moremagic);
5595                     SvTAINT(sv);
5596                     if ((mgt = SvMAGIC(sv))) {
5597                         mg->mg_moremagic = mgt;
5598                         SvMAGIC_set(sv, mg);
5599                     }
5600                 } else {
5601                     PL_tainted = 1;
5602                     SvTAINT(sv);
5603                 }
5604             } else
5605                 SvTAINTED_off(sv);
5606         }
5607     } else {
5608         sv_setsv(sv,&PL_sv_undef);
5609         return;
5610     }
5611 }
5612
5613 void
5614 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
5615                                                          SV const * const value)
5616 {
5617     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
5618
5619     PERL_UNUSED_ARG(rx);
5620     PERL_UNUSED_ARG(paren);
5621     PERL_UNUSED_ARG(value);
5622
5623     if (!PL_localizing)
5624         Perl_croak_no_modify(aTHX);
5625 }
5626
5627 I32
5628 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
5629                               const I32 paren)
5630 {
5631     struct regexp *const rx = (struct regexp *)SvANY(r);
5632     I32 i;
5633     I32 s1, t1;
5634
5635     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
5636
5637     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
5638         switch (paren) {
5639       /* $` / ${^PREMATCH} */
5640       case RX_BUFF_IDX_PREMATCH:
5641         if (rx->offs[0].start != -1) {
5642                         i = rx->offs[0].start;
5643                         if (i > 0) {
5644                                 s1 = 0;
5645                                 t1 = i;
5646                                 goto getlen;
5647                         }
5648             }
5649         return 0;
5650       /* $' / ${^POSTMATCH} */
5651       case RX_BUFF_IDX_POSTMATCH:
5652             if (rx->offs[0].end != -1) {
5653                         i = rx->sublen - rx->offs[0].end;
5654                         if (i > 0) {
5655                                 s1 = rx->offs[0].end;
5656                                 t1 = rx->sublen;
5657                                 goto getlen;
5658                         }
5659             }
5660         return 0;
5661       /* $& / ${^MATCH}, $1, $2, ... */
5662       default:
5663             if (paren <= (I32)rx->nparens &&
5664             (s1 = rx->offs[paren].start) != -1 &&
5665             (t1 = rx->offs[paren].end) != -1)
5666             {
5667             i = t1 - s1;
5668             goto getlen;
5669         } else {
5670             if (ckWARN(WARN_UNINITIALIZED))
5671                 report_uninit((const SV *)sv);
5672             return 0;
5673         }
5674     }
5675   getlen:
5676     if (i > 0 && RXp_MATCH_UTF8(rx)) {
5677         const char * const s = rx->subbeg + s1;
5678         const U8 *ep;
5679         STRLEN el;
5680
5681         i = t1 - s1;
5682         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
5683                         i = el;
5684     }
5685     return i;
5686 }
5687
5688 SV*
5689 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
5690 {
5691     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
5692         PERL_UNUSED_ARG(rx);
5693         if (0)
5694             return NULL;
5695         else
5696             return newSVpvs("Regexp");
5697 }
5698
5699 /* Scans the name of a named buffer from the pattern.
5700  * If flags is REG_RSN_RETURN_NULL returns null.
5701  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
5702  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
5703  * to the parsed name as looked up in the RExC_paren_names hash.
5704  * If there is an error throws a vFAIL().. type exception.
5705  */
5706
5707 #define REG_RSN_RETURN_NULL    0
5708 #define REG_RSN_RETURN_NAME    1
5709 #define REG_RSN_RETURN_DATA    2
5710
5711 STATIC SV*
5712 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
5713 {
5714     char *name_start = RExC_parse;
5715
5716     PERL_ARGS_ASSERT_REG_SCAN_NAME;
5717
5718     if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
5719          /* skip IDFIRST by using do...while */
5720         if (UTF)
5721             do {
5722                 RExC_parse += UTF8SKIP(RExC_parse);
5723             } while (isALNUM_utf8((U8*)RExC_parse));
5724         else
5725             do {
5726                 RExC_parse++;
5727             } while (isALNUM(*RExC_parse));
5728     }
5729
5730     if ( flags ) {
5731         SV* sv_name
5732             = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
5733                              SVs_TEMP | (UTF ? SVf_UTF8 : 0));
5734         if ( flags == REG_RSN_RETURN_NAME)
5735             return sv_name;
5736         else if (flags==REG_RSN_RETURN_DATA) {
5737             HE *he_str = NULL;
5738             SV *sv_dat = NULL;
5739             if ( ! sv_name )      /* should not happen*/
5740                 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
5741             if (RExC_paren_names)
5742                 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
5743             if ( he_str )
5744                 sv_dat = HeVAL(he_str);
5745             if ( ! sv_dat )
5746                 vFAIL("Reference to nonexistent named group");
5747             return sv_dat;
5748         }
5749         else {
5750             Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
5751         }
5752         /* NOT REACHED */
5753     }
5754     return NULL;
5755 }
5756
5757 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
5758     int rem=(int)(RExC_end - RExC_parse);                       \
5759     int cut;                                                    \
5760     int num;                                                    \
5761     int iscut=0;                                                \
5762     if (rem>10) {                                               \
5763         rem=10;                                                 \
5764         iscut=1;                                                \
5765     }                                                           \
5766     cut=10-rem;                                                 \
5767     if (RExC_lastparse!=RExC_parse)                             \
5768         PerlIO_printf(Perl_debug_log," >%.*s%-*s",              \
5769             rem, RExC_parse,                                    \
5770             cut + 4,                                            \
5771             iscut ? "..." : "<"                                 \
5772         );                                                      \
5773     else                                                        \
5774         PerlIO_printf(Perl_debug_log,"%16s","");                \
5775                                                                 \
5776     if (SIZE_ONLY)                                              \
5777        num = RExC_size + 1;                                     \
5778     else                                                        \
5779        num=REG_NODE_NUM(RExC_emit);                             \
5780     if (RExC_lastnum!=num)                                      \
5781        PerlIO_printf(Perl_debug_log,"|%4d",num);                \
5782     else                                                        \
5783        PerlIO_printf(Perl_debug_log,"|%4s","");                 \
5784     PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
5785         (int)((depth*2)), "",                                   \
5786         (funcname)                                              \
5787     );                                                          \
5788     RExC_lastnum=num;                                           \
5789     RExC_lastparse=RExC_parse;                                  \
5790 })
5791
5792
5793
5794 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
5795     DEBUG_PARSE_MSG((funcname));                            \
5796     PerlIO_printf(Perl_debug_log,"%4s","\n");               \
5797 })
5798 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
5799     DEBUG_PARSE_MSG((funcname));                            \
5800     PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
5801 })
5802
5803 /* This section of code defines the inversion list object and its methods.  The
5804  * interfaces are highly subject to change, so as much as possible is static to
5805  * this file.  An inversion list is here implemented as a malloc'd C array with
5806  * some added info.  More will be coming when functionality is added later.
5807  *
5808  * Some of the methods should always be private to the implementation, and some
5809  * should eventually be made public */
5810
5811 #define INVLIST_INITIAL_LEN 10
5812 #define INVLIST_ARRAY_KEY "array"
5813 #define INVLIST_MAX_KEY "max"
5814 #define INVLIST_LEN_KEY "len"
5815
5816 PERL_STATIC_INLINE UV*
5817 S_invlist_array(pTHX_ HV* const invlist)
5818 {
5819     /* Returns the pointer to the inversion list's array.  Every time the
5820      * length changes, this needs to be called in case malloc or realloc moved
5821      * it */
5822
5823     SV** list_ptr = hv_fetchs(invlist, INVLIST_ARRAY_KEY, FALSE);
5824
5825     PERL_ARGS_ASSERT_INVLIST_ARRAY;
5826
5827     if (list_ptr == NULL) {
5828         Perl_croak(aTHX_ "panic: inversion list without a '%s' element",
5829                                                             INVLIST_ARRAY_KEY);
5830     }
5831
5832     return INT2PTR(UV *, SvUV(*list_ptr));
5833 }
5834
5835 PERL_STATIC_INLINE void
5836 S_invlist_set_array(pTHX_ HV* const invlist, const UV* const array)
5837 {
5838     PERL_ARGS_ASSERT_INVLIST_SET_ARRAY;
5839
5840     /* Sets the array stored in the inversion list to the memory beginning with
5841      * the parameter */
5842
5843     if (hv_stores(invlist, INVLIST_ARRAY_KEY, newSVuv(PTR2UV(array))) == NULL) {
5844         Perl_croak(aTHX_ "panic: can't store '%s' entry in inversion list",
5845                                                             INVLIST_ARRAY_KEY);
5846     }
5847 }
5848
5849 PERL_STATIC_INLINE UV
5850 S_invlist_len(pTHX_ HV* const invlist)
5851 {
5852     /* Returns the current number of elements in the inversion list's array */
5853
5854     SV** len_ptr = hv_fetchs(invlist, INVLIST_LEN_KEY, FALSE);
5855
5856     PERL_ARGS_ASSERT_INVLIST_LEN;
5857
5858     if (len_ptr == NULL) {
5859         Perl_croak(aTHX_ "panic: inversion list without a '%s' element",
5860                                                             INVLIST_LEN_KEY);
5861     }
5862
5863     return SvUV(*len_ptr);
5864 }
5865
5866 PERL_STATIC_INLINE UV
5867 S_invlist_max(pTHX_ HV* const invlist)
5868 {
5869     /* Returns the maximum number of elements storable in the inversion list's
5870      * array, without having to realloc() */
5871
5872     SV** max_ptr = hv_fetchs(invlist, INVLIST_MAX_KEY, FALSE);
5873
5874     PERL_ARGS_ASSERT_INVLIST_MAX;
5875
5876     if (max_ptr == NULL) {
5877         Perl_croak(aTHX_ "panic: inversion list without a '%s' element",
5878                                                             INVLIST_MAX_KEY);
5879     }
5880
5881     return SvUV(*max_ptr);
5882 }
5883
5884 PERL_STATIC_INLINE void
5885 S_invlist_set_len(pTHX_ HV* const invlist, const UV len)
5886 {
5887     /* Sets the current number of elements stored in the inversion list */
5888
5889     PERL_ARGS_ASSERT_INVLIST_SET_LEN;
5890
5891     if (len != 0 && len > invlist_max(invlist)) {
5892         Perl_croak(aTHX_ "panic: Can't make '%s=%"UVuf"' more than %s=%"UVuf" in inversion list", INVLIST_LEN_KEY, len, INVLIST_MAX_KEY, invlist_max(invlist));
5893     }
5894
5895     if (hv_stores(invlist, INVLIST_LEN_KEY, newSVuv(len)) == NULL) {
5896         Perl_croak(aTHX_ "panic: can't store '%s' entry in inversion list",
5897                                                             INVLIST_LEN_KEY);
5898     }
5899 }
5900
5901 PERL_STATIC_INLINE void
5902 S_invlist_set_max(pTHX_ HV* const invlist, const UV max)
5903 {
5904
5905     /* Sets the maximum number of elements storable in the inversion list
5906      * without having to realloc() */
5907
5908     PERL_ARGS_ASSERT_INVLIST_SET_MAX;
5909
5910     if (max < invlist_len(invlist)) {
5911         Perl_croak(aTHX_ "panic: Can't make '%s=%"UVuf"' less than %s=%"UVuf" in inversion list", INVLIST_MAX_KEY, invlist_len(invlist), INVLIST_LEN_KEY, invlist_max(invlist));
5912     }
5913
5914     if (hv_stores(invlist, INVLIST_MAX_KEY, newSVuv(max)) == NULL) {
5915         Perl_croak(aTHX_ "panic: can't store '%s' entry in inversion list",
5916                                                             INVLIST_LEN_KEY);
5917     }
5918 }
5919
5920 #ifndef PERL_IN_XSUB_RE
5921 HV*
5922 Perl__new_invlist(pTHX_ IV initial_size)
5923 {
5924
5925     /* Return a pointer to a newly constructed inversion list, with enough
5926      * space to store 'initial_size' elements.  If that number is negative, a
5927      * system default is used instead */
5928
5929     HV* invlist = newHV();
5930     UV* list;
5931
5932     if (initial_size < 0) {
5933         initial_size = INVLIST_INITIAL_LEN;
5934     }
5935
5936     /* Allocate the initial space */
5937     Newx(list, initial_size, UV);
5938     invlist_set_array(invlist, list);
5939
5940     /* set_len has to come before set_max, as the latter inspects the len */
5941     invlist_set_len(invlist, 0);
5942     invlist_set_max(invlist, initial_size);
5943
5944     return invlist;
5945 }
5946 #endif
5947
5948 PERL_STATIC_INLINE void
5949 S_invlist_destroy(pTHX_ HV* const invlist)
5950 {
5951    /* Inversion list destructor */
5952
5953     SV** list_ptr = hv_fetchs(invlist, INVLIST_ARRAY_KEY, FALSE);
5954
5955     PERL_ARGS_ASSERT_INVLIST_DESTROY;
5956
5957     if (list_ptr != NULL) {
5958         UV *list = INT2PTR(UV *, SvUV(*list_ptr)); /* PERL_POISON needs lvalue */
5959         Safefree(list);
5960     }
5961 }
5962
5963 STATIC void
5964 S_invlist_extend(pTHX_ HV* const invlist, const UV new_max)
5965 {
5966     /* Change the maximum size of an inversion list (up or down) */
5967
5968     UV* orig_array;
5969     UV* array;
5970     const UV old_max = invlist_max(invlist);
5971
5972     PERL_ARGS_ASSERT_INVLIST_EXTEND;
5973
5974     if (old_max == new_max) {   /* If a no-op */
5975         return;
5976     }
5977
5978     array = orig_array = invlist_array(invlist);
5979     Renew(array, new_max, UV);
5980
5981     /* If the size change moved the list in memory, set the new one */
5982     if (array != orig_array) {
5983         invlist_set_array(invlist, array);
5984     }
5985
5986     invlist_set_max(invlist, new_max);
5987
5988 }
5989
5990 PERL_STATIC_INLINE void
5991 S_invlist_trim(pTHX_ HV* const invlist)
5992 {
5993     PERL_ARGS_ASSERT_INVLIST_TRIM;
5994
5995     /* Change the length of the inversion list to how many entries it currently
5996      * has */
5997
5998     invlist_extend(invlist, invlist_len(invlist));
5999 }
6000
6001 /* An element is in an inversion list iff its index is even numbered: 0, 2, 4,
6002  * etc */
6003
6004 #define ELEMENT_IN_INVLIST_SET(i) (! ((i) & 1))
6005
6006 #ifndef PERL_IN_XSUB_RE
6007 void
6008 Perl__append_range_to_invlist(pTHX_ HV* const invlist, const UV start, const UV end)
6009 {
6010    /* Subject to change or removal.  Append the range from 'start' to 'end' at
6011     * the end of the inversion list.  The range must be above any existing
6012     * ones. */
6013
6014     UV* array = invlist_array(invlist);
6015     UV max = invlist_max(invlist);
6016     UV len = invlist_len(invlist);
6017
6018     PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
6019
6020     if (len > 0) {
6021
6022         /* Here, the existing list is non-empty. The current max entry in the
6023          * list is generally the first value not in the set, except when the
6024          * set extends to the end of permissible values, in which case it is
6025          * the first entry in that final set, and so this call is an attempt to
6026          * append out-of-order */
6027
6028         UV final_element = len - 1;
6029         if (array[final_element] > start
6030             || ELEMENT_IN_INVLIST_SET(final_element))
6031         {
6032             Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list");
6033         }
6034
6035         /* Here, it is a legal append.  If the new range begins with the first
6036          * value not in the set, it is extending the set, so the new first
6037          * value not in the set is one greater than the newly extended range.
6038          * */
6039         if (array[final_element] == start) {
6040             if (end != UV_MAX) {
6041                 array[final_element] = end + 1;
6042             }
6043             else {
6044                 /* But if the end is the maximum representable on the machine,
6045                  * just let the range that this would extend have no end */
6046                 invlist_set_len(invlist, len - 1);
6047             }
6048             return;
6049         }
6050     }
6051
6052     /* Here the new range doesn't extend any existing set.  Add it */
6053
6054     len += 2;   /* Includes an element each for the start and end of range */
6055
6056     /* If overflows the existing space, extend, which may cause the array to be
6057      * moved */
6058     if (max < len) {
6059         invlist_extend(invlist, len);
6060         array = invlist_array(invlist);
6061     }
6062
6063     invlist_set_len(invlist, len);
6064
6065     /* The next item on the list starts the range, the one after that is
6066      * one past the new range.  */
6067     array[len - 2] = start;
6068     if (end != UV_MAX) {
6069         array[len - 1] = end + 1;
6070     }
6071     else {
6072         /* But if the end is the maximum representable on the machine, just let
6073          * the range have no end */
6074         invlist_set_len(invlist, len - 1);
6075     }
6076 }
6077 #endif
6078
6079 STATIC HV*
6080 S_invlist_union(pTHX_ HV* const a, HV* const b)
6081 {
6082     /* Return a new inversion list which is the union of two inversion lists.
6083      * The basis for this comes from "Unicode Demystified" Chapter 13 by
6084      * Richard Gillam, published by Addison-Wesley, and explained at some
6085      * length there.  The preface says to incorporate its examples into your
6086      * code at your own risk.
6087      *
6088      * The algorithm is like a merge sort.
6089      *
6090      * XXX A potential performance improvement is to keep track as we go along
6091      * if only one of the inputs contributes to the result, meaning the other
6092      * is a subset of that one.  In that case, we can skip the final copy and
6093      * return the larger of the input lists */
6094
6095     UV* array_a = invlist_array(a);   /* a's array */
6096     UV* array_b = invlist_array(b);
6097     UV len_a = invlist_len(a);  /* length of a's array */
6098     UV len_b = invlist_len(b);
6099
6100     HV* u;                      /* the resulting union */
6101     UV* array_u;
6102     UV len_u;
6103
6104     UV i_a = 0;             /* current index into a's array */
6105     UV i_b = 0;
6106     UV i_u = 0;
6107
6108     /* running count, as explained in the algorithm source book; items are
6109      * stopped accumulating and are output when the count changes to/from 0.
6110      * The count is incremented when we start a range that's in the set, and
6111      * decremented when we start a range that's not in the set.  So its range
6112      * is 0 to 2.  Only when the count is zero is something not in the set.
6113      */
6114     UV count = 0;
6115
6116     PERL_ARGS_ASSERT_INVLIST_UNION;
6117
6118     /* Size the union for the worst case: that the sets are completely
6119      * disjoint */
6120     u = _new_invlist(len_a + len_b);
6121     array_u = invlist_array(u);
6122
6123     /* Go through each list item by item, stopping when exhausted one of
6124      * them */
6125     while (i_a < len_a && i_b < len_b) {
6126         UV cp;      /* The element to potentially add to the union's array */
6127         bool cp_in_set;   /* is it in the the input list's set or not */
6128
6129         /* We need to take one or the other of the two inputs for the union.
6130          * Since we are merging two sorted lists, we take the smaller of the
6131          * next items.  In case of a tie, we take the one that is in its set
6132          * first.  If we took one not in the set first, it would decrement the
6133          * count, possibly to 0 which would cause it to be output as ending the
6134          * range, and the next time through we would take the same number, and
6135          * output it again as beginning the next range.  By doing it the
6136          * opposite way, there is no possibility that the count will be
6137          * momentarily decremented to 0, and thus the two adjoining ranges will
6138          * be seamlessly merged.  (In a tie and both are in the set or both not
6139          * in the set, it doesn't matter which we take first.) */
6140         if (array_a[i_a] < array_b[i_b]
6141             || (array_a[i_a] == array_b[i_b] && ELEMENT_IN_INVLIST_SET(i_a)))
6142         {
6143             cp_in_set = ELEMENT_IN_INVLIST_SET(i_a);
6144             cp= array_a[i_a++];
6145         }
6146         else {
6147             cp_in_set = ELEMENT_IN_INVLIST_SET(i_b);
6148             cp= array_b[i_b++];
6149         }
6150
6151         /* Here, have chosen which of the two inputs to look at.  Only output
6152          * if the running count changes to/from 0, which marks the
6153          * beginning/end of a range in that's in the set */
6154         if (cp_in_set) {
6155             if (count == 0) {
6156                 array_u[i_u++] = cp;
6157             }
6158             count++;
6159         }
6160         else {
6161             count--;
6162             if (count == 0) {
6163                 array_u[i_u++] = cp;
6164             }
6165         }
6166     }
6167
6168     /* Here, we are finished going through at least one of the lists, which
6169      * means there is something remaining in at most one.  We check if the list
6170      * that hasn't been exhausted is positioned such that we are in the middle
6171      * of a range in its set or not.  (We are in the set if the next item in
6172      * the array marks the beginning of something not in the set)   If in the
6173      * set, we decrement 'count'; if 0, there is potentially more to output.
6174      * There are four cases:
6175      *  1) Both weren't in their sets, count is 0, and remains 0.  What's left
6176      *     in the union is entirely from the non-exhausted set.
6177      *  2) Both were in their sets, count is 2.  Nothing further should
6178      *     be output, as everything that remains will be in the exhausted
6179      *     list's set, hence in the union; decrementing to 1 but not 0 insures
6180      *     that
6181      *  3) the exhausted was in its set, non-exhausted isn't, count is 1.
6182      *     Nothing further should be output because the union includes
6183      *     everything from the exhausted set.  Not decrementing insures that.
6184      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1;
6185      *     decrementing to 0 insures that we look at the remainder of the
6186      *     non-exhausted set */
6187     if ((i_a != len_a && ! ELEMENT_IN_INVLIST_SET(i_a))
6188         || (i_b != len_b && ! ELEMENT_IN_INVLIST_SET(i_b)))
6189     {
6190         count--;
6191     }
6192
6193     /* The final length is what we've output so far, plus what else is about to
6194      * be output.  (If 'count' is non-zero, then the input list we exhausted
6195      * has everything remaining up to the machine's limit in its set, and hence
6196      * in the union, so there will be no further output. */
6197     len_u = i_u;
6198     if (count == 0) {
6199         /* At most one of the subexpressions will be non-zero */
6200         len_u += (len_a - i_a) + (len_b - i_b);
6201     }
6202
6203     /* Set result to final length, which can change the pointer to array_u, so
6204      * re-find it */
6205     if (len_u != invlist_len(u)) {
6206         invlist_set_len(u, len_u);
6207         invlist_trim(u);
6208         array_u = invlist_array(u);
6209     }
6210
6211     /* When 'count' is 0, the list that was exhausted (if one was shorter than
6212      * the other) ended with everything above it not in its set.  That means
6213      * that the remaining part of the union is precisely the same as the
6214      * non-exhausted list, so can just copy it unchanged.  (If both list were
6215      * exhausted at the same time, then the operations below will be both 0.)
6216      */
6217     if (count == 0) {
6218         IV copy_count; /* At most one will have a non-zero copy count */
6219         if ((copy_count = len_a - i_a) > 0) {
6220             Copy(array_a + i_a, array_u + i_u, copy_count, UV);
6221         }
6222         else if ((copy_count = len_b - i_b) > 0) {
6223             Copy(array_b + i_b, array_u + i_u, copy_count, UV);
6224         }
6225     }
6226
6227     return u;
6228 }
6229
6230 STATIC HV*
6231 S_invlist_intersection(pTHX_ HV* const a, HV* const b)
6232 {
6233     /* Return the intersection of two inversion lists.  The basis for this
6234      * comes from "Unicode Demystified" Chapter 13 by Richard Gillam, published
6235      * by Addison-Wesley, and explained at some length there.  The preface says
6236      * to incorporate its examples into your code at your own risk.
6237      *
6238      * The algorithm is like a merge sort, and is essentially the same as the
6239      * union above
6240      */
6241
6242     UV* array_a = invlist_array(a);   /* a's array */
6243     UV* array_b = invlist_array(b);
6244     UV len_a = invlist_len(a);  /* length of a's array */
6245     UV len_b = invlist_len(b);
6246
6247     HV* r;                   /* the resulting intersection */
6248     UV* array_r;
6249     UV len_r;
6250
6251     UV i_a = 0;             /* current index into a's array */
6252     UV i_b = 0;
6253     UV i_r = 0;
6254
6255     /* running count, as explained in the algorithm source book; items are
6256      * stopped accumulating and are output when the count changes to/from 2.
6257      * The count is incremented when we start a range that's in the set, and
6258      * decremented when we start a range that's not in the set.  So its range
6259      * is 0 to 2.  Only when the count is 2 is something in the intersection.
6260      */
6261     UV count = 0;
6262
6263     PERL_ARGS_ASSERT_INVLIST_INTERSECTION;
6264
6265     /* Size the intersection for the worst case: that the intersection ends up
6266      * fragmenting everything to be completely disjoint */
6267     r= _new_invlist(len_a + len_b);
6268     array_r = invlist_array(r);
6269
6270     /* Go through each list item by item, stopping when exhausted one of
6271      * them */
6272     while (i_a < len_a && i_b < len_b) {
6273         UV cp;      /* The element to potentially add to the intersection's
6274                        array */
6275         bool cp_in_set; /* Is it in the input list's set or not */
6276
6277         /* We need to take one or the other of the two inputs for the union.
6278          * Since we are merging two sorted lists, we take the smaller of the
6279          * next items.  In case of a tie, we take the one that is not in its
6280          * set first (a difference from the union algorithm).  If we took one
6281          * in the set first, it would increment the count, possibly to 2 which
6282          * would cause it to be output as starting a range in the intersection,
6283          * and the next time through we would take that same number, and output
6284          * it again as ending the set.  By doing it the opposite of this, we
6285          * there is no possibility that the count will be momentarily
6286          * incremented to 2.  (In a tie and both are in the set or both not in
6287          * the set, it doesn't matter which we take first.) */
6288         if (array_a[i_a] < array_b[i_b]
6289             || (array_a[i_a] == array_b[i_b] && ! ELEMENT_IN_INVLIST_SET(i_a)))
6290         {
6291             cp_in_set = ELEMENT_IN_INVLIST_SET(i_a);
6292             cp= array_a[i_a++];
6293         }
6294         else {
6295             cp_in_set = ELEMENT_IN_INVLIST_SET(i_b);
6296             cp= array_b[i_b++];
6297         }
6298
6299         /* Here, have chosen which of the two inputs to look at.  Only output
6300          * if the running count changes to/from 2, which marks the
6301          * beginning/end of a range that's in the intersection */
6302         if (cp_in_set) {
6303             count++;
6304             if (count == 2) {
6305                 array_r[i_r++] = cp;
6306             }
6307         }
6308         else {
6309             if (count == 2) {
6310                 array_r[i_r++] = cp;
6311             }
6312             count--;
6313         }
6314     }
6315
6316     /* Here, we are finished going through at least one of the sets, which
6317      * means there is something remaining in at most one.  See the comments in
6318      * the union code */
6319     if ((i_a != len_a && ! ELEMENT_IN_INVLIST_SET(i_a))
6320         || (i_b != len_b && ! ELEMENT_IN_INVLIST_SET(i_b)))
6321     {
6322         count--;
6323     }
6324
6325     /* The final length is what we've output so far plus what else is in the
6326      * intersection.  Only one of the subexpressions below will be non-zero */
6327     len_r = i_r;
6328     if (count == 2) {
6329         len_r += (len_a - i_a) + (len_b - i_b);
6330     }
6331
6332     /* Set result to final length, which can change the pointer to array_r, so
6333      * re-find it */
6334     if (len_r != invlist_len(r)) {
6335         invlist_set_len(r, len_r);
6336         invlist_trim(r);
6337         array_r = invlist_array(r);
6338     }
6339
6340     /* Finish outputting any remaining */
6341     if (count == 2) { /* Only one of will have a non-zero copy count */
6342         IV copy_count;
6343         if ((copy_count = len_a - i_a) > 0) {
6344             Copy(array_a + i_a, array_r + i_r, copy_count, UV);
6345         }
6346         else if ((copy_count = len_b - i_b) > 0) {
6347             Copy(array_b + i_b, array_r + i_r, copy_count, UV);
6348         }
6349     }
6350
6351     return r;
6352 }
6353
6354 STATIC HV*
6355 S_add_range_to_invlist(pTHX_ HV* invlist, const UV start, const UV end)
6356 {
6357     /* Add the range from 'start' to 'end' inclusive to the inversion list's
6358      * set.  A pointer to the inversion list is returned.  This may actually be
6359      * a new list, in which case the passed in one has been destroyed.  The
6360      * passed in inversion list can be NULL, in which case a new one is created
6361      * with just the one range in it */
6362
6363     HV* range_invlist;
6364     HV* added_invlist;
6365     UV len;
6366
6367     if (invlist == NULL) {
6368         invlist = _new_invlist(2);
6369         len = 0;
6370     }
6371     else {
6372         len = invlist_len(invlist);
6373     }
6374
6375     /* If comes after the final entry, can just append it to the end */
6376     if (len == 0
6377         || start >= invlist_array(invlist)
6378                                     [invlist_len(invlist) - 1])
6379     {
6380         _append_range_to_invlist(invlist, start, end);
6381         return invlist;
6382     }
6383
6384     /* Here, can't just append things, create and return a new inversion list
6385      * which is the union of this range and the existing inversion list */
6386     range_invlist = _new_invlist(2);
6387     _append_range_to_invlist(range_invlist, start, end);
6388
6389     added_invlist = invlist_union(invlist, range_invlist);
6390
6391     /* The passed in list can be freed, as well as our temporary */
6392     invlist_destroy(range_invlist);
6393     if (invlist != added_invlist) {
6394         invlist_destroy(invlist);
6395     }
6396
6397     return added_invlist;
6398 }
6399
6400 PERL_STATIC_INLINE HV*
6401 S_add_cp_to_invlist(pTHX_ HV* invlist, const UV cp) {
6402     return add_range_to_invlist(invlist, cp, cp);
6403 }
6404
6405 /* End of inversion list object */
6406
6407 /*
6408  - reg - regular expression, i.e. main body or parenthesized thing
6409  *
6410  * Caller must absorb opening parenthesis.
6411  *
6412  * Combining parenthesis handling with the base level of regular expression
6413  * is a trifle forced, but the need to tie the tails of the branches to what
6414  * follows makes it hard to avoid.
6415  */
6416 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
6417 #ifdef DEBUGGING
6418 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
6419 #else
6420 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
6421 #endif
6422
6423 STATIC regnode *
6424 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
6425     /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
6426 {
6427     dVAR;
6428     register regnode *ret;              /* Will be the head of the group. */
6429     register regnode *br;
6430     register regnode *lastbr;
6431     register regnode *ender = NULL;
6432     register I32 parno = 0;
6433     I32 flags;
6434     U32 oregflags = RExC_flags;
6435     bool have_branch = 0;
6436     bool is_open = 0;
6437     I32 freeze_paren = 0;
6438     I32 after_freeze = 0;
6439
6440     /* for (?g), (?gc), and (?o) warnings; warning
6441        about (?c) will warn about (?g) -- japhy    */
6442
6443 #define WASTED_O  0x01
6444 #define WASTED_G  0x02
6445 #define WASTED_C  0x04
6446 #define WASTED_GC (0x02|0x04)
6447     I32 wastedflags = 0x00;
6448
6449     char * parse_start = RExC_parse; /* MJD */
6450     char * const oregcomp_parse = RExC_parse;
6451
6452     GET_RE_DEBUG_FLAGS_DECL;
6453
6454     PERL_ARGS_ASSERT_REG;
6455     DEBUG_PARSE("reg ");
6456
6457     *flagp = 0;                         /* Tentatively. */
6458
6459
6460     /* Make an OPEN node, if parenthesized. */
6461     if (paren) {
6462         if ( *RExC_parse == '*') { /* (*VERB:ARG) */
6463             char *start_verb = RExC_parse;
6464             STRLEN verb_len = 0;
6465             char *start_arg = NULL;
6466             unsigned char op = 0;
6467             int argok = 1;
6468             int internal_argval = 0; /* internal_argval is only useful if !argok */
6469             while ( *RExC_parse && *RExC_parse != ')' ) {
6470                 if ( *RExC_parse == ':' ) {
6471                     start_arg = RExC_parse + 1;
6472                     break;
6473                 }
6474                 RExC_parse++;
6475             }
6476             ++start_verb;
6477             verb_len = RExC_parse - start_verb;
6478             if ( start_arg ) {
6479                 RExC_parse++;
6480                 while ( *RExC_parse && *RExC_parse != ')' )
6481                     RExC_parse++;
6482                 if ( *RExC_parse != ')' )
6483                     vFAIL("Unterminated verb pattern argument");
6484                 if ( RExC_parse == start_arg )
6485                     start_arg = NULL;
6486             } else {
6487                 if ( *RExC_parse != ')' )
6488                     vFAIL("Unterminated verb pattern");
6489             }
6490
6491             switch ( *start_verb ) {
6492             case 'A':  /* (*ACCEPT) */
6493                 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
6494                     op = ACCEPT;
6495                     internal_argval = RExC_nestroot;
6496                 }
6497                 break;
6498             case 'C':  /* (*COMMIT) */
6499                 if ( memEQs(start_verb,verb_len,"COMMIT") )
6500                     op = COMMIT;
6501                 break;
6502             case 'F':  /* (*FAIL) */
6503                 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
6504                     op = OPFAIL;
6505                     argok = 0;
6506                 }
6507                 break;
6508             case ':':  /* (*:NAME) */
6509             case 'M':  /* (*MARK:NAME) */
6510                 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
6511                     op = MARKPOINT;
6512                     argok = -1;
6513                 }
6514                 break;
6515             case 'P':  /* (*PRUNE) */
6516                 if ( memEQs(start_verb,verb_len,"PRUNE") )
6517                     op = PRUNE;
6518                 break;
6519             case 'S':   /* (*SKIP) */
6520                 if ( memEQs(start_verb,verb_len,"SKIP") )
6521                     op = SKIP;
6522                 break;
6523             case 'T':  /* (*THEN) */
6524                 /* [19:06] <TimToady> :: is then */
6525                 if ( memEQs(start_verb,verb_len,"THEN") ) {
6526                     op = CUTGROUP;
6527                     RExC_seen |= REG_SEEN_CUTGROUP;
6528                 }
6529                 break;
6530             }
6531             if ( ! op ) {
6532                 RExC_parse++;
6533                 vFAIL3("Unknown verb pattern '%.*s'",
6534                     verb_len, start_verb);
6535             }
6536             if ( argok ) {
6537                 if ( start_arg && internal_argval ) {
6538                     vFAIL3("Verb pattern '%.*s' may not have an argument",
6539                         verb_len, start_verb);
6540                 } else if ( argok < 0 && !start_arg ) {
6541                     vFAIL3("Verb pattern '%.*s' has a mandatory argument",
6542                         verb_len, start_verb);
6543                 } else {
6544                     ret = reganode(pRExC_state, op, internal_argval);
6545                     if ( ! internal_argval && ! SIZE_ONLY ) {
6546                         if (start_arg) {
6547                             SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
6548                             ARG(ret) = add_data( pRExC_state, 1, "S" );
6549                             RExC_rxi->data->data[ARG(ret)]=(void*)sv;
6550                             ret->flags = 0;
6551                         } else {
6552                             ret->flags = 1;
6553                         }
6554                     }
6555                 }
6556                 if (!internal_argval)
6557                     RExC_seen |= REG_SEEN_VERBARG;
6558             } else if ( start_arg ) {
6559                 vFAIL3("Verb pattern '%.*s' may not have an argument",
6560                         verb_len, start_verb);
6561             } else {
6562                 ret = reg_node(pRExC_state, op);
6563             }
6564             nextchar(pRExC_state);
6565             return ret;
6566         } else
6567         if (*RExC_parse == '?') { /* (?...) */
6568             bool is_logical = 0;
6569             const char * const seqstart = RExC_parse;
6570             bool has_use_defaults = FALSE;
6571
6572             RExC_parse++;
6573             paren = *RExC_parse++;
6574             ret = NULL;                 /* For look-ahead/behind. */
6575             switch (paren) {
6576
6577             case 'P':   /* (?P...) variants for those used to PCRE/Python */
6578                 paren = *RExC_parse++;
6579                 if ( paren == '<')         /* (?P<...>) named capture */
6580                     goto named_capture;
6581                 else if (paren == '>') {   /* (?P>name) named recursion */
6582                     goto named_recursion;
6583                 }
6584                 else if (paren == '=') {   /* (?P=...)  named backref */
6585                     /* this pretty much dupes the code for \k<NAME> in regatom(), if
6586                        you change this make sure you change that */
6587                     char* name_start = RExC_parse;
6588                     U32 num = 0;
6589                     SV *sv_dat = reg_scan_name(pRExC_state,
6590                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6591                     if (RExC_parse == name_start || *RExC_parse != ')')
6592                         vFAIL2("Sequence %.3s... not terminated",parse_start);
6593
6594                     if (!SIZE_ONLY) {
6595                         num = add_data( pRExC_state, 1, "S" );
6596                         RExC_rxi->data->data[num]=(void*)sv_dat;
6597                         SvREFCNT_inc_simple_void(sv_dat);
6598                     }
6599                     RExC_sawback = 1;
6600                     ret = reganode(pRExC_state,
6601                                    ((! FOLD)
6602                                      ? NREF
6603                                      : (MORE_ASCII_RESTRICTED)
6604                                        ? NREFFA
6605                                        : (AT_LEAST_UNI_SEMANTICS)
6606                                          ? NREFFU
6607                                          : (LOC)
6608                                            ? NREFFL
6609                                            : NREFF),
6610                                     num);
6611                     *flagp |= HASWIDTH;
6612
6613                     Set_Node_Offset(ret, parse_start+1);
6614                     Set_Node_Cur_Length(ret); /* MJD */
6615
6616                     nextchar(pRExC_state);
6617                     return ret;
6618                 }
6619                 RExC_parse++;
6620                 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6621                 /*NOTREACHED*/
6622             case '<':           /* (?<...) */
6623                 if (*RExC_parse == '!')
6624                     paren = ',';
6625                 else if (*RExC_parse != '=')
6626               named_capture:
6627                 {               /* (?<...>) */
6628                     char *name_start;
6629                     SV *svname;
6630                     paren= '>';
6631             case '\'':          /* (?'...') */
6632                     name_start= RExC_parse;
6633                     svname = reg_scan_name(pRExC_state,
6634                         SIZE_ONLY ?  /* reverse test from the others */
6635                         REG_RSN_RETURN_NAME :
6636                         REG_RSN_RETURN_NULL);
6637                     if (RExC_parse == name_start) {
6638                         RExC_parse++;
6639                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6640                         /*NOTREACHED*/
6641                     }
6642                     if (*RExC_parse != paren)
6643                         vFAIL2("Sequence (?%c... not terminated",
6644                             paren=='>' ? '<' : paren);
6645                     if (SIZE_ONLY) {
6646                         HE *he_str;
6647                         SV *sv_dat = NULL;
6648                         if (!svname) /* shouldn't happen */
6649                             Perl_croak(aTHX_
6650                                 "panic: reg_scan_name returned NULL");
6651                         if (!RExC_paren_names) {
6652                             RExC_paren_names= newHV();
6653                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
6654 #ifdef DEBUGGING
6655                             RExC_paren_name_list= newAV();
6656                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
6657 #endif
6658                         }
6659                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
6660                         if ( he_str )
6661                             sv_dat = HeVAL(he_str);
6662                         if ( ! sv_dat ) {
6663                             /* croak baby croak */
6664                             Perl_croak(aTHX_
6665                                 "panic: paren_name hash element allocation failed");
6666                         } else if ( SvPOK(sv_dat) ) {
6667                             /* (?|...) can mean we have dupes so scan to check
6668                                its already been stored. Maybe a flag indicating
6669                                we are inside such a construct would be useful,
6670                                but the arrays are likely to be quite small, so
6671                                for now we punt -- dmq */
6672                             IV count = SvIV(sv_dat);
6673                             I32 *pv = (I32*)SvPVX(sv_dat);
6674                             IV i;
6675                             for ( i = 0 ; i < count ; i++ ) {
6676                                 if ( pv[i] == RExC_npar ) {
6677                                     count = 0;
6678                                     break;
6679                                 }
6680                             }
6681                             if ( count ) {
6682                                 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
6683                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
6684                                 pv[count] = RExC_npar;
6685                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
6686                             }
6687                         } else {
6688                             (void)SvUPGRADE(sv_dat,SVt_PVNV);
6689                             sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
6690                             SvIOK_on(sv_dat);
6691                             SvIV_set(sv_dat, 1);
6692                         }
6693 #ifdef DEBUGGING
6694                         if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
6695                             SvREFCNT_dec(svname);
6696 #endif
6697
6698                         /*sv_dump(sv_dat);*/
6699                     }
6700                     nextchar(pRExC_state);
6701                     paren = 1;
6702                     goto capturing_parens;
6703                 }
6704                 RExC_seen |= REG_SEEN_LOOKBEHIND;
6705                 RExC_in_lookbehind++;
6706                 RExC_parse++;
6707             case '=':           /* (?=...) */
6708                 RExC_seen_zerolen++;
6709                 break;
6710             case '!':           /* (?!...) */
6711                 RExC_seen_zerolen++;
6712                 if (*RExC_parse == ')') {
6713                     ret=reg_node(pRExC_state, OPFAIL);
6714                     nextchar(pRExC_state);
6715                     return ret;
6716                 }
6717                 break;
6718             case '|':           /* (?|...) */
6719                 /* branch reset, behave like a (?:...) except that
6720                    buffers in alternations share the same numbers */
6721                 paren = ':';
6722                 after_freeze = freeze_paren = RExC_npar;
6723                 break;
6724             case ':':           /* (?:...) */
6725             case '>':           /* (?>...) */
6726                 break;
6727             case '$':           /* (?$...) */
6728             case '@':           /* (?@...) */
6729                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
6730                 break;
6731             case '#':           /* (?#...) */
6732                 while (*RExC_parse && *RExC_parse != ')')
6733                     RExC_parse++;
6734                 if (*RExC_parse != ')')
6735                     FAIL("Sequence (?#... not terminated");
6736                 nextchar(pRExC_state);
6737                 *flagp = TRYAGAIN;
6738                 return NULL;
6739             case '0' :           /* (?0) */
6740             case 'R' :           /* (?R) */
6741                 if (*RExC_parse != ')')
6742                     FAIL("Sequence (?R) not terminated");
6743                 ret = reg_node(pRExC_state, GOSTART);
6744                 *flagp |= POSTPONED;
6745                 nextchar(pRExC_state);
6746                 return ret;
6747                 /*notreached*/
6748             { /* named and numeric backreferences */
6749                 I32 num;
6750             case '&':            /* (?&NAME) */
6751                 parse_start = RExC_parse - 1;
6752               named_recursion:
6753                 {
6754                     SV *sv_dat = reg_scan_name(pRExC_state,
6755                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6756                      num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
6757                 }
6758                 goto gen_recurse_regop;
6759                 /* NOT REACHED */
6760             case '+':
6761                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
6762                     RExC_parse++;
6763                     vFAIL("Illegal pattern");
6764                 }
6765                 goto parse_recursion;
6766                 /* NOT REACHED*/
6767             case '-': /* (?-1) */
6768                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
6769                     RExC_parse--; /* rewind to let it be handled later */
6770                     goto parse_flags;
6771                 }
6772                 /*FALLTHROUGH */
6773             case '1': case '2': case '3': case '4': /* (?1) */
6774             case '5': case '6': case '7': case '8': case '9':
6775                 RExC_parse--;
6776               parse_recursion:
6777                 num = atoi(RExC_parse);
6778                 parse_start = RExC_parse - 1; /* MJD */
6779                 if (*RExC_parse == '-')
6780                     RExC_parse++;
6781                 while (isDIGIT(*RExC_parse))
6782                         RExC_parse++;
6783                 if (*RExC_parse!=')')
6784                     vFAIL("Expecting close bracket");
6785
6786               gen_recurse_regop:
6787                 if ( paren == '-' ) {
6788                     /*
6789                     Diagram of capture buffer numbering.
6790                     Top line is the normal capture buffer numbers
6791                     Bottom line is the negative indexing as from
6792                     the X (the (?-2))
6793
6794                     +   1 2    3 4 5 X          6 7
6795                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
6796                     -   5 4    3 2 1 X          x x
6797
6798                     */
6799                     num = RExC_npar + num;
6800                     if (num < 1)  {
6801                         RExC_parse++;
6802                         vFAIL("Reference to nonexistent group");
6803                     }
6804                 } else if ( paren == '+' ) {
6805                     num = RExC_npar + num - 1;
6806                 }
6807
6808                 ret = reganode(pRExC_state, GOSUB, num);
6809                 if (!SIZE_ONLY) {
6810                     if (num > (I32)RExC_rx->nparens) {
6811                         RExC_parse++;
6812                         vFAIL("Reference to nonexistent group");
6813                     }
6814                     ARG2L_SET( ret, RExC_recurse_count++);
6815                     RExC_emit++;
6816                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6817                         "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
6818                 } else {
6819                     RExC_size++;
6820                 }
6821                 RExC_seen |= REG_SEEN_RECURSE;
6822                 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
6823                 Set_Node_Offset(ret, parse_start); /* MJD */
6824
6825                 *flagp |= POSTPONED;
6826                 nextchar(pRExC_state);
6827                 return ret;
6828             } /* named and numeric backreferences */
6829             /* NOT REACHED */
6830
6831             case '?':           /* (??...) */
6832                 is_logical = 1;
6833                 if (*RExC_parse != '{') {
6834                     RExC_parse++;
6835                     vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6836                     /*NOTREACHED*/
6837                 }
6838                 *flagp |= POSTPONED;
6839                 paren = *RExC_parse++;
6840                 /* FALL THROUGH */
6841             case '{':           /* (?{...}) */
6842             {
6843                 I32 count = 1;
6844                 U32 n = 0;
6845                 char c;
6846                 char *s = RExC_parse;
6847
6848                 RExC_seen_zerolen++;
6849                 RExC_seen |= REG_SEEN_EVAL;
6850                 while (count && (c = *RExC_parse)) {
6851                     if (c == '\\') {
6852                         if (RExC_parse[1])
6853                             RExC_parse++;
6854                     }
6855                     else if (c == '{')
6856                         count++;
6857                     else if (c == '}')
6858                         count--;
6859                     RExC_parse++;
6860                 }
6861                 if (*RExC_parse != ')') {
6862                     RExC_parse = s;
6863                     vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
6864                 }
6865                 if (!SIZE_ONLY) {
6866                     PAD *pad;
6867                     OP_4tree *sop, *rop;
6868                     SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
6869
6870                     ENTER;
6871                     Perl_save_re_context(aTHX);
6872                     rop = Perl_sv_compile_2op_is_broken(aTHX_ sv, &sop, "re", &pad);
6873                     sop->op_private |= OPpREFCOUNTED;
6874                     /* re_dup will OpREFCNT_inc */
6875                     OpREFCNT_set(sop, 1);
6876                     LEAVE;
6877
6878                     n = add_data(pRExC_state, 3, "nop");
6879                     RExC_rxi->data->data[n] = (void*)rop;
6880                     RExC_rxi->data->data[n+1] = (void*)sop;
6881                     RExC_rxi->data->data[n+2] = (void*)pad;
6882                     SvREFCNT_dec(sv);
6883                 }
6884                 else {                                          /* First pass */
6885                     if (PL_reginterp_cnt < ++RExC_seen_evals
6886                         && IN_PERL_RUNTIME)
6887                         /* No compiled RE interpolated, has runtime
6888                            components ===> unsafe.  */
6889                         FAIL("Eval-group not allowed at runtime, use re 'eval'");
6890                     if (PL_tainting && PL_tainted)
6891                         FAIL("Eval-group in insecure regular expression");
6892 #if PERL_VERSION > 8
6893                     if (IN_PERL_COMPILETIME)
6894                         PL_cv_has_eval = 1;
6895 #endif
6896                 }
6897
6898                 nextchar(pRExC_state);
6899                 if (is_logical) {
6900                     ret = reg_node(pRExC_state, LOGICAL);
6901                     if (!SIZE_ONLY)
6902                         ret->flags = 2;
6903                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
6904                     /* deal with the length of this later - MJD */
6905                     return ret;
6906                 }
6907                 ret = reganode(pRExC_state, EVAL, n);
6908                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
6909                 Set_Node_Offset(ret, parse_start);
6910                 return ret;
6911             }
6912             case '(':           /* (?(?{...})...) and (?(?=...)...) */
6913             {
6914                 int is_define= 0;
6915                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
6916                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
6917                         || RExC_parse[1] == '<'
6918                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
6919                         I32 flag;
6920
6921                         ret = reg_node(pRExC_state, LOGICAL);
6922                         if (!SIZE_ONLY)
6923                             ret->flags = 1;
6924                         REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
6925                         goto insert_if;
6926                     }
6927                 }
6928                 else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
6929                          || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
6930                 {
6931                     char ch = RExC_parse[0] == '<' ? '>' : '\'';
6932                     char *name_start= RExC_parse++;
6933                     U32 num = 0;
6934                     SV *sv_dat=reg_scan_name(pRExC_state,
6935                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6936                     if (RExC_parse == name_start || *RExC_parse != ch)
6937                         vFAIL2("Sequence (?(%c... not terminated",
6938                             (ch == '>' ? '<' : ch));
6939                     RExC_parse++;
6940                     if (!SIZE_ONLY) {
6941                         num = add_data( pRExC_state, 1, "S" );
6942                         RExC_rxi->data->data[num]=(void*)sv_dat;
6943                         SvREFCNT_inc_simple_void(sv_dat);
6944                     }
6945                     ret = reganode(pRExC_state,NGROUPP,num);
6946                     goto insert_if_check_paren;
6947                 }
6948                 else if (RExC_parse[0] == 'D' &&
6949                          RExC_parse[1] == 'E' &&
6950                          RExC_parse[2] == 'F' &&
6951                          RExC_parse[3] == 'I' &&
6952                          RExC_parse[4] == 'N' &&
6953                          RExC_parse[5] == 'E')
6954                 {
6955                     ret = reganode(pRExC_state,DEFINEP,0);
6956                     RExC_parse +=6 ;
6957                     is_define = 1;
6958                     goto insert_if_check_paren;
6959                 }
6960                 else if (RExC_parse[0] == 'R') {
6961                     RExC_parse++;
6962                     parno = 0;
6963                     if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
6964                         parno = atoi(RExC_parse++);
6965                         while (isDIGIT(*RExC_parse))
6966                             RExC_parse++;
6967                     } else if (RExC_parse[0] == '&') {
6968                         SV *sv_dat;
6969                         RExC_parse++;
6970                         sv_dat = reg_scan_name(pRExC_state,
6971                             SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6972                         parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
6973                     }
6974                     ret = reganode(pRExC_state,INSUBP,parno);
6975                     goto insert_if_check_paren;
6976                 }
6977                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
6978                     /* (?(1)...) */
6979                     char c;
6980                     parno = atoi(RExC_parse++);
6981
6982                     while (isDIGIT(*RExC_parse))
6983                         RExC_parse++;
6984                     ret = reganode(pRExC_state, GROUPP, parno);
6985
6986                  insert_if_check_paren:
6987                     if ((c = *nextchar(pRExC_state)) != ')')
6988                         vFAIL("Switch condition not recognized");
6989                   insert_if:
6990                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
6991                     br = regbranch(pRExC_state, &flags, 1,depth+1);
6992                     if (br == NULL)
6993                         br = reganode(pRExC_state, LONGJMP, 0);
6994                     else
6995                         REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
6996                     c = *nextchar(pRExC_state);
6997                     if (flags&HASWIDTH)
6998                         *flagp |= HASWIDTH;
6999                     if (c == '|') {
7000                         if (is_define)
7001                             vFAIL("(?(DEFINE)....) does not allow branches");
7002                         lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
7003                         regbranch(pRExC_state, &flags, 1,depth+1);
7004                         REGTAIL(pRExC_state, ret, lastbr);
7005                         if (flags&HASWIDTH)
7006                             *flagp |= HASWIDTH;
7007                         c = *nextchar(pRExC_state);
7008                     }
7009                     else
7010                         lastbr = NULL;
7011                     if (c != ')')
7012                         vFAIL("Switch (?(condition)... contains too many branches");
7013                     ender = reg_node(pRExC_state, TAIL);
7014                     REGTAIL(pRExC_state, br, ender);
7015                     if (lastbr) {
7016                         REGTAIL(pRExC_state, lastbr, ender);
7017                         REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
7018                     }
7019                     else
7020                         REGTAIL(pRExC_state, ret, ender);
7021                     RExC_size++; /* XXX WHY do we need this?!!
7022                                     For large programs it seems to be required
7023                                     but I can't figure out why. -- dmq*/
7024                     return ret;
7025                 }
7026                 else {
7027                     vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
7028                 }
7029             }
7030             case 0:
7031                 RExC_parse--; /* for vFAIL to print correctly */
7032                 vFAIL("Sequence (? incomplete");
7033                 break;
7034             case DEFAULT_PAT_MOD:   /* Use default flags with the exceptions
7035                                        that follow */
7036                 has_use_defaults = TRUE;
7037                 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
7038                 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
7039                                                 ? REGEX_UNICODE_CHARSET
7040                                                 : REGEX_DEPENDS_CHARSET);
7041                 goto parse_flags;
7042             default:
7043                 --RExC_parse;
7044                 parse_flags:      /* (?i) */
7045             {
7046                 U32 posflags = 0, negflags = 0;
7047                 U32 *flagsp = &posflags;
7048                 bool has_charset_modifier = 0;
7049                 regex_charset cs = (RExC_utf8 || RExC_uni_semantics)
7050                                     ? REGEX_UNICODE_CHARSET
7051                                     : REGEX_DEPENDS_CHARSET;
7052
7053                 while (*RExC_parse) {
7054                     /* && strchr("iogcmsx", *RExC_parse) */
7055                     /* (?g), (?gc) and (?o) are useless here
7056                        and must be globally applied -- japhy */
7057                     switch (*RExC_parse) {
7058                     CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
7059                     case LOCALE_PAT_MOD:
7060                         if (has_charset_modifier || flagsp == &negflags) {
7061                             goto fail_modifiers;
7062                         }
7063                         cs = REGEX_LOCALE_CHARSET;
7064                         has_charset_modifier = 1;
7065                         RExC_contains_locale = 1;
7066                         break;
7067                     case UNICODE_PAT_MOD:
7068                         if (has_charset_modifier || flagsp == &negflags) {
7069                             goto fail_modifiers;
7070                         }
7071                         cs = REGEX_UNICODE_CHARSET;
7072                         has_charset_modifier = 1;
7073                         break;
7074                     case ASCII_RESTRICT_PAT_MOD:
7075                         if (has_charset_modifier || flagsp == &negflags) {
7076                             goto fail_modifiers;
7077                         }
7078                         if (*(RExC_parse + 1) == ASCII_RESTRICT_PAT_MOD) {
7079                             /* Doubled modifier implies more restricted */
7080                             cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
7081                             RExC_parse++;
7082                         }
7083                         else {
7084                             cs = REGEX_ASCII_RESTRICTED_CHARSET;
7085                         }
7086                         has_charset_modifier = 1;
7087                         break;
7088                     case DEPENDS_PAT_MOD:
7089                         if (has_use_defaults
7090                             || has_charset_modifier
7091                             || flagsp == &negflags)
7092                         {
7093                             goto fail_modifiers;
7094                         }
7095
7096                         /* The dual charset means unicode semantics if the
7097                          * pattern (or target, not known until runtime) are
7098                          * utf8, or something in the pattern indicates unicode
7099                          * semantics */
7100                         cs = (RExC_utf8 || RExC_uni_semantics)
7101                              ? REGEX_UNICODE_CHARSET
7102                              : REGEX_DEPENDS_CHARSET;
7103                         has_charset_modifier = 1;
7104                         break;
7105                     case ONCE_PAT_MOD: /* 'o' */
7106                     case GLOBAL_PAT_MOD: /* 'g' */
7107                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
7108                             const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
7109                             if (! (wastedflags & wflagbit) ) {
7110                                 wastedflags |= wflagbit;
7111                                 vWARN5(
7112                                     RExC_parse + 1,
7113                                     "Useless (%s%c) - %suse /%c modifier",
7114                                     flagsp == &negflags ? "?-" : "?",
7115                                     *RExC_parse,
7116                                     flagsp == &negflags ? "don't " : "",
7117                                     *RExC_parse
7118                                 );
7119                             }
7120                         }
7121                         break;
7122
7123                     case CONTINUE_PAT_MOD: /* 'c' */
7124                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
7125                             if (! (wastedflags & WASTED_C) ) {
7126                                 wastedflags |= WASTED_GC;
7127                                 vWARN3(
7128                                     RExC_parse + 1,
7129                                     "Useless (%sc) - %suse /gc modifier",
7130                                     flagsp == &negflags ? "?-" : "?",
7131                                     flagsp == &negflags ? "don't " : ""
7132                                 );
7133                             }
7134                         }
7135                         break;
7136                     case KEEPCOPY_PAT_MOD: /* 'p' */
7137                         if (flagsp == &negflags) {
7138                             if (SIZE_ONLY)
7139                                 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
7140                         } else {
7141                             *flagsp |= RXf_PMf_KEEPCOPY;
7142                         }
7143                         break;
7144                     case '-':
7145                         /* A flag is a default iff it is following a minus, so
7146                          * if there is a minus, it means will be trying to
7147                          * re-specify a default which is an error */
7148                         if (has_use_defaults || flagsp == &negflags) {
7149             fail_modifiers:
7150                             RExC_parse++;
7151                             vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
7152                             /*NOTREACHED*/
7153                         }
7154                         flagsp = &negflags;
7155                         wastedflags = 0;  /* reset so (?g-c) warns twice */
7156                         break;
7157                     case ':':
7158                         paren = ':';
7159                         /*FALLTHROUGH*/
7160                     case ')':
7161                         RExC_flags |= posflags;
7162                         RExC_flags &= ~negflags;
7163                         set_regex_charset(&RExC_flags, cs);
7164                         if (paren != ':') {
7165                             oregflags |= posflags;
7166                             oregflags &= ~negflags;
7167                             set_regex_charset(&oregflags, cs);
7168                         }
7169                         nextchar(pRExC_state);
7170                         if (paren != ':') {
7171                             *flagp = TRYAGAIN;
7172                             return NULL;
7173                         } else {
7174                             ret = NULL;
7175                             goto parse_rest;
7176                         }
7177                         /*NOTREACHED*/
7178                     default:
7179                         RExC_parse++;
7180                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
7181                         /*NOTREACHED*/
7182                     }
7183                     ++RExC_parse;
7184                 }
7185             }} /* one for the default block, one for the switch */
7186         }
7187         else {                  /* (...) */
7188           capturing_parens:
7189             parno = RExC_npar;
7190             RExC_npar++;
7191
7192             ret = reganode(pRExC_state, OPEN, parno);
7193             if (!SIZE_ONLY ){
7194                 if (!RExC_nestroot)
7195                     RExC_nestroot = parno;
7196                 if (RExC_seen & REG_SEEN_RECURSE
7197                     && !RExC_open_parens[parno-1])
7198                 {
7199                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
7200                         "Setting open paren #%"IVdf" to %d\n",
7201                         (IV)parno, REG_NODE_NUM(ret)));
7202                     RExC_open_parens[parno-1]= ret;
7203                 }
7204             }
7205             Set_Node_Length(ret, 1); /* MJD */
7206             Set_Node_Offset(ret, RExC_parse); /* MJD */
7207             is_open = 1;
7208         }
7209     }
7210     else                        /* ! paren */
7211         ret = NULL;
7212
7213    parse_rest:
7214     /* Pick up the branches, linking them together. */
7215     parse_start = RExC_parse;   /* MJD */
7216     br = regbranch(pRExC_state, &flags, 1,depth+1);
7217
7218     /*     branch_len = (paren != 0); */
7219
7220     if (br == NULL)
7221         return(NULL);
7222     if (*RExC_parse == '|') {
7223         if (!SIZE_ONLY && RExC_extralen) {
7224             reginsert(pRExC_state, BRANCHJ, br, depth+1);
7225         }
7226         else {                  /* MJD */
7227             reginsert(pRExC_state, BRANCH, br, depth+1);
7228             Set_Node_Length(br, paren != 0);
7229             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
7230         }
7231         have_branch = 1;
7232         if (SIZE_ONLY)
7233             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
7234     }
7235     else if (paren == ':') {
7236         *flagp |= flags&SIMPLE;
7237     }
7238     if (is_open) {                              /* Starts with OPEN. */
7239         REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
7240     }
7241     else if (paren != '?')              /* Not Conditional */
7242         ret = br;
7243     *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
7244     lastbr = br;
7245     while (*RExC_parse == '|') {
7246         if (!SIZE_ONLY && RExC_extralen) {
7247             ender = reganode(pRExC_state, LONGJMP,0);
7248             REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
7249         }
7250         if (SIZE_ONLY)
7251             RExC_extralen += 2;         /* Account for LONGJMP. */
7252         nextchar(pRExC_state);
7253         if (freeze_paren) {
7254             if (RExC_npar > after_freeze)
7255                 after_freeze = RExC_npar;
7256             RExC_npar = freeze_paren;
7257         }
7258         br = regbranch(pRExC_state, &flags, 0, depth+1);
7259
7260         if (br == NULL)
7261             return(NULL);
7262         REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
7263         lastbr = br;
7264         *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
7265     }
7266
7267     if (have_branch || paren != ':') {
7268         /* Make a closing node, and hook it on the end. */
7269         switch (paren) {
7270         case ':':
7271             ender = reg_node(pRExC_state, TAIL);
7272             break;
7273         case 1:
7274             ender = reganode(pRExC_state, CLOSE, parno);
7275             if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
7276                 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
7277                         "Setting close paren #%"IVdf" to %d\n",
7278                         (IV)parno, REG_NODE_NUM(ender)));
7279                 RExC_close_parens[parno-1]= ender;
7280                 if (RExC_nestroot == parno)
7281                     RExC_nestroot = 0;
7282             }
7283             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
7284             Set_Node_Length(ender,1); /* MJD */
7285             break;
7286         case '<':
7287         case ',':
7288         case '=':
7289         case '!':
7290             *flagp &= ~HASWIDTH;
7291             /* FALL THROUGH */
7292         case '>':
7293             ender = reg_node(pRExC_state, SUCCEED);
7294             break;
7295         case 0:
7296             ender = reg_node(pRExC_state, END);
7297             if (!SIZE_ONLY) {
7298                 assert(!RExC_opend); /* there can only be one! */
7299                 RExC_opend = ender;
7300             }
7301             break;
7302         }
7303         REGTAIL(pRExC_state, lastbr, ender);
7304
7305         if (have_branch && !SIZE_ONLY) {
7306             if (depth==1)
7307                 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
7308
7309             /* Hook the tails of the branches to the closing node. */
7310             for (br = ret; br; br = regnext(br)) {
7311                 const U8 op = PL_regkind[OP(br)];
7312                 if (op == BRANCH) {
7313                     REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
7314                 }
7315                 else if (op == BRANCHJ) {
7316                     REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
7317                 }
7318             }
7319         }
7320     }
7321
7322     {
7323         const char *p;
7324         static const char parens[] = "=!<,>";
7325
7326         if (paren && (p = strchr(parens, paren))) {
7327             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
7328             int flag = (p - parens) > 1;
7329
7330             if (paren == '>')
7331                 node = SUSPEND, flag = 0;
7332             reginsert(pRExC_state, node,ret, depth+1);
7333             Set_Node_Cur_Length(ret);
7334             Set_Node_Offset(ret, parse_start + 1);
7335             ret->flags = flag;
7336             REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
7337         }
7338     }
7339
7340     /* Check for proper termination. */
7341     if (paren) {
7342         RExC_flags = oregflags;
7343         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
7344             RExC_parse = oregcomp_parse;
7345             vFAIL("Unmatched (");
7346         }
7347     }
7348     else if (!paren && RExC_parse < RExC_end) {
7349         if (*RExC_parse == ')') {
7350             RExC_parse++;
7351             vFAIL("Unmatched )");
7352         }
7353         else
7354             FAIL("Junk on end of regexp");      /* "Can't happen". */
7355         /* NOTREACHED */
7356     }
7357
7358     if (RExC_in_lookbehind) {
7359         RExC_in_lookbehind--;
7360     }
7361     if (after_freeze > RExC_npar)
7362         RExC_npar = after_freeze;
7363     return(ret);
7364 }
7365
7366 /*
7367  - regbranch - one alternative of an | operator
7368  *
7369  * Implements the concatenation operator.
7370  */
7371 STATIC regnode *
7372 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
7373 {
7374     dVAR;
7375     register regnode *ret;
7376     register regnode *chain = NULL;
7377     register regnode *latest;
7378     I32 flags = 0, c = 0;
7379     GET_RE_DEBUG_FLAGS_DECL;
7380
7381     PERL_ARGS_ASSERT_REGBRANCH;
7382
7383     DEBUG_PARSE("brnc");
7384
7385     if (first)
7386         ret = NULL;
7387     else {
7388         if (!SIZE_ONLY && RExC_extralen)
7389             ret = reganode(pRExC_state, BRANCHJ,0);
7390         else {
7391             ret = reg_node(pRExC_state, BRANCH);
7392             Set_Node_Length(ret, 1);
7393         }
7394     }
7395
7396     if (!first && SIZE_ONLY)
7397         RExC_extralen += 1;                     /* BRANCHJ */
7398
7399     *flagp = WORST;                     /* Tentatively. */
7400
7401     RExC_parse--;
7402     nextchar(pRExC_state);
7403     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
7404         flags &= ~TRYAGAIN;
7405         latest = regpiece(pRExC_state, &flags,depth+1);
7406         if (latest == NULL) {
7407             if (flags & TRYAGAIN)
7408                 continue;
7409             return(NULL);
7410         }
7411         else if (ret == NULL)
7412             ret = latest;
7413         *flagp |= flags&(HASWIDTH|POSTPONED);
7414         if (chain == NULL)      /* First piece. */
7415             *flagp |= flags&SPSTART;
7416         else {
7417             RExC_naughty++;
7418             REGTAIL(pRExC_state, chain, latest);
7419         }
7420         chain = latest;
7421         c++;
7422     }
7423     if (chain == NULL) {        /* Loop ran zero times. */
7424         chain = reg_node(pRExC_state, NOTHING);
7425         if (ret == NULL)
7426             ret = chain;
7427     }
7428     if (c == 1) {
7429         *flagp |= flags&SIMPLE;
7430     }
7431
7432     return ret;
7433 }
7434
7435 /*
7436  - regpiece - something followed by possible [*+?]
7437  *
7438  * Note that the branching code sequences used for ? and the general cases
7439  * of * and + are somewhat optimized:  they use the same NOTHING node as
7440  * both the endmarker for their branch list and the body of the last branch.
7441  * It might seem that this node could be dispensed with entirely, but the
7442  * endmarker role is not redundant.
7443  */
7444 STATIC regnode *
7445 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
7446 {
7447     dVAR;
7448     register regnode *ret;
7449     register char op;
7450     register char *next;
7451     I32 flags;
7452     const char * const origparse = RExC_parse;
7453     I32 min;
7454     I32 max = REG_INFTY;
7455     char *parse_start;
7456     const char *maxpos = NULL;
7457     GET_RE_DEBUG_FLAGS_DECL;
7458
7459     PERL_ARGS_ASSERT_REGPIECE;
7460
7461     DEBUG_PARSE("piec");
7462
7463     ret = regatom(pRExC_state, &flags,depth+1);
7464     if (ret == NULL) {
7465         if (flags & TRYAGAIN)
7466             *flagp |= TRYAGAIN;
7467         return(NULL);
7468     }
7469
7470     op = *RExC_parse;
7471
7472     if (op == '{' && regcurly(RExC_parse)) {
7473         maxpos = NULL;
7474         parse_start = RExC_parse; /* MJD */
7475         next = RExC_parse + 1;
7476         while (isDIGIT(*next) || *next == ',') {
7477             if (*next == ',') {
7478                 if (maxpos)
7479                     break;
7480                 else
7481                     maxpos = next;
7482             }
7483             next++;
7484         }
7485         if (*next == '}') {             /* got one */
7486             if (!maxpos)
7487                 maxpos = next;
7488             RExC_parse++;
7489             min = atoi(RExC_parse);
7490             if (*maxpos == ',')
7491                 maxpos++;
7492             else
7493                 maxpos = RExC_parse;
7494             max = atoi(maxpos);
7495             if (!max && *maxpos != '0')
7496                 max = REG_INFTY;                /* meaning "infinity" */
7497             else if (max >= REG_INFTY)
7498                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
7499             RExC_parse = next;
7500             nextchar(pRExC_state);
7501
7502         do_curly:
7503             if ((flags&SIMPLE)) {
7504                 RExC_naughty += 2 + RExC_naughty / 2;
7505                 reginsert(pRExC_state, CURLY, ret, depth+1);
7506                 Set_Node_Offset(ret, parse_start+1); /* MJD */
7507                 Set_Node_Cur_Length(ret);
7508             }
7509             else {
7510                 regnode * const w = reg_node(pRExC_state, WHILEM);
7511
7512                 w->flags = 0;
7513                 REGTAIL(pRExC_state, ret, w);
7514                 if (!SIZE_ONLY && RExC_extralen) {
7515                     reginsert(pRExC_state, LONGJMP,ret, depth+1);
7516                     reginsert(pRExC_state, NOTHING,ret, depth+1);
7517                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
7518                 }
7519                 reginsert(pRExC_state, CURLYX,ret, depth+1);
7520                                 /* MJD hk */
7521                 Set_Node_Offset(ret, parse_start+1);
7522                 Set_Node_Length(ret,
7523                                 op == '{' ? (RExC_parse - parse_start) : 1);
7524
7525                 if (!SIZE_ONLY && RExC_extralen)
7526                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
7527                 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
7528                 if (SIZE_ONLY)
7529                     RExC_whilem_seen++, RExC_extralen += 3;
7530                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
7531             }
7532             ret->flags = 0;
7533
7534             if (min > 0)
7535                 *flagp = WORST;
7536             if (max > 0)
7537                 *flagp |= HASWIDTH;
7538             if (max < min)
7539                 vFAIL("Can't do {n,m} with n > m");
7540             if (!SIZE_ONLY) {
7541                 ARG1_SET(ret, (U16)min);
7542                 ARG2_SET(ret, (U16)max);
7543             }
7544
7545             goto nest_check;
7546         }
7547     }
7548
7549     if (!ISMULT1(op)) {
7550         *flagp = flags;
7551         return(ret);
7552     }
7553
7554 #if 0                           /* Now runtime fix should be reliable. */
7555
7556     /* if this is reinstated, don't forget to put this back into perldiag:
7557
7558             =item Regexp *+ operand could be empty at {#} in regex m/%s/
7559
7560            (F) The part of the regexp subject to either the * or + quantifier
7561            could match an empty string. The {#} shows in the regular
7562            expression about where the problem was discovered.
7563
7564     */
7565
7566     if (!(flags&HASWIDTH) && op != '?')
7567       vFAIL("Regexp *+ operand could be empty");
7568 #endif
7569
7570     parse_start = RExC_parse;
7571     nextchar(pRExC_state);
7572
7573     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
7574
7575     if (op == '*' && (flags&SIMPLE)) {
7576         reginsert(pRExC_state, STAR, ret, depth+1);
7577         ret->flags = 0;
7578         RExC_naughty += 4;
7579     }
7580     else if (op == '*') {
7581         min = 0;
7582         goto do_curly;
7583     }
7584     else if (op == '+' && (flags&SIMPLE)) {
7585         reginsert(pRExC_state, PLUS, ret, depth+1);
7586         ret->flags = 0;
7587         RExC_naughty += 3;
7588     }
7589     else if (op == '+') {
7590         min = 1;
7591         goto do_curly;
7592     }
7593     else if (op == '?') {
7594         min = 0; max = 1;
7595         goto do_curly;
7596     }
7597   nest_check:
7598     if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
7599         ckWARN3reg(RExC_parse,
7600                    "%.*s matches null string many times",
7601                    (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
7602                    origparse);
7603     }
7604
7605     if (RExC_parse < RExC_end && *RExC_parse == '?') {
7606         nextchar(pRExC_state);
7607         reginsert(pRExC_state, MINMOD, ret, depth+1);
7608         REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
7609     }
7610 #ifndef REG_ALLOW_MINMOD_SUSPEND
7611     else
7612 #endif
7613     if (RExC_parse < RExC_end && *RExC_parse == '+') {
7614         regnode *ender;
7615         nextchar(pRExC_state);
7616         ender = reg_node(pRExC_state, SUCCEED);
7617         REGTAIL(pRExC_state, ret, ender);
7618         reginsert(pRExC_state, SUSPEND, ret, depth+1);
7619         ret->flags = 0;
7620         ender = reg_node(pRExC_state, TAIL);
7621         REGTAIL(pRExC_state, ret, ender);
7622         /*ret= ender;*/
7623     }
7624
7625     if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
7626         RExC_parse++;
7627         vFAIL("Nested quantifiers");
7628     }
7629
7630     return(ret);
7631 }
7632
7633
7634 /* reg_namedseq(pRExC_state,UVp)
7635
7636    This is expected to be called by a parser routine that has
7637    recognized '\N' and needs to handle the rest. RExC_parse is
7638    expected to point at the first char following the N at the time
7639    of the call.
7640
7641    The \N may be inside (indicated by valuep not being NULL) or outside a
7642    character class.
7643
7644    \N may begin either a named sequence, or if outside a character class, mean
7645    to match a non-newline.  For non single-quoted regexes, the tokenizer has
7646    attempted to decide which, and in the case of a named sequence converted it
7647    into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
7648    where c1... are the characters in the sequence.  For single-quoted regexes,
7649    the tokenizer passes the \N sequence through unchanged; this code will not
7650    attempt to determine this nor expand those.  The net effect is that if the
7651    beginning of the passed-in pattern isn't '{U+' or there is no '}', it
7652    signals that this \N occurrence means to match a non-newline.
7653
7654    Only the \N{U+...} form should occur in a character class, for the same
7655    reason that '.' inside a character class means to just match a period: it
7656    just doesn't make sense.
7657
7658    If valuep is non-null then it is assumed that we are parsing inside
7659    of a charclass definition and the first codepoint in the resolved
7660    string is returned via *valuep and the routine will return NULL.
7661    In this mode if a multichar string is returned from the charnames
7662    handler, a warning will be issued, and only the first char in the
7663    sequence will be examined. If the string returned is zero length
7664    then the value of *valuep is undefined and NON-NULL will
7665    be returned to indicate failure. (This will NOT be a valid pointer
7666    to a regnode.)
7667
7668    If valuep is null then it is assumed that we are parsing normal text and a
7669    new EXACT node is inserted into the program containing the resolved string,
7670    and a pointer to the new node is returned.  But if the string is zero length
7671    a NOTHING node is emitted instead.
7672
7673    On success RExC_parse is set to the char following the endbrace.
7674    Parsing failures will generate a fatal error via vFAIL(...)
7675  */
7676 STATIC regnode *
7677 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
7678 {
7679     char * endbrace;    /* '}' following the name */
7680     regnode *ret = NULL;
7681 #ifdef DEBUGGING
7682     char* parse_start = RExC_parse - 2;     /* points to the '\N' */
7683 #endif
7684     char* p;
7685
7686     GET_RE_DEBUG_FLAGS_DECL;
7687
7688     PERL_ARGS_ASSERT_REG_NAMEDSEQ;
7689
7690     GET_RE_DEBUG_FLAGS;
7691
7692     /* The [^\n] meaning of \N ignores spaces and comments under the /x
7693      * modifier.  The other meaning does not */
7694     p = (RExC_flags & RXf_PMf_EXTENDED)
7695         ? regwhite( pRExC_state, RExC_parse )
7696         : RExC_parse;
7697
7698     /* Disambiguate between \N meaning a named character versus \N meaning
7699      * [^\n].  The former is assumed when it can't be the latter. */
7700     if (*p != '{' || regcurly(p)) {
7701         RExC_parse = p;
7702         if (valuep) {
7703             /* no bare \N in a charclass */
7704             vFAIL("\\N in a character class must be a named character: \\N{...}");
7705         }
7706         nextchar(pRExC_state);
7707         ret = reg_node(pRExC_state, REG_ANY);
7708         *flagp |= HASWIDTH|SIMPLE;
7709         RExC_naughty++;
7710         RExC_parse--;
7711         Set_Node_Length(ret, 1); /* MJD */
7712         return ret;
7713     }
7714
7715     /* Here, we have decided it should be a named sequence */
7716
7717     /* The test above made sure that the next real character is a '{', but
7718      * under the /x modifier, it could be separated by space (or a comment and
7719      * \n) and this is not allowed (for consistency with \x{...} and the
7720      * tokenizer handling of \N{NAME}). */
7721     if (*RExC_parse != '{') {
7722         vFAIL("Missing braces on \\N{}");
7723     }
7724
7725     RExC_parse++;       /* Skip past the '{' */
7726
7727     if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
7728         || ! (endbrace == RExC_parse            /* nothing between the {} */
7729               || (endbrace - RExC_parse >= 2    /* U+ (bad hex is checked below */
7730                   && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
7731     {
7732         if (endbrace) RExC_parse = endbrace;    /* position msg's '<--HERE' */
7733         vFAIL("\\N{NAME} must be resolved by the lexer");
7734     }
7735
7736     if (endbrace == RExC_parse) {   /* empty: \N{} */
7737         if (! valuep) {
7738             RExC_parse = endbrace + 1;
7739             return reg_node(pRExC_state,NOTHING);
7740         }
7741
7742         if (SIZE_ONLY) {
7743             ckWARNreg(RExC_parse,
7744                     "Ignoring zero length \\N{} in character class"
7745             );
7746             RExC_parse = endbrace + 1;
7747         }
7748         *valuep = 0;
7749         return (regnode *) &RExC_parse; /* Invalid regnode pointer */
7750     }
7751
7752     REQUIRE_UTF8;       /* named sequences imply Unicode semantics */
7753     RExC_parse += 2;    /* Skip past the 'U+' */
7754
7755     if (valuep) {   /* In a bracketed char class */
7756         /* We only pay attention to the first char of
7757         multichar strings being returned. I kinda wonder
7758         if this makes sense as it does change the behaviour
7759         from earlier versions, OTOH that behaviour was broken
7760         as well. XXX Solution is to recharacterize as
7761         [rest-of-class]|multi1|multi2... */
7762
7763         STRLEN length_of_hex;
7764         I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7765             | PERL_SCAN_DISALLOW_PREFIX
7766             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
7767
7768         char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
7769         if (endchar < endbrace) {
7770             ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
7771         }
7772
7773         length_of_hex = (STRLEN)(endchar - RExC_parse);
7774         *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
7775
7776         /* The tokenizer should have guaranteed validity, but it's possible to
7777          * bypass it by using single quoting, so check */
7778         if (length_of_hex == 0
7779             || length_of_hex != (STRLEN)(endchar - RExC_parse) )
7780         {
7781             RExC_parse += length_of_hex;        /* Includes all the valid */
7782             RExC_parse += (RExC_orig_utf8)      /* point to after 1st invalid */
7783                             ? UTF8SKIP(RExC_parse)
7784                             : 1;
7785             /* Guard against malformed utf8 */
7786             if (RExC_parse >= endchar) RExC_parse = endchar;
7787             vFAIL("Invalid hexadecimal number in \\N{U+...}");
7788         }
7789
7790         RExC_parse = endbrace + 1;
7791         if (endchar == endbrace) return NULL;
7792
7793         ret = (regnode *) &RExC_parse;  /* Invalid regnode pointer */
7794     }
7795     else {      /* Not a char class */
7796         char *s;            /* String to put in generated EXACT node */
7797         STRLEN len = 0;     /* Its current byte length */
7798         char *endchar;      /* Points to '.' or '}' ending cur char in the input
7799                                stream */
7800         ret = reg_node(pRExC_state,
7801                            (U8) ((! FOLD) ? EXACT
7802                                           : (LOC)
7803                                              ? EXACTFL
7804                                              : (MORE_ASCII_RESTRICTED)
7805                                                ? EXACTFA
7806                                                : (AT_LEAST_UNI_SEMANTICS)
7807                                                  ? EXACTFU
7808                                                  : EXACTF));
7809         s= STRING(ret);
7810
7811         /* Exact nodes can hold only a U8 length's of text = 255.  Loop through
7812          * the input which is of the form now 'c1.c2.c3...}' until find the
7813          * ending brace or exceed length 255.  The characters that exceed this
7814          * limit are dropped.  The limit could be relaxed should it become
7815          * desirable by reparsing this as (?:\N{NAME}), so could generate
7816          * multiple EXACT nodes, as is done for just regular input.  But this
7817          * is primarily a named character, and not intended to be a huge long
7818          * string, so 255 bytes should be good enough */
7819         while (1) {
7820             STRLEN length_of_hex;
7821             I32 grok_flags = PERL_SCAN_ALLOW_UNDERSCORES
7822                             | PERL_SCAN_DISALLOW_PREFIX
7823                             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
7824             UV cp;  /* Ord of current character */
7825             bool use_this_char_fold = FOLD;
7826
7827             /* Code points are separated by dots.  If none, there is only one
7828              * code point, and is terminated by the brace */
7829             endchar = RExC_parse + strcspn(RExC_parse, ".}");
7830
7831             /* The values are Unicode even on EBCDIC machines */
7832             length_of_hex = (STRLEN)(endchar - RExC_parse);
7833             cp = grok_hex(RExC_parse, &length_of_hex, &grok_flags, NULL);
7834             if ( length_of_hex == 0
7835                 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
7836             {
7837                 RExC_parse += length_of_hex;        /* Includes all the valid */
7838                 RExC_parse += (RExC_orig_utf8)  /* point to after 1st invalid */
7839                                 ? UTF8SKIP(RExC_parse)
7840                                 : 1;
7841                 /* Guard against malformed utf8 */
7842                 if (RExC_parse >= endchar) RExC_parse = endchar;
7843                 vFAIL("Invalid hexadecimal number in \\N{U+...}");
7844             }
7845
7846             /* XXX ? Change to ANYOF node
7847             if (FOLD
7848                 && (cp > 255 || (! MORE_ASCII_RESTRICTED && ! LOC))
7849                 && is_TRICKYFOLD_cp(cp))
7850             {
7851             }
7852             */
7853
7854             /* Under /aa, we can't mix ASCII with non- in a fold.  If we are
7855              * folding, and the source isn't ASCII, look through all the
7856              * characters it folds to.  If any one of them is ASCII, forbid
7857              * this fold.  (cp is uni, so the 127 below is correct even for
7858              * EBCDIC).  Similarly under locale rules, we don't mix under 256
7859              * with above 255.  XXX It really doesn't make sense to have \N{}
7860              * which means a Unicode rules under locale.  I (khw) think this
7861              * should be warned about, but the counter argument is that people
7862              * who have programmed around Perl's earlier lack of specifying the
7863              * rules and used \N{} to force Unicode things in a local
7864              * environment shouldn't get suddenly a warning */
7865             if (use_this_char_fold) {
7866                 if (LOC && cp < 256) {  /* Fold not known until run-time */
7867                     use_this_char_fold = FALSE;
7868                 }
7869                 else if ((cp > 127 && MORE_ASCII_RESTRICTED)
7870                          || (cp > 255 && LOC))
7871                 {
7872                 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
7873                 U8* s = tmpbuf;
7874                 U8* e;
7875                 STRLEN foldlen;
7876
7877                 (void) toFOLD_uni(cp, tmpbuf, &foldlen);
7878                 e = s + foldlen;
7879
7880                 while (s < e) {
7881                     if (isASCII(*s)
7882                         || (LOC && (UTF8_IS_INVARIANT(*s)
7883                                     || UTF8_IS_DOWNGRADEABLE_START(*s))))
7884                     {
7885                         use_this_char_fold = FALSE;
7886                         break;
7887                     }
7888                     s += UTF8SKIP(s);
7889                 }
7890                 }
7891             }
7892
7893             if (! use_this_char_fold) { /* Not folding, just append to the
7894                                            string */
7895                 STRLEN unilen;
7896
7897                 /* Quit before adding this character if would exceed limit */
7898                 if (len + UNISKIP(cp) > U8_MAX) break;
7899
7900                 unilen = reguni(pRExC_state, cp, s);
7901                 if (unilen > 0) {
7902                     s   += unilen;
7903                     len += unilen;
7904                 }
7905             } else {    /* Folding, output the folded equivalent */
7906                 STRLEN foldlen,numlen;
7907                 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
7908                 cp = toFOLD_uni(cp, tmpbuf, &foldlen);
7909
7910                 /* Quit before exceeding size limit */
7911                 if (len + foldlen > U8_MAX) break;
7912
7913                 for (foldbuf = tmpbuf;
7914                     foldlen;
7915                     foldlen -= numlen)
7916                 {
7917                     cp = utf8_to_uvchr(foldbuf, &numlen);
7918                     if (numlen > 0) {
7919                         const STRLEN unilen = reguni(pRExC_state, cp, s);
7920                         s       += unilen;
7921                         len     += unilen;
7922                         /* In EBCDIC the numlen and unilen can differ. */
7923                         foldbuf += numlen;
7924                         if (numlen >= foldlen)
7925                             break;
7926                     }
7927                     else
7928                         break; /* "Can't happen." */
7929                 }
7930             }
7931
7932             /* Point to the beginning of the next character in the sequence. */
7933             RExC_parse = endchar + 1;
7934
7935             /* Quit if no more characters */
7936             if (RExC_parse >= endbrace) break;
7937         }
7938
7939
7940         if (SIZE_ONLY) {
7941             if (RExC_parse < endbrace) {
7942                 ckWARNreg(RExC_parse - 1,
7943                           "Using just the first characters returned by \\N{}");
7944             }
7945
7946             RExC_size += STR_SZ(len);
7947         } else {
7948             STR_LEN(ret) = len;
7949             RExC_emit += STR_SZ(len);
7950         }
7951
7952         RExC_parse = endbrace + 1;
7953
7954         *flagp |= HASWIDTH; /* Not SIMPLE, as that causes the engine to fail
7955                                with malformed in t/re/pat_advanced.t */
7956         RExC_parse --;
7957         Set_Node_Cur_Length(ret); /* MJD */
7958         nextchar(pRExC_state);
7959     }
7960
7961     return ret;
7962 }
7963
7964
7965 /*
7966  * reg_recode
7967  *
7968  * It returns the code point in utf8 for the value in *encp.
7969  *    value: a code value in the source encoding
7970  *    encp:  a pointer to an Encode object
7971  *
7972  * If the result from Encode is not a single character,
7973  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
7974  */
7975 STATIC UV
7976 S_reg_recode(pTHX_ const char value, SV **encp)
7977 {
7978     STRLEN numlen = 1;
7979     SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
7980     const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
7981     const STRLEN newlen = SvCUR(sv);
7982     UV uv = UNICODE_REPLACEMENT;
7983
7984     PERL_ARGS_ASSERT_REG_RECODE;
7985
7986     if (newlen)
7987         uv = SvUTF8(sv)
7988              ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
7989              : *(U8*)s;
7990
7991     if (!newlen || numlen != newlen) {
7992         uv = UNICODE_REPLACEMENT;
7993         *encp = NULL;
7994     }
7995     return uv;
7996 }
7997
7998
7999 /*
8000  - regatom - the lowest level
8001
8002    Try to identify anything special at the start of the pattern. If there
8003    is, then handle it as required. This may involve generating a single regop,
8004    such as for an assertion; or it may involve recursing, such as to
8005    handle a () structure.
8006
8007    If the string doesn't start with something special then we gobble up
8008    as much literal text as we can.
8009
8010    Once we have been able to handle whatever type of thing started the
8011    sequence, we return.
8012
8013    Note: we have to be careful with escapes, as they can be both literal
8014    and special, and in the case of \10 and friends can either, depending
8015    on context. Specifically there are two separate switches for handling
8016    escape sequences, with the one for handling literal escapes requiring
8017    a dummy entry for all of the special escapes that are actually handled
8018    by the other.
8019 */
8020
8021 STATIC regnode *
8022 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
8023 {
8024     dVAR;
8025     register regnode *ret = NULL;
8026     I32 flags;
8027     char *parse_start = RExC_parse;
8028     U8 op;
8029     GET_RE_DEBUG_FLAGS_DECL;
8030     DEBUG_PARSE("atom");
8031     *flagp = WORST;             /* Tentatively. */
8032
8033     PERL_ARGS_ASSERT_REGATOM;
8034
8035 tryagain:
8036     switch ((U8)*RExC_parse) {
8037     case '^':
8038         RExC_seen_zerolen++;
8039         nextchar(pRExC_state);
8040         if (RExC_flags & RXf_PMf_MULTILINE)
8041             ret = reg_node(pRExC_state, MBOL);
8042         else if (RExC_flags & RXf_PMf_SINGLELINE)
8043             ret = reg_node(pRExC_state, SBOL);
8044         else
8045             ret = reg_node(pRExC_state, BOL);
8046         Set_Node_Length(ret, 1); /* MJD */
8047         break;
8048     case '$':
8049         nextchar(pRExC_state);
8050         if (*RExC_parse)
8051             RExC_seen_zerolen++;
8052         if (RExC_flags & RXf_PMf_MULTILINE)
8053             ret = reg_node(pRExC_state, MEOL);
8054         else if (RExC_flags & RXf_PMf_SINGLELINE)
8055             ret = reg_node(pRExC_state, SEOL);
8056         else
8057             ret = reg_node(pRExC_state, EOL);
8058         Set_Node_Length(ret, 1); /* MJD */
8059         break;
8060     case '.':
8061         nextchar(pRExC_state);
8062         if (RExC_flags & RXf_PMf_SINGLELINE)
8063             ret = reg_node(pRExC_state, SANY);
8064         else
8065             ret = reg_node(pRExC_state, REG_ANY);
8066         *flagp |= HASWIDTH|SIMPLE;
8067         RExC_naughty++;
8068         Set_Node_Length(ret, 1); /* MJD */
8069         break;
8070     case '[':
8071     {
8072         char * const oregcomp_parse = ++RExC_parse;
8073         ret = regclass(pRExC_state,depth+1);
8074         if (*RExC_parse != ']') {
8075             RExC_parse = oregcomp_parse;
8076             vFAIL("Unmatched [");
8077         }
8078         nextchar(pRExC_state);
8079         *flagp |= HASWIDTH|SIMPLE;
8080         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
8081         break;
8082     }
8083     case '(':
8084         nextchar(pRExC_state);
8085         ret = reg(pRExC_state, 1, &flags,depth+1);
8086         if (ret == NULL) {
8087                 if (flags & TRYAGAIN) {
8088                     if (RExC_parse == RExC_end) {
8089                          /* Make parent create an empty node if needed. */
8090                         *flagp |= TRYAGAIN;
8091                         return(NULL);
8092                     }
8093                     goto tryagain;
8094                 }
8095                 return(NULL);
8096         }
8097         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
8098         break;
8099     case '|':
8100     case ')':
8101         if (flags & TRYAGAIN) {
8102             *flagp |= TRYAGAIN;
8103             return NULL;
8104         }
8105         vFAIL("Internal urp");
8106                                 /* Supposed to be caught earlier. */
8107         break;
8108     case '{':
8109         if (!regcurly(RExC_parse)) {
8110             RExC_parse++;
8111             goto defchar;
8112         }
8113         /* FALL THROUGH */
8114     case '?':
8115     case '+':
8116     case '*':
8117         RExC_parse++;
8118         vFAIL("Quantifier follows nothing");
8119         break;
8120     case LATIN_SMALL_LETTER_SHARP_S:
8121     case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
8122     case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
8123 #if UTF8_TWO_BYTE_HI_nocast(UPSILON_D_T) != UTF8_TWO_BYTE_HI_nocast(IOTA_D_T)
8124 #error The beginning utf8 byte of IOTA_D_T and UPSILON_D_T unexpectedly differ.  Other instances in this code should have the case statement below.
8125     case UTF8_TWO_BYTE_HI_nocast(UPSILON_D_T):
8126 #endif
8127         do_foldchar:
8128         if (!LOC && FOLD) {
8129             U32 len,cp;
8130             len=0; /* silence a spurious compiler warning */
8131             if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) {
8132                 *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */
8133                 RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */
8134                 ret = reganode(pRExC_state, FOLDCHAR, cp);
8135                 Set_Node_Length(ret, 1); /* MJD */
8136                 nextchar(pRExC_state); /* kill whitespace under /x */
8137                 return ret;
8138             }
8139         }
8140         goto outer_default;
8141     case '\\':
8142         /* Special Escapes
8143
8144            This switch handles escape sequences that resolve to some kind
8145            of special regop and not to literal text. Escape sequnces that
8146            resolve to literal text are handled below in the switch marked
8147            "Literal Escapes".
8148
8149            Every entry in this switch *must* have a corresponding entry
8150            in the literal escape switch. However, the opposite is not
8151            required, as the default for this switch is to jump to the
8152            literal text handling code.
8153         */
8154         switch ((U8)*++RExC_parse) {
8155         case LATIN_SMALL_LETTER_SHARP_S:
8156         case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
8157         case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
8158                    goto do_foldchar;
8159         /* Special Escapes */
8160         case 'A':
8161             RExC_seen_zerolen++;
8162             ret = reg_node(pRExC_state, SBOL);
8163             *flagp |= SIMPLE;
8164             goto finish_meta_pat;
8165         case 'G':
8166             ret = reg_node(pRExC_state, GPOS);
8167             RExC_seen |= REG_SEEN_GPOS;
8168             *flagp |= SIMPLE;
8169             goto finish_meta_pat;
8170         case 'K':
8171             RExC_seen_zerolen++;
8172             ret = reg_node(pRExC_state, KEEPS);
8173             *flagp |= SIMPLE;
8174             /* XXX:dmq : disabling in-place substitution seems to
8175              * be necessary here to avoid cases of memory corruption, as
8176              * with: C<$_="x" x 80; s/x\K/y/> -- rgs
8177              */
8178             RExC_seen |= REG_SEEN_LOOKBEHIND;
8179             goto finish_meta_pat;
8180         case 'Z':
8181             ret = reg_node(pRExC_state, SEOL);
8182             *flagp |= SIMPLE;
8183             RExC_seen_zerolen++;                /* Do not optimize RE away */
8184             goto finish_meta_pat;
8185         case 'z':
8186             ret = reg_node(pRExC_state, EOS);
8187             *flagp |= SIMPLE;
8188             RExC_seen_zerolen++;                /* Do not optimize RE away */
8189             goto finish_meta_pat;
8190         case 'C':
8191             ret = reg_node(pRExC_state, CANY);
8192             RExC_seen |= REG_SEEN_CANY;
8193             *flagp |= HASWIDTH|SIMPLE;
8194             goto finish_meta_pat;
8195         case 'X':
8196             ret = reg_node(pRExC_state, CLUMP);
8197             *flagp |= HASWIDTH;
8198             goto finish_meta_pat;
8199         case 'w':
8200             switch (get_regex_charset(RExC_flags)) {
8201                 case REGEX_LOCALE_CHARSET:
8202                     op = ALNUML;
8203                     break;
8204                 case REGEX_UNICODE_CHARSET:
8205                     op = ALNUMU;
8206                     break;
8207                 case REGEX_ASCII_RESTRICTED_CHARSET:
8208                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8209                     op = ALNUMA;
8210                     break;
8211                 case REGEX_DEPENDS_CHARSET:
8212                     op = ALNUM;
8213                     break;
8214                 default:
8215                     goto bad_charset;
8216             }
8217             ret = reg_node(pRExC_state, op);
8218             *flagp |= HASWIDTH|SIMPLE;
8219             goto finish_meta_pat;
8220         case 'W':
8221             switch (get_regex_charset(RExC_flags)) {
8222                 case REGEX_LOCALE_CHARSET:
8223                     op = NALNUML;
8224                     break;
8225                 case REGEX_UNICODE_CHARSET:
8226                     op = NALNUMU;
8227                     break;
8228                 case REGEX_ASCII_RESTRICTED_CHARSET:
8229                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8230                     op = NALNUMA;
8231                     break;
8232                 case REGEX_DEPENDS_CHARSET:
8233                     op = NALNUM;
8234                     break;
8235                 default:
8236                     goto bad_charset;
8237             }
8238             ret = reg_node(pRExC_state, op);
8239             *flagp |= HASWIDTH|SIMPLE;
8240             goto finish_meta_pat;
8241         case 'b':
8242             RExC_seen_zerolen++;
8243             RExC_seen |= REG_SEEN_LOOKBEHIND;
8244             switch (get_regex_charset(RExC_flags)) {
8245                 case REGEX_LOCALE_CHARSET:
8246                     op = BOUNDL;
8247                     break;
8248                 case REGEX_UNICODE_CHARSET:
8249                     op = BOUNDU;
8250                     break;
8251                 case REGEX_ASCII_RESTRICTED_CHARSET:
8252                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8253                     op = BOUNDA;
8254                     break;
8255                 case REGEX_DEPENDS_CHARSET:
8256                     op = BOUND;
8257                     break;
8258                 default:
8259                     goto bad_charset;
8260             }
8261             ret = reg_node(pRExC_state, op);
8262             FLAGS(ret) = get_regex_charset(RExC_flags);
8263             *flagp |= SIMPLE;
8264             if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
8265                 ckWARNregdep(RExC_parse, "\"\\b{\" is deprecated; use \"\\b\\{\" instead");
8266             }
8267             goto finish_meta_pat;
8268         case 'B':
8269             RExC_seen_zerolen++;
8270             RExC_seen |= REG_SEEN_LOOKBEHIND;
8271             switch (get_regex_charset(RExC_flags)) {
8272                 case REGEX_LOCALE_CHARSET:
8273                     op = NBOUNDL;
8274                     break;
8275                 case REGEX_UNICODE_CHARSET:
8276                     op = NBOUNDU;
8277                     break;
8278                 case REGEX_ASCII_RESTRICTED_CHARSET:
8279                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8280                     op = NBOUNDA;
8281                     break;
8282                 case REGEX_DEPENDS_CHARSET:
8283                     op = NBOUND;
8284                     break;
8285                 default:
8286                     goto bad_charset;
8287             }
8288             ret = reg_node(pRExC_state, op);
8289             FLAGS(ret) = get_regex_charset(RExC_flags);
8290             *flagp |= SIMPLE;
8291             if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
8292                 ckWARNregdep(RExC_parse, "\"\\B{\" is deprecated; use \"\\B\\{\" instead");
8293             }
8294             goto finish_meta_pat;
8295         case 's':
8296             switch (get_regex_charset(RExC_flags)) {
8297                 case REGEX_LOCALE_CHARSET:
8298                     op = SPACEL;
8299                     break;
8300                 case REGEX_UNICODE_CHARSET:
8301                     op = SPACEU;
8302                     break;
8303                 case REGEX_ASCII_RESTRICTED_CHARSET:
8304                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8305                     op = SPACEA;
8306                     break;
8307                 case REGEX_DEPENDS_CHARSET:
8308                     op = SPACE;
8309                     break;
8310                 default:
8311                     goto bad_charset;
8312             }
8313             ret = reg_node(pRExC_state, op);
8314             *flagp |= HASWIDTH|SIMPLE;
8315             goto finish_meta_pat;
8316         case 'S':
8317             switch (get_regex_charset(RExC_flags)) {
8318                 case REGEX_LOCALE_CHARSET:
8319                     op = NSPACEL;
8320                     break;
8321                 case REGEX_UNICODE_CHARSET:
8322                     op = NSPACEU;
8323                     break;
8324                 case REGEX_ASCII_RESTRICTED_CHARSET:
8325                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8326                     op = NSPACEA;
8327                     break;
8328                 case REGEX_DEPENDS_CHARSET:
8329                     op = NSPACE;
8330                     break;
8331                 default:
8332                     goto bad_charset;
8333             }
8334             ret = reg_node(pRExC_state, op);
8335             *flagp |= HASWIDTH|SIMPLE;
8336             goto finish_meta_pat;
8337         case 'd':
8338             switch (get_regex_charset(RExC_flags)) {
8339                 case REGEX_LOCALE_CHARSET:
8340                     op = DIGITL;
8341                     break;
8342                 case REGEX_ASCII_RESTRICTED_CHARSET:
8343                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8344                     op = DIGITA;
8345                     break;
8346                 case REGEX_DEPENDS_CHARSET: /* No difference between these */
8347                 case REGEX_UNICODE_CHARSET:
8348                     op = DIGIT;
8349                     break;
8350                 default:
8351                     goto bad_charset;
8352             }
8353             ret = reg_node(pRExC_state, op);
8354             *flagp |= HASWIDTH|SIMPLE;
8355             goto finish_meta_pat;
8356         case 'D':
8357             switch (get_regex_charset(RExC_flags)) {
8358                 case REGEX_LOCALE_CHARSET:
8359                     op = NDIGITL;
8360                     break;
8361                 case REGEX_ASCII_RESTRICTED_CHARSET:
8362                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8363                     op = NDIGITA;
8364                     break;
8365                 case REGEX_DEPENDS_CHARSET: /* No difference between these */
8366                 case REGEX_UNICODE_CHARSET:
8367                     op = NDIGIT;
8368                     break;
8369                 default:
8370                     goto bad_charset;
8371             }
8372             ret = reg_node(pRExC_state, op);
8373             *flagp |= HASWIDTH|SIMPLE;
8374             goto finish_meta_pat;
8375         case 'R':
8376             ret = reg_node(pRExC_state, LNBREAK);
8377             *flagp |= HASWIDTH|SIMPLE;
8378             goto finish_meta_pat;
8379         case 'h':
8380             ret = reg_node(pRExC_state, HORIZWS);
8381             *flagp |= HASWIDTH|SIMPLE;
8382             goto finish_meta_pat;
8383         case 'H':
8384             ret = reg_node(pRExC_state, NHORIZWS);
8385             *flagp |= HASWIDTH|SIMPLE;
8386             goto finish_meta_pat;
8387         case 'v':
8388             ret = reg_node(pRExC_state, VERTWS);
8389             *flagp |= HASWIDTH|SIMPLE;
8390             goto finish_meta_pat;
8391         case 'V':
8392             ret = reg_node(pRExC_state, NVERTWS);
8393             *flagp |= HASWIDTH|SIMPLE;
8394          finish_meta_pat:
8395             nextchar(pRExC_state);
8396             Set_Node_Length(ret, 2); /* MJD */
8397             break;
8398         case 'p':
8399         case 'P':
8400             {
8401                 char* const oldregxend = RExC_end;
8402 #ifdef DEBUGGING
8403                 char* parse_start = RExC_parse - 2;
8404 #endif
8405
8406                 if (RExC_parse[1] == '{') {
8407                   /* a lovely hack--pretend we saw [\pX] instead */
8408                     RExC_end = strchr(RExC_parse, '}');
8409                     if (!RExC_end) {
8410                         const U8 c = (U8)*RExC_parse;
8411                         RExC_parse += 2;
8412                         RExC_end = oldregxend;
8413                         vFAIL2("Missing right brace on \\%c{}", c);
8414                     }
8415                     RExC_end++;
8416                 }
8417                 else {
8418                     RExC_end = RExC_parse + 2;
8419                     if (RExC_end > oldregxend)
8420                         RExC_end = oldregxend;
8421                 }
8422                 RExC_parse--;
8423
8424                 ret = regclass(pRExC_state,depth+1);
8425
8426                 RExC_end = oldregxend;
8427                 RExC_parse--;
8428
8429                 Set_Node_Offset(ret, parse_start + 2);
8430                 Set_Node_Cur_Length(ret);
8431                 nextchar(pRExC_state);
8432                 *flagp |= HASWIDTH|SIMPLE;
8433             }
8434             break;
8435         case 'N':
8436             /* Handle \N and \N{NAME} here and not below because it can be
8437             multicharacter. join_exact() will join them up later on.
8438             Also this makes sure that things like /\N{BLAH}+/ and
8439             \N{BLAH} being multi char Just Happen. dmq*/
8440             ++RExC_parse;
8441             ret= reg_namedseq(pRExC_state, NULL, flagp);
8442             break;
8443         case 'k':    /* Handle \k<NAME> and \k'NAME' */
8444         parse_named_seq:
8445         {
8446             char ch= RExC_parse[1];
8447             if (ch != '<' && ch != '\'' && ch != '{') {
8448                 RExC_parse++;
8449                 vFAIL2("Sequence %.2s... not terminated",parse_start);
8450             } else {
8451                 /* this pretty much dupes the code for (?P=...) in reg(), if
8452                    you change this make sure you change that */
8453                 char* name_start = (RExC_parse += 2);
8454                 U32 num = 0;
8455                 SV *sv_dat = reg_scan_name(pRExC_state,
8456                     SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8457                 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
8458                 if (RExC_parse == name_start || *RExC_parse != ch)
8459                     vFAIL2("Sequence %.3s... not terminated",parse_start);
8460
8461                 if (!SIZE_ONLY) {
8462                     num = add_data( pRExC_state, 1, "S" );
8463                     RExC_rxi->data->data[num]=(void*)sv_dat;
8464                     SvREFCNT_inc_simple_void(sv_dat);
8465                 }
8466
8467                 RExC_sawback = 1;
8468                 ret = reganode(pRExC_state,
8469                                ((! FOLD)
8470                                  ? NREF
8471                                  : (MORE_ASCII_RESTRICTED)
8472                                    ? NREFFA
8473                                    : (AT_LEAST_UNI_SEMANTICS)
8474                                      ? NREFFU
8475                                      : (LOC)
8476                                        ? NREFFL
8477                                        : NREFF),
8478                                 num);
8479                 *flagp |= HASWIDTH;
8480
8481                 /* override incorrect value set in reganode MJD */
8482                 Set_Node_Offset(ret, parse_start+1);
8483                 Set_Node_Cur_Length(ret); /* MJD */
8484                 nextchar(pRExC_state);
8485
8486             }
8487             break;
8488         }
8489         case 'g':
8490         case '1': case '2': case '3': case '4':
8491         case '5': case '6': case '7': case '8': case '9':
8492             {
8493                 I32 num;
8494                 bool isg = *RExC_parse == 'g';
8495                 bool isrel = 0;
8496                 bool hasbrace = 0;
8497                 if (isg) {
8498                     RExC_parse++;
8499                     if (*RExC_parse == '{') {
8500                         RExC_parse++;
8501                         hasbrace = 1;
8502                     }
8503                     if (*RExC_parse == '-') {
8504                         RExC_parse++;
8505                         isrel = 1;
8506                     }
8507                     if (hasbrace && !isDIGIT(*RExC_parse)) {
8508                         if (isrel) RExC_parse--;
8509                         RExC_parse -= 2;
8510                         goto parse_named_seq;
8511                 }   }
8512                 num = atoi(RExC_parse);
8513                 if (isg && num == 0)
8514                     vFAIL("Reference to invalid group 0");
8515                 if (isrel) {
8516                     num = RExC_npar - num;
8517                     if (num < 1)
8518                         vFAIL("Reference to nonexistent or unclosed group");
8519                 }
8520                 if (!isg && num > 9 && num >= RExC_npar)
8521                     goto defchar;
8522                 else {
8523                     char * const parse_start = RExC_parse - 1; /* MJD */
8524                     while (isDIGIT(*RExC_parse))
8525                         RExC_parse++;
8526                     if (parse_start == RExC_parse - 1)
8527                         vFAIL("Unterminated \\g... pattern");
8528                     if (hasbrace) {
8529                         if (*RExC_parse != '}')
8530                             vFAIL("Unterminated \\g{...} pattern");
8531                         RExC_parse++;
8532                     }
8533                     if (!SIZE_ONLY) {
8534                         if (num > (I32)RExC_rx->nparens)
8535                             vFAIL("Reference to nonexistent group");
8536                     }
8537                     RExC_sawback = 1;
8538                     ret = reganode(pRExC_state,
8539                                    ((! FOLD)
8540                                      ? REF
8541                                      : (MORE_ASCII_RESTRICTED)
8542                                        ? REFFA
8543                                        : (AT_LEAST_UNI_SEMANTICS)
8544                                          ? REFFU
8545                                          : (LOC)
8546                                            ? REFFL
8547                                            : REFF),
8548                                     num);
8549                     *flagp |= HASWIDTH;
8550
8551                     /* override incorrect value set in reganode MJD */
8552                     Set_Node_Offset(ret, parse_start+1);
8553                     Set_Node_Cur_Length(ret); /* MJD */
8554                     RExC_parse--;
8555                     nextchar(pRExC_state);
8556                 }
8557             }
8558             break;
8559         case '\0':
8560             if (RExC_parse >= RExC_end)
8561                 FAIL("Trailing \\");
8562             /* FALL THROUGH */
8563         default:
8564             /* Do not generate "unrecognized" warnings here, we fall
8565                back into the quick-grab loop below */
8566             parse_start--;
8567             goto defchar;
8568         }
8569         break;
8570
8571     case '#':
8572         if (RExC_flags & RXf_PMf_EXTENDED) {
8573             if ( reg_skipcomment( pRExC_state ) )
8574                 goto tryagain;
8575         }
8576         /* FALL THROUGH */
8577
8578     default:
8579         outer_default:{
8580             register STRLEN len;
8581             register UV ender;
8582             register char *p;
8583             char *s;
8584             STRLEN foldlen;
8585             U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
8586             regnode * orig_emit;
8587
8588             parse_start = RExC_parse - 1;
8589
8590             RExC_parse++;
8591
8592         defchar:
8593             ender = 0;
8594             orig_emit = RExC_emit; /* Save the original output node position in
8595                                       case we need to output a different node
8596                                       type */
8597             ret = reg_node(pRExC_state,
8598                            (U8) ((! FOLD) ? EXACT
8599                                           : (LOC)
8600                                              ? EXACTFL
8601                                              : (MORE_ASCII_RESTRICTED)
8602                                                ? EXACTFA
8603                                                : (AT_LEAST_UNI_SEMANTICS)
8604                                                  ? EXACTFU
8605                                                  : EXACTF)
8606                     );
8607             s = STRING(ret);
8608             for (len = 0, p = RExC_parse - 1;
8609               len < 127 && p < RExC_end;
8610               len++)
8611             {
8612                 char * const oldp = p;
8613
8614                 if (RExC_flags & RXf_PMf_EXTENDED)
8615                     p = regwhite( pRExC_state, p );
8616                 switch ((U8)*p) {
8617                 case LATIN_SMALL_LETTER_SHARP_S:
8618                 case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
8619                 case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
8620                            if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
8621                                 goto normal_default;
8622                 case '^':
8623                 case '$':
8624                 case '.':
8625                 case '[':
8626                 case '(':
8627                 case ')':
8628                 case '|':
8629                     goto loopdone;
8630                 case '\\':
8631                     /* Literal Escapes Switch
8632
8633                        This switch is meant to handle escape sequences that
8634                        resolve to a literal character.
8635
8636                        Every escape sequence that represents something
8637                        else, like an assertion or a char class, is handled
8638                        in the switch marked 'Special Escapes' above in this
8639                        routine, but also has an entry here as anything that
8640                        isn't explicitly mentioned here will be treated as
8641                        an unescaped equivalent literal.
8642                     */
8643
8644                     switch ((U8)*++p) {
8645                     /* These are all the special escapes. */
8646                     case LATIN_SMALL_LETTER_SHARP_S:
8647                     case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
8648                     case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
8649                            if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
8650                                 goto normal_default;
8651                     case 'A':             /* Start assertion */
8652                     case 'b': case 'B':   /* Word-boundary assertion*/
8653                     case 'C':             /* Single char !DANGEROUS! */
8654                     case 'd': case 'D':   /* digit class */
8655                     case 'g': case 'G':   /* generic-backref, pos assertion */
8656                     case 'h': case 'H':   /* HORIZWS */
8657                     case 'k': case 'K':   /* named backref, keep marker */
8658                     case 'N':             /* named char sequence */
8659                     case 'p': case 'P':   /* Unicode property */
8660                               case 'R':   /* LNBREAK */
8661                     case 's': case 'S':   /* space class */
8662                     case 'v': case 'V':   /* VERTWS */
8663                     case 'w': case 'W':   /* word class */
8664                     case 'X':             /* eXtended Unicode "combining character sequence" */
8665                     case 'z': case 'Z':   /* End of line/string assertion */
8666                         --p;
8667                         goto loopdone;
8668
8669                     /* Anything after here is an escape that resolves to a
8670                        literal. (Except digits, which may or may not)
8671                      */
8672                     case 'n':
8673                         ender = '\n';
8674                         p++;
8675                         break;
8676                     case 'r':
8677                         ender = '\r';
8678                         p++;
8679                         break;
8680                     case 't':
8681                         ender = '\t';
8682                         p++;
8683                         break;
8684                     case 'f':
8685                         ender = '\f';
8686                         p++;
8687                         break;
8688                     case 'e':
8689                           ender = ASCII_TO_NATIVE('\033');
8690                         p++;
8691                         break;
8692                     case 'a':
8693                           ender = ASCII_TO_NATIVE('\007');
8694                         p++;
8695                         break;
8696                     case 'o':
8697                         {
8698                             STRLEN brace_len = len;
8699                             UV result;
8700                             const char* error_msg;
8701
8702                             bool valid = grok_bslash_o(p,
8703                                                        &result,
8704                                                        &brace_len,
8705                                                        &error_msg,
8706                                                        1);
8707                             p += brace_len;
8708                             if (! valid) {
8709                                 RExC_parse = p; /* going to die anyway; point
8710                                                    to exact spot of failure */
8711                                 vFAIL(error_msg);
8712                             }
8713                             else
8714                             {
8715                                 ender = result;
8716                             }
8717                             if (PL_encoding && ender < 0x100) {
8718                                 goto recode_encoding;
8719                             }
8720                             if (ender > 0xff) {
8721                                 REQUIRE_UTF8;
8722                             }
8723                             break;
8724                         }
8725                     case 'x':
8726                         if (*++p == '{') {
8727                             char* const e = strchr(p, '}');
8728
8729                             if (!e) {
8730                                 RExC_parse = p + 1;
8731                                 vFAIL("Missing right brace on \\x{}");
8732                             }
8733                             else {
8734                                 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
8735                                     | PERL_SCAN_DISALLOW_PREFIX;
8736                                 STRLEN numlen = e - p - 1;
8737                                 ender = grok_hex(p + 1, &numlen, &flags, NULL);
8738                                 if (ender > 0xff)
8739                                     REQUIRE_UTF8;
8740                                 p = e + 1;
8741                             }
8742                         }
8743                         else {
8744                             I32 flags = PERL_SCAN_DISALLOW_PREFIX;
8745                             STRLEN numlen = 2;
8746                             ender = grok_hex(p, &numlen, &flags, NULL);
8747                             p += numlen;
8748                         }
8749                         if (PL_encoding && ender < 0x100)
8750                             goto recode_encoding;
8751                         break;
8752                     case 'c':
8753                         p++;
8754                         ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
8755                         break;
8756                     case '0': case '1': case '2': case '3':case '4':
8757                     case '5': case '6': case '7': case '8':case '9':
8758                         if (*p == '0' ||
8759                             (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
8760                         {
8761                             I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
8762                             STRLEN numlen = 3;
8763                             ender = grok_oct(p, &numlen, &flags, NULL);
8764                             if (ender > 0xff) {
8765                                 REQUIRE_UTF8;
8766                             }
8767                             p += numlen;
8768                         }
8769                         else {
8770                             --p;
8771                             goto loopdone;
8772                         }
8773                         if (PL_encoding && ender < 0x100)
8774                             goto recode_encoding;
8775                         break;
8776                     recode_encoding:
8777                         {
8778                             SV* enc = PL_encoding;
8779                             ender = reg_recode((const char)(U8)ender, &enc);
8780                             if (!enc && SIZE_ONLY)
8781                                 ckWARNreg(p, "Invalid escape in the specified encoding");
8782                             REQUIRE_UTF8;
8783                         }
8784                         break;
8785                     case '\0':
8786                         if (p >= RExC_end)
8787                             FAIL("Trailing \\");
8788                         /* FALL THROUGH */
8789                     default:
8790                         if (!SIZE_ONLY&& isALPHA(*p)) {
8791                             /* Include any { following the alpha to emphasize
8792                              * that it could be part of an escape at some point
8793                              * in the future */
8794                             int len = (*(p + 1) == '{') ? 2 : 1;
8795                             ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
8796                         }
8797                         goto normal_default;
8798                     }
8799                     break;
8800                 default:
8801                   normal_default:
8802                     if (UTF8_IS_START(*p) && UTF) {
8803                         STRLEN numlen;
8804                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
8805                                                &numlen, UTF8_ALLOW_DEFAULT);
8806                         p += numlen;
8807                     }
8808                     else
8809                         ender = (U8) *p++;
8810                     break;
8811                 } /* End of switch on the literal */
8812
8813                 /* Certain characters are problematic because their folded
8814                  * length is so different from their original length that it
8815                  * isn't handleable by the optimizer.  They are therefore not
8816                  * placed in an EXACTish node; and are here handled specially.
8817                  * (Even if the optimizer handled LATIN_SMALL_LETTER_SHARP_S,
8818                  * putting it in a special node keeps regexec from having to
8819                  * deal with a non-utf8 multi-char fold */
8820                 if (FOLD
8821                     && (ender > 255 || (! MORE_ASCII_RESTRICTED && ! LOC))
8822                     && is_TRICKYFOLD_cp(ender))
8823                 {
8824                     /* If is in middle of outputting characters into an
8825                      * EXACTish node, go output what we have so far, and
8826                      * position the parse so that this will be called again
8827                      * immediately */
8828                     if (len) {
8829                         p  = oldp;
8830                         goto loopdone;
8831                     }
8832                     else {
8833
8834                         /* Here we are ready to output our tricky fold
8835                          * character.  What's done is to pretend it's in a
8836                          * [bracketed] class, and let the code that deals with
8837                          * those handle it, as that code has all the
8838                          * intelligence necessary.  First save the current
8839                          * parse state, get rid of the already allocated EXACT
8840                          * node that the ANYOFV node will replace, and point
8841                          * the parse to a buffer which we fill with the
8842                          * character we want the regclass code to think is
8843                          * being parsed */
8844                         char* const oldregxend = RExC_end;
8845                         char tmpbuf[2];
8846                         RExC_emit = orig_emit;
8847                         RExC_parse = tmpbuf;
8848                         if (UTF) {
8849                             tmpbuf[0] = UTF8_TWO_BYTE_HI(ender);
8850                             tmpbuf[1] = UTF8_TWO_BYTE_LO(ender);
8851                             RExC_end = RExC_parse + 2;
8852                         }
8853                         else {
8854                             tmpbuf[0] = (char) ender;
8855                             RExC_end = RExC_parse + 1;
8856                         }
8857
8858                         ret = regclass(pRExC_state,depth+1);
8859
8860                         /* Here, have parsed the buffer.  Reset the parse to
8861                          * the actual input, and return */
8862                         RExC_end = oldregxend;
8863                         RExC_parse = p - 1;
8864
8865                         Set_Node_Offset(ret, RExC_parse);
8866                         Set_Node_Cur_Length(ret);
8867                         nextchar(pRExC_state);
8868                         *flagp |= HASWIDTH|SIMPLE;
8869                         return ret;
8870                     }
8871                 }
8872
8873                 if ( RExC_flags & RXf_PMf_EXTENDED)
8874                     p = regwhite( pRExC_state, p );
8875                 if (UTF && FOLD) {
8876                     /* Prime the casefolded buffer.  Locale rules, which apply
8877                      * only to code points < 256, aren't known until execution,
8878                      * so for them, just output the original character using
8879                      * utf8 */
8880                     if (LOC && ender < 256) {
8881                         if (UNI_IS_INVARIANT(ender)) {
8882                             *tmpbuf = (U8) ender;
8883                             foldlen = 1;
8884                         } else {
8885                             *tmpbuf = UTF8_TWO_BYTE_HI(ender);
8886                             *(tmpbuf + 1) = UTF8_TWO_BYTE_LO(ender);
8887                             foldlen = 2;
8888                         }
8889                     }
8890                     else if (isASCII(ender)) {  /* Note: Here can't also be LOC
8891                                                  */
8892                         ender = toLOWER(ender);
8893                         *tmpbuf = (U8) ender;
8894                         foldlen = 1;
8895                     }
8896                     else if (! MORE_ASCII_RESTRICTED && ! LOC) {
8897
8898                         /* Locale and /aa require more selectivity about the
8899                          * fold, so are handled below.  Otherwise, here, just
8900                          * use the fold */
8901                         ender = toFOLD_uni(ender, tmpbuf, &foldlen);
8902                     }
8903                     else {
8904                         /* Under locale rules or /aa we are not to mix,
8905                          * respectively, ords < 256 or ASCII with non-.  So
8906                          * reject folds that mix them, using only the
8907                          * non-folded code point.  So do the fold to a
8908                          * temporary, and inspect each character in it. */
8909                         U8 trialbuf[UTF8_MAXBYTES_CASE+1];
8910                         U8* s = trialbuf;
8911                         UV tmpender = toFOLD_uni(ender, trialbuf, &foldlen);
8912                         U8* e = s + foldlen;
8913                         bool fold_ok = TRUE;
8914
8915                         while (s < e) {
8916                             if (isASCII(*s)
8917                                 || (LOC && (UTF8_IS_INVARIANT(*s)
8918                                            || UTF8_IS_DOWNGRADEABLE_START(*s))))
8919                             {
8920                                 fold_ok = FALSE;
8921                                 break;
8922                             }
8923                             s += UTF8SKIP(s);
8924                         }
8925                         if (fold_ok) {
8926                             Copy(trialbuf, tmpbuf, foldlen, U8);
8927                             ender = tmpender;
8928                         }
8929                         else {
8930                             uvuni_to_utf8(tmpbuf, ender);
8931                             foldlen = UNISKIP(ender);
8932                         }
8933                     }
8934                 }
8935                 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
8936                     if (len)
8937                         p = oldp;
8938                     else if (UTF) {
8939                          if (FOLD) {
8940                               /* Emit all the Unicode characters. */
8941                               STRLEN numlen;
8942                               for (foldbuf = tmpbuf;
8943                                    foldlen;
8944                                    foldlen -= numlen) {
8945                                    ender = utf8_to_uvchr(foldbuf, &numlen);
8946                                    if (numlen > 0) {
8947                                         const STRLEN unilen = reguni(pRExC_state, ender, s);
8948                                         s       += unilen;
8949                                         len     += unilen;
8950                                         /* In EBCDIC the numlen
8951                                          * and unilen can differ. */
8952                                         foldbuf += numlen;
8953                                         if (numlen >= foldlen)
8954                                              break;
8955                                    }
8956                                    else
8957                                         break; /* "Can't happen." */
8958                               }
8959                          }
8960                          else {
8961                               const STRLEN unilen = reguni(pRExC_state, ender, s);
8962                               if (unilen > 0) {
8963                                    s   += unilen;
8964                                    len += unilen;
8965                               }
8966                          }
8967                     }
8968                     else {
8969                         len++;
8970                         REGC((char)ender, s++);
8971                     }
8972                     break;
8973                 }
8974                 if (UTF) {
8975                      if (FOLD) {
8976                           /* Emit all the Unicode characters. */
8977                           STRLEN numlen;
8978                           for (foldbuf = tmpbuf;
8979                                foldlen;
8980                                foldlen -= numlen) {
8981                                ender = utf8_to_uvchr(foldbuf, &numlen);
8982                                if (numlen > 0) {
8983                                     const STRLEN unilen = reguni(pRExC_state, ender, s);
8984                                     len     += unilen;
8985                                     s       += unilen;
8986                                     /* In EBCDIC the numlen
8987                                      * and unilen can differ. */
8988                                     foldbuf += numlen;
8989                                     if (numlen >= foldlen)
8990                                          break;
8991                                }
8992                                else
8993                                     break;
8994                           }
8995                      }
8996                      else {
8997                           const STRLEN unilen = reguni(pRExC_state, ender, s);
8998                           if (unilen > 0) {
8999                                s   += unilen;
9000                                len += unilen;
9001                           }
9002                      }
9003                      len--;
9004                 }
9005                 else
9006                     REGC((char)ender, s++);
9007             }
9008         loopdone:   /* Jumped to when encounters something that shouldn't be in
9009                        the node */
9010             RExC_parse = p - 1;
9011             Set_Node_Cur_Length(ret); /* MJD */
9012             nextchar(pRExC_state);
9013             {
9014                 /* len is STRLEN which is unsigned, need to copy to signed */
9015                 IV iv = len;
9016                 if (iv < 0)
9017                     vFAIL("Internal disaster");
9018             }
9019             if (len > 0)
9020                 *flagp |= HASWIDTH;
9021             if (len == 1 && UNI_IS_INVARIANT(ender))
9022                 *flagp |= SIMPLE;
9023
9024             if (SIZE_ONLY)
9025                 RExC_size += STR_SZ(len);
9026             else {
9027                 STR_LEN(ret) = len;
9028                 RExC_emit += STR_SZ(len);
9029             }
9030         }
9031         break;
9032     }
9033
9034     return(ret);
9035
9036 /* Jumped to when an unrecognized character set is encountered */
9037 bad_charset:
9038     Perl_croak(aTHX_ "panic: Unknown regex character set encoding: %u", get_regex_charset(RExC_flags));
9039     return(NULL);
9040 }
9041
9042 STATIC char *
9043 S_regwhite( RExC_state_t *pRExC_state, char *p )
9044 {
9045     const char *e = RExC_end;
9046
9047     PERL_ARGS_ASSERT_REGWHITE;
9048
9049     while (p < e) {
9050         if (isSPACE(*p))
9051             ++p;
9052         else if (*p == '#') {
9053             bool ended = 0;
9054             do {
9055                 if (*p++ == '\n') {
9056                     ended = 1;
9057                     break;
9058                 }
9059             } while (p < e);
9060             if (!ended)
9061                 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
9062         }
9063         else
9064             break;
9065     }
9066     return p;
9067 }
9068
9069 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
9070    Character classes ([:foo:]) can also be negated ([:^foo:]).
9071    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
9072    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
9073    but trigger failures because they are currently unimplemented. */
9074
9075 #define POSIXCC_DONE(c)   ((c) == ':')
9076 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
9077 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
9078
9079 STATIC I32
9080 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
9081 {
9082     dVAR;
9083     I32 namedclass = OOB_NAMEDCLASS;
9084
9085     PERL_ARGS_ASSERT_REGPPOSIXCC;
9086
9087     if (value == '[' && RExC_parse + 1 < RExC_end &&
9088         /* I smell either [: or [= or [. -- POSIX has been here, right? */
9089         POSIXCC(UCHARAT(RExC_parse))) {
9090         const char c = UCHARAT(RExC_parse);
9091         char* const s = RExC_parse++;
9092
9093         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
9094             RExC_parse++;
9095         if (RExC_parse == RExC_end)
9096             /* Grandfather lone [:, [=, [. */
9097             RExC_parse = s;
9098         else {
9099             const char* const t = RExC_parse++; /* skip over the c */
9100             assert(*t == c);
9101
9102             if (UCHARAT(RExC_parse) == ']') {
9103                 const char *posixcc = s + 1;
9104                 RExC_parse++; /* skip over the ending ] */
9105
9106                 if (*s == ':') {
9107                     const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
9108                     const I32 skip = t - posixcc;
9109
9110                     /* Initially switch on the length of the name.  */
9111                     switch (skip) {
9112                     case 4:
9113                         if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
9114                             namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
9115                         break;
9116                     case 5:
9117                         /* Names all of length 5.  */
9118                         /* alnum alpha ascii blank cntrl digit graph lower
9119                            print punct space upper  */
9120                         /* Offset 4 gives the best switch position.  */
9121                         switch (posixcc[4]) {
9122                         case 'a':
9123                             if (memEQ(posixcc, "alph", 4)) /* alpha */
9124                                 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
9125                             break;
9126                         case 'e':
9127                             if (memEQ(posixcc, "spac", 4)) /* space */
9128                                 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
9129                             break;
9130                         case 'h':
9131                             if (memEQ(posixcc, "grap", 4)) /* graph */
9132                                 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
9133                             break;
9134                         case 'i':
9135                             if (memEQ(posixcc, "asci", 4)) /* ascii */
9136                                 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
9137                             break;
9138                         case 'k':
9139                             if (memEQ(posixcc, "blan", 4)) /* blank */
9140                                 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
9141                             break;
9142                         case 'l':
9143                             if (memEQ(posixcc, "cntr", 4)) /* cntrl */
9144                                 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
9145                             break;
9146                         case 'm':
9147                             if (memEQ(posixcc, "alnu", 4)) /* alnum */
9148                                 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
9149                             break;
9150                         case 'r':
9151                             if (memEQ(posixcc, "lowe", 4)) /* lower */
9152                                 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
9153                             else if (memEQ(posixcc, "uppe", 4)) /* upper */
9154                                 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
9155                             break;
9156                         case 't':
9157                             if (memEQ(posixcc, "digi", 4)) /* digit */
9158                                 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
9159                             else if (memEQ(posixcc, "prin", 4)) /* print */
9160                                 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
9161                             else if (memEQ(posixcc, "punc", 4)) /* punct */
9162                                 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
9163                             break;
9164                         }
9165                         break;
9166                     case 6:
9167                         if (memEQ(posixcc, "xdigit", 6))
9168                             namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
9169                         break;
9170                     }
9171
9172                     if (namedclass == OOB_NAMEDCLASS)
9173                         Simple_vFAIL3("POSIX class [:%.*s:] unknown",
9174                                       t - s - 1, s + 1);
9175                     assert (posixcc[skip] == ':');
9176                     assert (posixcc[skip+1] == ']');
9177                 } else if (!SIZE_ONLY) {
9178                     /* [[=foo=]] and [[.foo.]] are still future. */
9179
9180                     /* adjust RExC_parse so the warning shows after
9181                        the class closes */
9182                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
9183                         RExC_parse++;
9184                     Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
9185                 }
9186             } else {
9187                 /* Maternal grandfather:
9188                  * "[:" ending in ":" but not in ":]" */
9189                 RExC_parse = s;
9190             }
9191         }
9192     }
9193
9194     return namedclass;
9195 }
9196
9197 STATIC void
9198 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
9199 {
9200     dVAR;
9201
9202     PERL_ARGS_ASSERT_CHECKPOSIXCC;
9203
9204     if (POSIXCC(UCHARAT(RExC_parse))) {
9205         const char *s = RExC_parse;
9206         const char  c = *s++;
9207
9208         while (isALNUM(*s))
9209             s++;
9210         if (*s && c == *s && s[1] == ']') {
9211             ckWARN3reg(s+2,
9212                        "POSIX syntax [%c %c] belongs inside character classes",
9213                        c, c);
9214
9215             /* [[=foo=]] and [[.foo.]] are still future. */
9216             if (POSIXCC_NOTYET(c)) {
9217                 /* adjust RExC_parse so the error shows after
9218                    the class closes */
9219                 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
9220                     NOOP;
9221                 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
9222             }
9223         }
9224     }
9225 }
9226
9227 /* No locale test, and always Unicode semantics */
9228 #define _C_C_T_NOLOC_(NAME,TEST,WORD)                                          \
9229 ANYOF_##NAME:                                                                  \
9230         for (value = 0; value < 256; value++)                                  \
9231             if (TEST)                                                          \
9232             stored += set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);  \
9233     yesno = '+';                                                               \
9234     what = WORD;                                                               \
9235     break;                                                                     \
9236 case ANYOF_N##NAME:                                                            \
9237         for (value = 0; value < 256; value++)                                  \
9238             if (!TEST)                                                         \
9239             stored += set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);  \
9240     yesno = '!';                                                               \
9241     what = WORD;                                                               \
9242     break
9243
9244 /* Like the above, but there are differences if we are in uni-8-bit or not, so
9245  * there are two tests passed in, to use depending on that. There aren't any
9246  * cases where the label is different from the name, so no need for that
9247  * parameter */
9248 #define _C_C_T_(NAME, TEST_8, TEST_7, WORD)                                    \
9249 ANYOF_##NAME:                                                                  \
9250     if (LOC) ANYOF_CLASS_SET(ret, ANYOF_##NAME);                               \
9251     else if (UNI_SEMANTICS) {                                                  \
9252         for (value = 0; value < 256; value++) {                                \
9253             if (TEST_8(value)) stored +=                                       \
9254                       set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);  \
9255         }                                                                      \
9256     }                                                                          \
9257     else {                                                                     \
9258         for (value = 0; value < 128; value++) {                                \
9259             if (TEST_7(UNI_TO_NATIVE(value))) stored +=                        \
9260                 set_regclass_bit(pRExC_state, ret,                     \
9261                                    (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate);                 \
9262         }                                                                      \
9263     }                                                                          \
9264     yesno = '+';                                                               \
9265     what = WORD;                                                               \
9266     break;                                                                     \
9267 case ANYOF_N##NAME:                                                            \
9268     if (LOC) ANYOF_CLASS_SET(ret, ANYOF_N##NAME);                              \
9269     else if (UNI_SEMANTICS) {                                                  \
9270         for (value = 0; value < 256; value++) {                                \
9271             if (! TEST_8(value)) stored +=                                     \
9272                     set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);    \
9273         }                                                                      \
9274     }                                                                          \
9275     else {                                                                     \
9276         for (value = 0; value < 128; value++) {                                \
9277             if (! TEST_7(UNI_TO_NATIVE(value))) stored += set_regclass_bit(  \
9278                         pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate);    \
9279         }                                                                      \
9280         if (AT_LEAST_ASCII_RESTRICTED) {                                       \
9281             for (value = 128; value < 256; value++) {                          \
9282              stored += set_regclass_bit(                                     \
9283                            pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate); \
9284             }                                                                  \
9285             ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;                             \
9286         }                                                                      \
9287         else {                                                                 \
9288             /* For a non-ut8 target string with DEPENDS semantics, all above   \
9289              * ASCII Latin1 code points match the complement of any of the     \
9290              * classes.  But in utf8, they have their Unicode semantics, so    \
9291              * can't just set them in the bitmap, or else regexec.c will think \
9292              * they matched when they shouldn't. */                            \
9293             ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;                     \
9294         }                                                                      \
9295     }                                                                          \
9296     yesno = '!';                                                               \
9297     what = WORD;                                                               \
9298     break
9299
9300 STATIC U8
9301 S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, HV** invlist_ptr, AV** alternate_ptr)
9302 {
9303
9304     /* Handle the setting of folds in the bitmap for non-locale ANYOF nodes.
9305      * Locale folding is done at run-time, so this function should not be
9306      * called for nodes that are for locales.
9307      *
9308      * This function sets the bit corresponding to the fold of the input
9309      * 'value', if not already set.  The fold of 'f' is 'F', and the fold of
9310      * 'F' is 'f'.
9311      *
9312      * It also knows about the characters that are in the bitmap that have
9313      * folds that are matchable only outside it, and sets the appropriate lists
9314      * and flags.
9315      *
9316      * It returns the number of bits that actually changed from 0 to 1 */
9317
9318     U8 stored = 0;
9319     U8 fold;
9320
9321     PERL_ARGS_ASSERT_SET_REGCLASS_BIT_FOLD;
9322
9323     fold = (AT_LEAST_UNI_SEMANTICS) ? PL_fold_latin1[value]
9324                                     : PL_fold[value];
9325
9326     /* It assumes the bit for 'value' has already been set */
9327     if (fold != value && ! ANYOF_BITMAP_TEST(node, fold)) {
9328         ANYOF_BITMAP_SET(node, fold);
9329         stored++;
9330     }
9331     if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value) && (! isASCII(value) || ! MORE_ASCII_RESTRICTED)) {
9332         /* Certain Latin1 characters have matches outside the bitmap.  To get
9333          * here, 'value' is one of those characters.   None of these matches is
9334          * valid for ASCII characters under /aa, which have been excluded by
9335          * the 'if' above.  The matches fall into three categories:
9336          * 1) They are singly folded-to or -from an above 255 character, as
9337          *    LATIN SMALL LETTER Y WITH DIAERESIS and LATIN CAPITAL LETTER Y
9338          *    WITH DIAERESIS;
9339          * 2) They are part of a multi-char fold with another character in the
9340          *    bitmap, only LATIN SMALL LETTER SHARP S => "ss" fits that bill;
9341          * 3) They are part of a multi-char fold with a character not in the
9342          *    bitmap, such as various ligatures.
9343          * We aren't dealing fully with multi-char folds, except we do deal
9344          * with the pattern containing a character that has a multi-char fold
9345          * (not so much the inverse).
9346          * For types 1) and 3), the matches only happen when the target string
9347          * is utf8; that's not true for 2), and we set a flag for it.
9348          *
9349          * The code below adds to the passed in inversion list the single fold
9350          * closures for 'value'.  The values are hard-coded here so that an
9351          * innocent-looking character class, like /[ks]/i won't have to go out
9352          * to disk to find the possible matches.  XXX It would be better to
9353          * generate these via regen, in case a new version of the Unicode
9354          * standard adds new mappings, though that is not really likely. */
9355         switch (value) {
9356             case 'k':
9357             case 'K':
9358                 /* KELVIN SIGN */
9359                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212A);
9360                 break;
9361             case 's':
9362             case 'S':
9363                 /* LATIN SMALL LETTER LONG S */
9364                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x017F);
9365                 break;
9366             case MICRO_SIGN:
9367                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
9368                                                  GREEK_SMALL_LETTER_MU);
9369                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
9370                                                  GREEK_CAPITAL_LETTER_MU);
9371                 break;
9372             case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
9373             case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
9374                 /* ANGSTROM SIGN */
9375                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212B);
9376                 if (DEPENDS_SEMANTICS) {    /* See DEPENDS comment below */
9377                     *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
9378                                                      PL_fold_latin1[value]);
9379                 }
9380                 break;
9381             case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
9382                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
9383                                         LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
9384                 break;
9385             case LATIN_SMALL_LETTER_SHARP_S:
9386                 /* 0x1E9E is LATIN CAPITAL LETTER SHARP S */
9387                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x1E9E);
9388
9389                 /* Under /a, /d, and /u, this can match the two chars "ss" */
9390                 if (! MORE_ASCII_RESTRICTED) {
9391                     add_alternate(alternate_ptr, (U8 *) "ss", 2);
9392
9393                     /* And under /u or /a, it can match even if the target is
9394                      * not utf8 */
9395                     if (AT_LEAST_UNI_SEMANTICS) {
9396                         ANYOF_FLAGS(node) |= ANYOF_NONBITMAP_NON_UTF8;
9397                     }
9398                 }
9399                 break;
9400             case 'F': case 'f':
9401             case 'I': case 'i':
9402             case 'L': case 'l':
9403             case 'T': case 't':
9404                 /* These all are targets of multi-character folds, which can
9405                  * occur with only non-Latin1 characters in the fold, so they
9406                  * can match if the target string isn't UTF-8 */
9407                 ANYOF_FLAGS(node) |= ANYOF_NONBITMAP_NON_UTF8;
9408                 break;
9409             case 'A': case 'a':
9410             case 'H': case 'h':
9411             case 'J': case 'j':
9412             case 'N': case 'n':
9413             case 'W': case 'w':
9414             case 'Y': case 'y':
9415                 /* These all are targets of multi-character folds, which occur
9416                  * only with a non-Latin1 character as part of the fold, so
9417                  * they can't match unless the target string is in UTF-8, so no
9418                  * action here is necessary */
9419                 break;
9420             default:
9421                 /* Use deprecated warning to increase the chances of this
9422                  * being output */
9423                 ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%x; please use the perlbug utility to report;", value);
9424                 break;
9425         }
9426     }
9427     else if (DEPENDS_SEMANTICS
9428             && ! isASCII(value)
9429             && PL_fold_latin1[value] != value)
9430     {
9431            /* Under DEPENDS rules, non-ASCII Latin1 characters match their
9432             * folds only when the target string is in UTF-8.  We add the fold
9433             * here to the list of things to match outside the bitmap, which
9434             * won't be looked at unless it is UTF8 (or else if something else
9435             * says to look even if not utf8, but those things better not happen
9436             * under DEPENDS semantics. */
9437         *invlist_ptr = add_cp_to_invlist(*invlist_ptr, PL_fold_latin1[value]);
9438     }
9439
9440     return stored;
9441 }
9442
9443
9444 PERL_STATIC_INLINE U8
9445 S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, HV** invlist_ptr, AV** alternate_ptr)
9446 {
9447     /* This inline function sets a bit in the bitmap if not already set, and if
9448      * appropriate, its fold, returning the number of bits that actually
9449      * changed from 0 to 1 */
9450
9451     U8 stored;
9452
9453     PERL_ARGS_ASSERT_SET_REGCLASS_BIT;
9454
9455     if (ANYOF_BITMAP_TEST(node, value)) {   /* Already set */
9456         return 0;
9457     }
9458
9459     ANYOF_BITMAP_SET(node, value);
9460     stored = 1;
9461
9462     if (FOLD && ! LOC) {        /* Locale folds aren't known until runtime */
9463         stored += set_regclass_bit_fold(pRExC_state, node, value, invlist_ptr, alternate_ptr);
9464     }
9465
9466     return stored;
9467 }
9468
9469 STATIC void
9470 S_add_alternate(pTHX_ AV** alternate_ptr, U8* string, STRLEN len)
9471 {
9472     /* Adds input 'string' with length 'len' to the ANYOF node's unicode
9473      * alternate list, pointed to by 'alternate_ptr'.  This is an array of
9474      * the multi-character folds of characters in the node */
9475     SV *sv;
9476
9477     PERL_ARGS_ASSERT_ADD_ALTERNATE;
9478
9479     if (! *alternate_ptr) {
9480         *alternate_ptr = newAV();
9481     }
9482     sv = newSVpvn_utf8((char*)string, len, TRUE);
9483     av_push(*alternate_ptr, sv);
9484     return;
9485 }
9486
9487 /*
9488    parse a class specification and produce either an ANYOF node that
9489    matches the pattern or perhaps will be optimized into an EXACTish node
9490    instead. The node contains a bit map for the first 256 characters, with the
9491    corresponding bit set if that character is in the list.  For characters
9492    above 255, a range list is used */
9493
9494 STATIC regnode *
9495 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
9496 {
9497     dVAR;
9498     register UV nextvalue;
9499     register IV prevvalue = OOB_UNICODE;
9500     register IV range = 0;
9501     UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
9502     register regnode *ret;
9503     STRLEN numlen;
9504     IV namedclass;
9505     char *rangebegin = NULL;
9506     bool need_class = 0;
9507     SV *listsv = NULL;
9508     STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
9509                                       than just initialized.  */
9510     UV n;
9511
9512     /* code points this node matches that can't be stored in the bitmap */
9513     HV* nonbitmap = NULL;
9514
9515     /* The items that are to match that aren't stored in the bitmap, but are a
9516      * result of things that are stored there.  This is the fold closure of
9517      * such a character, either because it has DEPENDS semantics and shouldn't
9518      * be matched unless the target string is utf8, or is a code point that is
9519      * too large for the bit map, as for example, the fold of the MICRO SIGN is
9520      * above 255.  This all is solely for performance reasons.  By having this
9521      * code know the outside-the-bitmap folds that the bitmapped characters are
9522      * involved with, we don't have to go out to disk to find the list of
9523      * matches, unless the character class includes code points that aren't
9524      * storable in the bit map.  That means that a character class with an 's'
9525      * in it, for example, doesn't need to go out to disk to find everything
9526      * that matches.  A 2nd list is used so that the 'nonbitmap' list is kept
9527      * empty unless there is something whose fold we don't know about, and will
9528      * have to go out to the disk to find. */
9529     HV* l1_fold_invlist = NULL;
9530
9531     /* List of multi-character folds that are matched by this node */
9532     AV* unicode_alternate  = NULL;
9533 #ifdef EBCDIC
9534     UV literal_endpoint = 0;
9535 #endif
9536     UV stored = 0;  /* how many chars stored in the bitmap */
9537
9538     regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
9539         case we need to change the emitted regop to an EXACT. */
9540     const char * orig_parse = RExC_parse;
9541     GET_RE_DEBUG_FLAGS_DECL;
9542
9543     PERL_ARGS_ASSERT_REGCLASS;
9544 #ifndef DEBUGGING
9545     PERL_UNUSED_ARG(depth);
9546 #endif
9547
9548     DEBUG_PARSE("clas");
9549
9550     /* Assume we are going to generate an ANYOF node. */
9551     ret = reganode(pRExC_state, ANYOF, 0);
9552
9553
9554     if (!SIZE_ONLY) {
9555         ANYOF_FLAGS(ret) = 0;
9556     }
9557
9558     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
9559         RExC_naughty++;
9560         RExC_parse++;
9561         if (!SIZE_ONLY)
9562             ANYOF_FLAGS(ret) |= ANYOF_INVERT;
9563     }
9564
9565     if (SIZE_ONLY) {
9566         RExC_size += ANYOF_SKIP;
9567         listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
9568     }
9569     else {
9570         RExC_emit += ANYOF_SKIP;
9571         if (LOC) {
9572             ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
9573         }
9574         ANYOF_BITMAP_ZERO(ret);
9575         listsv = newSVpvs("# comment\n");
9576         initial_listsv_len = SvCUR(listsv);
9577     }
9578
9579     nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
9580
9581     if (!SIZE_ONLY && POSIXCC(nextvalue))
9582         checkposixcc(pRExC_state);
9583
9584     /* allow 1st char to be ] (allowing it to be - is dealt with later) */
9585     if (UCHARAT(RExC_parse) == ']')
9586         goto charclassloop;
9587
9588 parseit:
9589     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
9590
9591     charclassloop:
9592
9593         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
9594
9595         if (!range)
9596             rangebegin = RExC_parse;
9597         if (UTF) {
9598             value = utf8n_to_uvchr((U8*)RExC_parse,
9599                                    RExC_end - RExC_parse,
9600                                    &numlen, UTF8_ALLOW_DEFAULT);
9601             RExC_parse += numlen;
9602         }
9603         else
9604             value = UCHARAT(RExC_parse++);
9605
9606         nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
9607         if (value == '[' && POSIXCC(nextvalue))
9608             namedclass = regpposixcc(pRExC_state, value);
9609         else if (value == '\\') {
9610             if (UTF) {
9611                 value = utf8n_to_uvchr((U8*)RExC_parse,
9612                                    RExC_end - RExC_parse,
9613                                    &numlen, UTF8_ALLOW_DEFAULT);
9614                 RExC_parse += numlen;
9615             }
9616             else
9617                 value = UCHARAT(RExC_parse++);
9618             /* Some compilers cannot handle switching on 64-bit integer
9619              * values, therefore value cannot be an UV.  Yes, this will
9620              * be a problem later if we want switch on Unicode.
9621              * A similar issue a little bit later when switching on
9622              * namedclass. --jhi */
9623             switch ((I32)value) {
9624             case 'w':   namedclass = ANYOF_ALNUM;       break;
9625             case 'W':   namedclass = ANYOF_NALNUM;      break;
9626             case 's':   namedclass = ANYOF_SPACE;       break;
9627             case 'S':   namedclass = ANYOF_NSPACE;      break;
9628             case 'd':   namedclass = ANYOF_DIGIT;       break;
9629             case 'D':   namedclass = ANYOF_NDIGIT;      break;
9630             case 'v':   namedclass = ANYOF_VERTWS;      break;
9631             case 'V':   namedclass = ANYOF_NVERTWS;     break;
9632             case 'h':   namedclass = ANYOF_HORIZWS;     break;
9633             case 'H':   namedclass = ANYOF_NHORIZWS;    break;
9634             case 'N':  /* Handle \N{NAME} in class */
9635                 {
9636                     /* We only pay attention to the first char of
9637                     multichar strings being returned. I kinda wonder
9638                     if this makes sense as it does change the behaviour
9639                     from earlier versions, OTOH that behaviour was broken
9640                     as well. */
9641                     UV v; /* value is register so we cant & it /grrr */
9642                     if (reg_namedseq(pRExC_state, &v, NULL)) {
9643                         goto parseit;
9644                     }
9645                     value= v;
9646                 }
9647                 break;
9648             case 'p':
9649             case 'P':
9650                 {
9651                 char *e;
9652                 if (RExC_parse >= RExC_end)
9653                     vFAIL2("Empty \\%c{}", (U8)value);
9654                 if (*RExC_parse == '{') {
9655                     const U8 c = (U8)value;
9656                     e = strchr(RExC_parse++, '}');
9657                     if (!e)
9658                         vFAIL2("Missing right brace on \\%c{}", c);
9659                     while (isSPACE(UCHARAT(RExC_parse)))
9660                         RExC_parse++;
9661                     if (e == RExC_parse)
9662                         vFAIL2("Empty \\%c{}", c);
9663                     n = e - RExC_parse;
9664                     while (isSPACE(UCHARAT(RExC_parse + n - 1)))
9665                         n--;
9666                 }
9667                 else {
9668                     e = RExC_parse;
9669                     n = 1;
9670                 }
9671                 if (!SIZE_ONLY) {
9672                     if (UCHARAT(RExC_parse) == '^') {
9673                          RExC_parse++;
9674                          n--;
9675                          value = value == 'p' ? 'P' : 'p'; /* toggle */
9676                          while (isSPACE(UCHARAT(RExC_parse))) {
9677                               RExC_parse++;
9678                               n--;
9679                          }
9680                     }
9681
9682                     /* Add the property name to the list.  If /i matching, give
9683                      * a different name which consists of the normal name
9684                      * sandwiched between two underscores and '_i'.  The design
9685                      * is discussed in the commit message for this. */
9686                     Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s%.*s%s\n",
9687                                         (value=='p' ? '+' : '!'),
9688                                         (FOLD) ? "__" : "",
9689                                         (int)n,
9690                                         RExC_parse,
9691                                         (FOLD) ? "_i" : ""
9692                                     );
9693                 }
9694                 RExC_parse = e + 1;
9695
9696                 /* The \p could match something in the Latin1 range, hence
9697                  * something that isn't utf8 */
9698                 ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
9699                 namedclass = ANYOF_MAX;  /* no official name, but it's named */
9700
9701                 /* \p means they want Unicode semantics */
9702                 RExC_uni_semantics = 1;
9703                 }
9704                 break;
9705             case 'n':   value = '\n';                   break;
9706             case 'r':   value = '\r';                   break;
9707             case 't':   value = '\t';                   break;
9708             case 'f':   value = '\f';                   break;
9709             case 'b':   value = '\b';                   break;
9710             case 'e':   value = ASCII_TO_NATIVE('\033');break;
9711             case 'a':   value = ASCII_TO_NATIVE('\007');break;
9712             case 'o':
9713                 RExC_parse--;   /* function expects to be pointed at the 'o' */
9714                 {
9715                     const char* error_msg;
9716                     bool valid = grok_bslash_o(RExC_parse,
9717                                                &value,
9718                                                &numlen,
9719                                                &error_msg,
9720                                                SIZE_ONLY);
9721                     RExC_parse += numlen;
9722                     if (! valid) {
9723                         vFAIL(error_msg);
9724                     }
9725                 }
9726                 if (PL_encoding && value < 0x100) {
9727                     goto recode_encoding;
9728                 }
9729                 break;
9730             case 'x':
9731                 if (*RExC_parse == '{') {
9732                     I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
9733                         | PERL_SCAN_DISALLOW_PREFIX;
9734                     char * const e = strchr(RExC_parse++, '}');
9735                     if (!e)
9736                         vFAIL("Missing right brace on \\x{}");
9737
9738                     numlen = e - RExC_parse;
9739                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
9740                     RExC_parse = e + 1;
9741                 }
9742                 else {
9743                     I32 flags = PERL_SCAN_DISALLOW_PREFIX;
9744                     numlen = 2;
9745                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
9746                     RExC_parse += numlen;
9747                 }
9748                 if (PL_encoding && value < 0x100)
9749                     goto recode_encoding;
9750                 break;
9751             case 'c':
9752                 value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
9753                 break;
9754             case '0': case '1': case '2': case '3': case '4':
9755             case '5': case '6': case '7':
9756                 {
9757                     /* Take 1-3 octal digits */
9758                     I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
9759                     numlen = 3;
9760                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
9761                     RExC_parse += numlen;
9762                     if (PL_encoding && value < 0x100)
9763                         goto recode_encoding;
9764                     break;
9765                 }
9766             recode_encoding:
9767                 {
9768                     SV* enc = PL_encoding;
9769                     value = reg_recode((const char)(U8)value, &enc);
9770                     if (!enc && SIZE_ONLY)
9771                         ckWARNreg(RExC_parse,
9772                                   "Invalid escape in the specified encoding");
9773                     break;
9774                 }
9775             default:
9776                 /* Allow \_ to not give an error */
9777                 if (!SIZE_ONLY && isALNUM(value) && value != '_') {
9778                     ckWARN2reg(RExC_parse,
9779                                "Unrecognized escape \\%c in character class passed through",
9780                                (int)value);
9781                 }
9782                 break;
9783             }
9784         } /* end of \blah */
9785 #ifdef EBCDIC
9786         else
9787             literal_endpoint++;
9788 #endif
9789
9790         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
9791
9792             /* What matches in a locale is not known until runtime, so need to
9793              * (one time per class) allocate extra space to pass to regexec.
9794              * The space will contain a bit for each named class that is to be
9795              * matched against.  This isn't needed for \p{} and pseudo-classes,
9796              * as they are not affected by locale, and hence are dealt with
9797              * separately */
9798             if (LOC && namedclass < ANYOF_MAX && ! need_class) {
9799                 need_class = 1;
9800                 if (SIZE_ONLY) {
9801                     RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
9802                 }
9803                 else {
9804                     RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
9805                     ANYOF_CLASS_ZERO(ret);
9806                 }
9807                 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
9808             }
9809
9810             /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
9811              * literal, as is the character that began the false range, i.e.
9812              * the 'a' in the examples */
9813             if (range) {
9814                 if (!SIZE_ONLY) {
9815                     const int w =
9816                         RExC_parse >= rangebegin ?
9817                         RExC_parse - rangebegin : 0;
9818                     ckWARN4reg(RExC_parse,
9819                                "False [] range \"%*.*s\"",
9820                                w, w, rangebegin);
9821
9822                     stored +=
9823                          set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
9824                     if (prevvalue < 256) {
9825                         stored +=
9826                          set_regclass_bit(pRExC_state, ret, (U8) prevvalue, &l1_fold_invlist, &unicode_alternate);
9827                     }
9828                     else {
9829                         nonbitmap = add_cp_to_invlist(nonbitmap, prevvalue);
9830                     }
9831                 }
9832
9833                 range = 0; /* this was not a true range */
9834             }
9835
9836
9837
9838             if (!SIZE_ONLY) {
9839                 const char *what = NULL;
9840                 char yesno = 0;
9841
9842                 /* Possible truncation here but in some 64-bit environments
9843                  * the compiler gets heartburn about switch on 64-bit values.
9844                  * A similar issue a little earlier when switching on value.
9845                  * --jhi */
9846                 switch ((I32)namedclass) {
9847
9848                 case _C_C_T_(ALNUMC, isALNUMC_L1, isALNUMC, "XPosixAlnum");
9849                 case _C_C_T_(ALPHA, isALPHA_L1, isALPHA, "XPosixAlpha");
9850                 case _C_C_T_(BLANK, isBLANK_L1, isBLANK, "XPosixBlank");
9851                 case _C_C_T_(CNTRL, isCNTRL_L1, isCNTRL, "XPosixCntrl");
9852                 case _C_C_T_(GRAPH, isGRAPH_L1, isGRAPH, "XPosixGraph");
9853                 case _C_C_T_(LOWER, isLOWER_L1, isLOWER, "XPosixLower");
9854                 case _C_C_T_(PRINT, isPRINT_L1, isPRINT, "XPosixPrint");
9855                 case _C_C_T_(PSXSPC, isPSXSPC_L1, isPSXSPC, "XPosixSpace");
9856                 case _C_C_T_(PUNCT, isPUNCT_L1, isPUNCT, "XPosixPunct");
9857                 case _C_C_T_(UPPER, isUPPER_L1, isUPPER, "XPosixUpper");
9858                 /* \s, \w match all unicode if utf8. */
9859                 case _C_C_T_(SPACE, isSPACE_L1, isSPACE, "SpacePerl");
9860                 case _C_C_T_(ALNUM, isWORDCHAR_L1, isALNUM, "Word");
9861                 case _C_C_T_(XDIGIT, isXDIGIT_L1, isXDIGIT, "XPosixXDigit");
9862                 case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
9863                 case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
9864                 case ANYOF_ASCII:
9865                     if (LOC)
9866                         ANYOF_CLASS_SET(ret, ANYOF_ASCII);
9867                     else {
9868                         for (value = 0; value < 128; value++)
9869                             stored +=
9870                               set_regclass_bit(pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate);
9871                     }
9872                     yesno = '+';
9873                     what = NULL;        /* Doesn't match outside ascii, so
9874                                            don't want to add +utf8:: */
9875                     break;
9876                 case ANYOF_NASCII:
9877                     if (LOC)
9878                         ANYOF_CLASS_SET(ret, ANYOF_NASCII);
9879                     else {
9880                         for (value = 128; value < 256; value++)
9881                             stored +=
9882                               set_regclass_bit(pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate);
9883                     }
9884                     ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
9885                     yesno = '!';
9886                     what = "ASCII";
9887                     break;
9888                 case ANYOF_DIGIT:
9889                     if (LOC)
9890                         ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
9891                     else {
9892                         /* consecutive digits assumed */
9893                         for (value = '0'; value <= '9'; value++)
9894                             stored +=
9895                               set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);
9896                     }
9897                     yesno = '+';
9898                     what = "Digit";
9899                     break;
9900                 case ANYOF_NDIGIT:
9901                     if (LOC)
9902                         ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
9903                     else {
9904                         /* consecutive digits assumed */
9905                         for (value = 0; value < '0'; value++)
9906                             stored +=
9907                               set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);
9908                         for (value = '9' + 1; value < 256; value++)
9909                             stored +=
9910                               set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);
9911                     }
9912                     yesno = '!';
9913                     what = "Digit";
9914                     if (AT_LEAST_ASCII_RESTRICTED ) {
9915                         ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
9916                     }
9917                     break;
9918                 case ANYOF_MAX:
9919                     /* this is to handle \p and \P */
9920                     break;
9921                 default:
9922                     vFAIL("Invalid [::] class");
9923                     break;
9924                 }
9925                 if (what && ! (AT_LEAST_ASCII_RESTRICTED)) {
9926                     /* Strings such as "+utf8::isWord\n" */
9927                     Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
9928                 }
9929
9930                 continue;
9931             }
9932         } /* end of namedclass \blah */
9933
9934         if (range) {
9935             if (prevvalue > (IV)value) /* b-a */ {
9936                 const int w = RExC_parse - rangebegin;
9937                 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
9938                 range = 0; /* not a valid range */
9939             }
9940         }
9941         else {
9942             prevvalue = value; /* save the beginning of the range */
9943             if (RExC_parse+1 < RExC_end
9944                 && *RExC_parse == '-'
9945                 && RExC_parse[1] != ']')
9946             {
9947                 RExC_parse++;
9948
9949                 /* a bad range like \w-, [:word:]- ? */
9950                 if (namedclass > OOB_NAMEDCLASS) {
9951                     if (ckWARN(WARN_REGEXP)) {
9952                         const int w =
9953                             RExC_parse >= rangebegin ?
9954                             RExC_parse - rangebegin : 0;
9955                         vWARN4(RExC_parse,
9956                                "False [] range \"%*.*s\"",
9957                                w, w, rangebegin);
9958                     }
9959                     if (!SIZE_ONLY)
9960                         stored +=
9961                             set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
9962                 } else
9963                     range = 1;  /* yeah, it's a range! */
9964                 continue;       /* but do it the next time */
9965             }
9966         }
9967
9968         /* non-Latin1 code point implies unicode semantics.  Must be set in
9969          * pass1 so is there for the whole of pass 2 */
9970         if (value > 255) {
9971             RExC_uni_semantics = 1;
9972         }
9973
9974         /* now is the next time */
9975         if (!SIZE_ONLY) {
9976             if (prevvalue < 256) {
9977                 const IV ceilvalue = value < 256 ? value : 255;
9978                 IV i;
9979 #ifdef EBCDIC
9980                 /* In EBCDIC [\x89-\x91] should include
9981                  * the \x8e but [i-j] should not. */
9982                 if (literal_endpoint == 2 &&
9983                     ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
9984                      (isUPPER(prevvalue) && isUPPER(ceilvalue))))
9985                 {
9986                     if (isLOWER(prevvalue)) {
9987                         for (i = prevvalue; i <= ceilvalue; i++)
9988                             if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
9989                                 stored +=
9990                                   set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
9991                             }
9992                     } else {
9993                         for (i = prevvalue; i <= ceilvalue; i++)
9994                             if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
9995                                 stored +=
9996                                   set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
9997                             }
9998                     }
9999                 }
10000                 else
10001 #endif
10002                       for (i = prevvalue; i <= ceilvalue; i++) {
10003                         stored += set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
10004                       }
10005           }
10006           if (value > 255) {
10007             const UV prevnatvalue  = NATIVE_TO_UNI(prevvalue);
10008             const UV natvalue      = NATIVE_TO_UNI(value);
10009             nonbitmap = add_range_to_invlist(nonbitmap, prevnatvalue, natvalue);
10010         }
10011 #ifdef EBCDIC
10012             literal_endpoint = 0;
10013 #endif
10014         }
10015
10016         range = 0; /* this range (if it was one) is done now */
10017     }
10018
10019
10020
10021     if (SIZE_ONLY)
10022         return ret;
10023     /****** !SIZE_ONLY AFTER HERE *********/
10024
10025     /* If folding and there are code points above 255, we calculate all
10026      * characters that could fold to or from the ones already on the list */
10027     if (FOLD && nonbitmap) {
10028         UV i;
10029
10030         HV* fold_intersection;
10031         UV* fold_list;
10032
10033         /* This is a list of all the characters that participate in folds
10034             * (except marks, etc in multi-char folds */
10035         if (! PL_utf8_foldable) {
10036             SV* swash = swash_init("utf8", "Cased", &PL_sv_undef, 1, 0);
10037             PL_utf8_foldable = _swash_to_invlist(swash);
10038         }
10039
10040         /* This is a hash that for a particular fold gives all characters
10041             * that are involved in it */
10042         if (! PL_utf8_foldclosures) {
10043
10044             /* If we were unable to find any folds, then we likely won't be
10045              * able to find the closures.  So just create an empty list.
10046              * Folding will effectively be restricted to the non-Unicode rules
10047              * hard-coded into Perl.  (This case happens legitimately during
10048              * compilation of Perl itself before the Unicode tables are
10049              * generated) */
10050             if (invlist_len(PL_utf8_foldable) == 0) {
10051                 PL_utf8_foldclosures = _new_invlist(0);
10052             } else {
10053                 /* If the folds haven't been read in, call a fold function
10054                     * to force that */
10055                 if (! PL_utf8_tofold) {
10056                     U8 dummy[UTF8_MAXBYTES+1];
10057                     STRLEN dummy_len;
10058                     to_utf8_fold((U8*) "A", dummy, &dummy_len);
10059                 }
10060                 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
10061             }
10062         }
10063
10064         /* Only the characters in this class that participate in folds need
10065             * be checked.  Get the intersection of this class and all the
10066             * possible characters that are foldable.  This can quickly narrow
10067             * down a large class */
10068         fold_intersection = invlist_intersection(PL_utf8_foldable, nonbitmap);
10069
10070         /* Now look at the foldable characters in this class individually */
10071         fold_list = invlist_array(fold_intersection);
10072         for (i = 0; i < invlist_len(fold_intersection); i++) {
10073             UV j;
10074
10075             /* The next entry is the beginning of the range that is in the
10076              * class */
10077             UV start = fold_list[i++];
10078
10079
10080             /* The next entry is the beginning of the next range, which
10081                 * isn't in the class, so the end of the current range is one
10082                 * less than that */
10083             UV end = fold_list[i] - 1;
10084
10085             /* Look at every character in the range */
10086             for (j = start; j <= end; j++) {
10087
10088                 /* Get its fold */
10089                 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
10090                 STRLEN foldlen;
10091                 const UV f = to_uni_fold(j, foldbuf, &foldlen);
10092
10093                 if (foldlen > (STRLEN)UNISKIP(f)) {
10094
10095                     /* Any multicharacter foldings (disallowed in
10096                         * lookbehind patterns) require the following
10097                         * transform: [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst) where
10098                         * E folds into "pq" and F folds into "rst", all other
10099                         * characters fold to single characters.  We save away
10100                         * these multicharacter foldings, to be later saved as
10101                         * part of the additional "s" data. */
10102                     if (! RExC_in_lookbehind) {
10103                         U8* loc = foldbuf;
10104                         U8* e = foldbuf + foldlen;
10105
10106                         /* If any of the folded characters of this are in
10107                             * the Latin1 range, tell the regex engine that
10108                             * this can match a non-utf8 target string.  The
10109                             * only multi-byte fold whose source is in the
10110                             * Latin1 range (U+00DF) applies only when the
10111                             * target string is utf8, or under unicode rules */
10112                         if (j > 255 || AT_LEAST_UNI_SEMANTICS) {
10113                             while (loc < e) {
10114
10115                                 /* Can't mix ascii with non- under /aa */
10116                                 if (MORE_ASCII_RESTRICTED
10117                                     && (isASCII(*loc) != isASCII(j)))
10118                                 {
10119                                     goto end_multi_fold;
10120                                 }
10121                                 if (UTF8_IS_INVARIANT(*loc)
10122                                     || UTF8_IS_DOWNGRADEABLE_START(*loc))
10123                                 {
10124                                     /* Can't mix above and below 256 under
10125                                         * LOC */
10126                                     if (LOC) {
10127                                         goto end_multi_fold;
10128                                     }
10129                                     ANYOF_FLAGS(ret)
10130                                             |= ANYOF_NONBITMAP_NON_UTF8;
10131                                     break;
10132                                 }
10133                                 loc += UTF8SKIP(loc);
10134                             }
10135                         }
10136
10137                         add_alternate(&unicode_alternate, foldbuf, foldlen);
10138                     end_multi_fold: ;
10139                     }
10140                 }
10141                 else {
10142                     /* Single character fold.  Add everything in its fold
10143                         * closure to the list that this node should match */
10144                     SV** listp;
10145
10146                     /* The fold closures data structure is a hash with the
10147                         * keys being every character that is folded to, like
10148                         * 'k', and the values each an array of everything that
10149                         * folds to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ] */
10150                     if ((listp = hv_fetch(PL_utf8_foldclosures,
10151                                     (char *) foldbuf, foldlen, FALSE)))
10152                     {
10153                         AV* list = (AV*) *listp;
10154                         IV k;
10155                         for (k = 0; k <= av_len(list); k++) {
10156                             SV** c_p = av_fetch(list, k, FALSE);
10157                             UV c;
10158                             if (c_p == NULL) {
10159                                 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
10160                             }
10161                             c = SvUV(*c_p);
10162
10163                             /* /aa doesn't allow folds between ASCII and
10164                                 * non-; /l doesn't allow them between above
10165                                 * and below 256 */
10166                             if ((MORE_ASCII_RESTRICTED
10167                                  && (isASCII(c) != isASCII(j)))
10168                                     || (LOC && ((c < 256) != (j < 256))))
10169                             {
10170                                 continue;
10171                             }
10172
10173                             if (c < 256 && AT_LEAST_UNI_SEMANTICS) {
10174                                 stored += set_regclass_bit(pRExC_state,
10175                                         ret,
10176                                         (U8) c,
10177                                         &l1_fold_invlist, &unicode_alternate);
10178                             }
10179                                 /* It may be that the code point is already
10180                                     * in this range or already in the bitmap,
10181                                     * in which case we need do nothing */
10182                             else if ((c < start || c > end)
10183                                         && (c > 255
10184                                             || ! ANYOF_BITMAP_TEST(ret, c)))
10185                             {
10186                                 nonbitmap = add_cp_to_invlist(nonbitmap, c);
10187                             }
10188                         }
10189                     }
10190                 }
10191             }
10192         }
10193         invlist_destroy(fold_intersection);
10194     }
10195
10196     /* Combine the two lists into one. */
10197     if (l1_fold_invlist) {
10198         if (nonbitmap) {
10199             nonbitmap = invlist_union(nonbitmap, l1_fold_invlist);
10200         }
10201         else {
10202             nonbitmap = l1_fold_invlist;
10203         }
10204     }
10205
10206     /* Here, we have calculated what code points should be in the character
10207      * class.   Now we can see about various optimizations.  Fold calculation
10208      * needs to take place before inversion.  Otherwise /[^k]/i would invert to
10209      * include K, which under /i would match k. */
10210
10211     /* Optimize inverted simple patterns (e.g. [^a-z]).  Note that we haven't
10212      * set the FOLD flag yet, so this this does optimize those.  It doesn't
10213      * optimize locale.  Doing so perhaps could be done as long as there is
10214      * nothing like \w in it; some thought also would have to be given to the
10215      * interaction with above 0x100 chars */
10216     if (! LOC
10217         && (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT
10218         && ! unicode_alternate
10219         && ! nonbitmap
10220         && SvCUR(listsv) == initial_listsv_len)
10221     {
10222         for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
10223             ANYOF_BITMAP(ret)[value] ^= 0xFF;
10224         stored = 256 - stored;
10225
10226         /* The inversion means that everything above 255 is matched; and at the
10227          * same time we clear the invert flag */
10228         ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
10229     }
10230
10231     /* Folding in the bitmap is taken care of above, but not for locale (for
10232      * which we have to wait to see what folding is in effect at runtime), and
10233      * for things not in the bitmap.  Set run-time fold flag for these */
10234     if (FOLD && (LOC || nonbitmap || unicode_alternate)) {
10235         ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD;
10236     }
10237
10238     /* A single character class can be "optimized" into an EXACTish node.
10239      * Note that since we don't currently count how many characters there are
10240      * outside the bitmap, we are XXX missing optimization possibilities for
10241      * them.  This optimization can't happen unless this is a truly single
10242      * character class, which means that it can't be an inversion into a
10243      * many-character class, and there must be no possibility of there being
10244      * things outside the bitmap.  'stored' (only) for locales doesn't include
10245      * \w, etc, so have to make a special test that they aren't present
10246      *
10247      * Similarly A 2-character class of the very special form like [bB] can be
10248      * optimized into an EXACTFish node, but only for non-locales, and for
10249      * characters which only have the two folds; so things like 'fF' and 'Ii'
10250      * wouldn't work because they are part of the fold of 'LATIN SMALL LIGATURE
10251      * FI'. */
10252     if (! nonbitmap
10253         && ! unicode_alternate
10254         && SvCUR(listsv) == initial_listsv_len
10255         && ! (ANYOF_FLAGS(ret) & (ANYOF_INVERT|ANYOF_UNICODE_ALL))
10256         && (((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
10257                               || (! ANYOF_CLASS_TEST_ANY_SET(ret)))))
10258             || (stored == 2 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
10259                                  && (! _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value))
10260                                  /* If the latest code point has a fold whose
10261                                   * bit is set, it must be the only other one */
10262                                 && ((prevvalue = PL_fold_latin1[value]) != (IV)value)
10263                                  && ANYOF_BITMAP_TEST(ret, prevvalue)))))
10264     {
10265         /* Note that the information needed to decide to do this optimization
10266          * is not currently available until the 2nd pass, and that the actually
10267          * used EXACTish node takes less space than the calculated ANYOF node,
10268          * and hence the amount of space calculated in the first pass is larger
10269          * than actually used, so this optimization doesn't gain us any space.
10270          * But an EXACT node is faster than an ANYOF node, and can be combined
10271          * with any adjacent EXACT nodes later by the optimizer for further
10272          * gains.  The speed of executing an EXACTF is similar to an ANYOF
10273          * node, so the optimization advantage comes from the ability to join
10274          * it to adjacent EXACT nodes */
10275
10276         const char * cur_parse= RExC_parse;
10277         U8 op;
10278         RExC_emit = (regnode *)orig_emit;
10279         RExC_parse = (char *)orig_parse;
10280
10281         if (stored == 1) {
10282
10283             /* A locale node with one point can be folded; all the other cases
10284              * with folding will have two points, since we calculate them above
10285              */
10286             if (ANYOF_FLAGS(ret) & ANYOF_LOC_NONBITMAP_FOLD) {
10287                  op = EXACTFL;
10288             }
10289             else {
10290                 op = EXACT;
10291             }
10292         }   /* else 2 chars in the bit map: the folds of each other */
10293         else if (AT_LEAST_UNI_SEMANTICS || !isASCII(value)) {
10294
10295             /* To join adjacent nodes, they must be the exact EXACTish type.
10296              * Try to use the most likely type, by using EXACTFU if the regex
10297              * calls for them, or is required because the character is
10298              * non-ASCII */
10299             op = EXACTFU;
10300         }
10301         else {    /* Otherwise, more likely to be EXACTF type */
10302             op = EXACTF;
10303         }
10304
10305         ret = reg_node(pRExC_state, op);
10306         RExC_parse = (char *)cur_parse;
10307         if (UTF && ! NATIVE_IS_INVARIANT(value)) {
10308             *STRING(ret)= UTF8_EIGHT_BIT_HI((U8) value);
10309             *(STRING(ret) + 1)= UTF8_EIGHT_BIT_LO((U8) value);
10310             STR_LEN(ret)= 2;
10311             RExC_emit += STR_SZ(2);
10312         }
10313         else {
10314             *STRING(ret)= (char)value;
10315             STR_LEN(ret)= 1;
10316             RExC_emit += STR_SZ(1);
10317         }
10318         SvREFCNT_dec(listsv);
10319         return ret;
10320     }
10321
10322     if (nonbitmap) {
10323         UV* nonbitmap_array = invlist_array(nonbitmap);
10324         UV nonbitmap_len = invlist_len(nonbitmap);
10325         UV i;
10326
10327         /*  Here have the full list of items to match that aren't in the
10328          *  bitmap.  Convert to the structure that the rest of the code is
10329          *  expecting.   XXX That rest of the code should convert to this
10330          *  structure */
10331         for (i = 0; i < nonbitmap_len; i++) {
10332
10333             /* The next entry is the beginning of the range that is in the
10334              * class */
10335             UV start = nonbitmap_array[i++];
10336             UV end;
10337
10338             /* The next entry is the beginning of the next range, which isn't
10339              * in the class, so the end of the current range is one less than
10340              * that.  But if there is no next range, it means that the range
10341              * begun by 'start' extends to infinity, which for this platform
10342              * ends at UV_MAX */
10343             if (i == nonbitmap_len) {
10344                 end = UV_MAX;
10345             }
10346             else {
10347                 end = nonbitmap_array[i] - 1;
10348             }
10349
10350             if (start == end) {
10351                 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", start);
10352             }
10353             else {
10354                 /* The \t sets the whole range */
10355                 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
10356                         /* XXX EBCDIC */
10357                                    start, end);
10358             }
10359         }
10360         invlist_destroy(nonbitmap);
10361     }
10362
10363     if (SvCUR(listsv) == initial_listsv_len && ! unicode_alternate) {
10364         ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
10365         SvREFCNT_dec(listsv);
10366         SvREFCNT_dec(unicode_alternate);
10367     }
10368     else {
10369
10370         AV * const av = newAV();
10371         SV *rv;
10372         /* The 0th element stores the character class description
10373          * in its textual form: used later (regexec.c:Perl_regclass_swash())
10374          * to initialize the appropriate swash (which gets stored in
10375          * the 1st element), and also useful for dumping the regnode.
10376          * The 2nd element stores the multicharacter foldings,
10377          * used later (regexec.c:S_reginclass()). */
10378         av_store(av, 0, listsv);
10379         av_store(av, 1, NULL);
10380         av_store(av, 2, MUTABLE_SV(unicode_alternate));
10381         if (unicode_alternate) { /* This node is variable length */
10382             OP(ret) = ANYOFV;
10383         }
10384         rv = newRV_noinc(MUTABLE_SV(av));
10385         n = add_data(pRExC_state, 1, "s");
10386         RExC_rxi->data->data[n] = (void*)rv;
10387         ARG_SET(ret, n);
10388     }
10389     return ret;
10390 }
10391 #undef _C_C_T_
10392
10393
10394 /* reg_skipcomment()
10395
10396    Absorbs an /x style # comments from the input stream.
10397    Returns true if there is more text remaining in the stream.
10398    Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
10399    terminates the pattern without including a newline.
10400
10401    Note its the callers responsibility to ensure that we are
10402    actually in /x mode
10403
10404 */
10405
10406 STATIC bool
10407 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
10408 {
10409     bool ended = 0;
10410
10411     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
10412
10413     while (RExC_parse < RExC_end)
10414         if (*RExC_parse++ == '\n') {
10415             ended = 1;
10416             break;
10417         }
10418     if (!ended) {
10419         /* we ran off the end of the pattern without ending
10420            the comment, so we have to add an \n when wrapping */
10421         RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
10422         return 0;
10423     } else
10424         return 1;
10425 }
10426
10427 /* nextchar()
10428
10429    Advances the parse position, and optionally absorbs
10430    "whitespace" from the inputstream.
10431
10432    Without /x "whitespace" means (?#...) style comments only,
10433    with /x this means (?#...) and # comments and whitespace proper.
10434
10435    Returns the RExC_parse point from BEFORE the scan occurs.
10436
10437    This is the /x friendly way of saying RExC_parse++.
10438 */
10439
10440 STATIC char*
10441 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
10442 {
10443     char* const retval = RExC_parse++;
10444
10445     PERL_ARGS_ASSERT_NEXTCHAR;
10446
10447     for (;;) {
10448         if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
10449                 RExC_parse[2] == '#') {
10450             while (*RExC_parse != ')') {
10451                 if (RExC_parse == RExC_end)
10452                     FAIL("Sequence (?#... not terminated");
10453                 RExC_parse++;
10454             }
10455             RExC_parse++;
10456             continue;
10457         }
10458         if (RExC_flags & RXf_PMf_EXTENDED) {
10459             if (isSPACE(*RExC_parse)) {
10460                 RExC_parse++;
10461                 continue;
10462             }
10463             else if (*RExC_parse == '#') {
10464                 if ( reg_skipcomment( pRExC_state ) )
10465                     continue;
10466             }
10467         }
10468         return retval;
10469     }
10470 }
10471
10472 /*
10473 - reg_node - emit a node
10474 */
10475 STATIC regnode *                        /* Location. */
10476 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
10477 {
10478     dVAR;
10479     register regnode *ptr;
10480     regnode * const ret = RExC_emit;
10481     GET_RE_DEBUG_FLAGS_DECL;
10482
10483     PERL_ARGS_ASSERT_REG_NODE;
10484
10485     if (SIZE_ONLY) {
10486         SIZE_ALIGN(RExC_size);
10487         RExC_size += 1;
10488         return(ret);
10489     }
10490     if (RExC_emit >= RExC_emit_bound)
10491         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
10492
10493     NODE_ALIGN_FILL(ret);
10494     ptr = ret;
10495     FILL_ADVANCE_NODE(ptr, op);
10496     REH_CALL_REGCOMP_HOOK(pRExC_state->rx, (ptr) - 1);
10497 #ifdef RE_TRACK_PATTERN_OFFSETS
10498     if (RExC_offsets) {         /* MJD */
10499         MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
10500               "reg_node", __LINE__,
10501               PL_reg_name[op],
10502               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
10503                 ? "Overwriting end of array!\n" : "OK",
10504               (UV)(RExC_emit - RExC_emit_start),
10505               (UV)(RExC_parse - RExC_start),
10506               (UV)RExC_offsets[0]));
10507         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
10508     }
10509 #endif
10510     RExC_emit = ptr;
10511     return(ret);
10512 }
10513
10514 /*
10515 - reganode - emit a node with an argument
10516 */
10517 STATIC regnode *                        /* Location. */
10518 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
10519 {
10520     dVAR;
10521     register regnode *ptr;
10522     regnode * const ret = RExC_emit;
10523     GET_RE_DEBUG_FLAGS_DECL;
10524
10525     PERL_ARGS_ASSERT_REGANODE;
10526
10527     if (SIZE_ONLY) {
10528         SIZE_ALIGN(RExC_size);
10529         RExC_size += 2;
10530         /*
10531            We can't do this:
10532
10533            assert(2==regarglen[op]+1);
10534
10535            Anything larger than this has to allocate the extra amount.
10536            If we changed this to be:
10537
10538            RExC_size += (1 + regarglen[op]);
10539
10540            then it wouldn't matter. Its not clear what side effect
10541            might come from that so its not done so far.
10542            -- dmq
10543         */
10544         return(ret);
10545     }
10546     if (RExC_emit >= RExC_emit_bound)
10547         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
10548
10549     NODE_ALIGN_FILL(ret);
10550     ptr = ret;
10551     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
10552     REH_CALL_REGCOMP_HOOK(pRExC_state->rx, (ptr) - 2);
10553 #ifdef RE_TRACK_PATTERN_OFFSETS
10554     if (RExC_offsets) {         /* MJD */
10555         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
10556               "reganode",
10557               __LINE__,
10558               PL_reg_name[op],
10559               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
10560               "Overwriting end of array!\n" : "OK",
10561               (UV)(RExC_emit - RExC_emit_start),
10562               (UV)(RExC_parse - RExC_start),
10563               (UV)RExC_offsets[0]));
10564         Set_Cur_Node_Offset;
10565     }
10566 #endif
10567     RExC_emit = ptr;
10568     return(ret);
10569 }
10570
10571 /*
10572 - reguni - emit (if appropriate) a Unicode character
10573 */
10574 STATIC STRLEN
10575 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
10576 {
10577     dVAR;
10578
10579     PERL_ARGS_ASSERT_REGUNI;
10580
10581     return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
10582 }
10583
10584 /*
10585 - reginsert - insert an operator in front of already-emitted operand
10586 *
10587 * Means relocating the operand.
10588 */
10589 STATIC void
10590 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
10591 {
10592     dVAR;
10593     register regnode *src;
10594     register regnode *dst;
10595     register regnode *place;
10596     const int offset = regarglen[(U8)op];
10597     const int size = NODE_STEP_REGNODE + offset;
10598     GET_RE_DEBUG_FLAGS_DECL;
10599
10600     PERL_ARGS_ASSERT_REGINSERT;
10601     PERL_UNUSED_ARG(depth);
10602 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
10603     DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
10604     if (SIZE_ONLY) {
10605         RExC_size += size;
10606         return;
10607     }
10608
10609     src = RExC_emit;
10610     RExC_emit += size;
10611     dst = RExC_emit;
10612     if (RExC_open_parens) {
10613         int paren;
10614         /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
10615         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
10616             if ( RExC_open_parens[paren] >= opnd ) {
10617                 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
10618                 RExC_open_parens[paren] += size;
10619             } else {
10620                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
10621             }
10622             if ( RExC_close_parens[paren] >= opnd ) {
10623                 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
10624                 RExC_close_parens[paren] += size;
10625             } else {
10626                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
10627             }
10628         }
10629     }
10630
10631     while (src > opnd) {
10632         StructCopy(--src, --dst, regnode);
10633 #ifdef RE_TRACK_PATTERN_OFFSETS
10634         if (RExC_offsets) {     /* MJD 20010112 */
10635             MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
10636                   "reg_insert",
10637                   __LINE__,
10638                   PL_reg_name[op],
10639                   (UV)(dst - RExC_emit_start) > RExC_offsets[0]
10640                     ? "Overwriting end of array!\n" : "OK",
10641                   (UV)(src - RExC_emit_start),
10642                   (UV)(dst - RExC_emit_start),
10643                   (UV)RExC_offsets[0]));
10644             Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
10645             Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
10646         }
10647 #endif
10648     }
10649
10650
10651     place = opnd;               /* Op node, where operand used to be. */
10652 #ifdef RE_TRACK_PATTERN_OFFSETS
10653     if (RExC_offsets) {         /* MJD */
10654         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
10655               "reginsert",
10656               __LINE__,
10657               PL_reg_name[op],
10658               (UV)(place - RExC_emit_start) > RExC_offsets[0]
10659               ? "Overwriting end of array!\n" : "OK",
10660               (UV)(place - RExC_emit_start),
10661               (UV)(RExC_parse - RExC_start),
10662               (UV)RExC_offsets[0]));
10663         Set_Node_Offset(place, RExC_parse);
10664         Set_Node_Length(place, 1);
10665     }
10666 #endif
10667     src = NEXTOPER(place);
10668     FILL_ADVANCE_NODE(place, op);
10669     REH_CALL_REGCOMP_HOOK(pRExC_state->rx, (place) - 1);
10670     Zero(src, offset, regnode);
10671 }
10672
10673 /*
10674 - regtail - set the next-pointer at the end of a node chain of p to val.
10675 - SEE ALSO: regtail_study
10676 */
10677 /* TODO: All three parms should be const */
10678 STATIC void
10679 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
10680 {
10681     dVAR;
10682     register regnode *scan;
10683     GET_RE_DEBUG_FLAGS_DECL;
10684
10685     PERL_ARGS_ASSERT_REGTAIL;
10686 #ifndef DEBUGGING
10687     PERL_UNUSED_ARG(depth);
10688 #endif
10689
10690     if (SIZE_ONLY)
10691         return;
10692
10693     /* Find last node. */
10694     scan = p;
10695     for (;;) {
10696         regnode * const temp = regnext(scan);
10697         DEBUG_PARSE_r({
10698             SV * const mysv=sv_newmortal();
10699             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
10700             regprop(RExC_rx, mysv, scan);
10701             PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
10702                 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
10703                     (temp == NULL ? "->" : ""),
10704                     (temp == NULL ? PL_reg_name[OP(val)] : "")
10705             );
10706         });
10707         if (temp == NULL)
10708             break;
10709         scan = temp;
10710     }
10711
10712     if (reg_off_by_arg[OP(scan)]) {
10713         ARG_SET(scan, val - scan);
10714     }
10715     else {
10716         NEXT_OFF(scan) = val - scan;
10717     }
10718 }
10719
10720 #ifdef DEBUGGING
10721 /*
10722 - regtail_study - set the next-pointer at the end of a node chain of p to val.
10723 - Look for optimizable sequences at the same time.
10724 - currently only looks for EXACT chains.
10725
10726 This is experimental code. The idea is to use this routine to perform
10727 in place optimizations on branches and groups as they are constructed,
10728 with the long term intention of removing optimization from study_chunk so
10729 that it is purely analytical.
10730
10731 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
10732 to control which is which.
10733
10734 */
10735 /* TODO: All four parms should be const */
10736
10737 STATIC U8
10738 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
10739 {
10740     dVAR;
10741     register regnode *scan;
10742     U8 exact = PSEUDO;
10743 #ifdef EXPERIMENTAL_INPLACESCAN
10744     I32 min = 0;
10745 #endif
10746     GET_RE_DEBUG_FLAGS_DECL;
10747
10748     PERL_ARGS_ASSERT_REGTAIL_STUDY;
10749
10750
10751     if (SIZE_ONLY)
10752         return exact;
10753
10754     /* Find last node. */
10755
10756     scan = p;
10757     for (;;) {
10758         regnode * const temp = regnext(scan);
10759 #ifdef EXPERIMENTAL_INPLACESCAN
10760         if (PL_regkind[OP(scan)] == EXACT)
10761             if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
10762                 return EXACT;
10763 #endif
10764         if ( exact ) {
10765             switch (OP(scan)) {
10766                 case EXACT:
10767                 case EXACTF:
10768                 case EXACTFA:
10769                 case EXACTFU:
10770                 case EXACTFL:
10771                         if( exact == PSEUDO )
10772                             exact= OP(scan);
10773                         else if ( exact != OP(scan) )
10774                             exact= 0;
10775                 case NOTHING:
10776                     break;
10777                 default:
10778                     exact= 0;
10779             }
10780         }
10781         DEBUG_PARSE_r({
10782             SV * const mysv=sv_newmortal();
10783             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
10784             regprop(RExC_rx, mysv, scan);
10785             PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
10786                 SvPV_nolen_const(mysv),
10787                 REG_NODE_NUM(scan),
10788                 PL_reg_name[exact]);
10789         });
10790         if (temp == NULL)
10791             break;
10792         scan = temp;
10793     }
10794     DEBUG_PARSE_r({
10795         SV * const mysv_val=sv_newmortal();
10796         DEBUG_PARSE_MSG("");
10797         regprop(RExC_rx, mysv_val, val);
10798         PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
10799                       SvPV_nolen_const(mysv_val),
10800                       (IV)REG_NODE_NUM(val),
10801                       (IV)(val - scan)
10802         );
10803     });
10804     if (reg_off_by_arg[OP(scan)]) {
10805         ARG_SET(scan, val - scan);
10806     }
10807     else {
10808         NEXT_OFF(scan) = val - scan;
10809     }
10810
10811     return exact;
10812 }
10813 #endif
10814
10815 /*
10816  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
10817  */
10818 #ifdef DEBUGGING
10819 static void
10820 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
10821 {
10822     int bit;
10823     int set=0;
10824     regex_charset cs;
10825
10826     for (bit=0; bit<32; bit++) {
10827         if (flags & (1<<bit)) {
10828             if ((1<<bit) & RXf_PMf_CHARSET) {   /* Output separately, below */
10829                 continue;
10830             }
10831             if (!set++ && lead)
10832                 PerlIO_printf(Perl_debug_log, "%s",lead);
10833             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
10834         }
10835     }
10836     if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
10837             if (!set++ && lead) {
10838                 PerlIO_printf(Perl_debug_log, "%s",lead);
10839             }
10840             switch (cs) {
10841                 case REGEX_UNICODE_CHARSET:
10842                     PerlIO_printf(Perl_debug_log, "UNICODE");
10843                     break;
10844                 case REGEX_LOCALE_CHARSET:
10845                     PerlIO_printf(Perl_debug_log, "LOCALE");
10846                     break;
10847                 case REGEX_ASCII_RESTRICTED_CHARSET:
10848                     PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
10849                     break;
10850                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
10851                     PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
10852                     break;
10853                 default:
10854                     PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
10855                     break;
10856             }
10857     }
10858     if (lead)  {
10859         if (set)
10860             PerlIO_printf(Perl_debug_log, "\n");
10861         else
10862             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
10863     }
10864 }
10865 #endif
10866
10867 void
10868 Perl_regdump(pTHX_ const regexp *r)
10869 {
10870 #ifdef DEBUGGING
10871     dVAR;
10872     SV * const sv = sv_newmortal();
10873     SV *dsv= sv_newmortal();
10874     RXi_GET_DECL(r,ri);
10875     GET_RE_DEBUG_FLAGS_DECL;
10876
10877     PERL_ARGS_ASSERT_REGDUMP;
10878
10879     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
10880
10881     /* Header fields of interest. */
10882     if (r->anchored_substr) {
10883         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
10884             RE_SV_DUMPLEN(r->anchored_substr), 30);
10885         PerlIO_printf(Perl_debug_log,
10886                       "anchored %s%s at %"IVdf" ",
10887                       s, RE_SV_TAIL(r->anchored_substr),
10888                       (IV)r->anchored_offset);
10889     } else if (r->anchored_utf8) {
10890         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
10891             RE_SV_DUMPLEN(r->anchored_utf8), 30);
10892         PerlIO_printf(Perl_debug_log,
10893                       "anchored utf8 %s%s at %"IVdf" ",
10894                       s, RE_SV_TAIL(r->anchored_utf8),
10895                       (IV)r->anchored_offset);
10896     }
10897     if (r->float_substr) {
10898         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
10899             RE_SV_DUMPLEN(r->float_substr), 30);
10900         PerlIO_printf(Perl_debug_log,
10901                       "floating %s%s at %"IVdf"..%"UVuf" ",
10902                       s, RE_SV_TAIL(r->float_substr),
10903                       (IV)r->float_min_offset, (UV)r->float_max_offset);
10904     } else if (r->float_utf8) {
10905         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
10906             RE_SV_DUMPLEN(r->float_utf8), 30);
10907         PerlIO_printf(Perl_debug_log,
10908                       "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
10909                       s, RE_SV_TAIL(r->float_utf8),
10910                       (IV)r->float_min_offset, (UV)r->float_max_offset);
10911     }
10912     if (r->check_substr || r->check_utf8)
10913         PerlIO_printf(Perl_debug_log,
10914                       (const char *)
10915                       (r->check_substr == r->float_substr
10916                        && r->check_utf8 == r->float_utf8
10917                        ? "(checking floating" : "(checking anchored"));
10918     if (r->extflags & RXf_NOSCAN)
10919         PerlIO_printf(Perl_debug_log, " noscan");
10920     if (r->extflags & RXf_CHECK_ALL)
10921         PerlIO_printf(Perl_debug_log, " isall");
10922     if (r->check_substr || r->check_utf8)
10923         PerlIO_printf(Perl_debug_log, ") ");
10924
10925     if (ri->regstclass) {
10926         regprop(r, sv, ri->regstclass);
10927         PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
10928     }
10929     if (r->extflags & RXf_ANCH) {
10930         PerlIO_printf(Perl_debug_log, "anchored");
10931         if (r->extflags & RXf_ANCH_BOL)
10932             PerlIO_printf(Perl_debug_log, "(BOL)");
10933         if (r->extflags & RXf_ANCH_MBOL)
10934             PerlIO_printf(Perl_debug_log, "(MBOL)");
10935         if (r->extflags & RXf_ANCH_SBOL)
10936             PerlIO_printf(Perl_debug_log, "(SBOL)");
10937         if (r->extflags & RXf_ANCH_GPOS)
10938             PerlIO_printf(Perl_debug_log, "(GPOS)");
10939         PerlIO_putc(Perl_debug_log, ' ');
10940     }
10941     if (r->extflags & RXf_GPOS_SEEN)
10942         PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
10943     if (r->intflags & PREGf_SKIP)
10944         PerlIO_printf(Perl_debug_log, "plus ");
10945     if (r->intflags & PREGf_IMPLICIT)
10946         PerlIO_printf(Perl_debug_log, "implicit ");
10947     PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
10948     if (r->extflags & RXf_EVAL_SEEN)
10949         PerlIO_printf(Perl_debug_log, "with eval ");
10950     PerlIO_printf(Perl_debug_log, "\n");
10951     DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
10952 #else
10953     PERL_ARGS_ASSERT_REGDUMP;
10954     PERL_UNUSED_CONTEXT;
10955     PERL_UNUSED_ARG(r);
10956 #endif  /* DEBUGGING */
10957 }
10958
10959 /*
10960 - regprop - printable representation of opcode
10961 */
10962 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
10963 STMT_START { \
10964         if (do_sep) {                           \
10965             Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
10966             if (flags & ANYOF_INVERT)           \
10967                 /*make sure the invert info is in each */ \
10968                 sv_catpvs(sv, "^");             \
10969             do_sep = 0;                         \
10970         }                                       \
10971 } STMT_END
10972
10973 void
10974 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
10975 {
10976 #ifdef DEBUGGING
10977     dVAR;
10978     register int k;
10979     RXi_GET_DECL(prog,progi);
10980     GET_RE_DEBUG_FLAGS_DECL;
10981
10982     PERL_ARGS_ASSERT_REGPROP;
10983
10984     sv_setpvs(sv, "");
10985
10986     if (OP(o) > REGNODE_MAX)            /* regnode.type is unsigned */
10987         /* It would be nice to FAIL() here, but this may be called from
10988            regexec.c, and it would be hard to supply pRExC_state. */
10989         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
10990     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
10991
10992     k = PL_regkind[OP(o)];
10993
10994     if (k == EXACT) {
10995         sv_catpvs(sv, " ");
10996         /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
10997          * is a crude hack but it may be the best for now since
10998          * we have no flag "this EXACTish node was UTF-8"
10999          * --jhi */
11000         pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
11001                   PERL_PV_ESCAPE_UNI_DETECT |
11002                   PERL_PV_ESCAPE_NONASCII   |
11003                   PERL_PV_PRETTY_ELLIPSES   |
11004                   PERL_PV_PRETTY_LTGT       |
11005                   PERL_PV_PRETTY_NOCLEAR
11006                   );
11007     } else if (k == TRIE) {
11008         /* print the details of the trie in dumpuntil instead, as
11009          * progi->data isn't available here */
11010         const char op = OP(o);
11011         const U32 n = ARG(o);
11012         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
11013                (reg_ac_data *)progi->data->data[n] :
11014                NULL;
11015         const reg_trie_data * const trie
11016             = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
11017
11018         Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
11019         DEBUG_TRIE_COMPILE_r(
11020             Perl_sv_catpvf(aTHX_ sv,
11021                 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
11022                 (UV)trie->startstate,
11023                 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
11024                 (UV)trie->wordcount,
11025                 (UV)trie->minlen,
11026                 (UV)trie->maxlen,
11027                 (UV)TRIE_CHARCOUNT(trie),
11028                 (UV)trie->uniquecharcount
11029             )
11030         );
11031         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
11032             int i;
11033             int rangestart = -1;
11034             U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
11035             sv_catpvs(sv, "[");
11036             for (i = 0; i <= 256; i++) {
11037                 if (i < 256 && BITMAP_TEST(bitmap,i)) {
11038                     if (rangestart == -1)
11039                         rangestart = i;
11040                 } else if (rangestart != -1) {
11041                     if (i <= rangestart + 3)
11042                         for (; rangestart < i; rangestart++)
11043                             put_byte(sv, rangestart);
11044                     else {
11045                         put_byte(sv, rangestart);
11046                         sv_catpvs(sv, "-");
11047                         put_byte(sv, i - 1);
11048                     }
11049                     rangestart = -1;
11050                 }
11051             }
11052             sv_catpvs(sv, "]");
11053         }
11054
11055     } else if (k == CURLY) {
11056         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
11057             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
11058         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
11059     }
11060     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
11061         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
11062     else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
11063         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
11064         if ( RXp_PAREN_NAMES(prog) ) {
11065             if ( k != REF || (OP(o) < NREF)) {
11066                 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
11067                 SV **name= av_fetch(list, ARG(o), 0 );
11068                 if (name)
11069                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
11070             }
11071             else {
11072                 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
11073                 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
11074                 I32 *nums=(I32*)SvPVX(sv_dat);
11075                 SV **name= av_fetch(list, nums[0], 0 );
11076                 I32 n;
11077                 if (name) {
11078                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
11079                         Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
11080                                     (n ? "," : ""), (IV)nums[n]);
11081                     }
11082                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
11083                 }
11084             }
11085         }
11086     } else if (k == GOSUB)
11087         Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
11088     else if (k == VERB) {
11089         if (!o->flags)
11090             Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
11091                            SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
11092     } else if (k == LOGICAL)
11093         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);     /* 2: embedded, otherwise 1 */
11094     else if (k == FOLDCHAR)
11095         Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) );
11096     else if (k == ANYOF) {
11097         int i, rangestart = -1;
11098         const U8 flags = ANYOF_FLAGS(o);
11099         int do_sep = 0;
11100
11101         /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
11102         static const char * const anyofs[] = {
11103             "\\w",
11104             "\\W",
11105             "\\s",
11106             "\\S",
11107             "\\d",
11108             "\\D",
11109             "[:alnum:]",
11110             "[:^alnum:]",
11111             "[:alpha:]",
11112             "[:^alpha:]",
11113             "[:ascii:]",
11114             "[:^ascii:]",
11115             "[:cntrl:]",
11116             "[:^cntrl:]",
11117             "[:graph:]",
11118             "[:^graph:]",
11119             "[:lower:]",
11120             "[:^lower:]",
11121             "[:print:]",
11122             "[:^print:]",
11123             "[:punct:]",
11124             "[:^punct:]",
11125             "[:upper:]",
11126             "[:^upper:]",
11127             "[:xdigit:]",
11128             "[:^xdigit:]",
11129             "[:space:]",
11130             "[:^space:]",
11131             "[:blank:]",
11132             "[:^blank:]"
11133         };
11134
11135         if (flags & ANYOF_LOCALE)
11136             sv_catpvs(sv, "{loc}");
11137         if (flags & ANYOF_LOC_NONBITMAP_FOLD)
11138             sv_catpvs(sv, "{i}");
11139         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
11140         if (flags & ANYOF_INVERT)
11141             sv_catpvs(sv, "^");
11142
11143         /* output what the standard cp 0-255 bitmap matches */
11144         for (i = 0; i <= 256; i++) {
11145             if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
11146                 if (rangestart == -1)
11147                     rangestart = i;
11148             } else if (rangestart != -1) {
11149                 if (i <= rangestart + 3)
11150                     for (; rangestart < i; rangestart++)
11151                         put_byte(sv, rangestart);
11152                 else {
11153                     put_byte(sv, rangestart);
11154                     sv_catpvs(sv, "-");
11155                     put_byte(sv, i - 1);
11156                 }
11157                 do_sep = 1;
11158                 rangestart = -1;
11159             }
11160         }
11161
11162         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
11163         /* output any special charclass tests (used entirely under use locale) */
11164         if (ANYOF_CLASS_TEST_ANY_SET(o))
11165             for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
11166                 if (ANYOF_CLASS_TEST(o,i)) {
11167                     sv_catpv(sv, anyofs[i]);
11168                     do_sep = 1;
11169                 }
11170
11171         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
11172
11173         if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
11174             sv_catpvs(sv, "{non-utf8-latin1-all}");
11175         }
11176
11177         /* output information about the unicode matching */
11178         if (flags & ANYOF_UNICODE_ALL)
11179             sv_catpvs(sv, "{unicode_all}");
11180         else if (ANYOF_NONBITMAP(o))
11181             sv_catpvs(sv, "{unicode}");
11182         if (flags & ANYOF_NONBITMAP_NON_UTF8)
11183             sv_catpvs(sv, "{outside bitmap}");
11184
11185         if (ANYOF_NONBITMAP(o)) {
11186             SV *lv;
11187             SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
11188
11189             if (lv) {
11190                 if (sw) {
11191                     U8 s[UTF8_MAXBYTES_CASE+1];
11192
11193                     for (i = 0; i <= 256; i++) { /* just the first 256 */
11194                         uvchr_to_utf8(s, i);
11195
11196                         if (i < 256 && swash_fetch(sw, s, TRUE)) {
11197                             if (rangestart == -1)
11198                                 rangestart = i;
11199                         } else if (rangestart != -1) {
11200                             if (i <= rangestart + 3)
11201                                 for (; rangestart < i; rangestart++) {
11202                                     const U8 * const e = uvchr_to_utf8(s,rangestart);
11203                                     U8 *p;
11204                                     for(p = s; p < e; p++)
11205                                         put_byte(sv, *p);
11206                                 }
11207                             else {
11208                                 const U8 *e = uvchr_to_utf8(s,rangestart);
11209                                 U8 *p;
11210                                 for (p = s; p < e; p++)
11211                                     put_byte(sv, *p);
11212                                 sv_catpvs(sv, "-");
11213                                 e = uvchr_to_utf8(s, i-1);
11214                                 for (p = s; p < e; p++)
11215                                     put_byte(sv, *p);
11216                                 }
11217                                 rangestart = -1;
11218                             }
11219                         }
11220
11221                     sv_catpvs(sv, "..."); /* et cetera */
11222                 }
11223
11224                 {
11225                     char *s = savesvpv(lv);
11226                     char * const origs = s;
11227
11228                     while (*s && *s != '\n')
11229                         s++;
11230
11231                     if (*s == '\n') {
11232                         const char * const t = ++s;
11233
11234                         while (*s) {
11235                             if (*s == '\n')
11236                                 *s = ' ';
11237                             s++;
11238                         }
11239                         if (s[-1] == ' ')
11240                             s[-1] = 0;
11241
11242                         sv_catpv(sv, t);
11243                     }
11244
11245                     Safefree(origs);
11246                 }
11247             }
11248         }
11249
11250         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
11251     }
11252     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
11253         Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
11254 #else
11255     PERL_UNUSED_CONTEXT;
11256     PERL_UNUSED_ARG(sv);
11257     PERL_UNUSED_ARG(o);
11258     PERL_UNUSED_ARG(prog);
11259 #endif  /* DEBUGGING */
11260 }
11261
11262 SV *
11263 Perl_re_intuit_string(pTHX_ REGEXP * const r)
11264 {                               /* Assume that RE_INTUIT is set */
11265     dVAR;
11266     struct regexp *const prog = (struct regexp *)SvANY(r);
11267     GET_RE_DEBUG_FLAGS_DECL;
11268
11269     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
11270     PERL_UNUSED_CONTEXT;
11271
11272     DEBUG_COMPILE_r(
11273         {
11274             const char * const s = SvPV_nolen_const(prog->check_substr
11275                       ? prog->check_substr : prog->check_utf8);
11276
11277             if (!PL_colorset) reginitcolors();
11278             PerlIO_printf(Perl_debug_log,
11279                       "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
11280                       PL_colors[4],
11281                       prog->check_substr ? "" : "utf8 ",
11282                       PL_colors[5],PL_colors[0],
11283                       s,
11284                       PL_colors[1],
11285                       (strlen(s) > 60 ? "..." : ""));
11286         } );
11287
11288     return prog->check_substr ? prog->check_substr : prog->check_utf8;
11289 }
11290
11291 /*
11292    pregfree()
11293
11294    handles refcounting and freeing the perl core regexp structure. When
11295    it is necessary to actually free the structure the first thing it
11296    does is call the 'free' method of the regexp_engine associated to
11297    the regexp, allowing the handling of the void *pprivate; member
11298    first. (This routine is not overridable by extensions, which is why
11299    the extensions free is called first.)
11300
11301    See regdupe and regdupe_internal if you change anything here.
11302 */
11303 #ifndef PERL_IN_XSUB_RE
11304 void
11305 Perl_pregfree(pTHX_ REGEXP *r)
11306 {
11307     SvREFCNT_dec(r);
11308 }
11309
11310 void
11311 Perl_pregfree2(pTHX_ REGEXP *rx)
11312 {
11313     dVAR;
11314     struct regexp *const r = (struct regexp *)SvANY(rx);
11315     GET_RE_DEBUG_FLAGS_DECL;
11316
11317     PERL_ARGS_ASSERT_PREGFREE2;
11318
11319     if (r->mother_re) {
11320         ReREFCNT_dec(r->mother_re);
11321     } else {
11322         CALLREGFREE_PVT(rx); /* free the private data */
11323         SvREFCNT_dec(RXp_PAREN_NAMES(r));
11324     }
11325     if (r->substrs) {
11326         SvREFCNT_dec(r->anchored_substr);
11327         SvREFCNT_dec(r->anchored_utf8);
11328         SvREFCNT_dec(r->float_substr);
11329         SvREFCNT_dec(r->float_utf8);
11330         Safefree(r->substrs);
11331     }
11332     RX_MATCH_COPY_FREE(rx);
11333 #ifdef PERL_OLD_COPY_ON_WRITE
11334     SvREFCNT_dec(r->saved_copy);
11335 #endif
11336     Safefree(r->offs);
11337 }
11338
11339 /*  reg_temp_copy()
11340
11341     This is a hacky workaround to the structural issue of match results
11342     being stored in the regexp structure which is in turn stored in
11343     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
11344     could be PL_curpm in multiple contexts, and could require multiple
11345     result sets being associated with the pattern simultaneously, such
11346     as when doing a recursive match with (??{$qr})
11347
11348     The solution is to make a lightweight copy of the regexp structure
11349     when a qr// is returned from the code executed by (??{$qr}) this
11350     lightweight copy doesn't actually own any of its data except for
11351     the starp/end and the actual regexp structure itself.
11352
11353 */
11354
11355
11356 REGEXP *
11357 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
11358 {
11359     struct regexp *ret;
11360     struct regexp *const r = (struct regexp *)SvANY(rx);
11361     register const I32 npar = r->nparens+1;
11362
11363     PERL_ARGS_ASSERT_REG_TEMP_COPY;
11364
11365     if (!ret_x)
11366         ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
11367     ret = (struct regexp *)SvANY(ret_x);
11368
11369     (void)ReREFCNT_inc(rx);
11370     /* We can take advantage of the existing "copied buffer" mechanism in SVs
11371        by pointing directly at the buffer, but flagging that the allocated
11372        space in the copy is zero. As we've just done a struct copy, it's now
11373        a case of zero-ing that, rather than copying the current length.  */
11374     SvPV_set(ret_x, RX_WRAPPED(rx));
11375     SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
11376     memcpy(&(ret->xpv_cur), &(r->xpv_cur),
11377            sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
11378     SvLEN_set(ret_x, 0);
11379     SvSTASH_set(ret_x, NULL);
11380     SvMAGIC_set(ret_x, NULL);
11381     Newx(ret->offs, npar, regexp_paren_pair);
11382     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
11383     if (r->substrs) {
11384         Newx(ret->substrs, 1, struct reg_substr_data);
11385         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
11386
11387         SvREFCNT_inc_void(ret->anchored_substr);
11388         SvREFCNT_inc_void(ret->anchored_utf8);
11389         SvREFCNT_inc_void(ret->float_substr);
11390         SvREFCNT_inc_void(ret->float_utf8);
11391
11392         /* check_substr and check_utf8, if non-NULL, point to either their
11393            anchored or float namesakes, and don't hold a second reference.  */
11394     }
11395     RX_MATCH_COPIED_off(ret_x);
11396 #ifdef PERL_OLD_COPY_ON_WRITE
11397     ret->saved_copy = NULL;
11398 #endif
11399     ret->mother_re = rx;
11400
11401     return ret_x;
11402 }
11403 #endif
11404
11405 /* regfree_internal()
11406
11407    Free the private data in a regexp. This is overloadable by
11408    extensions. Perl takes care of the regexp structure in pregfree(),
11409    this covers the *pprivate pointer which technically perl doesn't
11410    know about, however of course we have to handle the
11411    regexp_internal structure when no extension is in use.
11412
11413    Note this is called before freeing anything in the regexp
11414    structure.
11415  */
11416
11417 void
11418 Perl_regfree_internal(pTHX_ REGEXP * const rx)
11419 {
11420     dVAR;
11421     struct regexp *const r = (struct regexp *)SvANY(rx);
11422     RXi_GET_DECL(r,ri);
11423     GET_RE_DEBUG_FLAGS_DECL;
11424
11425     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
11426
11427     DEBUG_COMPILE_r({
11428         if (!PL_colorset)
11429             reginitcolors();
11430         {
11431             SV *dsv= sv_newmortal();
11432             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
11433                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
11434             PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
11435                 PL_colors[4],PL_colors[5],s);
11436         }
11437     });
11438 #ifdef RE_TRACK_PATTERN_OFFSETS
11439     if (ri->u.offsets)
11440         Safefree(ri->u.offsets);             /* 20010421 MJD */
11441 #endif
11442     if (ri->data) {
11443         int n = ri->data->count;
11444         PAD* new_comppad = NULL;
11445         PAD* old_comppad;
11446         PADOFFSET refcnt;
11447
11448         while (--n >= 0) {
11449           /* If you add a ->what type here, update the comment in regcomp.h */
11450             switch (ri->data->what[n]) {
11451             case 'a':
11452             case 's':
11453             case 'S':
11454             case 'u':
11455                 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
11456                 break;
11457             case 'f':
11458                 Safefree(ri->data->data[n]);
11459                 break;
11460             case 'p':
11461                 new_comppad = MUTABLE_AV(ri->data->data[n]);
11462                 break;
11463             case 'o':
11464                 if (new_comppad == NULL)
11465                     Perl_croak(aTHX_ "panic: pregfree comppad");
11466                 PAD_SAVE_LOCAL(old_comppad,
11467                     /* Watch out for global destruction's random ordering. */
11468                     (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
11469                 );
11470                 OP_REFCNT_LOCK;
11471                 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
11472                 OP_REFCNT_UNLOCK;
11473                 if (!refcnt)
11474                     op_free((OP_4tree*)ri->data->data[n]);
11475
11476                 PAD_RESTORE_LOCAL(old_comppad);
11477                 SvREFCNT_dec(MUTABLE_SV(new_comppad));
11478                 new_comppad = NULL;
11479                 break;
11480             case 'n':
11481                 break;
11482             case 'T':
11483                 { /* Aho Corasick add-on structure for a trie node.
11484                      Used in stclass optimization only */
11485                     U32 refcount;
11486                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
11487                     OP_REFCNT_LOCK;
11488                     refcount = --aho->refcount;
11489                     OP_REFCNT_UNLOCK;
11490                     if ( !refcount ) {
11491                         PerlMemShared_free(aho->states);
11492                         PerlMemShared_free(aho->fail);
11493                          /* do this last!!!! */
11494                         PerlMemShared_free(ri->data->data[n]);
11495                         PerlMemShared_free(ri->regstclass);
11496                     }
11497                 }
11498                 break;
11499             case 't':
11500                 {
11501                     /* trie structure. */
11502                     U32 refcount;
11503                     reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
11504                     OP_REFCNT_LOCK;
11505                     refcount = --trie->refcount;
11506                     OP_REFCNT_UNLOCK;
11507                     if ( !refcount ) {
11508                         PerlMemShared_free(trie->charmap);
11509                         PerlMemShared_free(trie->states);
11510                         PerlMemShared_free(trie->trans);
11511                         if (trie->bitmap)
11512                             PerlMemShared_free(trie->bitmap);
11513                         if (trie->jump)
11514                             PerlMemShared_free(trie->jump);
11515                         PerlMemShared_free(trie->wordinfo);
11516                         /* do this last!!!! */
11517                         PerlMemShared_free(ri->data->data[n]);
11518                     }
11519                 }
11520                 break;
11521             default:
11522                 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
11523             }
11524         }
11525         Safefree(ri->data->what);
11526         Safefree(ri->data);
11527     }
11528
11529     Safefree(ri);
11530 }
11531
11532 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
11533 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
11534 #define SAVEPVN(p,n)    ((p) ? savepvn(p,n) : NULL)
11535
11536 /*
11537    re_dup - duplicate a regexp.
11538
11539    This routine is expected to clone a given regexp structure. It is only
11540    compiled under USE_ITHREADS.
11541
11542    After all of the core data stored in struct regexp is duplicated
11543    the regexp_engine.dupe method is used to copy any private data
11544    stored in the *pprivate pointer. This allows extensions to handle
11545    any duplication it needs to do.
11546
11547    See pregfree() and regfree_internal() if you change anything here.
11548 */
11549 #if defined(USE_ITHREADS)
11550 #ifndef PERL_IN_XSUB_RE
11551 void
11552 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
11553 {
11554     dVAR;
11555     I32 npar;
11556     const struct regexp *r = (const struct regexp *)SvANY(sstr);
11557     struct regexp *ret = (struct regexp *)SvANY(dstr);
11558
11559     PERL_ARGS_ASSERT_RE_DUP_GUTS;
11560
11561     npar = r->nparens+1;
11562     Newx(ret->offs, npar, regexp_paren_pair);
11563     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
11564     if(ret->swap) {
11565         /* no need to copy these */
11566         Newx(ret->swap, npar, regexp_paren_pair);
11567     }
11568
11569     if (ret->substrs) {
11570         /* Do it this way to avoid reading from *r after the StructCopy().
11571            That way, if any of the sv_dup_inc()s dislodge *r from the L1
11572            cache, it doesn't matter.  */
11573         const bool anchored = r->check_substr
11574             ? r->check_substr == r->anchored_substr
11575             : r->check_utf8 == r->anchored_utf8;
11576         Newx(ret->substrs, 1, struct reg_substr_data);
11577         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
11578
11579         ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
11580         ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
11581         ret->float_substr = sv_dup_inc(ret->float_substr, param);
11582         ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
11583
11584         /* check_substr and check_utf8, if non-NULL, point to either their
11585            anchored or float namesakes, and don't hold a second reference.  */
11586
11587         if (ret->check_substr) {
11588             if (anchored) {
11589                 assert(r->check_utf8 == r->anchored_utf8);
11590                 ret->check_substr = ret->anchored_substr;
11591                 ret->check_utf8 = ret->anchored_utf8;
11592             } else {
11593                 assert(r->check_substr == r->float_substr);
11594                 assert(r->check_utf8 == r->float_utf8);
11595                 ret->check_substr = ret->float_substr;
11596                 ret->check_utf8 = ret->float_utf8;
11597             }
11598         } else if (ret->check_utf8) {
11599             if (anchored) {
11600                 ret->check_utf8 = ret->anchored_utf8;
11601             } else {
11602                 ret->check_utf8 = ret->float_utf8;
11603             }
11604         }
11605     }
11606
11607     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
11608
11609     if (ret->pprivate)
11610         RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
11611
11612     if (RX_MATCH_COPIED(dstr))
11613         ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
11614     else
11615         ret->subbeg = NULL;
11616 #ifdef PERL_OLD_COPY_ON_WRITE
11617     ret->saved_copy = NULL;
11618 #endif
11619
11620     if (ret->mother_re) {
11621         if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
11622             /* Our storage points directly to our mother regexp, but that's
11623                1: a buffer in a different thread
11624                2: something we no longer hold a reference on
11625                so we need to copy it locally.  */
11626             /* Note we need to sue SvCUR() on our mother_re, because it, in
11627                turn, may well be pointing to its own mother_re.  */
11628             SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
11629                                    SvCUR(ret->mother_re)+1));
11630             SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
11631         }
11632         ret->mother_re      = NULL;
11633     }
11634     ret->gofs = 0;
11635 }
11636 #endif /* PERL_IN_XSUB_RE */
11637
11638 /*
11639    regdupe_internal()
11640
11641    This is the internal complement to regdupe() which is used to copy
11642    the structure pointed to by the *pprivate pointer in the regexp.
11643    This is the core version of the extension overridable cloning hook.
11644    The regexp structure being duplicated will be copied by perl prior
11645    to this and will be provided as the regexp *r argument, however
11646    with the /old/ structures pprivate pointer value. Thus this routine
11647    may override any copying normally done by perl.
11648
11649    It returns a pointer to the new regexp_internal structure.
11650 */
11651
11652 void *
11653 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
11654 {
11655     dVAR;
11656     struct regexp *const r = (struct regexp *)SvANY(rx);
11657     regexp_internal *reti;
11658     int len, npar;
11659     RXi_GET_DECL(r,ri);
11660
11661     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
11662
11663     npar = r->nparens+1;
11664     len = ProgLen(ri);
11665
11666     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
11667     Copy(ri->program, reti->program, len+1, regnode);
11668
11669
11670     reti->regstclass = NULL;
11671
11672     if (ri->data) {
11673         struct reg_data *d;
11674         const int count = ri->data->count;
11675         int i;
11676
11677         Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
11678                 char, struct reg_data);
11679         Newx(d->what, count, U8);
11680
11681         d->count = count;
11682         for (i = 0; i < count; i++) {
11683             d->what[i] = ri->data->what[i];
11684             switch (d->what[i]) {
11685                 /* legal options are one of: sSfpontTua
11686                    see also regcomp.h and pregfree() */
11687             case 'a': /* actually an AV, but the dup function is identical.  */
11688             case 's':
11689             case 'S':
11690             case 'p': /* actually an AV, but the dup function is identical.  */
11691             case 'u': /* actually an HV, but the dup function is identical.  */
11692                 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
11693                 break;
11694             case 'f':
11695                 /* This is cheating. */
11696                 Newx(d->data[i], 1, struct regnode_charclass_class);
11697                 StructCopy(ri->data->data[i], d->data[i],
11698                             struct regnode_charclass_class);
11699                 reti->regstclass = (regnode*)d->data[i];
11700                 break;
11701             case 'o':
11702                 /* Compiled op trees are readonly and in shared memory,
11703                    and can thus be shared without duplication. */
11704                 OP_REFCNT_LOCK;
11705                 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
11706                 OP_REFCNT_UNLOCK;
11707                 break;
11708             case 'T':
11709                 /* Trie stclasses are readonly and can thus be shared
11710                  * without duplication. We free the stclass in pregfree
11711                  * when the corresponding reg_ac_data struct is freed.
11712                  */
11713                 reti->regstclass= ri->regstclass;
11714                 /* Fall through */
11715             case 't':
11716                 OP_REFCNT_LOCK;
11717                 ((reg_trie_data*)ri->data->data[i])->refcount++;
11718                 OP_REFCNT_UNLOCK;
11719                 /* Fall through */
11720             case 'n':
11721                 d->data[i] = ri->data->data[i];
11722                 break;
11723             default:
11724                 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
11725             }
11726         }
11727
11728         reti->data = d;
11729     }
11730     else
11731         reti->data = NULL;
11732
11733     reti->name_list_idx = ri->name_list_idx;
11734
11735 #ifdef RE_TRACK_PATTERN_OFFSETS
11736     if (ri->u.offsets) {
11737         Newx(reti->u.offsets, 2*len+1, U32);
11738         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
11739     }
11740 #else
11741     SetProgLen(reti,len);
11742 #endif
11743
11744     return (void*)reti;
11745 }
11746
11747 #endif    /* USE_ITHREADS */
11748
11749 #ifndef PERL_IN_XSUB_RE
11750
11751 /*
11752  - regnext - dig the "next" pointer out of a node
11753  */
11754 regnode *
11755 Perl_regnext(pTHX_ register regnode *p)
11756 {
11757     dVAR;
11758     register I32 offset;
11759
11760     if (!p)
11761         return(NULL);
11762
11763     if (OP(p) > REGNODE_MAX) {          /* regnode.type is unsigned */
11764         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
11765     }
11766
11767     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
11768     if (offset == 0)
11769         return(NULL);
11770
11771     return(p+offset);
11772 }
11773 #endif
11774
11775 STATIC void
11776 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
11777 {
11778     va_list args;
11779     STRLEN l1 = strlen(pat1);
11780     STRLEN l2 = strlen(pat2);
11781     char buf[512];
11782     SV *msv;
11783     const char *message;
11784
11785     PERL_ARGS_ASSERT_RE_CROAK2;
11786
11787     if (l1 > 510)
11788         l1 = 510;
11789     if (l1 + l2 > 510)
11790         l2 = 510 - l1;
11791     Copy(pat1, buf, l1 , char);
11792     Copy(pat2, buf + l1, l2 , char);
11793     buf[l1 + l2] = '\n';
11794     buf[l1 + l2 + 1] = '\0';
11795 #ifdef I_STDARG
11796     /* ANSI variant takes additional second argument */
11797     va_start(args, pat2);
11798 #else
11799     va_start(args);
11800 #endif
11801     msv = vmess(buf, &args);
11802     va_end(args);
11803     message = SvPV_const(msv,l1);
11804     if (l1 > 512)
11805         l1 = 512;
11806     Copy(message, buf, l1 , char);
11807     buf[l1-1] = '\0';                   /* Overwrite \n */
11808     Perl_croak(aTHX_ "%s", buf);
11809 }
11810
11811 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
11812
11813 #ifndef PERL_IN_XSUB_RE
11814 void
11815 Perl_save_re_context(pTHX)
11816 {
11817     dVAR;
11818
11819     struct re_save_state *state;
11820
11821     SAVEVPTR(PL_curcop);
11822     SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
11823
11824     state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
11825     PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
11826     SSPUSHUV(SAVEt_RE_STATE);
11827
11828     Copy(&PL_reg_state, state, 1, struct re_save_state);
11829
11830     PL_reg_start_tmp = 0;
11831     PL_reg_start_tmpl = 0;
11832     PL_reg_oldsaved = NULL;
11833     PL_reg_oldsavedlen = 0;
11834     PL_reg_maxiter = 0;
11835     PL_reg_leftiter = 0;
11836     PL_reg_poscache = NULL;
11837     PL_reg_poscache_size = 0;
11838 #ifdef PERL_OLD_COPY_ON_WRITE
11839     PL_nrs = NULL;
11840 #endif
11841
11842     /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
11843     if (PL_curpm) {
11844         const REGEXP * const rx = PM_GETRE(PL_curpm);
11845         if (rx) {
11846             U32 i;
11847             for (i = 1; i <= RX_NPARENS(rx); i++) {
11848                 char digits[TYPE_CHARS(long)];
11849                 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
11850                 GV *const *const gvp
11851                     = (GV**)hv_fetch(PL_defstash, digits, len, 0);
11852
11853                 if (gvp) {
11854                     GV * const gv = *gvp;
11855                     if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
11856                         save_scalar(gv);
11857                 }
11858             }
11859         }
11860     }
11861 }
11862 #endif
11863
11864 static void
11865 clear_re(pTHX_ void *r)
11866 {
11867     dVAR;
11868     ReREFCNT_dec((REGEXP *)r);
11869 }
11870
11871 #ifdef DEBUGGING
11872
11873 STATIC void
11874 S_put_byte(pTHX_ SV *sv, int c)
11875 {
11876     PERL_ARGS_ASSERT_PUT_BYTE;
11877
11878     /* Our definition of isPRINT() ignores locales, so only bytes that are
11879        not part of UTF-8 are considered printable. I assume that the same
11880        holds for UTF-EBCDIC.
11881        Also, code point 255 is not printable in either (it's E0 in EBCDIC,
11882        which Wikipedia says:
11883
11884        EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
11885        ones (binary 1111 1111, hexadecimal FF). It is similar, but not
11886        identical, to the ASCII delete (DEL) or rubout control character.
11887        ) So the old condition can be simplified to !isPRINT(c)  */
11888     if (!isPRINT(c)) {
11889         if (c < 256) {
11890             Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
11891         }
11892         else {
11893             Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
11894         }
11895     }
11896     else {
11897         const char string = c;
11898         if (c == '-' || c == ']' || c == '\\' || c == '^')
11899             sv_catpvs(sv, "\\");
11900         sv_catpvn(sv, &string, 1);
11901     }
11902 }
11903
11904
11905 #define CLEAR_OPTSTART \
11906     if (optstart) STMT_START { \
11907             DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
11908             optstart=NULL; \
11909     } STMT_END
11910
11911 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
11912
11913 STATIC const regnode *
11914 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
11915             const regnode *last, const regnode *plast,
11916             SV* sv, I32 indent, U32 depth)
11917 {
11918     dVAR;
11919     register U8 op = PSEUDO;    /* Arbitrary non-END op. */
11920     register const regnode *next;
11921     const regnode *optstart= NULL;
11922
11923     RXi_GET_DECL(r,ri);
11924     GET_RE_DEBUG_FLAGS_DECL;
11925
11926     PERL_ARGS_ASSERT_DUMPUNTIL;
11927
11928 #ifdef DEBUG_DUMPUNTIL
11929     PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
11930         last ? last-start : 0,plast ? plast-start : 0);
11931 #endif
11932
11933     if (plast && plast < last)
11934         last= plast;
11935
11936     while (PL_regkind[op] != END && (!last || node < last)) {
11937         /* While that wasn't END last time... */
11938         NODE_ALIGN(node);
11939         op = OP(node);
11940         if (op == CLOSE || op == WHILEM)
11941             indent--;
11942         next = regnext((regnode *)node);
11943
11944         /* Where, what. */
11945         if (OP(node) == OPTIMIZED) {
11946             if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
11947                 optstart = node;
11948             else
11949                 goto after_print;
11950         } else
11951             CLEAR_OPTSTART;
11952
11953         regprop(r, sv, node);
11954         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
11955                       (int)(2*indent + 1), "", SvPVX_const(sv));
11956
11957         if (OP(node) != OPTIMIZED) {
11958             if (next == NULL)           /* Next ptr. */
11959                 PerlIO_printf(Perl_debug_log, " (0)");
11960             else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
11961                 PerlIO_printf(Perl_debug_log, " (FAIL)");
11962             else
11963                 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
11964             (void)PerlIO_putc(Perl_debug_log, '\n');
11965         }
11966
11967       after_print:
11968         if (PL_regkind[(U8)op] == BRANCHJ) {
11969             assert(next);
11970             {
11971                 register const regnode *nnode = (OP(next) == LONGJMP
11972                                              ? regnext((regnode *)next)
11973                                              : next);
11974                 if (last && nnode > last)
11975                     nnode = last;
11976                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
11977             }
11978         }
11979         else if (PL_regkind[(U8)op] == BRANCH) {
11980             assert(next);
11981             DUMPUNTIL(NEXTOPER(node), next);
11982         }
11983         else if ( PL_regkind[(U8)op]  == TRIE ) {
11984             const regnode *this_trie = node;
11985             const char op = OP(node);
11986             const U32 n = ARG(node);
11987             const reg_ac_data * const ac = op>=AHOCORASICK ?
11988                (reg_ac_data *)ri->data->data[n] :
11989                NULL;
11990             const reg_trie_data * const trie =
11991                 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
11992 #ifdef DEBUGGING
11993             AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
11994 #endif
11995             const regnode *nextbranch= NULL;
11996             I32 word_idx;
11997             sv_setpvs(sv, "");
11998             for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
11999                 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
12000
12001                 PerlIO_printf(Perl_debug_log, "%*s%s ",
12002                    (int)(2*(indent+3)), "",
12003                     elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
12004                             PL_colors[0], PL_colors[1],
12005                             (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
12006                             PERL_PV_PRETTY_ELLIPSES    |
12007                             PERL_PV_PRETTY_LTGT
12008                             )
12009                             : "???"
12010                 );
12011                 if (trie->jump) {
12012                     U16 dist= trie->jump[word_idx+1];
12013                     PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
12014                                   (UV)((dist ? this_trie + dist : next) - start));
12015                     if (dist) {
12016                         if (!nextbranch)
12017                             nextbranch= this_trie + trie->jump[0];
12018                         DUMPUNTIL(this_trie + dist, nextbranch);
12019                     }
12020                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
12021                         nextbranch= regnext((regnode *)nextbranch);
12022                 } else {
12023                     PerlIO_printf(Perl_debug_log, "\n");
12024                 }
12025             }
12026             if (last && next > last)
12027                 node= last;
12028             else
12029                 node= next;
12030         }
12031         else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
12032             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
12033                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
12034         }
12035         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
12036             assert(next);
12037             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
12038         }
12039         else if ( op == PLUS || op == STAR) {
12040             DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
12041         }
12042         else if (PL_regkind[(U8)op] == ANYOF) {
12043             /* arglen 1 + class block */
12044             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
12045                     ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
12046             node = NEXTOPER(node);
12047         }
12048         else if (PL_regkind[(U8)op] == EXACT) {
12049             /* Literal string, where present. */
12050             node += NODE_SZ_STR(node) - 1;
12051             node = NEXTOPER(node);
12052         }
12053         else {
12054             node = NEXTOPER(node);
12055             node += regarglen[(U8)op];
12056         }
12057         if (op == CURLYX || op == OPEN)
12058             indent++;
12059     }
12060     CLEAR_OPTSTART;
12061 #ifdef DEBUG_DUMPUNTIL
12062     PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
12063 #endif
12064     return node;
12065 }
12066
12067 #endif  /* DEBUGGING */
12068
12069 /*
12070  * Local variables:
12071  * c-indentation-style: bsd
12072  * c-basic-offset: 4
12073  * indent-tabs-mode: t
12074  * End:
12075  *
12076  * ex: set ts=8 sts=4 sw=4 noet:
12077  */